/**
 * \file dnn/src/aarch64/matrix_mul/asm/common.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
#pragma once
#include <cmath>
#include <cstdint>
#include <type_traits>
#include "src/arm_common/simd_macro/marm_neon.h"
#include "src/common/utils.h"
#include "src/fallback/conv_bias/common.h"

namespace megdnn {
namespace aarch64 {

/* ======================== Prefetch ======================== */
#define ASM_PREFETCH(address)    "PRFM PLDL1KEEP, " address "\n"
#define ASM_PREFETCHL2(address)  "PRFM PLDL2KEEP, " address "\n"
#define ASM_PREFETCHW(address)   "PRFM PSTL1KEEP, " address "\n"
#define ASM_PREFETCHWL2(address) "PRFM PSTL2KEEP, " address "\n"

static inline void prefetch_6x(const void* pfp) {
    // clang-format off
    asm volatile(ASM_PREFETCH("[%[pfp]]")
                 ASM_PREFETCH("[%[pfp], #64]")
                 ASM_PREFETCH("[%[pfp], #128]")
                 ASM_PREFETCH("[%[pfp], #192]")
                 ASM_PREFETCH("[%[pfp], #256]")
                 ASM_PREFETCH("[%[pfp], #320]")
                 :
                 : [pfp] "r"(pfp)
                 : "memory");
    // clang-format on
}

static inline void prefetch_5x(const void* pfp) {
    // clang-format off
    asm volatile(ASM_PREFETCH("[%[pfp]]")
                 ASM_PREFETCH("[%[pfp], #64]")
                 ASM_PREFETCH("[%[pfp], #128]")
                 ASM_PREFETCH("[%[pfp], #192]")
                 ASM_PREFETCH("[%[pfp], #256]")
                 :
                 : [pfp] "r"(pfp)
                 : "memory");
    // clang-format on
}

static inline void prefetch_4x(const void* pfp) {
    // clang-format off
    asm volatile(ASM_PREFETCH("[%[pfp]]")
                 ASM_PREFETCH("[%[pfp], #64]")
                 ASM_PREFETCH("[%[pfp], #128]")
                 ASM_PREFETCH("[%[pfp], #192]")
                 :
                 : [pfp] "r"(pfp)
                 : "memory");
    // clang-format on
}

static inline void prefetch_3x(const void* pfp) {
    // clang-format off
    asm volatile(ASM_PREFETCH("[%[pfp]]")
                 ASM_PREFETCH("[%[pfp], #64]")
                 ASM_PREFETCH("[%[pfp], #128]")
                 :
                 : [pfp] "r"(pfp)
                 : "memory");
    // clang-format on
}

static inline void prefetch_2x(const void* pfp) {
    // clang-format off
    asm volatile(ASM_PREFETCH("[%[pfp]]")
                 ASM_PREFETCH("[%[pfp], #64]")
                 :
                 : [pfp] "r"(pfp)
                 : "memory");
    // clang-format on
}

static inline void prefetch_1x(const void* pfp) {
    // clang-format off
    asm volatile(ASM_PREFETCH("[%[pfp]]") : : [pfp] "r"(pfp) : "memory");
    // clang-format on
}

/* ======================== interleave pack A ======================== */

/**
 * interleave_INTERLEAVE_UNROLLK_BATCH_type
 *
 * BATCH means process BATCH * UNROLL_K cols once, BATCH * sizeof(TYPE) *
 * UNROLL_K = 16bytes(128bits, a vector size).
 *
 * the elements traverse order:
 * rep(j, 0, INTERLEAVE) rep(i, 0, UNROLL_K) *ouptr++ = inptr[j, i]
 */

template <typename T>
static inline void interleave_24x1_8_h_helper(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
        T*& outptr, int skippf = 0) {
    static_assert(sizeof(T) == 2, "only support size == 2");
    asm volatile(
            // Load up 8 elements (1 vector) from each of 8 sources.
            "cbnz    %w[skippf], 1f\n"
            ASM_PREFETCH("[%[inptr0], #128]")
            ASM_PREFETCH("[%[inptr1], #128]")
            ASM_PREFETCH("[%[inptr2], #128]")
            ASM_PREFETCH("[%[inptr3], #128]")
            "1:\n"

            "ldr    q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7
            "ldr    q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7
            "ldr    q2, [%[inptr2]], #16\n" // q4=C0C1C2C3...
            "ldr    q6, [%[inptr6]], #16\n"
            "zip1    v8.8h, v0.8h, v4.8h\n"  // q8=A0E0A1E1A2E2A3E3
            "zip2    v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7
            "zip1    v9.8h, v2.8h, v6.8h\n"  // q9=C0G0C1G1C2G2C3G3
            "zip2    v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7
            "ldr    q1, [%[inptr1]], #16\n"  // q1=B0B1B2B3B4B5B6B7
            "ldr    q5, [%[inptr5]], #16\n"
            "ldr    q3, [%[inptr3]], #16\n" // q3=D0D1D2D3....
            "ldr    q7, [%[inptr7]], #16\n"
            "zip1    v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3
            "zip2    v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7
            "zip1    v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3
            "zip2    v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7

            "zip1    v12.8h,  v8.8h,  v9.8h\n" // q20=A0C0E0G0A1C1E1G1
            "zip2    v20.8h,  v8.8h,  v9.8h\n"
            "zip1    v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1
            "zip2    v21.8h, v10.8h, v11.8h\n"

            "cbnz    %w[skippf], 2f\n"
            ASM_PREFETCH("[%[inptr4], #112]")
            ASM_PREFETCH("[%[inptr5], #112]")
            ASM_PREFETCH("[%[inptr6], #112]")
            ASM_PREFETCH("[%[inptr7], #112]")
            "2:\n"

            "zip1    v22.8h, v16.8h, v17.8h\n"
            "zip2    v30.8h, v16.8h, v17.8h\n"
            "zip1    v23.8h, v18.8h, v19.8h\n"
            "zip2    v31.8h, v18.8h, v19.8h\n"

            "zip1    v14.8h, v12.8h, v13.8h\n"    // q22=A0B0C0D0E0F0G0H0
            "zip2    v15.8h, v12.8h, v13.8h\n"    // q23=A1B1C1D1E1F1G1H1
            "str    q14, [%[outptr]], #48\n"
            "str    q15, [%[outptr]], #48\n"

            "zip1    v0.8h, v20.8h, v21.8h\n"
            "zip2    v1.8h, v20.8h, v21.8h\n"
            "str    q0, [%[outptr]], #48\n"
            "str    q1, [%[outptr]], #48\n"

            "zip1    v2.8h, v22.8h, v23.8h\n"
            "zip2    v3.8h, v22.8h, v23.8h\n"
            "str    q2, [%[outptr]], #48\n"
            "str    q3, [%[outptr]], #48\n"

            "zip1    v4.8h, v30.8h, v31.8h\n"
            "zip2    v5.8h, v30.8h, v31.8h\n"
            "str    q4, [%[outptr]], #48\n"
            "str    q5, [%[outptr]], #48\n"

            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1),
              [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
              [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
              [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7),
              [outptr] "+r"(outptr)
            : [skippf] "r"(skippf)
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
              "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
              "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
              "v31", "cc", "memory");
}

template <typename T>
static inline void interleave_16x1_8_h_helper(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
        T*& outptr, int skippf = 0) {
    static_assert(sizeof(T) == 2, "only support size == 2");
    asm volatile(
            // Load up 8 elements (1 vector) from each of 8 sources.
            "cbnz    %w[skippf], 1f\n"
            ASM_PREFETCH("[%[inptr0], #128]")
            ASM_PREFETCH("[%[inptr1], #128]")
            ASM_PREFETCH("[%[inptr2], #128]")
            ASM_PREFETCH("[%[inptr3], #128]")
            "1:\n"

            "ldr    q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7
            "ldr    q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7
            "ldr    q2, [%[inptr2]], #16\n" // q4=C0C1C2C3...
            "ldr    q6, [%[inptr6]], #16\n"
            "zip1    v8.8h, v0.8h, v4.8h\n"  // q8=A0E0A1E1A2E2A3E3
            "zip2    v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7
            "zip1    v9.8h, v2.8h, v6.8h\n"  // q9=C0G0C1G1C2G2C3G3
            "zip2    v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7
            "ldr    q1, [%[inptr1]], #16\n"  // q1=B0B1B2B3B4B5B6B7
            "ldr    q5, [%[inptr5]], #16\n"
            "ldr    q3, [%[inptr3]], #16\n" // q3=D0D1D2D3....
            "ldr    q7, [%[inptr7]], #16\n"
            "zip1    v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3
            "zip2    v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7
            "zip1    v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3
            "zip2    v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7

            "zip1    v12.8h,  v8.8h,  v9.8h\n" // q20=A0C0E0G0A1C1E1G1
            "zip2    v20.8h,  v8.8h,  v9.8h\n"
            "zip1    v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1
            "zip2    v21.8h, v10.8h, v11.8h\n"

            "cbnz    %w[skippf], 2f\n"
            ASM_PREFETCH("[%[inptr4], #112]")
            ASM_PREFETCH("[%[inptr5], #112]")
            ASM_PREFETCH("[%[inptr6], #112]")
            ASM_PREFETCH("[%[inptr7], #112]")
            "2:\n"

            "zip1    v22.8h, v16.8h, v17.8h\n"
            "zip2    v30.8h, v16.8h, v17.8h\n"
            "zip1    v23.8h, v18.8h, v19.8h\n"
            "zip2    v31.8h, v18.8h, v19.8h\n"

            "zip1    v14.8h, v12.8h, v13.8h\n"    // q22=A0B0C0D0E0F0G0H0
            "zip2    v15.8h, v12.8h, v13.8h\n"    // q23=A1B1C1D1E1F1G1H1
            "str    q14, [%[outptr]], #32\n"
            "str    q15, [%[outptr]], #32\n"

            "zip1    v0.8h, v20.8h, v21.8h\n"
            "zip2    v1.8h, v20.8h, v21.8h\n"
            "str    q0, [%[outptr]], #32\n"
            "str    q1, [%[outptr]], #32\n"

            "zip1    v2.8h, v22.8h, v23.8h\n"
            "zip2    v3.8h, v22.8h, v23.8h\n"
            "str    q2, [%[outptr]], #32\n"
            "str    q3, [%[outptr]], #32\n"

            "zip1    v4.8h, v30.8h, v31.8h\n"
            "zip2    v5.8h, v30.8h, v31.8h\n"
            "str    q4, [%[outptr]], #32\n"
            "str    q5, [%[outptr]], #32\n"

            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1),
              [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
              [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
              [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7),
              [outptr] "+r"(outptr)
            : [skippf] "r"(skippf)
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
              "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
              "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
              "v31", "cc", "memory");
}

template <typename T>
static inline void interleave_8x1_8_h(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
        T*& outptr, int skippf = 0) {
    static_assert(sizeof(T) == 2, "only support size == 2");
    asm volatile(
       // Load up 8 elements (1 vector) from each of 8 sources.
       "cbnz    %w[skippf], 1f\n"
       ASM_PREFETCH("[%[inptr0], #128]")
       ASM_PREFETCH("[%[inptr1], #128]")
       ASM_PREFETCH("[%[inptr2], #128]")
       ASM_PREFETCH("[%[inptr3], #128]")
       "1:\n"


       "ldr    q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7
       "ldr    q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7
       "ldr    q2, [%[inptr2]], #16\n" // q4=C0C1C2C3...
       "ldr    q6, [%[inptr6]], #16\n"
       "zip1    v8.8h, v0.8h, v4.8h\n"  // q8=A0E0A1E1A2E2A3E3
       "zip2    v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7
       "zip1    v9.8h, v2.8h, v6.8h\n"  // q9=C0G0C1G1C2G2C3G3
       "zip2    v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7
       "ldr    q1, [%[inptr1]], #16\n"  // q1=B0B1B2B3B4B5B6B7
       "ldr    q5, [%[inptr5]], #16\n"
       "ldr    q3, [%[inptr3]], #16\n" // q3=D0D1D2D3....
       "ldr    q7, [%[inptr7]], #16\n"
       "zip1    v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3
       "zip2    v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7
       "zip1    v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3
       "zip2    v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7

       "zip1    v12.8h,  v8.8h,  v9.8h\n" // q20=A0C0E0G0A1C1E1G1
       "zip2    v20.8h,  v8.8h,  v9.8h\n"
       "zip1    v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1
       "zip2    v21.8h, v10.8h, v11.8h\n"

       "cbnz    %w[skippf], 2f\n"
       ASM_PREFETCH("[%[inptr4], #112]")
       ASM_PREFETCH("[%[inptr5], #112]")
       ASM_PREFETCH("[%[inptr6], #112]")
       ASM_PREFETCH("[%[inptr7], #112]")
       "2:\n"

       "zip1    v22.8h, v16.8h, v17.8h\n"
       "zip2    v30.8h, v16.8h, v17.8h\n"
       "zip1    v23.8h, v18.8h, v19.8h\n"
       "zip2    v31.8h, v18.8h, v19.8h\n"

       "zip1    v14.8h, v12.8h, v13.8h\n"    // q22=A0B0C0D0E0F0G0H0
       "zip2    v15.8h, v12.8h, v13.8h\n"    // q23=A1B1C1D1E1F1G1H1
       "stp    q14, q15, [%[outptr]], #32\n" // Write back first two elements

       "zip1    v0.8h, v20.8h, v21.8h\n"
       "zip2    v1.8h, v20.8h, v21.8h\n"
       "stp    q0, q1, [%[outptr]], #32\n" // Write back next two elements

       "zip1    v2.8h, v22.8h, v23.8h\n"
       "zip2    v3.8h, v22.8h, v23.8h\n"
       "stp    q2, q3, [%[outptr]], #32\n" // Write back next two elements

       "zip1    v4.8h, v30.8h, v31.8h\n"
       "zip2    v5.8h, v30.8h, v31.8h\n"
       "stp    q4, q5, [%[outptr]], #32\n" // Write back last two elements

       : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1),
         [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
         [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
         [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7),
         [outptr] "+r"(outptr)

       : [skippf] "r"(skippf)
       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
         "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
         "v31", "cc", "memory");
}

template <typename T>
static inline void interleave_4x1_4_h(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        T*& outptr) {
    static_assert(sizeof(T) == 2, "only support size == 2");
    asm volatile(
            "ldr d0, [%[inptr0]], #8\n"   // d0 = A0A1A2A3
            "ldr d1, [%[inptr1]], #8\n"   // d1 = B0B1B2B3
            "ldr d2, [%[inptr2]], #8\n"   // d2 = C0C1C2C3
            "ldr d3, [%[inptr3]], #8\n"   // d3 = D0D1D2D3
            "zip1 v4.4h, v0.4h, v2.4h\n"  // d4 = A0C0A1C1
            "zip2 v8.4h, v0.4h, v2.4h\n"  // d8 = A2C2A3C3
            "zip1 v5.4h, v1.4h, v3.4h\n"  // d5 = B0D0B1D1
            "zip2 v9.4h, v1.4h, v3.4h\n"  // d9 = B2D2B3D3

            "zip1 v6.4h, v4.4h, v5.4h\n"  // d6 = A0B0C0D0
            "zip2 v7.4h, v4.4h, v5.4h\n"  // d7 = A1B1C1D1
            "stp d6, d7, [%[outptr]], #16\n"

            "zip1 v10.4h, v8.4h, v9.4h\n"  // d10 = A2B2C2D2
            "zip2 v11.4h, v8.4h, v9.4h\n"  // d11 = A3B3C3D3
            "stp d10, d11, [%[outptr]], #16\n"
            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
              [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
              "memory");
}

static inline void interleave_4x1_2_d(
        const int64_t*& inptr0, const int64_t*& inptr1, const int64_t*& inptr2,
        const int64_t*& inptr3, int64_t*& outptr) {
    asm volatile(
            "ld1 {v0.2d}, [%[inptr0]], #16\n"  // d0 = A0A1
            "ld1 {v1.2d}, [%[inptr1]], #16\n"  // d1 = B0B1
            "ld1 {v2.2d}, [%[inptr2]], #16\n"  // d2 = C0C1
            "ld1 {v3.2d}, [%[inptr3]], #16\n"  // d3 = D0D1

            "zip1 v4.2d, v0.2d, v1.2d\n"  // d8 = A0B0
            "zip2 v5.2d, v0.2d, v1.2d\n"  // d9 = A1B1
            "zip1 v6.2d, v2.2d, v3.2d\n"  // d10 = C0D0
            "zip2 v7.2d, v2.2d, v3.2d\n"  // d11 = C1D1

            "st1 {v4.2d}, [%[outptr]], #16\n"
            "st1 {v6.2d}, [%[outptr]], #16\n"
            "st1 {v5.2d}, [%[outptr]], #16\n"
            "st1 {v7.2d}, [%[outptr]], #16\n"
            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
              [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "cc", "memory");
}

static inline void interleave_4x2_2_d(
        const int64_t*& inptr0, const int64_t*& inptr1, const int64_t*& inptr2,
        const int64_t*& inptr3, int64_t*& outptr) {
    asm volatile(
            "ld1 {v0.2d}, [%[inptr0]], #16\n"  // d0 = A0
            "ld1 {v1.2d}, [%[inptr0]], #16\n"  // d1 = A1
            "ld1 {v2.2d}, [%[inptr1]], #16\n"  // d2 = B0
            "ld1 {v3.2d}, [%[inptr1]], #16\n"  // d3 = B1
            "ld1 {v4.2d}, [%[inptr2]], #16\n"  // d4 = C0
            "ld1 {v5.2d}, [%[inptr2]], #16\n"  // d5 = C1
            "ld1 {v6.2d}, [%[inptr3]], #16\n"  // d6 = D0
            "ld1 {v7.2d}, [%[inptr3]], #16\n"  // d7 = D1

            "st1 {v0.2d}, [%[outptr]], #16\n"
            "st1 {v2.2d}, [%[outptr]], #16\n"
            "st1 {v4.2d}, [%[outptr]], #16\n"
            "st1 {v6.2d}, [%[outptr]], #16\n"
            "st1 {v1.2d}, [%[outptr]], #16\n"
            "st1 {v3.2d}, [%[outptr]], #16\n"
            "st1 {v5.2d}, [%[outptr]], #16\n"
            "st1 {v7.2d}, [%[outptr]], #16\n"
            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
              [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "cc", "memory");
}

static inline void interleave_12x1_4_s(
        const int32_t*& inptr0, const int32_t*& inptr1, const int32_t*& inptr2,
        const int32_t*& inptr3, const int32_t*& inptr4, const int32_t*& inptr5,
        const int32_t*& inptr6, const int32_t*& inptr7, const int32_t*& inptr8,
        const int32_t*& inptr9, const int32_t*& inptr10, const int32_t*& inptr11,
        int32_t*& outptr) {
    asm volatile(
            "ld1 {v0.4s}, [%[inptr0]], #16\n"  // d0 = A0A1A2A3
            "ld1 {v1.4s}, [%[inptr1]], #16\n"  // d1 = B0B1B2B3
            "ld1 {v2.4s}, [%[inptr2]], #16\n"  // d2 = C0C1C2C3
            "ld1 {v3.4s}, [%[inptr3]], #16\n"  // d3 = D0D1D2D3
            "zip1 v12.4s, v0.4s, v2.4s\n"      // d12 = A0C0A1C1
            "zip2 v13.4s, v0.4s, v2.4s\n"      // d13 = A2C2A3C3
            "zip1 v14.4s, v1.4s, v3.4s\n"      // d14 = B0D0B1D1
            "zip2 v15.4s, v1.4s, v3.4s\n"      // d15 = B2D2B3D3
            "zip1 v0.4s, v12.4s, v14.4s\n"     // d0 = A0B0C0D0
            "zip2 v1.4s, v12.4s, v14.4s\n"     // d1 = A1B1C1D1
            "zip1 v2.4s, v13.4s, v15.4s\n"     // d2 = A2B2C2D2
            "zip2 v3.4s, v13.4s, v15.4s\n"     // d3 = A3B3C3D3

            "ld1 {v4.4s}, [%[inptr4]], #16\n"  // d4 = E0E1E2E3
            "ld1 {v5.4s}, [%[inptr5]], #16\n"  // d5 = F0F1F2F3
            "ld1 {v6.4s}, [%[inptr6]], #16\n"  // d6 = G0G1G2G3
            "ld1 {v7.4s}, [%[inptr7]], #16\n"  // d7 = H0H1H2H3
            "zip1 v16.4s, v4.4s, v6.4s\n"      // d16 = E0G0E1G1
            "zip2 v17.4s, v4.4s, v6.4s\n"      // d17 = E2G2E3G3
            "zip1 v18.4s, v5.4s, v7.4s\n"      // d18 = F0H0F1H1
            "zip2 v19.4s, v5.4s, v7.4s\n"      // d19 = F2H2F3H3
            "zip1 v4.4s, v16.4s, v18.4s\n"     // d4 = E0F0G0H0
            "zip2 v5.4s, v16.4s, v18.4s\n"     // d5 = E1F1G1H1
            "zip1 v6.4s, v17.4s, v19.4s\n"     // d6 = E2F2G2H2
            "zip2 v7.4s, v17.4s, v19.4s\n"     // d7 = E3F3G3H3

            "ld1 {v8.4s}, [%[inptr8]], #16\n"    // d8 = I0I1I2I3
            "ld1 {v9.4s}, [%[inptr9]], #16\n"    // d9 = J0J1J2J3
            "ld1 {v10.4s}, [%[inptr10]], #16\n"  // d10 = K0K1K2K3
            "ld1 {v11.4s}, [%[inptr11]], #16\n"  // d11 = L0L1L2L3
            "zip1 v20.4s, v8.4s, v10.4s\n"       // d20 = I0K0I1K1
            "zip2 v21.4s, v8.4s, v10.4s\n"       // d21 = I2K2I3K3
            "zip1 v22.4s, v9.4s, v11.4s\n"       // d22 = J0L0J1L1
            "zip2 v23.4s, v9.4s, v11.4s\n"       // d23 = J2L2J3L3
            "zip1 v8.4s, v20.4s, v22.4s\n"       // d8 = I0J0K0L0
            "zip2 v9.4s, v20.4s, v22.4s\n"       // d9 = I1J1K1L1
            "zip1 v10.4s, v21.4s, v23.4s\n"      // d10 = I2J2K2L2
            "zip2 v11.4s, v21.4s, v23.4s\n"      // d11 = I3J3K3L3

            "st1 {v0.4s}, [%[outptr]], #16\n"
            "st1 {v4.4s}, [%[outptr]], #16\n"
            "st1 {v8.4s}, [%[outptr]], #16\n"
            "st1 {v1.4s}, [%[outptr]], #16\n"
            "st1 {v5.4s}, [%[outptr]], #16\n"
            "st1 {v9.4s}, [%[outptr]], #16\n"
            "st1 {v2.4s}, [%[outptr]], #16\n"
            "st1 {v6.4s}, [%[outptr]], #16\n"
            "st1 {v10.4s}, [%[outptr]], #16\n"
            "st1 {v3.4s}, [%[outptr]], #16\n"
            "st1 {v7.4s}, [%[outptr]], #16\n"
            "st1 {v11.4s}, [%[outptr]], #16\n"

            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
              [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
              [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [inptr8] "+r"(inptr8),
              [inptr9] "+r"(inptr9), [inptr10] "+r"(inptr10), [inptr11] "+r"(inptr11),
              [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
              "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
              "v22", "v23", "cc", "memory");
}

template <typename T>
static inline void interleave_12x1_4_h(
        const T*& in0, const T*& in1, const T*& in2, const T*& in3, const T*& in4,
        const T*& in5, const T*& in6, const T*& in7, const T*& in8, const T*& in9,
        const T*& in10, const T*& in11, T*& out) {
    static_assert(
            std::is_same<T, int16_t>::value || std::is_same<T, uint16_t>::value,
            "interleave_12x1_4_h only support uint16_t and int16_t");
    const int16_t*& inptr0 = reinterpret_cast<const int16_t*&>(in0);
    const int16_t*& inptr1 = reinterpret_cast<const int16_t*&>(in1);
    const int16_t*& inptr2 = reinterpret_cast<const int16_t*&>(in2);
    const int16_t*& inptr3 = reinterpret_cast<const int16_t*&>(in3);
    const int16_t*& inptr4 = reinterpret_cast<const int16_t*&>(in4);
    const int16_t*& inptr5 = reinterpret_cast<const int16_t*&>(in5);
    const int16_t*& inptr6 = reinterpret_cast<const int16_t*&>(in6);
    const int16_t*& inptr7 = reinterpret_cast<const int16_t*&>(in7);
    const int16_t*& inptr8 = reinterpret_cast<const int16_t*&>(in8);
    const int16_t*& inptr9 = reinterpret_cast<const int16_t*&>(in9);
    const int16_t*& inptr10 = reinterpret_cast<const int16_t*&>(in10);
    const int16_t*& inptr11 = reinterpret_cast<const int16_t*&>(in11);
    int16_t*& outptr = reinterpret_cast<int16_t*&>(out);
    asm volatile(
            "ld1 {v0.4h}, [%[inptr0]], #8\n"  // d0 = A0A1A2A3
            "ld1 {v1.4h}, [%[inptr1]], #8\n"  // d1 = B0B1B2B3
            "ld1 {v2.4h}, [%[inptr2]], #8\n"  // d2 = C0C1C2C3
            "ld1 {v3.4h}, [%[inptr3]], #8\n"  // d3 = D0D1D2D3
            "zip1 v12.4h, v0.4h, v2.4h\n"     // d12 = A0C0A1C1
            "zip2 v13.4h, v0.4h, v2.4h\n"     // d13 = A2C2A3C3
            "zip1 v14.4h, v1.4h, v3.4h\n"     // d14 = B0D0B1D1
            "zip2 v15.4h, v1.4h, v3.4h\n"     // d15 = B2D2B3D3
            "zip1 v0.4h, v12.4h, v14.4h\n"    // d0 = A0B0C0D0
            "zip2 v1.4h, v12.4h, v14.4h\n"    // d1 = A1B1C1D1
            "zip1 v2.4h, v13.4h, v15.4h\n"    // d2 = A2B2C2D2
            "zip2 v3.4h, v13.4h, v15.4h\n"    // d3 = A3B3C3D3

            "ld1 {v4.4h}, [%[inptr4]], #8\n"  // d4 = E0E1E2E3
            "ld1 {v5.4h}, [%[inptr5]], #8\n"  // d5 = F0F1F2F3
            "ld1 {v6.4h}, [%[inptr6]], #8\n"  // d6 = G0G1G2G3
            "ld1 {v7.4h}, [%[inptr7]], #8\n"  // d7 = H0H1H2H3
            "zip1 v16.4h, v4.4h, v6.4h\n"     // d16 = E0G0E1G1
            "zip2 v17.4h, v4.4h, v6.4h\n"     // d17 = E2G2E3G3
            "zip1 v18.4h, v5.4h, v7.4h\n"     // d18 = F0H0F1H1
            "zip2 v19.4h, v5.4h, v7.4h\n"     // d19 = F2H2F3H3
            "zip1 v4.4h, v16.4h, v18.4h\n"    // d4 = E0F0G0H0
            "zip2 v5.4h, v16.4h, v18.4h\n"    // d5 = E1F1G1H1
            "zip1 v6.4h, v17.4h, v19.4h\n"    // d6 = E2F2G2H2
            "zip2 v7.4h, v17.4h, v19.4h\n"    // d7 = E3F3G3H3

            "ld1 {v8.4h}, [%[inptr8]], #8\n"    // d8 = I0I1I2I3
            "ld1 {v9.4h}, [%[inptr9]], #8\n"    // d9 = J0J1J2J3
            "ld1 {v10.4h}, [%[inptr10]], #8\n"  // d10 = K0K1K2K3
            "ld1 {v11.4h}, [%[inptr11]], #8\n"  // d11 = L0L1L2L3
            "zip1 v20.4h, v8.4h, v10.4h\n"      // d20 = I0K0I1K1
            "zip2 v21.4h, v8.4h, v10.4h\n"      // d21 = I2K2I3K3
            "zip1 v22.4h, v9.4h, v11.4h\n"      // d22 = J0L0J1L1
            "zip2 v23.4h, v9.4h, v11.4h\n"      // d23 = J2L2J3L3
            "zip1 v8.4h, v20.4h, v22.4h\n"      // d8 = I0J0K0L0
            "zip2 v9.4h, v20.4h, v22.4h\n"      // d9 = I1J1K1L1
            "zip1 v10.4h, v21.4h, v23.4h\n"     // d10 = I2J2K2L2
            "zip2 v11.4h, v21.4h, v23.4h\n"     // d11 = I3J3K3L3

            "st1 {v0.4h}, [%[outptr]], #8\n"   // d0 = A0B0C0D0
            "st1 {v4.4h}, [%[outptr]], #8\n"   // d4 = E0F0G0H0
            "st1 {v8.4h}, [%[outptr]], #8\n"   // d8 = I0J0K0L0
            "st1 {v1.4h}, [%[outptr]], #8\n"   // d1 = A1B1C1D1
            "st1 {v5.4h}, [%[outptr]], #8\n"   // d5 = E1F1G1H1
            "st1 {v9.4h}, [%[outptr]], #8\n"   // d9 = I1J1K1L1
            "st1 {v2.4h}, [%[outptr]], #8\n"   // d2 = A2B2C2D2
            "st1 {v6.4h}, [%[outptr]], #8\n"   // d6 = E2F2G2H2
            "st1 {v10.4h}, [%[outptr]], #8\n"  // d10 = I2J2K2L2
            "st1 {v3.4h}, [%[outptr]], #8\n"   // d3 = A3B3C3D3
            "st1 {v7.4h}, [%[outptr]], #8\n"   // d7 = E3F3G3H3
            "st1 {v11.4h}, [%[outptr]], #8\n"  // d11 = I3J3K3L3

            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
              [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
              [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [inptr8] "+r"(inptr8),
              [inptr9] "+r"(inptr9), [inptr10] "+r"(inptr10), [inptr11] "+r"(inptr11),
              [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
              "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
              "v22", "v23", "cc", "memory");
}

template <typename T>
static inline void interleave_12x4_4_b(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
        const T*& inptr8, const T*& inptr9, const T*& inptr10, const T*& inptr11,
        T*& outptr) {
    static_assert(
            std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
            "interleave_12x4_4_b only support uint8_t and int8_t");
    interleave_12x1_4_s(
            reinterpret_cast<const int32_t*&>(inptr0),
            reinterpret_cast<const int32_t*&>(inptr1),
            reinterpret_cast<const int32_t*&>(inptr2),
            reinterpret_cast<const int32_t*&>(inptr3),
            reinterpret_cast<const int32_t*&>(inptr4),
            reinterpret_cast<const int32_t*&>(inptr5),
            reinterpret_cast<const int32_t*&>(inptr6),
            reinterpret_cast<const int32_t*&>(inptr7),
            reinterpret_cast<const int32_t*&>(inptr8),
            reinterpret_cast<const int32_t*&>(inptr9),
            reinterpret_cast<const int32_t*&>(inptr10),
            reinterpret_cast<const int32_t*&>(inptr11),
            reinterpret_cast<int32_t*&>(outptr));
}

static inline void interleave_2x1_4_s(
        const int32_t*& inptr0, const int32_t*& inptr1, int32_t*& outptr) {
    asm volatile(
            "ld1 {v0.4s}, [%[inptr0]], #16\n"  // d0 = A0A1A2A3
            "ld1 {v1.4s}, [%[inptr1]], #16\n"  // d1 = B0B1B2B3
            "st1 {v0.4s}, [%[outptr]], #16\n"
            "st1 {v1.4s}, [%[outptr]], #16\n"
            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "cc", "memory");
}

static inline void interleave_8x1_4_s(
        const int32_t*& inptr0, const int32_t*& inptr1, const int32_t*& inptr2,
        const int32_t*& inptr3, const int32_t*& inptr4, const int32_t*& inptr5,
        const int32_t*& inptr6, const int32_t*& inptr7, int32_t*& outptr) {
    asm volatile(
            "ld1 {v0.4s}, [%[inptr0]], #16\n"  // d0 = A0A1A2A3
            "ld1 {v1.4s}, [%[inptr1]], #16\n"  // d1 = B0B1B2B3
            "ld1 {v2.4s}, [%[inptr2]], #16\n"  // d2 = C0C1C2C3
            "ld1 {v3.4s}, [%[inptr3]], #16\n"  // d3 = D0D1D2D3
            "zip1 v8.4s, v0.4s, v2.4s\n"       // d8 = A0C0A1C1
            "zip2 v9.4s, v0.4s, v2.4s\n"       // d9 = A2C2A3C3
            "zip1 v10.4s, v1.4s, v3.4s\n"      // d10 = B0D0B1D1
            "zip2 v11.4s, v1.4s, v3.4s\n"      // d11 = B2D2B3D3
            "zip1 v12.4s, v8.4s, v10.4s\n"     // d12 = A0B0C0D0
            "zip2 v13.4s, v8.4s, v10.4s\n"     // d13 = A1B1C1D1
            "zip1 v14.4s, v9.4s, v11.4s\n"     // d14 = A2B2C2D2
            "zip2 v15.4s, v9.4s, v11.4s\n"     // d15 = A3B3C3D3

            "ld1 {v4.4s}, [%[inptr4]], #16\n"  // d4 = E0E1E2E3
            "ld1 {v5.4s}, [%[inptr5]], #16\n"  // d5 = F0F1F2F3
            "ld1 {v6.4s}, [%[inptr6]], #16\n"  // d6 = G0G1G2G3
            "ld1 {v7.4s}, [%[inptr7]], #16\n"  // d7 = H0H1H2H3
            "zip1 v16.4s, v4.4s, v6.4s\n"      // d16 = E0G0E1G1
            "zip2 v17.4s, v4.4s, v6.4s\n"      // d17 = E2G2E3G3
            "zip1 v18.4s, v5.4s, v7.4s\n"      // d18 = F0H0F1H1
            "zip2 v19.4s, v5.4s, v7.4s\n"      // d19 = F2H2F3H3
            "zip1 v20.4s, v16.4s, v18.4s\n"    // d20 = E0F0G0H0
            "zip2 v21.4s, v16.4s, v18.4s\n"    // d21 = E1F1G1H1
            "zip1 v22.4s, v17.4s, v19.4s\n"    // d22 = E2F2G2H2
            "zip2 v23.4s, v17.4s, v19.4s\n"    // d23 = E3F3G3H3

            "st1 {v12.4s}, [%[outptr]], #16\n"
            "st1 {v20.4s}, [%[outptr]], #16\n"
            "st1 {v13.4s}, [%[outptr]], #16\n"
            "st1 {v21.4s}, [%[outptr]], #16\n"
            "st1 {v14.4s}, [%[outptr]], #16\n"
            "st1 {v22.4s}, [%[outptr]], #16\n"
            "st1 {v15.4s}, [%[outptr]], #16\n"
            "st1 {v23.4s}, [%[outptr]], #16\n"

            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
              [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
              [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
              "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
              "v22", "v23", "cc", "memory");
}

static inline void interleave_8x1_2_d(
        const int64_t*& inptr0, const int64_t*& inptr1, const int64_t*& inptr2,
        const int64_t*& inptr3, const int64_t*& inptr4, const int64_t*& inptr5,
        const int64_t*& inptr6, const int64_t*& inptr7, int64_t*& outptr) {
    asm volatile(
            "ld1 {v0.2d}, [%[inptr0]], #16\n"  // d0 = A0A1
            "ld1 {v1.2d}, [%[inptr1]], #16\n"  // d1 = B0B1
            "ld1 {v2.2d}, [%[inptr2]], #16\n"  // d2 = C0C1
            "ld1 {v3.2d}, [%[inptr3]], #16\n"  // d3 = D0D1
            "ld1 {v4.2d}, [%[inptr4]], #16\n"  // d4 = E0E1
            "ld1 {v5.2d}, [%[inptr5]], #16\n"  // d5 = F0F1
            "ld1 {v6.2d}, [%[inptr6]], #16\n"  // d6 = G0G1
            "ld1 {v7.2d}, [%[inptr7]], #16\n"  // d7 = H0H1

            "zip1 v8.2d, v0.2d, v1.2d\n"   // d8 = A0B0
            "zip2 v9.2d, v0.2d, v1.2d\n"   // d9 = A1B1
            "zip1 v10.2d, v2.2d, v3.2d\n"  // d10 = C0D0
            "zip2 v11.2d, v2.2d, v3.2d\n"  // d11 = C1D1
            "zip1 v12.2d, v4.2d, v5.2d\n"  // d12 = E0F0
            "zip2 v13.2d, v4.2d, v5.2d\n"  // d13 = E1F1
            "zip1 v14.2d, v6.2d, v7.2d\n"  // d14 = G0H0
            "zip2 v15.2d, v6.2d, v7.2d\n"  // d15 = G1H1

            "st1 {v8.2d}, [%[outptr]], #16\n"
            "st1 {v10.2d}, [%[outptr]], #16\n"
            "st1 {v12.2d}, [%[outptr]], #16\n"
            "st1 {v14.2d}, [%[outptr]], #16\n"
            "st1 {v9.2d}, [%[outptr]], #16\n"
            "st1 {v11.2d}, [%[outptr]], #16\n"
            "st1 {v13.2d}, [%[outptr]], #16\n"
            "st1 {v15.2d}, [%[outptr]], #16\n"
            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
              [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
              [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
              "v12", "v13", "v14", "v15", "cc", "memory");
}

static inline void interleave_8x2_2_d(
        const int64_t*& inptr0, const int64_t*& inptr1, const int64_t*& inptr2,
        const int64_t*& inptr3, const int64_t*& inptr4, const int64_t*& inptr5,
        const int64_t*& inptr6, const int64_t*& inptr7, int64_t*& outptr) {
    asm volatile(
            "ld1 {v0.2d}, [%[inptr0]], #16\n"   // d0 = A0
            "ld1 {v1.2d}, [%[inptr0]], #16\n"   // d1 = A1
            "ld1 {v2.2d}, [%[inptr1]], #16\n"   // d2 = B0
            "ld1 {v3.2d}, [%[inptr1]], #16\n"   // d3 = B1
            "ld1 {v4.2d}, [%[inptr2]], #16\n"   // d4 = C0
            "ld1 {v5.2d}, [%[inptr2]], #16\n"   // d5 = C1
            "ld1 {v6.2d}, [%[inptr3]], #16\n"   // d6 = D0
            "ld1 {v7.2d}, [%[inptr3]], #16\n"   // d7 = D1
            "ld1 {v8.2d}, [%[inptr4]], #16\n"   // d8 = E0
            "ld1 {v9.2d}, [%[inptr4]], #16\n"   // d9 = E1
            "ld1 {v10.2d}, [%[inptr5]], #16\n"  // d10 = F0
            "ld1 {v11.2d}, [%[inptr5]], #16\n"  // d11 = F1
            "ld1 {v12.2d}, [%[inptr6]], #16\n"  // d12 = G0
            "ld1 {v13.2d}, [%[inptr6]], #16\n"  // d13 = G1
            "ld1 {v14.2d}, [%[inptr7]], #16\n"  // d14 = H0
            "ld1 {v15.2d}, [%[inptr7]], #16\n"  // d15 = H1

            "st1 {v0.2d}, [%[outptr]], #16\n"
            "st1 {v2.2d}, [%[outptr]], #16\n"
            "st1 {v4.2d}, [%[outptr]], #16\n"
            "st1 {v6.2d}, [%[outptr]], #16\n"
            "st1 {v8.2d}, [%[outptr]], #16\n"
            "st1 {v10.2d}, [%[outptr]], #16\n"
            "st1 {v12.2d}, [%[outptr]], #16\n"
            "st1 {v14.2d}, [%[outptr]], #16\n"
            "st1 {v1.2d}, [%[outptr]], #16\n"
            "st1 {v3.2d}, [%[outptr]], #16\n"
            "st1 {v5.2d}, [%[outptr]], #16\n"
            "st1 {v7.2d}, [%[outptr]], #16\n"
            "st1 {v9.2d}, [%[outptr]], #16\n"
            "st1 {v11.2d}, [%[outptr]], #16\n"
            "st1 {v13.2d}, [%[outptr]], #16\n"
            "st1 {v15.2d}, [%[outptr]], #16\n"
            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
              [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
              [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
              "v12", "v13", "v14", "v15", "cc", "memory");
}

template <typename T>
static inline void interleave_2x4_4_b(const T*& inptr0, const T*& inptr1, T*& outptr) {
    static_assert(
            std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
            "interleave_2x4_4_b only support uint8_t and int8_t");
    interleave_2x1_4_s(
            reinterpret_cast<const int32_t*&>(inptr0),
            reinterpret_cast<const int32_t*&>(inptr1),
            reinterpret_cast<int32_t*&>(outptr));
}

template <typename T>
static inline void interleave_8x4_4_b(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
        T*& outptr) {
    static_assert(
            std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
            "interleave_8x4_4_b only support uint8_t and int8_t");
    interleave_8x1_4_s(
            reinterpret_cast<const int32_t*&>(inptr0),
            reinterpret_cast<const int32_t*&>(inptr1),
            reinterpret_cast<const int32_t*&>(inptr2),
            reinterpret_cast<const int32_t*&>(inptr3),
            reinterpret_cast<const int32_t*&>(inptr4),
            reinterpret_cast<const int32_t*&>(inptr5),
            reinterpret_cast<const int32_t*&>(inptr6),
            reinterpret_cast<const int32_t*&>(inptr7),
            reinterpret_cast<int32_t*&>(outptr));
}

template <typename T>
static inline void interleave_8x4_1_h(
        const T*& in0, const T*& in1, const T*& in2, const T*& in3, T* out) {
    static_assert(sizeof(T) == 2, "only support size == 2");
    asm volatile(
            "ldr q0, [%[in0]], #16\n"  // A1A2A3A4A5A6A7A8
            "ldr q1, [%[in1]], #16\n"  // B1B2B3B4B5B6B7B8
            "ldr q2, [%[in2]], #16\n"  // C1C2C3C4C5C6C7C8
            "ldr q3, [%[in3]], #16\n"  // D1D2D3D4D5D6D7D8

            "trn1 v4.8h, v0.8h, v1.8h\n"  // A1B1A3B3A5B5A7B7
            "trn2 v5.8h, v0.8h, v1.8h\n"  // A2B2A4B4A6B6A8B8
            "trn1 v6.8h, v2.8h, v3.8h\n"  // C1D1C3D3C5D5C7D7
            "trn2 v7.8h, v2.8h, v3.8h\n"  // C2D2C4D4C6D6C8D8

            "zip1 v8.4s, v4.4s, v6.4s\n"   // A1B1C1D1A3B3C3D3
            "zip2 v9.4s, v4.4s, v6.4s\n"   // A5B5C5D5A7B7C7D7
            "zip1 v10.4s, v5.4s, v7.4s\n"  // A2B2C2D2A4B4C4D4
            "zip2 v11.4s, v5.4s, v7.4s\n"  // A6B6C6D6A8B8C8D8

            "zip1 v12.2d, v8.2d, v10.2d\n"  // A1B1C1D1A2B2C2D2
            "zip2 v13.2d, v8.2d, v10.2d\n"  // A3B3C3D3A4B4C4D4
            "zip1 v14.2d, v9.2d, v11.2d\n"  // A5B5C5D5A6B6C6D6
            "zip2 v15.2d, v9.2d, v11.2d\n"  // A7B7C7D7A8B8C8D8

            "st1 {v12.2d}, [%[out]], #16\n"
            "st1 {v13.2d}, [%[out]], #16\n"
            "st1 {v14.2d}, [%[out]], #16\n"
            "st1 {v15.2d}, [%[out]], #16\n"
            : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2), [in3] "+r"(in3),
              [out] "+r"(out)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
              "v12", "v13", "v14", "v15", "memory");
}

template <typename T>
static inline void interleave_8x8_2_b(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
        T*& outptr) {
    static_assert(
            std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
            "interleave_8x8_2_b only support uint8_t and int8_t");
    interleave_8x1_2_d(
            reinterpret_cast<const int64_t*&>(inptr0),
            reinterpret_cast<const int64_t*&>(inptr1),
            reinterpret_cast<const int64_t*&>(inptr2),
            reinterpret_cast<const int64_t*&>(inptr3),
            reinterpret_cast<const int64_t*&>(inptr4),
            reinterpret_cast<const int64_t*&>(inptr5),
            reinterpret_cast<const int64_t*&>(inptr6),
            reinterpret_cast<const int64_t*&>(inptr7),
            reinterpret_cast<int64_t*&>(outptr));
}

template <typename T>
static inline void interleave_8x8_2_h(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
        T*& outptr) {
    static_assert(
            std::is_same<T, int16_t>::value || std::is_same<T, uint16_t>::value,
            "interleave_8x8_2_h only support uint16_t and int16_t");
    interleave_8x2_2_d(
            reinterpret_cast<const int64_t*&>(inptr0),
            reinterpret_cast<const int64_t*&>(inptr1),
            reinterpret_cast<const int64_t*&>(inptr2),
            reinterpret_cast<const int64_t*&>(inptr3),
            reinterpret_cast<const int64_t*&>(inptr4),
            reinterpret_cast<const int64_t*&>(inptr5),
            reinterpret_cast<const int64_t*&>(inptr6),
            reinterpret_cast<const int64_t*&>(inptr7),
            reinterpret_cast<int64_t*&>(outptr));
}

template <typename T>
static inline void interleave_8x2_8_b(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
        T*& outptr) {
    static_assert(
            std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
            "interleave_8x2_8_b only support uint8_t and int8_t");
    interleave_8x1_8_h(
            reinterpret_cast<const int16_t*&>(inptr0),
            reinterpret_cast<const int16_t*&>(inptr1),
            reinterpret_cast<const int16_t*&>(inptr2),
            reinterpret_cast<const int16_t*&>(inptr3),
            reinterpret_cast<const int16_t*&>(inptr4),
            reinterpret_cast<const int16_t*&>(inptr5),
            reinterpret_cast<const int16_t*&>(inptr6),
            reinterpret_cast<const int16_t*&>(inptr7),
            reinterpret_cast<int16_t*&>(outptr));
}

template <typename T>
static inline void interleave_8x8_1_b(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
        T*& outptr) {
    static_assert(
            std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
            "interleave_8x8_1_b only support uint8_t and int8_t");
    asm volatile(
            "ld1 {v0.d}[0],  [%[inptr0]], 8\n"  // A1A2A3A4A5A6A7A8
            "ld1 {v0.d}[1],  [%[inptr1]], 8\n"  // B1B2B3B4B5B6B7B8
            "ld1 {v1.d}[0],  [%[inptr2]], 8\n"  // C1C2C3C4C5C6C7C8
            "ld1 {v1.d}[1],  [%[inptr3]], 8\n"  // D1D2D3D4D5D6D7D8
            "ld1 {v2.d}[0],  [%[inptr4]], 8\n"  // E1E2E3E4E5E6E7E8
            "ld1 {v2.d}[1],  [%[inptr5]], 8\n"  // F1F2F3F4F5F6F7F8
            "ld1 {v3.d}[0],  [%[inptr6]], 8\n"  // G1G2G3G4G5G6G7G8
            "ld1 {v3.d}[1],  [%[inptr7]], 8\n"  // H1H2H3H4H5H6H7H8

            "st1 {v0.2d},  [%[outptr]], 16\n"  // A1A2A3A4A5A6A7A8B1B2B3B4B5B6B7B8
            "st1 {v1.2d},  [%[outptr]], 16\n"  // C1C2C3C4C5C6C7C8D1D2D3D4D5D6D7D8
            "st1 {v2.2d},  [%[outptr]], 16\n"  // E1E2E3E4E5E6E7E8F1F2F3F4F5F6F7F8
            "st1 {v3.2d},  [%[outptr]], 16\n"  // G1G2G3G4G5G6G7G8H1H2H3H4H5H6H7H8
            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
              [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
              [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "memory");
}

template <typename T>
static inline void interleave_8x4_1_b_with_shift(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
        T* outptr) {
    static_assert(sizeof(T) == 1, "only support size == 1");
    asm volatile(
            "ld1 {v0.s}[0], [%[inptr0]], #4\n"
            "ld1 {v0.s}[1], [%[inptr1]], #4\n"
            "ld1 {v0.s}[2], [%[inptr2]], #4\n"
            "ld1 {v0.s}[3], [%[inptr3]], #4\n"
            "ld1 {v1.s}[0], [%[inptr4]], #4\n"
            "ld1 {v1.s}[1], [%[inptr5]], #4\n"
            "ld1 {v1.s}[2], [%[inptr6]], #4\n"
            "ld1 {v1.s}[3], [%[inptr7]], #4\n"
            "shl  v2.16b, v0.16b,        #4\n"
            "shl  v5.16b, v1.16b,        #4\n"
            "sshr v3.16b, v0.16b,        #4\n"  // hig
            "sshr v4.16b, v2.16b,        #4\n"  // low
            "sshr v6.16b, v1.16b,        #4\n"  // hig
            "sshr v7.16b, v5.16b,        #4\n"  // low
            "zip1 v8.16b, v4.16b,    v3.16b\n"
            "zip2 v9.16b, v4.16b,    v3.16b\n"
            "zip1 v10.16b, v7.16b,   v6.16b\n"
            "zip2 v11.16b, v7.16b,   v6.16b\n"
            "st1 {v8.16b-v11.16b},[%[outptr]],#64"
            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
              [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
              [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
              "memory");
}

template <typename T>
static inline void interleave_8x8_1_h(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
        T*& outptr) {
    static_assert(
            std::is_same<T, int16_t>::value || std::is_same<T, uint16_t>::value,
            "interleave_8x8_1_h only support uint16_t and int16_t");
    asm volatile(
            "ld1 {v0.8h},  [%[inptr0]], #16\n"  // A1A2A3A4A5A6A7A8
            "ld1 {v1.8h},  [%[inptr1]], #16\n"  // B1B2B3B4B5B6B7B8
            "ld1 {v2.8h},  [%[inptr2]], #16\n"  // C1C2C3C4C5C6C7C8
            "ld1 {v3.8h},  [%[inptr3]], #16\n"  // D1D2D3D4D5D6D7D8
            "ld1 {v4.8h},  [%[inptr4]], #16\n"  // E1E2E3E4E5E6E7E8
            "ld1 {v5.8h},  [%[inptr5]], #16\n"  // F1F2F3F4F5F6F7F8
            "ld1 {v6.8h},  [%[inptr6]], #16\n"  // G1G2G3G4G5G6G7G8
            "ld1 {v7.8h},  [%[inptr7]], #16\n"  // H1H2H3H4H5H6H7H8

            "st1 {v0.8h},  [%[outptr]], #16\n"  // A1A2A3A4A5A6A7A8
            "st1 {v1.8h},  [%[outptr]], #16\n"  // B1B2B3B4B5B6B7B8
            "st1 {v2.8h},  [%[outptr]], #16\n"  // C1C2C3C4C5C6C7C8
            "st1 {v3.8h},  [%[outptr]], #16\n"  // D1D2D3D4D5D6D7D8
            "st1 {v4.8h},  [%[outptr]], #16\n"  // E1E2E3E4E5E6E7E8
            "st1 {v5.8h},  [%[outptr]], #16\n"  // F1F2F3F4F5F6F7F8
            "st1 {v6.8h},  [%[outptr]], #16\n"  // G1G2G3G4G5G6G7G8
            "st1 {v7.8h},  [%[outptr]], #16\n"  // H1H2H3H4H5H6H7H8
            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
              [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
              [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory");
}

static inline void interleave_4x1_4_s(
        const int32_t*& inptr0, const int32_t*& inptr1, const int32_t*& inptr2,
        const int32_t*& inptr3, int32_t*& outptr) {
    asm volatile(
            "ld1 {v0.4s}, [%[inptr0]], #16\n"  // d0 = A0A1A2A3
            "ld1 {v1.4s}, [%[inptr1]], #16\n"  // d1 = B0B1B2B3
            "ld1 {v2.4s}, [%[inptr2]], #16\n"  // d2 = C0C1C2C3
            "ld1 {v3.4s}, [%[inptr3]], #16\n"  // d3 = D0D1D2D3
            "zip1 v8.4s, v0.4s, v2.4s\n"       // d8 = A0C0A1C1
            "zip2 v9.4s, v0.4s, v2.4s\n"       // d9 = A2C2A3C3
            "zip1 v10.4s, v1.4s, v3.4s\n"      // d10 = B0D0B1D1
            "zip2 v11.4s, v1.4s, v3.4s\n"      // d11 = B2D2B3D3
            "zip1 v12.4s, v8.4s, v10.4s\n"     // d12 = A0B0C0D0
            "zip2 v13.4s, v8.4s, v10.4s\n"     // d13 = A1B1C1D1
            "zip1 v14.4s, v9.4s, v11.4s\n"     // d14 = A2B2C2D2
            "zip2 v15.4s, v9.4s, v11.4s\n"     // d15 = A3B3C3D3

            "st1 {v12.4s}, [%[outptr]], #16\n"
            "st1 {v13.4s}, [%[outptr]], #16\n"
            "st1 {v14.4s}, [%[outptr]], #16\n"
            "st1 {v15.4s}, [%[outptr]], #16\n"

            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
              [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
              "v12", "v13", "v14", "v15", "cc", "memory");
}

template <typename T>
static inline void interleave_4x8_1_s(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        T*& outptr) {
    static_assert(sizeof(T) == 4, "only support size == 4");
    asm volatile(
            "ld1 {v0.4s, v1.4s}, [%[inptr0]], #32\n"
            "ld1 {v2.4s, v3.4s}, [%[inptr1]], #32\n"
            "ld1 {v4.4s, v5.4s}, [%[inptr2]], #32\n"
            "ld1 {v6.4s, v7.4s}, [%[inptr3]], #32\n"
            "st1 {v0.4s, v1.4s}, [%[outptr]], #32\n"
            "st1 {v2.4s, v3.4s}, [%[outptr]], #32\n"
            "st1 {v4.4s, v5.4s}, [%[outptr]], #32\n"
            "st1 {v6.4s, v7.4s}, [%[outptr]], #32\n"

            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
              [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "cc", "memory");
}

template <typename T>
static inline void interleave_4x12_1_s(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        T*& outptr) {
    static_assert(sizeof(T) == 4, "only support size == 4");
    asm volatile(
            "ld1 {v0.4s, v1.4s, v2.4s}, [%[inptr0]], #48\n"
            "ld1 {v4.4s, v5.4s, v6.4s}, [%[inptr1]], #48\n"
            "ld1 {v8.4s, v9.4s, v10.4s}, [%[inptr2]], #48\n"
            "ld1 {v12.4s, v13.4s, v14.4s}, [%[inptr3]], #48\n"
            "st1 {v0.4s, v1.4s, v2.4s}, [%[outptr]], #48\n"
            "st1 {v4.4s, v5.4s, v6.4s}, [%[outptr]], #48\n"
            "st1 {v8.4s, v9.4s, v10.4s}, [%[outptr]], #48\n"
            "st1 {v12.4s, v13.4s, v14.4s}, [%[outptr]], #48\n"

            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
              [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v4", "v5", "v6", "v8", "v9", "v10", "v12", "v13",
              "v14", "cc", "memory");
}

template <typename T>
static inline void interleave_4x16_1_b(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        T*& outptr) {
    static_assert(sizeof(T) == 1, "only support size == 1");
    asm volatile(
            "ld1 {v0.4s}, [%[inptr0]], #16\n"  // d0 = A0A1A2A3
            "ld1 {v1.4s}, [%[inptr1]], #16\n"  // d1 = B0B1B2B3
            "ld1 {v2.4s}, [%[inptr2]], #16\n"  // d2 = C0C1C2C3
            "ld1 {v3.4s}, [%[inptr3]], #16\n"  // d3 = D0D1D2D3
            "st1 {v0.4s}, [%[outptr]], #16\n"
            "st1 {v1.4s}, [%[outptr]], #16\n"
            "st1 {v2.4s}, [%[outptr]], #16\n"
            "st1 {v3.4s}, [%[outptr]], #16\n"

            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
              [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "v4", "cc", "memory");
}

template <typename T>
static inline void interleave_4x16_1_s(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        T*& outptr) {
    static_assert(sizeof(T) == 4, "only support size == 4");
    asm volatile(
            "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[inptr0]], #64\n"
            "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[inptr1]], #64\n"
            "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%[inptr2]], #64\n"
            "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%[inptr3]], #64\n"
            "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[outptr]], #64\n"
            "st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[outptr]], #64\n"
            "st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%[outptr]], #64\n"
            "st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%[outptr]], #64\n"

            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
              [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
              "v12", "v13", "v14", "v15", "cc", "memory");
}

template <typename T>
static inline void interleave_4x2_4_b(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        T*& outptr) {
    static_assert(
            std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
            "interleave_4x2_4_b only support uint8_t and int8_t");
    interleave_4x1_4_h(
            reinterpret_cast<const int16_t*&>(inptr0),
            reinterpret_cast<const int16_t*&>(inptr1),
            reinterpret_cast<const int16_t*&>(inptr2),
            reinterpret_cast<const int16_t*&>(inptr3),
            reinterpret_cast<int16_t*&>(outptr));
}

template <typename T>
static inline void interleave_4x4_4_b(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        T*& outptr) {
    static_assert(
            std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
            "interleave_4x4_4_b only support uint8_t and int8_t");
    interleave_4x1_4_s(
            reinterpret_cast<const int32_t*&>(inptr0),
            reinterpret_cast<const int32_t*&>(inptr1),
            reinterpret_cast<const int32_t*&>(inptr2),
            reinterpret_cast<const int32_t*&>(inptr3),
            reinterpret_cast<int32_t*&>(outptr));
}

template <typename T>
static inline void interleave_4x4_1_s(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        T*& outptr) {
    static_assert(sizeof(T) == 4, "interleave_4x4_1_s only support size == 4");
    asm volatile(
            "ld1 {v0.4s}, [%[inptr0]], #16\n"
            "ld1 {v1.4s}, [%[inptr1]], #16\n"
            "ld1 {v2.4s}, [%[inptr2]], #16\n"
            "ld1 {v3.4s}, [%[inptr3]], #16\n"
            "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[outptr]], #64\n"

            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
              [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "cc", "memory");
}

template <typename T>
static inline void interleave_2x4_4_s(const T*& inptr0, const T*& inptr1, T* outptr) {
    static_assert(sizeof(T) == 4, "interleave_2x4_4_s only support size == 4");
    asm volatile(
            "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[inptr0]], #64\n"
            "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[inptr1]], #64\n"
            "stp q0, q4, [%[outptr]]\n"
            "stp q1, q5, [%[outptr], #32]\n"
            "stp q2, q6, [%[outptr], #64]\n"
            "stp q3, q7, [%[outptr], #96]\n"

            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory");
}

template <typename T>
static inline void interleave_1x4_4_s(const T*& inptr0, T* outptr) {
    static_assert(sizeof(T) == 4, "interleave_1x4_4_s only support size == 4");
    asm volatile(
            "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[inptr0]], #64\n"
            "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[outptr]]\n"

            : [inptr0] "+r"(inptr0), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "memory");
}

template <typename T>
static inline void interleave_4x8_2_b(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        T*& outptr) {
    static_assert(
            std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
            "interleave_4x8_2_b only support uint8_t and int8_t");
    interleave_4x1_2_d(
            reinterpret_cast<const int64_t*&>(inptr0),
            reinterpret_cast<const int64_t*&>(inptr1),
            reinterpret_cast<const int64_t*&>(inptr2),
            reinterpret_cast<const int64_t*&>(inptr3),
            reinterpret_cast<int64_t*&>(outptr));
}

template <typename T>
static inline void interleave_4x8_2_h(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        T*& outptr) {
    static_assert(
            std::is_same<T, int16_t>::value || std::is_same<T, uint16_t>::value,
            "interleave_4x8_2_h only support uint16_t and int16_t");
    interleave_4x2_2_d(
            reinterpret_cast<const int64_t*&>(inptr0),
            reinterpret_cast<const int64_t*&>(inptr1),
            reinterpret_cast<const int64_t*&>(inptr2),
            reinterpret_cast<const int64_t*&>(inptr3),
            reinterpret_cast<int64_t*&>(outptr));
}

template <typename T>
static inline void interleave_1x16_1_s(const T*& inptr0, T*& outptr) {
    static_assert(sizeof(T) == 4, "interleave_1x16_1_s only support size == 4");
    asm volatile(
            "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[inptr0]], #64\n"
            "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[outptr]], #64\n"

            : [inptr0] "+r"(inptr0), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "cc", "memory");
}
template <typename T>
static inline void interleave_1x12_1_s(const T*& inptr0, T*& outptr) {
    static_assert(sizeof(T) == 4, "interleave_1x12_1_s only support size == 4");
    asm volatile(
            "ld1 {v0.4s, v1.4s, v2.4s}, [%[inptr0]], #48\n"
            "st1 {v0.4s, v1.4s, v2.4s}, [%[outptr]], #48\n"

            : [inptr0] "+r"(inptr0), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "cc", "memory");
}

template <typename T>
static inline void interleave_1x8_1_s(const T*& inptr0, T*& outptr) {
    static_assert(sizeof(T) == 4, "interleave_1x8_1_s only support size == 4");
    asm volatile(
            "ld1 {v0.4s, v1.4s}, [%[inptr0]], #32\n"
            "st1 {v0.4s, v1.4s}, [%[outptr]], #32\n"

            : [inptr0] "+r"(inptr0), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "cc", "memory");
}

template <typename T>
static inline void interleave_1x4_1_s(const T*& inptr0, T*& outptr) {
    static_assert(sizeof(T) == 4, "interleave_1x4_1_s only support size == 4");
    asm volatile(
            "ld1 {v0.4s}, [%[inptr0]], #16\n"
            "st1 {v0.4s}, [%[outptr]], #16\n"

            : [inptr0] "+r"(inptr0), [outptr] "+r"(outptr)
            :
            : "v0", "cc", "memory");
}

template <typename T>
static inline void interleave_helper(
        const T*& inptr, T*& outptr, int unroll_k, int ksize, T val = 0) {
    int k = 0;
    for (; k < ksize; k++) {
        *outptr++ = *inptr++;
    }
    for (; k < unroll_k; k++) {
        *outptr++ = val;
    }
}

template <typename T>
static inline void interleave_1(
        const T*& inptr0, T*& outptr, int unroll_k, int ksize, T val = 0) {
    for (int k = 0; k < ksize; k += unroll_k) {
        int size = std::min(unroll_k, ksize - k);
        interleave_helper(inptr0, outptr, unroll_k, size, val);
    }
}

template <typename T>
static inline void interleave_4(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        T*& outptr, int unroll_k, int ksize, T val = 0) {
    for (int k = 0; k < ksize; k += unroll_k) {
        int size = std::min(unroll_k, ksize - k);
        interleave_helper(inptr0, outptr, unroll_k, size, val);
        interleave_helper(inptr1, outptr, unroll_k, size, val);
        interleave_helper(inptr2, outptr, unroll_k, size, val);
        interleave_helper(inptr3, outptr, unroll_k, size, val);
    }
}

template <typename T>
static inline void interleave_8(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
        T*& outptr, int unroll_k, int ksize, T val = 0) {
    for (int k = 0; k < ksize; k += unroll_k) {
        int size = std::min(unroll_k, ksize - k);
        interleave_helper(inptr0, outptr, unroll_k, size, val);
        interleave_helper(inptr1, outptr, unroll_k, size, val);
        interleave_helper(inptr2, outptr, unroll_k, size, val);
        interleave_helper(inptr3, outptr, unroll_k, size, val);
        interleave_helper(inptr4, outptr, unroll_k, size, val);
        interleave_helper(inptr5, outptr, unroll_k, size, val);
        interleave_helper(inptr6, outptr, unroll_k, size, val);
        interleave_helper(inptr7, outptr, unroll_k, size, val);
    }
}

template <typename T>
static inline void interleave_12(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
        const T*& inptr8, const T*& inptr9, const T*& inptr10, const T*& inptr11,
        T*& outptr, int unroll_k, int ksize) {
    for (int k = 0; k < ksize; k += unroll_k) {
        int size = std::min(unroll_k, ksize - k);
        interleave_helper(inptr0, outptr, unroll_k, size);
        interleave_helper(inptr1, outptr, unroll_k, size);
        interleave_helper(inptr2, outptr, unroll_k, size);
        interleave_helper(inptr3, outptr, unroll_k, size);
        interleave_helper(inptr4, outptr, unroll_k, size);
        interleave_helper(inptr5, outptr, unroll_k, size);
        interleave_helper(inptr6, outptr, unroll_k, size);
        interleave_helper(inptr7, outptr, unroll_k, size);
        interleave_helper(inptr8, outptr, unroll_k, size);
        interleave_helper(inptr9, outptr, unroll_k, size);
        interleave_helper(inptr10, outptr, unroll_k, size);
        interleave_helper(inptr11, outptr, unroll_k, size);
    }
}
/* ======================== transpose pack B ======================== */
/**
 * transpose_INTERLEAVE_UNROLLK_BATCH_type
 *
 * BATCH means process BATCH * INTERLEAVE cols once, BATCH * sizeof(TYPE) *
 * INTERLEAVE = 16bytes(128bits, a vector size).
 *
 * the elements traverse order:
 * rep(j, 0, INTERLEAVE) rep(i, 0, UNROLL_K) *ouptr++ = inptr[i, j]
 */
template <typename T>
static inline void transpose_24x4_1_h(
        const T*& in0, const T*& in1, const T*& in2, const T*& in3, T* out) {
    static_assert(sizeof(T) == 2, "only support size == 2");
    asm volatile(
        "ldp q0, q1, [%[in0]], #32\n"
        "stp q0, q1, [%[out]]\n"
        "ldr q2, [%[in0]], #16\n"
        ASM_PREFETCH("[%[in0], #192]")
        "ldp q3, q4, [%[in1]], #32\n"
        "stp q2, q3, [%[out], #32]\n"
        "ldr q5, [%[in1]], #16\n"
        ASM_PREFETCH("[%[in1], #192]")
        "stp q4, q5, [%[out], #64]\n"
        "ldp q6, q7, [%[in2]], #32\n"
        "stp q6, q7, [%[out], #96]\n"
        "ldr q8, [%[in2]], #16\n"
        ASM_PREFETCH("[%[in2], #192]")
        "ldp q9, q10, [%[in3]], #32\n"
        "stp q8, q9, [%[out], #128]\n"
        "ldr q11, [%[in3]], #16\n"
        "stp q10, q11, [%[out], #160]\n"
        ASM_PREFETCH("[%[in3], #192]")

        : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2),
          [in3] "+r"(in3), [out] "+r"(out)
        :
        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
        "v11", "memory");
}

template <typename T>
static inline void transpose_16x4_1_h(
        const T*& in0, const T*& in1, const T*& in2, const T*& in3, T* out) {
    static_assert(sizeof(T) == 2, "only support size == 2");
    asm volatile(
            "ldp q0, q1, [%[in0]], #32\n"
            "stp q0, q1, [%[out]]\n"
            "ldp q2, q3, [%[in1]], #32\n"
            "stp q2, q3, [%[out], #32]\n"
            "ldp q4, q5, [%[in2]], #32\n"
            "stp q4, q5, [%[out], #64]\n"
            "ldp q6, q7, [%[in3]], #32\n"
            "stp q6, q7, [%[out], #96]\n"
            : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2), [in3] "+r"(in3),
              [out] "+r"(out)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory");
}

template <typename T>
static inline void transpose_8x4_1_h(
        const T*& in0, const T*& in1, const T*& in2, const T*& in3, T* out) {
    static_assert(sizeof(T) == 2, "only support size == 2");
    asm volatile(
            "ldr q0, [%[in0]], #16\n"
            "str q0, [%[out]]\n"
            "ldr q1, [%[in1]], #16\n"
            "str q1, [%[out], #16]\n"
            "ldr q2, [%[in2]], #16\n"
            "str q2, [%[out], #32]\n"
            "ldr q3, [%[in3]], #16\n"
            "str q3, [%[out], #48]\n"
            : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2), [in3] "+r"(in3),
              [out] "+r"(out)
            :
            : "v0", "v1", "v2", "v3", "memory");
}

template <typename T>
static inline void transpose_24x2_1_h(const T*& in0, const T*& in1, T* out) {
    static_assert(sizeof(T) == 2, "only support size == 2");
    asm volatile(
        "ldp q0, q1, [%[in0]], #32\n"
        "stp q0, q1, [%[out]]\n"
        "ldr q2, [%[in0]], #16\n"
        ASM_PREFETCH("[%[in0], #192]")
        "ldp q3, q4, [%[in1]], #32\n"
        "stp q2, q3, [%[out], #32]\n"
        "ldr q5, [%[in1]], #16\n"
        ASM_PREFETCH("[%[in1], #192]")
        "stp q4, q5, [%[out], #64]\n"
        : [in0] "+r"(in0), [in1] "+r"(in1), [out] "+r"(out)
        :
        : "v0", "v1", "v2", "v3", "v4", "v5", "memory");
}

template <typename T>
static inline void transpose_16x2_1_h(const T*& in0, const T*& in1, T* out) {
    static_assert(sizeof(T) == 2, "only support size == 2");
    asm volatile(
            "ldp q0, q1, [%[in0]], #32\n"
            "stp q0, q1, [%[out]]\n"
            "ldp q2, q3, [%[in1]], #32\n"
            "stp q2, q3, [%[out], #32]\n"
            : [in0] "+r"(in0), [in1] "+r"(in1), [out] "+r"(out)
            :
            : "v0", "v1", "v2", "v3", "memory");
}

template <typename T>
static inline void transpose_8x2_1_h(const T*& in0, const T*& in1, T* out) {
    static_assert(sizeof(T) == 2, "only support size == 2");
    asm volatile(
            "ldr q0, [%[in0]], #16\n"
            "str q0, [%[out]]\n"
            "ldr q1, [%[in1]], #16\n"
            "str q1, [%[out], #16]\n"
            : [in0] "+r"(in0), [in1] "+r"(in1), [out] "+r"(out)
            :
            : "v0", "v1", "memory");
}

template <typename T>
static inline void transpose_24x1_1_h(const T*& in0, T* out) {
    static_assert(sizeof(T) == 2, "only support size == 2");
    // clang-format off
    asm volatile(
            "ldp q0, q1, [%[in0]], #32\n"
            "stp q0, q1, [%[out]] \n"
            "ldr q2, [%[in0]], #16 \n"
            ASM_PREFETCH("[%[in0], #192]")
            "str q2, [%[out], #32] \n"
            : [in0] "+r"(in0), [out] "+r"(out)
            :
            : "v0", "v1", "v2", "memory");
    // clang-format on
}

template <typename T>
static inline void transpose_16x1_1_h(const T*& in0, T* out) {
    static_assert(sizeof(T) == 2, "only support size == 2");
    asm volatile(
            "ldp q0, q1, [%[in0]], #32\n"
            "stp q0, q1, [%[out]]\n"
            : [in0] "+r"(in0), [out] "+r"(out)
            :
            : "v0", "v1", "memory");
}

template <typename T>
static inline void transpose_12x1_1_h(const T*& in0, T* out) {
    static_assert(sizeof(T) == 2, "only support size == 2");
    // clang-format off
    asm volatile(
            "ld1 {v0.8h}, [%[in0]], #16\n"
            "ld1 {v1.4h}, [%[in0]], #8\n"
            "st1 {v0.8h}, [%[out]], #16\n"
            "st1 {v1.4h}, [%[out]], #8\n"
            : [in0] "+r"(in0), [out] "+r"(out)
            :
            : "v0", "v1", "memory");
    // clang-format on
}

template <typename T>
static inline void transpose_8x1_1_h(const T*& in0, T* out) {
    static_assert(sizeof(T) == 2, "only support size == 2");
    asm volatile(
            "ldr q0, [%[in0]], #16\n"
            "str q0, [%[out]]\n"
            : [in0] "+r"(in0), [out] "+r"(out)
            :
            : "v0", "memory");
}

template <typename T>
static inline void transpose_4x1_1_h(const T*& in0, T* out) {
    static_assert(sizeof(T) == 2, "only support size == 2");
    // clang-format off
    asm volatile(
            "ld1 {v0.4h}, [%[in0]], #8\n"
            "st1 {v0.4h}, [%[out]], #8\n"
            : [in0] "+r"(in0), [out] "+r"(out)
            :
            : "v0", "memory");
    // clang-format on
}

template <typename T>
static inline void transpose_4x4_1_s(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        T* outptr, int stride = 16) {
    static_assert(sizeof(T) == 4, "transpose_4x4_1_s only support sizeof(T) == 4");

    asm volatile(
            "ld1 {v0.4s},  [%[inptr0]], 16\n"  // A0A1A2A3
            "ld1 {v1.4s},  [%[inptr1]], 16\n"  // B0B1B2B3
            "ld1 {v2.4s},  [%[inptr2]], 16\n"  // C0C1C2C3
            "ld1 {v3.4s},  [%[inptr3]], 16\n"  // D0D1D2D3

            "zip1 v4.4s, v0.4s, v1.4s\n"
            "zip1 v5.4s, v2.4s, v3.4s\n"
            "zip2 v6.4s, v0.4s, v1.4s\n"
            "zip2 v7.4s, v2.4s, v3.4s\n"

            "zip1 v8.2d, v4.2d, v5.2d\n"
            "zip1 v9.2d, v6.2d, v7.2d\n"
            "zip2 v10.2d, v4.2d, v5.2d\n"
            "zip2 v11.2d, v6.2d, v7.2d\n"

            "st1 {v8.4s},  [%[outptr]], %x[stride]\n"
            "st1 {v10.4s},  [%[outptr]], %x[stride]\n"
            "st1 {v9.4s},  [%[outptr]], %x[stride]\n"
            "st1 {v11.4s},  [%[outptr]], %x[stride]\n"
            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
              [inptr3] "+r"(inptr3), [outptr] "+r"(outptr), [stride] "+r"(stride)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
              "memory");
}

template <typename T>
static inline void transpose_1x12_4_s(const T*& inptr0, T* outptr) {
    static_assert(sizeof(T) == 4, "transpose_1x12_4_s only support sizeof(T) == 4");

    asm volatile(
            "ld4 {v0.4s, v1.4s, v2.4s, v3.4s},  [%[inptr0]], #64\n"
            "ld4 {v4.4s, v5.4s, v6.4s, v7.4s},  [%[inptr0]], #64\n"
            "ld4 {v8.4s, v9.4s, v10.4s, v11.4s},[%[inptr0]], #64\n"

            "stp q0, q4, [%[outptr]] \n"
            "stp q8, q1, [%[outptr], #32] \n"
            "stp q5, q9, [%[outptr], #64] \n"
            "stp q2, q6, [%[outptr], #96] \n"
            "stp q10, q3, [%[outptr], #128] \n"
            "stp q7, q11, [%[outptr], #160] \n"
            : [inptr0] "+r"(inptr0), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
              "memory");
}

template <typename T>
static inline void transpose_1x4_4_s(const T*& inptr0, T* outptr) {
    static_assert(sizeof(T) == 4, "transpose_1x4_4_s only support sizeof(T) == 4");

    asm volatile(
            "ld4 {v0.4s, v1.4s, v2.4s, v3.4s},  [%[inptr0]], #64\n"
            "st1 {v0.4s, v1.4s, v2.4s, v3.4s},  [%[outptr]]\n"
            : [inptr0] "+r"(inptr0), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "memory");
}

template <typename T>
static inline void transpose_8x4_1_s(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
        T* outptr) {
    static_assert(sizeof(T) == 4, "transpose_8x4_1_s only support sizeof(T) == 4");

    asm volatile(
            "ld1 {v0.4s},  [%[inptr0]], 16\n"  // A0A1A2A3
            "ld1 {v1.4s},  [%[inptr1]], 16\n"  // B0B1B2B3
            "ld1 {v2.4s},  [%[inptr2]], 16\n"  // C0C1C2C3
            "ld1 {v3.4s},  [%[inptr3]], 16\n"  // D0D1D2D3
            "ld1 {v4.4s},  [%[inptr4]], 16\n"  // E0E1E2E3
            "ld1 {v5.4s},  [%[inptr5]], 16\n"  // F0F1F2F3
            "ld1 {v6.4s},  [%[inptr6]], 16\n"  // G0G1G2G3
            "ld1 {v7.4s},  [%[inptr7]], 16\n"  // H0H1H2H3

            "zip1 v8.4s, v0.4s, v1.4s\n"   // A0B0A1B1
            "zip2 v9.4s, v0.4s, v1.4s\n"   // A2B2A3B3
            "zip1 v10.4s, v2.4s, v3.4s\n"  // C0D0C1D1
            "zip2 v11.4s, v2.4s, v3.4s\n"  // C2D2C3D3
            "zip1 v12.4s, v4.4s, v5.4s\n"  // E0F0E1F1
            "zip2 v13.4s, v4.4s, v5.4s\n"  // E2F2E3F3
            "zip1 v14.4s, v6.4s, v7.4s\n"  // G0H0G1H1
            "zip2 v15.4s, v6.4s, v7.4s\n"  // G2H2G3H3

            "zip1 v0.2d, v8.2d, v10.2d\n"  // A0B0C0D0
            "zip2 v2.2d, v8.2d, v10.2d\n"  // A1B1C1D1

            "zip1 v4.2d, v9.2d, v11.2d\n"  // A2B2C2D2
            "zip2 v6.2d, v9.2d, v11.2d\n"  // A3B3C3D3

            "zip1 v1.2d, v12.2d, v14.2d\n"  // E0F0G0H0
            "zip2 v3.2d, v12.2d, v14.2d\n"  // E1F1G1H1

            "zip1 v5.2d, v13.2d, v15.2d\n"  // E2F2G2H2
            "zip2 v7.2d, v13.2d, v15.2d\n"  // E3F3G3H3

            "st1 {v0.4s,v1.4s,v2.4s,v3.4s},  [%[outptr]], #64\n"
            "st1 {v4.4s,v5.4s,v6.4s,v7.4s},  [%[outptr]], #64\n"
            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
              [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
              [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
              "v12", "v13", "v14", "v15", "memory");
}

template <typename T>
static inline void transpose_12x4_1_s(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
        const T*& inptr8, const T*& inptr9, const T*& inptr10, const T*& inptr11,
        T* outptr) {
    static_assert(sizeof(T) == 4, "transpose_12x4_1_s only support sizeof(T) == 4");
    asm volatile(
            "ld1 {v0.4s},  [%[inptr0]], 16\n"    // A0A1A2A3
            "ld1 {v1.4s},  [%[inptr1]], 16\n"    // B0B1B2B3
            "ld1 {v2.4s},  [%[inptr2]], 16\n"    // C0C1C2C3
            "ld1 {v3.4s},  [%[inptr3]], 16\n"    // D0D1D2D3
            "ld1 {v4.4s},  [%[inptr4]], 16\n"    // E0E1E2E3
            "ld1 {v5.4s},  [%[inptr5]], 16\n"    // F0F1F2F3
            "ld1 {v6.4s},  [%[inptr6]], 16\n"    // G0G1G2G3
            "ld1 {v7.4s},  [%[inptr7]], 16\n"    // H0H1H2H3
            "ld1 {v16.4s},  [%[inptr8]], 16\n"   // I0I1I2I3
            "ld1 {v17.4s},  [%[inptr9]], 16\n"   // J0J1J2J3
            "ld1 {v18.4s},  [%[inptr10]], 16\n"  // K0K1K2K3
            "ld1 {v19.4s},  [%[inptr11]], 16\n"  // L0L1L2L3

            "zip1 v8.4s, v0.4s, v1.4s\n"   // A0B0A1B1
            "zip2 v9.4s, v0.4s, v1.4s\n"   // A2B2A3B3
            "zip1 v10.4s, v2.4s, v3.4s\n"  // C0D0C1D1
            "zip2 v11.4s, v2.4s, v3.4s\n"  // C2D2C3D3

            "zip1 v12.4s, v4.4s, v5.4s\n"  // E0F0E1F1
            "zip2 v13.4s, v4.4s, v5.4s\n"  // E2F2E3F3
            "zip1 v14.4s, v6.4s, v7.4s\n"  // G0H0G1H1
            "zip2 v15.4s, v6.4s, v7.4s\n"  // G2H2G3H3

            "zip1 v20.4s, v16.4s, v17.4s\n"  // I0J0I1J1
            "zip2 v21.4s, v16.4s, v17.4s\n"  // I2J2I3J3
            "zip1 v22.4s, v18.4s, v19.4s\n"  // K0L0K1L1
            "zip2 v23.4s, v18.4s, v19.4s\n"  // K2L2K3L3

            "zip1 v0.2d, v8.2d, v10.2d\n"  // A0B0C0D0
            "zip2 v3.2d, v8.2d, v10.2d\n"  // A1B1C1D1

            "zip1 v6.2d, v9.2d, v11.2d\n"   // A2B2C2D2
            "zip2 v24.2d, v9.2d, v11.2d\n"  // A3B3C3D3

            "zip1 v1.2d, v12.2d, v14.2d\n"  // E0F0G0H0
            "zip2 v4.2d, v12.2d, v14.2d\n"  // E1F1G1H1

            "zip1 v7.2d, v13.2d, v15.2d\n"   // E2F2G2H2
            "zip2 v25.2d, v13.2d, v15.2d\n"  // E3F3G3H3

            "zip1 v2.2d, v20.2d, v22.2d\n"  // I0J0K0L0
            "zip2 v5.2d, v20.2d, v22.2d\n"  // I1J1K1L1

            "zip1 v8.2d, v21.2d, v23.2d\n"   // I2J2K2L2
            "zip2 v26.2d, v21.2d, v23.2d\n"  // I3J3K3L3

            "st1 {v0.4s,v1.4s,v2.4s},  [%[outptr]], #48\n"
            "st1 {v3.4s,v4.4s,v5.4s},  [%[outptr]], #48\n"
            "st1 {v6.4s,v7.4s,v8.4s},  [%[outptr]], #48\n"
            "st1 {v24.4s,v25.4s,v26.4s},  [%[outptr]], #48\n"
            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
              [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
              [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [inptr8] "+r"(inptr8),
              [inptr9] "+r"(inptr9), [inptr10] "+r"(inptr10), [inptr11] "+r"(inptr11),
              [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
              "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
              "v22", "v23", "v24", "v25", "v26", "memory");
}

template <typename T>
static inline void transpose_12x4_1_b(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        T* outptr) {
    static_assert(
            std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
            "transpose_12x4_1_b only support uint8_t and int8_t");
    asm volatile(
            "ldr q0,  [%[inptr0]], #12\n"  // A1A2A3A4A5A6A7A8A9A10A11A12A13A14A15A16
            "ldr q1,  [%[inptr1]], #12\n"  // B1B2B3B4B5B6B7B8B9B10B11B12B13B14B15B16
            "ldr q2,  [%[inptr2]], #12\n"  // C1C2C3C4C5C6C7C8C9C10C11C12C13C14C15C16
            //! \warning the last inptr3 may less than 16bytes, so we should
            //! split read it
            "ldr d3,  [%[inptr3]], #8\n"  // D1D2D3D4D5D6D7D8D9D10D11D12D13D14D15D16
            "ldr w1, [%[inptr3]], #4\n"
            "ins v3.s[2], w1\n"

            "trn1 v4.16b, v0.16b, v1.16b\n"  // v4: A1B1A3B3....
            "trn2 v5.16b, v0.16b, v1.16b\n"  // v5: A2B2A4B4....
            "trn1 v6.16b, v2.16b, v3.16b\n"  // v6: C1D1C3D3....
            "trn2 v7.16b, v2.16b, v3.16b\n"  // v7: C2D2C4D4....

            "trn1 v8.8h, v4.8h, v6.8h\n"   // v8: A1B1C1D1A5B5C5D5...
            "trn2 v9.8h, v4.8h, v6.8h\n"   // v9: A3B3C3D3A7B7C7D7...
            "trn1 v10.8h, v5.8h, v7.8h\n"  // v10: A2B2C2D2A6B6C6D6...
            "trn2 v11.8h, v5.8h, v7.8h\n"  // v11: A4B4C4D4A8B8C8D8...

            //! ABCD=E then
            //! v8: E1E5E9E13 v10: E2E6E10E14 v9: E3E7E11E15 v11:
            //! E4E8E12E16
            "zip1 v12.4s, v8.4s, v10.4s\n"   // v12: E1E2E5E6
            "zip2 v13.4s, v8.4s, v10.4s\n"   // v13: E9E10E13E14
            "zip1 v14.4s, v9.4s, v11.4s\n"   // v14: E3E4E7E8
            "zip2 v15.4s, v9.4s, v11.4s\n"   // v15: E11E12E15E16
            "zip1 v17.2d, v12.2d, v14.2d\n"  // v17: E1E2E3E4
            "zip2 v18.2d, v12.2d, v14.2d\n"  // v18: E5E6E7E8
            "zip1 v19.2d, v13.2d, v15.2d\n"  // v19: E8E10E11E12
            "zip2 v20.2d, v13.2d, v15.2d\n"  // v19: E13E14E15E16

            "stp q17, q18, [%[outptr]], #32\n"
            "str q19, [%[outptr]], #16\n"
            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
              [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
            :
            : "w1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
              "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "memory");
}

template <typename T>
static inline void transpose_8x4_1_b(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        T* outptr) {
    static_assert(
            std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
            "transpose_8x4_1_b only support uint8_t and int8_t");
    asm volatile(
            "ld1 {v0.d}[0],  [%[inptr0]], #8\n"  // A1A2A3A4A5A6A7A8
            "ld1 {v1.d}[0],  [%[inptr1]], #8\n"  // B1B2B3B4B5B6B7B8
            "ld1 {v0.d}[1],  [%[inptr2]], #8\n"  // C1C2C3C4C5C6C7C8
            "ld1 {v1.d}[1],  [%[inptr3]], #8\n"  // D1D2D3D4D5D6D7D8

            "zip1 v2.16b, v0.16b, v1.16b\n"  // A1B1A2B2A3B3A4B4A5B5A6B6A7B7A8B8
            "zip2 v3.16b, v0.16b, v1.16b\n"  // C1D1C2D2C3D3C4D4C5D5C6D6C7D7C8D8

            "zip1 v4.8h, v2.8h, v3.8h\n"  // A1B1C1D1A2B2C2D2A3B3C3D3A4B4C4D4
            "zip2 v5.8h, v2.8h, v3.8h\n"  // A5B5C5D5A6B6C6D6A7B7C7D7A8B8C8D8

            "st1 {v4.2d}, [%[outptr]], #16\n"
            "st1 {v5.2d}, [%[outptr]], #16\n"
            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
              [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "memory");
}

template <typename T>
static inline void transpose_4x8_1_b_with_shift(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
        T*& outptr) {
    static int8x16_t shuffle_idx = {0, 4, 8,  12, 1, 5, 9,  13,
                                    2, 6, 10, 14, 3, 7, 11, 15};
    static_assert(
            std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
            "transpose_8x4_1_b only support uint8_t and int8_t");
    asm volatile(
            "ld1 {v0.s}[0],  [%[inptr0]], #4\n"  // A1A2A3A4
            "ld1 {v0.s}[1],  [%[inptr1]], #4\n"  // B1B2B3B4
            "ld1 {v0.s}[2],  [%[inptr2]], #4\n"  // C1C2C3C4
            "ld1 {v0.s}[3],  [%[inptr3]], #4\n"  // D1D2D3D4
            "ld1 {v1.s}[0],  [%[inptr4]], #4\n"  // E1E2E3E4
            "ld1 {v1.s}[1],  [%[inptr5]], #4\n"  // F1F2F3F4
            "ld1 {v1.s}[2],  [%[inptr6]], #4\n"  // G1G2G3G4
            "ld1 {v1.s}[3],  [%[inptr7]], #4\n"  // H1H2H3H4

            "tbl v2.16b, {v0.16b}, %[shuffle_idx].16b \n"  // A1B1C1D1A2B2C2D2A3B3C3D3A4B4C4D4
            "tbl v3.16b, {v1.16b}, %[shuffle_idx].16b \n"  // E1F1G1H1E2F2G2H2E3F3G3H3E4F4G4H4

            "zip1 v4.4s, v2.4s, v3.4s\n"  // A1B1C1D1E1F1G1H1 A2B2C2D2E2F2G2H2
            "zip2 v5.4s, v2.4s, v3.4s\n"  // A3B3C3D3E3F3G3H3 A4B4C4D4E4F4G4H4

            "shl  v6.16b, v4.16b,        #4\n"
            "sshr v7.16b, v4.16b,        #4\n"  // hig
            "sshr v8.16b, v6.16b,        #4\n"  // low
            "shl  v9.16b, v5.16b,        #4\n"
            "sshr v10.16b, v5.16b,        #4\n"  // hig
            "sshr v11.16b, v9.16b,        #4\n"  // low
            "zip1 v0.2d,v8.2d,v7.2d\n"
            "zip2 v1.2d,v8.2d,v7.2d\n"
            "zip1 v2.2d,v11.2d,v10.2d\n"
            "zip2 v3.2d,v11.2d,v10.2d\n"
            "st1 {v0.2d-v3.2d},[%[outptr]],#64\n"

            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
              [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
              [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7),
              [shuffle_idx] "+w"(shuffle_idx), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
              "memory");
}
template <typename T>
static inline void transpose_8x8_1_b(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
        T* outptr) {
    static_assert(
            std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
            "transpose_8x8_1_b only support uint8_t and int8_t");
    asm volatile(
            "ld1 {v0.8b},  [%[inptr0]], #8\n"  // A1A2A3A4A5A6A7A8
            "ld1 {v1.8b},  [%[inptr1]], #8\n"  // B1B2B3B4B5B6B7B8
            "ld1 {v2.8b},  [%[inptr2]], #8\n"  // C1C2C3C4C5C6C7C8
            "ld1 {v3.8b},  [%[inptr3]], #8\n"  // D1D2D3D4D5D6D7D8
            "ld1 {v4.8b},  [%[inptr4]], #8\n"  // E1E2E3E4E5E6E7E8
            "ld1 {v5.8b},  [%[inptr5]], #8\n"  // F1F2F3F4F5F6F7F8
            "ld1 {v6.8b},  [%[inptr6]], #8\n"  // G1G2G3G4G5G6G7G8
            "ld1 {v7.8b},  [%[inptr7]], #8\n"  // H1H2H3H4H5H6H7H8

            "zip1 v8.16b, v0.16b, v1.16b\n"   // A1B1A2B2A3B3A4B4
                                              // A5B5A6B6A7B7A8B8
            "zip1 v9.16b, v2.16b, v3.16b\n"   // C1D1C2D2C3D3C4D4
                                              // C5D5C6D6C7D7C8D8
            "zip1 v10.16b, v4.16b, v5.16b\n"  // E1F1E2F2E3F3E4F4
                                              // E5F5E6F6E7F7E8F8
            "zip1 v11.16b, v6.16b, v7.16b\n"  // G1H1G2H2G3H3G4H4
                                              // G5H5G6H6G7H7G8H8

            "zip1 v12.8h, v8.8h, v9.8h\n"    // A1B1C1D1A2B2C2D2
                                             // A3B3C3D3A4B4C4D4
            "zip1 v13.8h, v10.8h, v11.8h\n"  // E1F1G1H1E2F2G2H2
                                             // E3F3G3H3E4F4G4H4
            "zip2 v14.8h, v8.8h, v9.8h\n"    // A5B5C5D5A6B6C6D6
                                             // A7B7C7D7A8B8C8D8
            "zip2 v15.8h, v10.8h, v11.8h\n"  // E5F5G5H5E6F6G6H6
                                             // E7F7G7H7E8F8G8H8

            "zip1 v16.4s, v12.4s, v13.4s\n"  // A1B1C1D1E1F1G1H1
                                             // A2B2C2D2E2F2G2H2
            "zip1 v18.4s, v14.4s, v15.4s\n"  // A5B5C5D5E5F5G5H5
                                             // A6B6C6D6E6F6G6H6
            "zip2 v17.4s, v12.4s, v13.4s\n"  // A3B3C3D3E3F3G3H3
                                             // A4B4C4D4E4F4G4H4
            "zip2 v19.4s, v14.4s, v15.4s\n"  // A7B7C7D7E7F7G7H7
                                             // A8B8C8D8E8F8G8H8

            "st1 {v16.16b},  [%[outptr]], #16\n"  // A1B1C1D1E1F1G1H1
                                                  // A2B2C2D2E2F2G2H2
            "st1 {v17.16b},  [%[outptr]], #16\n"  // A3B3C3D3E3F3G3H3
                                                  // A4B4C4D4E4F4G4H4
            "st1 {v18.16b},  [%[outptr]], #16\n"  // A5B5C5D5E5F5G5H5
                                                  // A6B6C6D6E6F6G6H6
            "st1 {v19.16b},  [%[outptr]], #16\n"  // A7B7C7D7E7F7G7H7
                                                  // A8B8C8D8E8F8G8H8
            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
              [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
              [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
              "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "memory");
}

template <typename T>
static inline void transpose_4x16_1_b_helper(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
        T* outptr) {
    static_assert(sizeof(T) == 1, "only support size == 1");
    static int8x16_t shuffle_idx = {0, 4, 8,  12, 1, 5, 9,  13,
                                    2, 6, 10, 14, 3, 7, 11, 15};
    asm volatile(
            "ld1 {v0.s}[0], [%[inptr0]], #4\n"
            "ld1 {v0.s}[1], [%[inptr1]], #4\n"
            "ld1 {v0.s}[2], [%[inptr2]], #4\n"
            "ld1 {v0.s}[3], [%[inptr3]], #4\n"
            "ld1 {v1.s}[0], [%[inptr4]], #4\n"
            "ld1 {v1.s}[1], [%[inptr5]], #4\n"
            "ld1 {v1.s}[2], [%[inptr6]], #4\n"
            "ld1 {v1.s}[3], [%[inptr7]], #4\n"

            "tbl v2.16b, {v0.16b}, %[shuffle_idx].16b\n"
            "tbl v3.16b, {v1.16b}, %[shuffle_idx].16b\n"

            "zip1 v4.4s, v2.4s, v3.4s\n"
            "zip2 v5.4s, v2.4s, v3.4s\n"

            "dup v6.2d, v4.d[1]\n"
            "dup v7.2d, v5.d[1]\n"

            "str d4, [%[outptr]], #16\n"
            "str d6, [%[outptr]], #16\n"
            "str d5, [%[outptr]], #16\n"
            "str d7, [%[outptr]], #16\n"

            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
              [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
              [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr),
              [shuffle_idx] "+w"(shuffle_idx)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory");
}

template <typename T>
static inline void transpose_4(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        T* outptr, int interleave, int size, T val = 0) {
    megdnn_assert(size <= interleave);
    int i = 0;
    for (; i < size; i++) {
        *outptr++ = *inptr0++;
        *outptr++ = *inptr1++;
        *outptr++ = *inptr2++;
        *outptr++ = *inptr3++;
    }
    for (; i < interleave; i++) {
        *outptr++ = val;
        *outptr++ = val;
        *outptr++ = val;
        *outptr++ = val;
    }
}

template <typename T>
static inline void transpose_8(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
        T* outptr, int interleave, int size, T val = 0) {
    megdnn_assert(size <= interleave);
    int i = 0;
    for (; i < size; i++) {
        *outptr++ = *inptr0++;
        *outptr++ = *inptr1++;
        *outptr++ = *inptr2++;
        *outptr++ = *inptr3++;
        *outptr++ = *inptr4++;
        *outptr++ = *inptr5++;
        *outptr++ = *inptr6++;
        *outptr++ = *inptr7++;
    }
    for (; i < interleave; i++) {
        *outptr++ = val;
        *outptr++ = val;
        *outptr++ = val;
        *outptr++ = val;
        *outptr++ = val;
        *outptr++ = val;
        *outptr++ = val;
        *outptr++ = val;
    }
}
/***************************** Transpose then interleave ********************/

//! pack form {1, 4(icb), 4(ic), 4(oc)} to {1, 1, 4(oc), 16(ic)}
template <typename T>
static inline void transpose_interleave_4x4_4_b(
        const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
        T* outptr, int stride = 64) {
    static_assert(
            sizeof(T) == 1, "transpose_interleave_4x4_4_b only support sizeof(T) == 1");

    asm volatile(
            "ld4 {v0.16b, v1.16b, v2.16b, v3.16b},[%[inptr0]], 64\n"
            "ld4 {v4.16b, v5.16b, v6.16b, v7.16b},[%[inptr1]], 64\n"
            "ld4 {v8.16b, v9.16b, v10.16b, v11.16b},[%[inptr2]], 64\n"
            "ld4 {v12.16b, v13.16b, v14.16b, v15.16b},[%[inptr3]], 64\n"

            "st1 {v0.16b, v1.16b, v2.16b, v3.16b},[%[outptr]], %x[stride]\n"
            "st1 {v4.16b, v5.16b, v6.16b, v7.16b},[%[outptr]], %x[stride]\n"
            "st1 {v8.16b, v9.16b, v10.16b, v11.16b},[%[outptr]], %x[stride]\n"
            "st1 {v12.16b, v13.16b, v14.16b, v15.16b},[%[outptr]], %x[stride]\n"
            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
              [inptr3] "+r"(inptr3), [outptr] "+r"(outptr), [stride] "+r"(stride)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
              "v12", "v14", "v15", "memory");
}

template <typename T>
static inline void transpose_interleave_1x4_4_b(
        const T*& inptr0, T* outptr, int stride = 64) {
    static_assert(
            sizeof(T) == 1, "transpose_interleave_1x4_4_b only support sizeof(T) == 1");

    asm volatile(
            "ld4 {v0.16b, v1.16b, v2.16b, v3.16b},[%[inptr0]], 64\n"
            "st1 {v0.16b, v1.16b, v2.16b, v3.16b},[%[outptr]], %x[stride]\n"
            : [inptr0] "+r"(inptr0), [outptr] "+r"(outptr), [stride] "+r"(stride)
            :
            : "v0", "v1", "v2", "v3", "v4", "memory");
}

static inline void interleave_4x4_16x4_s8_s16(
        const int8_t* inptr0, const int8_t* inptr1, const int8_t* inptr2,
        const int8_t* inptr3, int16_t* outptr) {
    int8x16_t row0 = vld1q_s8(inptr0);
    int16x8_t row0_01 = vmovl_low_s8(row0);
    int16x8_t row0_23 = vmovl_high_s8(row0);
    int16x4_t row0_0 = vget_low_s16(row0_01);
    int16x4_t row0_1 = vget_high_s16(row0_01);
    int16x4_t row0_2 = vget_low_s16(row0_23);
    int16x4_t row0_3 = vget_high_s16(row0_23);

    int8x16_t row1 = vld1q_s8(inptr1);
    int16x8_t row1_01 = vmovl_low_s8(row1);
    int16x8_t row1_23 = vmovl_high_s8(row1);
    int16x4_t row1_0 = vget_low_s16(row1_01);
    int16x4_t row1_1 = vget_high_s16(row1_01);
    int16x4_t row1_2 = vget_low_s16(row1_23);
    int16x4_t row1_3 = vget_high_s16(row1_23);

    int8x16_t row2 = vld1q_s8(inptr2);
    int16x8_t row2_01 = vmovl_low_s8(row2);
    int16x8_t row2_23 = vmovl_high_s8(row2);
    int16x4_t row2_0 = vget_low_s16(row2_01);
    int16x4_t row2_1 = vget_high_s16(row2_01);
    int16x4_t row2_2 = vget_low_s16(row2_23);
    int16x4_t row2_3 = vget_high_s16(row2_23);

    int8x16_t row3 = vld1q_s8(inptr3);
    int16x8_t row3_01 = vmovl_low_s8(row3);
    int16x8_t row3_23 = vmovl_high_s8(row3);
    int16x4_t row3_0 = vget_low_s16(row3_01);
    int16x4_t row3_1 = vget_high_s16(row3_01);
    int16x4_t row3_2 = vget_low_s16(row3_23);
    int16x4_t row3_3 = vget_high_s16(row3_23);

    vst1_s16(outptr, row0_0);
    vst1_s16(outptr + 1 * 4, row1_0);
    vst1_s16(outptr + 2 * 4, row2_0);
    vst1_s16(outptr + 3 * 4, row3_0);
    vst1_s16(outptr + 4 * 4, row0_1);
    vst1_s16(outptr + 5 * 4, row1_1);
    vst1_s16(outptr + 6 * 4, row2_1);
    vst1_s16(outptr + 7 * 4, row3_1);
    vst1_s16(outptr + 8 * 4, row0_2);
    vst1_s16(outptr + 9 * 4, row1_2);
    vst1_s16(outptr + 10 * 4, row2_2);
    vst1_s16(outptr + 11 * 4, row3_2);
    vst1_s16(outptr + 12 * 4, row0_3);
    vst1_s16(outptr + 13 * 4, row1_3);
    vst1_s16(outptr + 14 * 4, row2_3);
    vst1_s16(outptr + 15 * 4, row3_3);
};
static inline void interleave_4x4_8x4_s8_s16(
        const int8_t* inptr0, const int8_t* inptr1, int16_t* outptr) {
    int8x16_t row0 = vld1q_s8(inptr0);
    int16x8_t row0_01 = vmovl_low_s8(row0);
    int16x8_t row0_23 = vmovl_high_s8(row0);
    int16x4_t row0_0 = vget_low_s16(row0_01);
    int16x4_t row0_1 = vget_high_s16(row0_01);
    int16x4_t row0_2 = vget_low_s16(row0_23);
    int16x4_t row0_3 = vget_high_s16(row0_23);

    int8x16_t row1 = vld1q_s8(inptr1);
    int16x8_t row1_01 = vmovl_low_s8(row1);
    int16x8_t row1_23 = vmovl_high_s8(row1);
    int16x4_t row1_0 = vget_low_s16(row1_01);
    int16x4_t row1_1 = vget_high_s16(row1_01);
    int16x4_t row1_2 = vget_low_s16(row1_23);
    int16x4_t row1_3 = vget_high_s16(row1_23);

    vst1_s16(outptr, row0_0);
    vst1_s16(outptr + 1 * 4, row1_0);
    vst1_s16(outptr + 2 * 4, row0_1);
    vst1_s16(outptr + 3 * 4, row1_1);
    vst1_s16(outptr + 4 * 4, row0_2);
    vst1_s16(outptr + 5 * 4, row1_2);
    vst1_s16(outptr + 6 * 4, row0_3);
    vst1_s16(outptr + 7 * 4, row1_3);
};

static inline void memcpy_s8_s16(const int8_t* inptr, int16_t* outptr, int count) {
    for (; count >= 32; count -= 32) {
        int8x8_t in0 = vld1_s8(inptr);
        int8x8_t in1 = vld1_s8(inptr + 1 * 8);
        int8x8_t in2 = vld1_s8(inptr + 2 * 8);
        int8x8_t in3 = vld1_s8(inptr + 3 * 8);
        vst1q_s16(outptr, vmovl_s8(in0));
        vst1q_s16(outptr + 1 * 8, vmovl_s8(in1));
        vst1q_s16(outptr + 2 * 8, vmovl_s8(in2));
        vst1q_s16(outptr + 3 * 8, vmovl_s8(in3));
        inptr += 32;
        outptr += 32;
    }
    for (; count >= 8; count -= 8) {
        int8x8_t in0 = vld1_s8(inptr);
        vst1q_s16(outptr, vmovl_s8(in0));
        inptr += 8;
        outptr += 8;
    }
    for (; count > 0; --count) {
        *outptr++ = (int16_t)(*inptr++);
    }
}

static inline void transpos_12x4_s8(const int8_t* inptr0, int8_t* outptr) {
    static const uint8_t src_idx_buffer[16] = {0, 4, 8,  12, 1, 5, 9,  13,
                                               2, 6, 10, 14, 3, 7, 11, 15};
    static const uint8x16_t vtbl = vld1q_u8(&src_idx_buffer[0]);
    int8x8x4_t input = vld4_s8(inptr0);
    int8x16_t input2 = vqtbl1q_s8(vld1q_s8(inptr0 + 4 * 8), vtbl);

    vst1_s8(outptr, input.val[0]);
    vst1q_lane_s32(
            reinterpret_cast<int32_t*>(outptr + 8), vreinterpretq_s32_s8(input2), 0);
    vst1_s8(outptr + 1 * 12, input.val[1]);
    vst1q_lane_s32(
            reinterpret_cast<int32_t*>(outptr + 1 * 12 + 8),
            vreinterpretq_s32_s8(input2), 1);
    vst1_s8(outptr + 2 * 12, input.val[2]);
    vst1q_lane_s32(
            reinterpret_cast<int32_t*>(outptr + 2 * 12 + 8),
            vreinterpretq_s32_s8(input2), 2);
    vst1_s8(outptr + 3 * 12, input.val[3]);
    vst1q_lane_s32(
            reinterpret_cast<int32_t*>(outptr + 3 * 12 + 8),
            vreinterpretq_s32_s8(input2), 3);
}

template <typename T>
static inline void interleave_8x8_mk4_b(
        const T*& inptr0, const T*& inptr1, T*& outptr) {
    static_assert(
            std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
            "transpose_8x4_1_b only support uint8_t and int8_t");
    asm volatile(
            "ld1 {v0.4s},  [%[inptr0]], #16\n"
            "ld1 {v1.4s},  [%[inptr1]], #16\n"
            "ld1 {v2.4s},  [%[inptr0]], #16\n"
            "ld1 {v3.4s},  [%[inptr1]], #16\n"

            "zip1 v4.4s, v0.4s, v1.4s \n"
            "zip2 v5.4s, v0.4s, v1.4s \n"

            "zip1 v6.4s, v2.4s, v3.4s\n"
            "zip2 v7.4s, v2.4s, v3.4s\n"

            "st1 {v4.4s},[%[outptr]],#16\n"
            "st1 {v5.4s},[%[outptr]],#16\n"
            "st1 {v6.4s},[%[outptr]],#16\n"
            "st1 {v7.4s},[%[outptr]],#16\n"

            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory");
}

template <typename T>
static inline void transpose_8x8_mk4_b(const T*& inptr0, const T*& inptr1, T* outptr) {
    static_assert(
            std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
            "transpose_8x4_1_b only support uint8_t and int8_t");
    asm volatile(
            "ld4 {v0.8b-v3.8b},  [%[inptr0]], #32\n"
            "ld4 {v4.8b-v7.8b},  [%[inptr1]], #32\n"
            "st1 {v0.2s},[%[outptr]],#8\n"
            "st1 {v1.2s},[%[outptr]],#8\n"
            "st1 {v2.2s},[%[outptr]],#8\n"
            "st1 {v3.2s},[%[outptr]],#8\n"
            "st1 {v4.2s},[%[outptr]],#8\n"
            "st1 {v5.2s},[%[outptr]],#8\n"
            "st1 {v6.2s},[%[outptr]],#8\n"
            "st1 {v7.2s},[%[outptr]],#8\n"

            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [outptr] "+r"(outptr)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory");
}

}  // namespace aarch64
}  // namespace megdnn

// vim: syntax=cpp.doxygen