/**
 * \file dnn/src/cuda/memory_utils.cuh
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
#if MEGDNN_CC_CUDA
#pragma once
#include "src/cuda/utils.cuh"

namespace megdnn {
namespace cuda {

MEGDNN_DEVICE __forceinline__ void transpose_int8_4x4_impl(
        const int src0, const int src1, const int src2, const int src3,
        int& dst0, int& dst1, int& dst2, int& dst3) {
    int dst01_lo = __byte_perm(src0, src1, 0x5140);
    int dst01_hi = __byte_perm(src0, src1, 0x7362);
    int dst23_lo = __byte_perm(src2, src3, 0x5140);
    int dst23_hi = __byte_perm(src2, src3, 0x7362);
    dst0 = __byte_perm(dst01_lo, dst23_lo, 0x5410);
    dst1 = __byte_perm(dst01_lo, dst23_lo, 0x7632);
    dst2 = __byte_perm(dst01_hi, dst23_hi, 0x5410);
    dst3 = __byte_perm(dst01_hi, dst23_hi, 0x7632);
}

template <uint32_t interleaved, typename vec_type>
MEGDNN_DEVICE __forceinline__ void transpose_int8_interleavedx4(
        const int src[interleaved], vec_type (&dst)[4]);

template <>
MEGDNN_DEVICE __forceinline__ void transpose_int8_interleavedx4<4, int>(
        const int src[4], int (&dst)[4]) {
    transpose_int8_4x4_impl(src[0], src[1], src[2], src[3], dst[0], dst[1],
                            dst[2], dst[3]);
}

template <>
MEGDNN_DEVICE __forceinline__ void transpose_int8_interleavedx4<8, int2>(
        const int src[8], int2 (&dst)[4]) {
    transpose_int8_4x4_impl(src[0], src[1], src[2], src[3], dst[0].x, dst[1].x,
                            dst[2].x, dst[3].x);
    transpose_int8_4x4_impl(src[4], src[5], src[6], src[7], dst[0].y, dst[1].y,
                            dst[2].y, dst[3].y);
}

template <>
MEGDNN_DEVICE __forceinline__ void transpose_int8_interleavedx4<16, int4>(
        const int src[16], int4 (&dst)[4]) {
    transpose_int8_4x4_impl(src[0], src[1], src[2], src[3], dst[0].x, dst[1].x,
                            dst[2].x, dst[3].x);
    transpose_int8_4x4_impl(src[4], src[5], src[6], src[7], dst[0].y, dst[1].y,
                            dst[2].y, dst[3].y);
    transpose_int8_4x4_impl(src[8], src[9], src[10], src[11], dst[0].z,
                            dst[1].z, dst[2].z, dst[3].z);
    transpose_int8_4x4_impl(src[12], src[13], src[14], src[15], dst[0].w,
                            dst[1].w, dst[2].w, dst[3].w);
}

}  // namespace cuda
}  // namespace megdnn
#endif

// vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}