/** * \file dnn/src/cuda/memory_utils.cuh * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. */ #if MEGDNN_CC_CUDA #pragma once #include "src/cuda/utils.cuh" namespace megdnn { namespace cuda { MEGDNN_DEVICE __forceinline__ void transpose_int8_4x4_impl( const int src0, const int src1, const int src2, const int src3, int& dst0, int& dst1, int& dst2, int& dst3) { int dst01_lo = __byte_perm(src0, src1, 0x5140); int dst01_hi = __byte_perm(src0, src1, 0x7362); int dst23_lo = __byte_perm(src2, src3, 0x5140); int dst23_hi = __byte_perm(src2, src3, 0x7362); dst0 = __byte_perm(dst01_lo, dst23_lo, 0x5410); dst1 = __byte_perm(dst01_lo, dst23_lo, 0x7632); dst2 = __byte_perm(dst01_hi, dst23_hi, 0x5410); dst3 = __byte_perm(dst01_hi, dst23_hi, 0x7632); } template MEGDNN_DEVICE __forceinline__ void transpose_int8_interleavedx4( const int src[interleaved], vec_type (&dst)[4]); template <> MEGDNN_DEVICE __forceinline__ void transpose_int8_interleavedx4<4, int>( const int src[4], int (&dst)[4]) { transpose_int8_4x4_impl(src[0], src[1], src[2], src[3], dst[0], dst[1], dst[2], dst[3]); } template <> MEGDNN_DEVICE __forceinline__ void transpose_int8_interleavedx4<8, int2>( const int src[8], int2 (&dst)[4]) { transpose_int8_4x4_impl(src[0], src[1], src[2], src[3], dst[0].x, dst[1].x, dst[2].x, dst[3].x); transpose_int8_4x4_impl(src[4], src[5], src[6], src[7], dst[0].y, dst[1].y, dst[2].y, dst[3].y); } template <> MEGDNN_DEVICE __forceinline__ void transpose_int8_interleavedx4<16, int4>( const int src[16], int4 (&dst)[4]) { transpose_int8_4x4_impl(src[0], src[1], src[2], src[3], dst[0].x, dst[1].x, dst[2].x, dst[3].x); transpose_int8_4x4_impl(src[4], src[5], src[6], src[7], dst[0].y, dst[1].y, dst[2].y, dst[3].y); transpose_int8_4x4_impl(src[8], src[9], src[10], src[11], dst[0].z, dst[1].z, dst[2].z, dst[3].z); transpose_int8_4x4_impl(src[12], src[13], src[14], src[15], dst[0].w, dst[1].w, dst[2].w, dst[3].w); } } // namespace cuda } // namespace megdnn #endif // vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}