From c74660ea880117a79389126ea5fc6202e67b0ac1 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Wed, 24 Mar 2021 14:05:19 +0800 Subject: [PATCH] fix(dnn/cuda): fix invalid local read for relayout format kernel GitOrigin-RevId: 5a77b82212626072059bbd95b624f57195f7b36e --- dnn/src/cuda/relayout_format/relayout_format.cu | 24 ++++++++++----------- dnn/src/cuda/utils.cuh | 28 +++++++++---------------- 2 files changed, 22 insertions(+), 30 deletions(-) diff --git a/dnn/src/cuda/relayout_format/relayout_format.cu b/dnn/src/cuda/relayout_format/relayout_format.cu index a06787a5..0e74e761 100644 --- a/dnn/src/cuda/relayout_format/relayout_format.cu +++ b/dnn/src/cuda/relayout_format/relayout_format.cu @@ -321,11 +321,11 @@ struct Translayout<2, 64, SrcType, dtype::QuantizedS4, dtype::QuantizedS4, int* dst_frag = reinterpret_cast(dst_width); #pragma unroll for (int i = 0; i < 64; i += 8) { -#define unpack_int4x2(_idx) \ - intermediate[_idx][0] = unpack_integer_4bits( \ - reinterpret_cast(read_channel[i + _idx]), 0); \ - intermediate[_idx][1] = unpack_integer_4bits( \ - reinterpret_cast(read_channel[i + _idx]), 4); +#define unpack_int4x2(_idx) \ + intermediate[_idx][0] = unpack_integer_4bits( \ + reinterpret_cast(read_channel[i + _idx]), 0); \ + intermediate[_idx][1] = unpack_integer_4bits( \ + reinterpret_cast(read_channel[i + _idx]), 4); // clang-format off unpack_int4x2(0) unpack_int4x2(1) @@ -336,7 +336,7 @@ struct Translayout<2, 64, SrcType, dtype::QuantizedS4, dtype::QuantizedS4, unpack_int4x2(6) unpack_int4x2(7) // clang-format on - + int frag_idx = i / 8; dst_frag[0 * 8 + frag_idx] = pack_channel(0); dst_frag[1 * 8 + frag_idx] = pack_channel(1); @@ -428,11 +428,11 @@ struct Translayout<2, 64, SrcType, dtype::Quantized4Asymm, int* dst_frag = reinterpret_cast(dst_width); #pragma unroll for (int i = 0; i < 64; i += 8) { -#define unpack_int4x2(_idx) \ - intermediate[_idx][0] = unpack_integer_4bits( \ - reinterpret_cast(read_channel[i + _idx]), 0); \ - intermediate[_idx][1] = unpack_integer_4bits( \ - reinterpret_cast(read_channel[i + _idx]), 4); +#define unpack_int4x2(_idx) \ + intermediate[_idx][0] = unpack_integer_4bits( \ + reinterpret_cast(read_channel[i + _idx]), 0); \ + intermediate[_idx][1] = unpack_integer_4bits( \ + reinterpret_cast(read_channel[i + _idx]), 4); // clang-format off unpack_int4x2(0) unpack_int4x2(1) @@ -1257,7 +1257,7 @@ private: uint32_t mul; uint32_t shr; uint32_t mask[mask_size]; - size_t stride[accesses]; + size_t stride[lane_size_in_type / pack_size_in_type]; }; template (out); } -template -MEGDNN_DEVICE __forceinline__ static int unpack_integer_4bits(unsigned storage, - unsigned bits); - -template <> -MEGDNN_DEVICE __forceinline__ int unpack_integer_4bits(unsigned storage, - unsigned bits) { - uint8_t result = (uint8_t)((unsigned)(storage >> bits) & 0xf); - static constexpr uint8_t mask = (uint8_t)((1 << 4) - 1); - return (result & uint8_t(1 << 3)) ? ((int)(result) | ~(int)(mask)) - : (int)(result); -} - -template <> -MEGDNN_DEVICE __forceinline__ int unpack_integer_4bits(unsigned storage, - unsigned bits) { - uint8_t result = (uint8_t)((unsigned)(storage >> bits) & 0xf); - return (int)(result); +template +MEGDNN_DEVICE __forceinline__ static int unpack_integer_4bits(T storage, + int bits) { + uint8_t result = (uint8_t)((storage >> bits) & 0xf); + if (signedness) { + static constexpr uint8_t mask = (uint8_t)((1 << 4) - 1); + return (result & uint8_t(1 << 3)) ? ((int)(result) | ~(int)(mask)) + : (int)(result); + } + return int(result); } MEGDNN_DEVICE __forceinline__ static void transform_int4x8_to_int8(