feat(dnn/cuda): add relayout format when width is an odd number

GitOrigin-RevId: f059f1f56d
4 years ago · 8fef78d06d
--- a/dnn/src/common/relayout_format.cpp
+++ b/dnn/src/common/relayout_format.cpp
@@ -380,7 +380,7 @@ void RelayoutFormat::deduce_format(TensorFormat src, TensorFormat& dst) {
            break;
    }

    if (!dst.is_default() &&
    if (dst.type() == TensorFormat::Type::IMAGE2D_PACK4 &&
        (
                handle()->type() != Handle::HandleType::NAIVE)) {
 #if MEGDNN_ENABLE_MANGLING
--- a/dnn/src/cuda/relayout_format/relayout_format.cu
+++ b/dnn/src/cuda/relayout_format/relayout_format.cu
@@ -10,10 +10,10 @@
 * implied.
 */

 #include <stdint.h>
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #include "cutlass/fast_math.h"
 #include "cutlass/arch/memory.h"
 #pragma GCC diagnostic pop
 #include "src/cuda/query_blocksize.cuh"
@@ -112,6 +112,8 @@ struct CudaPostProcess<dtype::QuantizedS32, dtype::QuantizedS32, true> {

 template <>
 struct CudaPostProcess<dtype::QuantizedS4, dtype::QuantizedS4, false> {
    using SrcType = dtype::QuantizedS4;
    using DstType = dtype::QuantizedS4;
    CudaDTypeParamImpl<dt_qint4> m_dst_type_cvt;
    CudaDTypeParamImpl<dt_qint4> m_src_type_cvt;
    CudaPostProcess(float src_scale, uint8_t, float dst_scale, uint8_t) {
@@ -126,12 +128,16 @@ struct CudaPostProcess<dtype::QuantizedS4, dtype::QuantizedS4, false> {

 template <>
 struct CudaPostProcess<dtype::QuantizedS4, dtype::QuantizedS4, true> {
    using SrcType = dtype::QuantizedS4;
    using DstType = dtype::QuantizedS4;
    CudaPostProcess(float, uint8_t, float, uint8_t){};
    inline __device__ int8_t operator()(int8_t val) { return val; }
 };

 template <>
 struct CudaPostProcess<dtype::Quantized4Asymm, dtype::Quantized4Asymm, false> {
    using SrcType = dtype::Quantized4Asymm;
    using DstType = dtype::Quantized4Asymm;
    CudaDTypeParamImpl<dt_quint4> m_dst_type_cvt;
    CudaDTypeParamImpl<dt_quint4> m_src_type_cvt;
    CudaPostProcess(float src_scale, uint8_t src_zero_point, float dst_scale,
@@ -149,6 +155,8 @@ struct CudaPostProcess<dtype::Quantized4Asymm, dtype::Quantized4Asymm, false> {

 template <>
 struct CudaPostProcess<dtype::Quantized4Asymm, dtype::Quantized4Asymm, true> {
    using SrcType = dtype::Quantized4Asymm;
    using DstType = dtype::Quantized4Asymm;
    uint8_t m_src_zero_point = 0;
    uint8_t m_dst_zero_point = 0;
    CudaPostProcess(float, uint8_t src_zero_point, float,
@@ -328,13 +336,20 @@ struct Translayout<2, 64, SrcType, dtype::QuantizedS4, dtype::QuantizedS4,
            unpack_int4x2(6)
            unpack_int4x2(7)
            // clang-format on

            
            int frag_idx = i / 8;
            dst_frag[0 * 8 + frag_idx] = pack_channel(0);
            dst_frag[1 * 8 + frag_idx] = pack_channel(1);
 #undef unpack_int4x2
        }
    }
    using Fragment = array_wrapper<SrcType, 64>;
    static inline __device__ void trans(
            Fragment& dst, Fragment& src,
            CudaPostProcess<DnnSrcType, DnnDstType, same_scale>& post_process) {
        trans(reinterpret_cast<DstDtype(&)[2]>(dst),
              reinterpret_cast<InnerDtype(&)[64]>(src), post_process, 0);
    }
 };

 template <typename SrcType, bool same_scale>
@@ -375,6 +390,13 @@ struct Translayout<8, 64, SrcType, dtype::QuantizedS4, dtype::QuantizedS4,
            dst_frag[7 * 8 + frag_idx] = pack_channel(7);
        }
    }
    using Fragment = array_wrapper<unsigned, 64>;
    static inline __device__ void trans(
            Fragment& dst, Fragment& src,
            CudaPostProcess<DnnSrcType, DnnDstType, same_scale>& post_process) {
        trans(reinterpret_cast<DstDtype(&)[8]>(dst),
              reinterpret_cast<InnerDtype(&)[64]>(src), post_process, 0);
    }
 };
 #undef pack_channel

@@ -428,6 +450,13 @@ struct Translayout<2, 64, SrcType, dtype::Quantized4Asymm,
 #undef unpack_int4x2
        }
    }
    using Fragment = array_wrapper<SrcType, 64>;
    static inline __device__ void trans(
            Fragment& dst, Fragment& src,
            CudaPostProcess<DnnSrcType, DnnDstType, same_scale>& post_process) {
        trans(reinterpret_cast<DstDtype(&)[2]>(dst),
              reinterpret_cast<InnerDtype(&)[64]>(src), post_process, 0);
    }
 };

 template <typename SrcType, bool same_scale>
@@ -468,6 +497,13 @@ struct Translayout<8, 64, SrcType, dtype::Quantized4Asymm,
            dst_frag[7 * 8 + frag_idx] = pack_channel(7);
        }
    }
    using Fragment = array_wrapper<unsigned, 64>;
    static inline __device__ void trans(
            Fragment& dst, Fragment& src,
            CudaPostProcess<DnnSrcType, DnnDstType, same_scale>& post_process) {
        trans(reinterpret_cast<DstDtype(&)[8]>(dst),
              reinterpret_cast<InnerDtype(&)[64]>(src), post_process, 0);
    }
 };
 #undef pack_channel

@@ -1028,11 +1064,21 @@ public:
            : pointer{nullptr}, chan_stride_in_elements{0}, channel{0} {}
    MEGDNN_DEVICE TensorIteratorOverChannel(Type* pointer_,
                                            int chan_stride_in_elements_,
                                            int channel_)
                                            int channel_, int, int)
            : pointer{pointer_},
              chan_stride_in_elements{chan_stride_in_elements_},
              channel{channel_} {}

    MEGDNN_DEVICE __forceinline__ void initialize(int c_idx, int hw_idx) {
        pointer += (c_idx / pack_size) * chan_stride_in_elements +
                   hw_idx * pack_size * size_nbits / (8 * sizeof(Type));
    }

    MEGDNN_DEVICE __forceinline__ void add_pointer_offset(
            size_t offset_in_type) {
        pointer += offset_in_type;
    }

    MEGDNN_DEVICE __forceinline__ void load(Fragment& frag) {
        AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
        Type* pointer_ = pointer;
@@ -1087,64 +1133,224 @@ private:
    int channel;
 };

 template <int pack_w, int pack_c, bool same_scale, typename SrcType,
          typename DstType, typename DnnSrcType, typename DnnDstType,
          int size_nbits = 8>
 __global__ void kern_nchwx_nchw(
        const SrcType* src, DstType* dst, int ic, int ihw, int n_stride_src,
        int ic_stride, int n_stride_dst, int oc_stride,
        CudaPostProcess<DnnSrcType, DnnDstType, same_scale> post_process,
        const char zero_point) {
    using InnerDtype =
            typename DTypeRWHelper<typename DTypeTrait<DnnSrcType>::ctype,
                                   pack_w>::InnerDtype;
    using SrcIterator = TensorIteratorOverChannel<SrcType, pack_c, pack_c,
                                                  pack_w, size_nbits>;
    using DstIteraotr = TensorIteratorOverChannel<InnerDtype, 1, pack_c, pack_w,
                                                  size_nbits>;
    using Transpose = Translayout<pack_c, pack_w, SrcType, DnnSrcType,
                                  DnnDstType, same_scale>;
    static constexpr int size_src_type = sizeof(SrcType);
    static constexpr int size_dst_type = sizeof(DstType);
    MEGDNN_STATIC_ASSERT(std::is_same<SrcType MEGDNN_COMMA DstType>::value,
                         "Currently this kernel only support accessing tensor "
                         "src and dst in same data type.");
    n_stride_src /= size_src_type;
    ic_stride /= size_src_type;
    n_stride_dst /= size_dst_type;
    oc_stride /= size_dst_type;
 #undef MEGDNN_COMMA
 template <typename Type_, int pack_size_, int chan_blk_, int width_,
          int size_nbits_>
 class MaskedTensorIteratorOverChannel {
 public:
    using Type = Type_;
    static constexpr int pack_size = pack_size_;
    static constexpr int chan_blk = chan_blk_;
    static constexpr int width = width_;
    static constexpr int size_nbits = size_nbits_;
    static constexpr int elements_in_type =
            chan_blk * width * size_nbits / (8 * sizeof(Type));
    static constexpr int lane_size_in_type =
            (width * pack_size * size_nbits) / (8 * sizeof(Type));
    static constexpr int pack_size_in_type =
            (pack_size * size_nbits) >= (8 * sizeof(Type))
                    ? (pack_size * size_nbits / (8 * sizeof(Type)))
                    : (width * pack_size * size_nbits / (8 * sizeof(Type)));
    static constexpr int pack_size_in_byte = pack_size_in_type * sizeof(Type);
    static constexpr int accesses = elements_in_type / pack_size_in_type;
    static constexpr int mask_size = (accesses + 32 - 1) / 32;
    using AccessType = array_wrapper<Type, pack_size_in_type>;
    using Fragment = array_wrapper<Type, elements_in_type>;

    const int n_idx = blockIdx.y;
    const int ihw_block_idx = blockIdx.x * blockDim.x + threadIdx.x;
    const int ihw_offset = ihw_block_idx * pack_w;
    const int ihw_offset_in_type =
            ihw_offset * size_nbits / (8 * size_src_type);
    const int oc_stride_inner_dtype =
            oc_stride * size_dst_type / sizeof(InnerDtype);
    if (ihw_offset < ihw) {
        const int ic_block = (ic + pack_c - 1) / pack_c;
        const int src_offset_base =
                n_idx * n_stride_src + ihw_offset_in_type * pack_c;
        const int dst_offset_base = n_idx * n_stride_dst + ihw_offset_in_type;
        SrcIterator src_iterator{const_cast<SrcType*>(src + src_offset_base),
                                 ic_stride, ic};
        DstIteraotr dst_iterator{
                reinterpret_cast<InnerDtype*>(dst + dst_offset_base),
                oc_stride_inner_dtype, ic};

        for (int ic_blk_idx = 0; ic_blk_idx < ic_block; ++ic_blk_idx) {
            typename SrcIterator::Fragment src_frag;
            typename DstIteraotr::Fragment dst_frag;
            src_iterator.load(src_frag);
            Transpose::trans(
                    reinterpret_cast<typename SrcIterator::Fragment&>(dst_frag),
                    src_frag, post_process);
            dst_iterator.store(dst_frag);
            src_iterator.advance();
            dst_iterator.advance();
    MEGDNN_HOST MEGDNN_DEVICE MaskedTensorIteratorOverChannel()
            : pointer{nullptr},
              chan_stride_in_elements{0},
              channel{0} {}
    MEGDNN_HOST MEGDNN_DEVICE MaskedTensorIteratorOverChannel(
            Type* pointer_, int chan_stride_in_elements_, int channel_,
            int bound_, int div_)
            : pointer{pointer_},
              chan_stride_in_elements{chan_stride_in_elements_},
              channel{channel_},
              bound{bound_},
              div{div_} {
        cutlass::find_divisor(mul, shr, div);
    }

    MEGDNN_DEVICE __forceinline__ void initialize(int c_idx, int hw_idx) {
        pointer += (c_idx / pack_size) * chan_stride_in_elements;
 #pragma unroll
        for (int i = 0; i < mask_size; ++i) {
            mask[i] = 0;
        }
 #pragma unroll
        for (int i = 0; i < chan_blk; i += pack_size) {
 #pragma unroll
            for (int j = 0; j < lane_size_in_type / pack_size_in_type; j++) {
                int offset = hw_idx + j;
                int h, w;
                cutlass::fast_divmod(h, w, offset, div, mul, shr);
                bool guard = (i < channel) && (w < bound);
                int index = (i / pack_size) *
                                    (lane_size_in_type / pack_size_in_type) +
                            j;
                int mask_index = (index >> 5);
                int mask_shift = (index & 0x1f);
                mask[mask_index] |= (guard << mask_shift);
                stride[j] = (h * bound + w) * pack_size * size_nbits /
                            (8 * sizeof(Type));
            }
        }
    }

    MEGDNN_DEVICE __forceinline__ void add_pointer_offset(size_t offset_in_type) {
        pointer += offset_in_type;
    }

    MEGDNN_DEVICE __forceinline__ void load(Fragment& frag) {
        AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
        Type* pointer_ = pointer;
 #pragma unroll
        for (int i = 0; i < chan_blk; i += pack_size) {
 #pragma unroll
            for (int j = 0; j < lane_size_in_type / pack_size_in_type; j++) {
                int frag_idx = i / pack_size *
                                       (lane_size_in_type / pack_size_in_type) +
                               j;
                int mask_index = (frag_idx >> 5);
                int mask_shift = (frag_idx & 0x1f);
                bool guard = (mask[mask_index] & (1 << mask_shift));
                cutlass::arch::global_load<AccessType, pack_size_in_byte>(
                        frag_ptr[frag_idx],
                        reinterpret_cast<void*>(pointer_ + stride[j]), guard);
            }
            pointer_ += chan_stride_in_elements;
        }
    }

    MEGDNN_DEVICE __forceinline__ void store(const Fragment& frag) {
        const AccessType* frag_ptr = reinterpret_cast<const AccessType*>(&frag);
        Type* pointer_ = pointer;
 #pragma unroll
        for (int i = 0; i < chan_blk; i += pack_size) {
 #pragma unroll
            for (int j = 0; j < lane_size_in_type / pack_size_in_type; j++) {
                int frag_idx = i / pack_size *
                                       (lane_size_in_type / pack_size_in_type) +
                               j;
                int mask_index = (frag_idx >> 5);
                int mask_shift = (frag_idx & 0x1f);
                bool guard = (mask[mask_index] & (1 << mask_shift));
                cutlass::arch::global_store<AccessType, pack_size_in_byte>(
                        frag_ptr[frag_idx],
                        reinterpret_cast<void*>(pointer_ + stride[j]), guard);
            }
            pointer_ += chan_stride_in_elements;
        }
    }

    MEGDNN_DEVICE __forceinline__ void advance() {
        pointer += (chan_blk / pack_size) * chan_stride_in_elements;
        channel -= chan_blk;
    }

 private:
    Type* pointer;
    int chan_stride_in_elements;
    int channel;
    int bound;
    int div;
    uint32_t mul;
    uint32_t shr;
    uint32_t mask[mask_size];
    size_t stride[accesses];
 };

 template <bool padding_, typename Type_, int pack_size_, int chan_blk_,
          int width_, int size_nbits_>
 struct TensorIteratorPolicy;
 template <typename Type_, int pack_size_, int chan_blk_, int width_,
          int size_nbits_>
 struct TensorIteratorPolicy<true, Type_, pack_size_, chan_blk_, width_,
                            size_nbits_> {
    using TensorIterator =
            MaskedTensorIteratorOverChannel<Type_, pack_size_, chan_blk_,
                                            width_, size_nbits_>;
 };
 template <typename Type_, int pack_size_, int chan_blk_, int width_,
          int size_nbits_>
 struct TensorIteratorPolicy<false, Type_, pack_size_, chan_blk_, width_,
                            size_nbits_> {
    using TensorIterator =
            TensorIteratorOverChannel<Type_, pack_size_, chan_blk_, width_,
                                      size_nbits_>;
 };

 template <typename SrcIterator_, typename DstIterator_, typename Transpose_,
          typename CudaPostProcess_>
 struct RelayoutProblem {
    using SrcIterator = SrcIterator_;
    using DstIterator = DstIterator_;
    using Transpose = Transpose_; 
    using CudaPostProcess = CudaPostProcess_;
    MEGDNN_STATIC_ASSERT(SrcIterator::chan_blk == DstIterator::chan_blk,
                         "channel block mismatch");
    MEGDNN_STATIC_ASSERT(SrcIterator::width == DstIterator::width,
                         "width block mismatch");
    MEGDNN_STATIC_ASSERT(SrcIterator::size_nbits == DstIterator::size_nbits,
                         "size in bits of elements mismatch");
    static constexpr int pack_chan = SrcIterator::chan_blk;
    static constexpr int pack_width = SrcIterator::width;
    using DnnSrcType = typename CudaPostProcess::SrcType;
    using DnnDstType = typename CudaPostProcess::DstType;
    struct Param {
        SrcIterator src_iterator;
        DstIterator dst_iterator;
        CudaPostProcess post_process;
        int n_stride_src;
        int n_stride_dst;
        int batch_size;
        int channels;
        int hw;
        MEGDNN_HOST MEGDNN_DEVICE Param(SrcIterator src_iterator_,
                                        DstIterator dst_iterator_,
                                        CudaPostProcess post_process_,
                                        int n_stride_src_, int n_stride_dst_,
                                        int batch_size_, int channels_, int hw_)
                : src_iterator{src_iterator_},
                  dst_iterator{dst_iterator_},
                  post_process{post_process_},
                  n_stride_src{n_stride_src_},
                  n_stride_dst{n_stride_dst_},
                  batch_size{batch_size_},
                  channels{channels_},
                  hw{hw_} {}
    };
 };

 template <typename RelayoutProblem_>
 __global__ void relayout_kern(typename RelayoutProblem_::Param param) {
    using SrcIterator = typename RelayoutProblem_::SrcIterator;
    using DstIterator = typename RelayoutProblem_::DstIterator;
    static constexpr int pack_chan = RelayoutProblem_::pack_chan;
    static constexpr int pack_width = RelayoutProblem_::pack_width;
    const int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
    const int thread_offset = thread_idx * pack_width;
    const int hw_idx = (thread_offset % param.hw);
    const int nc_blks = thread_offset / param.hw;
    const int c_blks = (param.channels + pack_chan - 1) / pack_chan;
    const int n_idx = nc_blks / c_blks;
    const int c_blk_idx = nc_blks % c_blks;
    const int c_idx = c_blk_idx * pack_chan;
    if (n_idx < param.batch_size) {
        const int src_offset = n_idx * param.n_stride_src;
        const int dst_offset = n_idx * param.n_stride_dst;
        param.src_iterator.add_pointer_offset(src_offset);
        param.dst_iterator.add_pointer_offset(dst_offset);
        param.src_iterator.initialize(c_idx, hw_idx);
        param.dst_iterator.initialize(c_idx, hw_idx);
        typename SrcIterator::Fragment src_frag;
        typename DstIterator::Fragment dst_frag;
        param.src_iterator.load(src_frag);
        RelayoutProblem_::Transpose::trans(
                reinterpret_cast<typename SrcIterator::Fragment&>(dst_frag),
                src_frag, param.post_process);
        param.dst_iterator.store(dst_frag);
    }
 }
 }  // namespace

@@ -1175,21 +1381,23 @@ void relayout_format::relayout_format_cuda_nchw_nchwx(
                  "Unsupport pack size(pack_oc:%d, src:%s, dst:%s)", pack_oc,
                  stype.name(), dtype.name());
 #undef DEF
    const int in_n = src.layout[0];
    const int out_n = dst.layout[0];
    const int ic = src.layout[1];
    const int h = src.layout[2];
    const int w = src.layout[3];
    const int oc = dst.layout[1] * pack_oc;
    const int hw = h * w;
    const int ocpg = oc / group;
    // stride in byte
    const int n_stride_src = src_layout.dtype.size(src_layout.stride[0]);
    const int ic_stride = src_layout.dtype.size(src_layout.stride[1]);
    const int n_stride_dst = dst_layout.dtype.size(dst_layout.stride[0]);
    const int oc_stride = dst_layout.dtype.size(dst_layout.stride[1]);
    // no padding
    if (src.layout.stride[2] == static_cast<ptrdiff_t>(src.layout[3])) {
        const int in_n = src.layout[0];
        const int out_n = dst.layout[0];
        const int ic = src.layout[1];
        const int h = src.layout[2];
        const int w = src.layout[3];
        const int oc = dst.layout[1] * pack_oc;
        const int hw = h * w;
        const int ocpg = oc / group;
        // stride in byte
        const int n_stride_src = src_layout.dtype.size(src_layout.stride[0]);
        const int ic_stride = src_layout.dtype.size(src_layout.stride[1]);
        const int n_stride_dst = dst_layout.dtype.size(dst_layout.stride[0]);
        const int oc_stride = dst_layout.dtype.size(dst_layout.stride[1]);

    bool same_scale = src_scale == dst_scale;
        bool same_scale = src_scale == dst_scale;
 #define DISPATCH_RAW(_same_scale, _pack_w, _pack_oc, _src_type, _dst_type,   \
                     _src_c_type, _dst_c_type, _size_nbits)                  \
    if (same_scale == _same_scale && hw % _pack_w == 0 &&                    \
@@ -1225,19 +1433,95 @@ void relayout_format::relayout_format_cuda_nchw_nchwx(
    DISPATCH_RAW(false, 8, 64, _src_type, _dst_type, char, char, 4); \
    DISPATCH_RAW(true, 2, 64, _src_type, _dst_type, char, char, 4);  \
    DISPATCH_RAW(false, 2, 64, _src_type, _dst_type, char, char, 4);
    DISPATCH_INT(QuantizedS32, QuantizedS32);
    DISPATCH_BYTE(Uint8, QuantizedS8);
    DISPATCH_BYTE(Quantized8Asymm, QuantizedS8);
    DISPATCH_BYTE(QuantizedS8, QuantizedS8);
    DISPATCH_4BITS(QuantizedS4, QuantizedS4);
    DISPATCH_4BITS(Quantized4Asymm, Quantized4Asymm);
        DISPATCH_INT(QuantizedS32, QuantizedS32);
        DISPATCH_BYTE(Uint8, QuantizedS8);
        DISPATCH_BYTE(Quantized8Asymm, QuantizedS8);
        DISPATCH_BYTE(QuantizedS8, QuantizedS8);
        DISPATCH_4BITS(QuantizedS4, QuantizedS4);
        DISPATCH_4BITS(Quantized4Asymm, Quantized4Asymm);
 #undef DISPATCH_4BITS
 #undef DISPATCH_BYTE
 #undef DISPATCH_INT
 #undef DISPATCH_RAW
    megdnn_assert(false,
                  "Unsupported data type(src:%s, dst:%s) or image size(%dx%d).",
                  stype.name(), dtype.name(), h, w);
        megdnn_assert(
                false,
                "Unsupported data type(src:%s, dst:%s) or image size(%dx%d).",
                stype.name(), dtype.name(), h, w);
    } else {
        megdnn_assert(src_layout.dtype.is_low_bit());
        int n = src.layout[0];
        int c = src.layout[1];
        int h = src.layout[2];
        // align to byte
        int w = src.layout[3];
        int w_pad = DIVUP(w, 2) * 2;
        int hw  = h * w_pad;
        int n_stride_src = src_layout.stride[0];
        int ic_stride = src_layout.stride[1];
        int n_stride_dst = dst_layout.stride[0];
        int oc_stride = dst_layout.stride[1];
        int problem_size = n * (c / pack_oc) * hw;
        bool same_scale = src_scale == dst_scale;
 #define DISPATCH_RAW(_same_scale, _pack_w, _pack_oc, _src_type, _dst_type,     \
                     _src_c_type, _dst_c_type, _size_nbits)                    \
    if (same_scale == _same_scale && hw % _pack_w == 0 &&                      \
        stype.enumv().ev == DTypeEnum::Ev::_src_type &&                        \
        dtype.enumv().ev == DTypeEnum::Ev::_dst_type) {                        \
        using InnerDtype_ = typename DTypeRWHelper<                            \
                typename DTypeTrait<dtype::_src_type>::ctype,                  \
                _pack_w>::InnerDtype;                                          \
        using SrcIterator_ =                                                   \
                TensorIteratorOverChannel<InnerDtype_, 1, _pack_oc, _pack_w,   \
                                          _size_nbits>;                        \
        using DstIterator_ = MaskedTensorIteratorOverChannel<                  \
                _dst_c_type, _pack_oc, _pack_oc, _pack_w, _size_nbits>;        \
        using CudaPostProcess_ =                                               \
                CudaPostProcess<dtype::_src_type, dtype::_dst_type,            \
                                _same_scale>;                                  \
        using Transpose_ =                                                     \
                Translayout<_pack_w, _pack_oc, _src_c_type, dtype::_src_type,  \
                            dtype::_dst_type, _same_scale>;                    \
        using RelayoutProblem_ =                                               \
                RelayoutProblem<SrcIterator_, DstIterator_, Transpose_,        \
                                CudaPostProcess_>;                             \
        n_stride_src = n_stride_src * _size_nbits / (8 * sizeof(InnerDtype_)); \
        ic_stride = ic_stride * _size_nbits / (8 * sizeof(InnerDtype_));       \
        n_stride_dst = n_stride_dst * _size_nbits / (8 * sizeof(_dst_c_type)); \
        oc_stride = oc_stride * _size_nbits / (8 * sizeof(_dst_c_type));       \
        typename RelayoutProblem_::Param param{                                \
                SrcIterator_{(InnerDtype_*)src.raw_ptr, ic_stride, c, w,       \
                             w_pad},                                           \
                DstIterator_{(_dst_c_type*)dst.raw_ptr, oc_stride, c, w,       \
                             w_pad},                                           \
                CudaPostProcess_{src_scale, src_zero_point, dst_scale,         \
                                 dst_zero_point},                              \
                n_stride_src,                                                  \
                n_stride_dst,                                                  \
                n,                                                             \
                c,                                                             \
                hw};                                                           \
        auto kernel = relayout_kern<RelayoutProblem_>;                         \
        int nr_threads = query_blocksize_for_kernel(kernel);                   \
        nr_threads = std::min(nr_threads, DIVUP(problem_size, _pack_w));       \
        const dim3 block_dim(DIVUP(problem_size, nr_threads* _pack_w));        \
        const dim3 thread_dim(nr_threads);                                     \
        return kernel<<<block_dim, thread_dim, 0, stream>>>(param);            \
    }
 #define DISPATCH_4BITS(_src_type, _dst_type)                         \
    DISPATCH_RAW(true, 8, 64, _src_type, _dst_type, char, char, 4);  \
    DISPATCH_RAW(false, 8, 64, _src_type, _dst_type, char, char, 4); \
    DISPATCH_RAW(true, 2, 64, _src_type, _dst_type, char, char, 4);  \
    DISPATCH_RAW(false, 2, 64, _src_type, _dst_type, char, char, 4);
        DISPATCH_4BITS(QuantizedS4, QuantizedS4);
        DISPATCH_4BITS(Quantized4Asymm, Quantized4Asymm);
 #undef DISPATCH_4BITS
 #undef DISPATCH_RAW
        megdnn_assert(
                false,
                "Unsupported data type(src:%s, dst:%s) or image size(%dx%d).",
                stype.name(), dtype.name(), h, w);
    }
    after_kernel_launch();
 }

 bool relayout_format::relayout_format_cuda_usable(
@@ -1283,43 +1567,77 @@ void relayout_format::relayout_format_cuda_nchwx_nchw(
    // clang-format on
    megdnn_assert(pack_ic == 64, "Unsupport pack size(pack_ic:%d)", pack_ic);
 #undef DEF
    const int n = src.layout[0];
    const int c = src.layout[1] * pack_ic;
    const int h = src.layout[2];
    int n = src.layout[0];
    int c = src.layout[1] * pack_ic;
    int h = src.layout[2];
    // align to byte
    const int w = src.layout[3];
    const int hw = h * w;
    const int n_stride_src = src_layout.dtype.size(src_layout.stride[0]);
    const int ic_stride = src_layout.dtype.size(src_layout.stride[1]);
    const int n_stride_dst = dst_layout.dtype.size(dst_layout.stride[0]);
    const int oc_stride = dst_layout.dtype.size(dst_layout.stride[1]);
    int w = src.layout[3];
    int w_pad = DIVUP(w, 2) * 2;
    int hw = h * w_pad;
    int n_stride_src = src_layout.stride[0];
    int ic_stride = src_layout.stride[1];
    int n_stride_dst = dst_layout.stride[0];
    int oc_stride = dst_layout.stride[1];
    int problem_size = n * (c / pack_ic) * hw;

    bool same_scale = src_scale == dst_scale;
 #define DISPATCH_RAW(_same_scale, _pack_w, _pack_oc, _src_type, _dst_type,   \
                     _src_c_type, _dst_c_type, _size_nbits)                  \
    if (same_scale == _same_scale && hw % _pack_w == 0 &&                    \
        stype.enumv().ev == DTypeEnum::Ev::_src_type &&                      \
        dtype.enumv().ev == DTypeEnum::Ev::_dst_type) {                      \
        auto kernel =                                                        \
                kern_nchwx_nchw<_pack_w, _pack_oc, _same_scale, _src_c_type, \
                                _dst_c_type, dtype::_src_type,               \
                                dtype::_dst_type, _size_nbits>;              \
        int nr_threads = query_blocksize_for_kernel(kernel);                 \
        const dim3 block_dim(DIVUP(hw, nr_threads* _pack_w), n);             \
        const dim3 thread_dim(nr_threads);                                   \
        return kernel<<<block_dim, thread_dim, 0, stream>>>(                 \
                (_src_c_type*)src.raw_ptr, (_dst_c_type*)dst.raw_ptr, c, hw, \
                n_stride_src, ic_stride, n_stride_dst, oc_stride,            \
                CudaPostProcess<dtype::_src_type, dtype::_dst_type,          \
                                _same_scale>(src_scale, src_zero_point,      \
                                             dst_scale, dst_zero_point),     \
                src_zero_point);                                             \
    bool padding = w % 2 != 0;
 #define DISPATCH_RAW(_padding, _same_scale, _pack_w, _pack_oc, _src_type,      \
                     _dst_type, _src_c_type, _dst_c_type, _size_nbits)         \
    if (padding == _padding && same_scale == _same_scale &&                    \
        hw % _pack_w == 0 && stype.enumv().ev == DTypeEnum::Ev::_src_type &&   \
        dtype.enumv().ev == DTypeEnum::Ev::_dst_type) {                        \
        using SrcIterator_ =                                                   \
                typename TensorIteratorPolicy<_padding, _src_c_type, _pack_oc, \
                                              _pack_oc, _pack_w,               \
                                              _size_nbits>::TensorIterator;    \
        using InnerDtype_ = typename DTypeRWHelper<                            \
                typename DTypeTrait<dtype::_src_type>::ctype,                  \
                _pack_w>::InnerDtype;                                          \
        using DstIterator_ =                                                   \
                TensorIteratorOverChannel<InnerDtype_, 1, _pack_oc, _pack_w,   \
                                          _size_nbits>;                        \
        using CudaPostProcess_ =                                               \
                CudaPostProcess<dtype::_src_type, dtype::_dst_type,            \
                                _same_scale>;                                  \
        using Transpose_ =                                                     \
                Translayout<_pack_oc, _pack_w, _src_c_type, dtype::_src_type,  \
                            dtype::_dst_type, _same_scale>;                    \
        using RelayoutProblem_ =                                               \
                RelayoutProblem<SrcIterator_, DstIterator_, Transpose_,        \
                                CudaPostProcess_>;                             \
        n_stride_src = n_stride_src * _size_nbits / (8 * sizeof(_src_c_type)); \
        ic_stride = ic_stride * _size_nbits / (8 * sizeof(_src_c_type));       \
        n_stride_dst = n_stride_dst * _size_nbits / (8 * sizeof(InnerDtype_)); \
        oc_stride = oc_stride * _size_nbits / (8 * sizeof(InnerDtype_));       \
        typename RelayoutProblem_::Param param{                                \
                SrcIterator_{(_src_c_type*)src.raw_ptr, ic_stride, c, w,       \
                             w_pad},                                           \
                DstIterator_{(InnerDtype_*)dst.raw_ptr, oc_stride, c, w,       \
                             w_pad},                                           \
                CudaPostProcess_{src_scale, src_zero_point, dst_scale,         \
                                 dst_zero_point},                              \
                n_stride_src,                                                  \
                n_stride_dst,                                                  \
                n,                                                             \
                c,                                                             \
                hw};                                                           \
        auto kernel = relayout_kern<RelayoutProblem_>;                         \
        int nr_threads = query_blocksize_for_kernel(kernel);                   \
        nr_threads = std::min(nr_threads, DIVUP(problem_size, _pack_w));       \
        const dim3 block_dim(DIVUP(problem_size, nr_threads* _pack_w));        \
        const dim3 thread_dim(nr_threads);                                     \
        return kernel<<<block_dim, thread_dim, 0, stream>>>(param);            \
    }
 #define DISPATCH_4BITS(_src_type, _dst_type)                         \
    DISPATCH_RAW(true, 8, 64, _src_type, _dst_type, char, char, 4);  \
    DISPATCH_RAW(false, 8, 64, _src_type, _dst_type, char, char, 4); \
    DISPATCH_RAW(true, 2, 64, _src_type, _dst_type, char, char, 4);  \
    DISPATCH_RAW(false, 2, 64, _src_type, _dst_type, char, char, 4);
 #define DISPATCH_4BITS(_src_type, _dst_type)                                \
    DISPATCH_RAW(true, true, 8, 64, _src_type, _dst_type, char, char, 4);   \
    DISPATCH_RAW(true, false, 8, 64, _src_type, _dst_type, char, char, 4);  \
    DISPATCH_RAW(true, true, 2, 64, _src_type, _dst_type, char, char, 4);   \
    DISPATCH_RAW(true, false, 2, 64, _src_type, _dst_type, char, char, 4);  \
    DISPATCH_RAW(false, true, 8, 64, _src_type, _dst_type, char, char, 4);  \
    DISPATCH_RAW(false, false, 8, 64, _src_type, _dst_type, char, char, 4); \
    DISPATCH_RAW(false, true, 2, 64, _src_type, _dst_type, char, char, 4);  \
    DISPATCH_RAW(false, false, 2, 64, _src_type, _dst_type, char, char, 4);
    DISPATCH_4BITS(QuantizedS4, QuantizedS4);
    DISPATCH_4BITS(Quantized4Asymm, Quantized4Asymm);
 #undef DISPATCH_4BITS
@@ -1327,6 +1645,7 @@ void relayout_format::relayout_format_cuda_nchwx_nchw(
    megdnn_assert(false,
                  "Unsupported data type(src:%s, dst:%s) or image size(%dx%d).",
                  stype.name(), dtype.name(), h, w);
    after_kernel_launch();
 }

 void relayout_format::relayout_format_cuda_nchw4_nchw(
@@ -1344,6 +1663,7 @@ void relayout_format::relayout_format_cuda_nchw4_nchw(
    const dim3 thread_dim(nr_threads);
    kern_nchw4_nchw<<<block_dim, thread_dim, 0, stream>>>(
            (int8_t*)src.raw_ptr, (int8_t*)dst.raw_ptr, n, ic, oc, h, w, group);
    after_kernel_launch();
 }

 void relayout_format::relayout_format_cuda_nchw_nchw4_weight(
@@ -1372,4 +1692,5 @@ void relayout_format::relayout_format_cuda_nchw_nchw4_weight(
            (char*)src.raw_ptr, (char*)dst.raw_ptr, oc, ic, hw, oc_stride_src,
            ic_stride, oc_stride_dst, group_stride_src, group_stride_dst, 0,
            {});
    after_kernel_launch();
 }
--- a/dnn/test/common/benchmarker.h
+++ b/dnn/test/common/benchmarker.h
@@ -87,10 +87,11 @@ public:
        for (size_t i = 0; i < shapes.size(); ++i) {
            DType dt = (m_dtype.find(i) != m_dtype.end() ? m_dtype[i]
                                                         : dtype::Float32());
            TensorFormat fmt = (m_fmt.find(i) != m_fmt.end()
                                        ? m_fmt[i]
                                        : DefaultTensorFormat::make());
            layouts[i] = TensorLayout(shapes[i], dt, fmt);
            if (m_fmt.find(i) == m_fmt.end()) {
                layouts[i] = TensorLayout(shapes[i], dt);
                layouts[i].init_contiguous_stride();
            } else
                layouts[i] = TensorLayout(shapes[i], dt, m_fmt[i]);
        }
        return layouts;
    }
--- a/dnn/test/common/checker.cpp
+++ b/dnn/test/common/checker.cpp
@@ -19,7 +19,6 @@ using namespace megdnn;
 using namespace test;

 namespace {

    template<typename ctype, class Iter>
    ::testing::AssertionResult assert_tensor_eq_with_iter(
            const char *expr0, const char *expr1,
@@ -30,7 +29,7 @@ namespace {
        double error_sum = 0;
        double error_sum_biased = 0;
        for (size_t i = 0; i < nr_elem; ++ i) {
            ctype iv0 = *it0, iv1 = *it1;
            ctype iv0 = ctype(*it0), iv1 = ctype(*it1);
            float err = diff(iv0, iv1);
            error_sum += std::abs(err);
            error_sum_biased += err;
@@ -84,12 +83,14 @@ namespace {
            const char *expr0, const char *expr1,
            const TensorND &v0, const TensorND &v1,
            float maxerr, float maxerr_avg, float maxerr_avg_biased) {

        if (v0.layout.is_physical_contiguous() &&
            v1.layout.is_physical_contiguous()) {
            return assert_tensor_eq_with_iter<ctype>(
                    expr0, expr1, v0.ptr<ctype>(), v1.ptr<ctype>(), v0.layout,
                    maxerr, maxerr_avg, maxerr_avg_biased);
        if (!std::is_same<ctype, dt_qint4>::value &&
            !std::is_same<ctype, dt_quint4>::value) {
            if (v0.layout.is_physical_contiguous() &&
                v1.layout.is_physical_contiguous()) {
                return assert_tensor_eq_with_iter<ctype>(
                        expr0, expr1, v0.ptr<ctype>(), v1.ptr<ctype>(),
                        v0.layout, maxerr, maxerr_avg, maxerr_avg_biased);
            }
        }

        auto it0 = megdnn::tensor_iter_valonly<ctype>(v0).begin(),
@@ -100,56 +101,6 @@ namespace {
                                                 maxerr_avg_biased);
    }

    template <typename ITYPE>
    ::testing::AssertionResult assert_tensor_eq_with_lowbit4(
            const char* expr0, const char* expr1,
            const TensorND& v0, const TensorND& v1,
            float maxerr, float maxerr_avg) {
        if (!v0.layout.eq_layout(v1.layout)) {
            return ::testing::AssertionFailure()
                << "Layout mismatch for testing equality of lowbit4\n"
                << "Value of: " << expr1 << "\n"
                << "  Actual: " << v1.layout.TensorShape::to_string() << "\n"
                << "Expected: " << expr0 << "\n"
                << "Which is: " << v0.layout.TensorShape::to_string() << "\n";
        }
        auto v0_ptr = static_cast<ITYPE*>(v0.raw_ptr) - v0.layout.span().low_byte;
        auto v1_ptr = static_cast<ITYPE*>(v1.raw_ptr) - v1.layout.span().low_byte;
        double error_sum = 0;
        for (size_t i = 0; i < v0.layout.span().dist_elem(); ++i) {
            ITYPE iv0 = (v0_ptr[i / 2] << (i ^ 1) * 4);
            iv0 = iv0 >> 4;
            ITYPE iv1 = (v1_ptr[i / 2] << (i ^ 1) * 4);
            iv1 = iv1 >> 4;

            float err = std::abs(diff(iv0, iv1));
            error_sum += err;
            if (!good_float(iv0) || !good_float(iv1) || err >= maxerr) {
                Index index(v0.layout, i);
                return ::testing::AssertionFailure()
                    << "Unequal value\n"
                    << "Value of: " << expr1 << "\n"
                    << "  Actual: " << (iv1+0) << "\n"
                    << "Expected: " << expr0 << "\n"
                    << "Which is: " << (iv0+0) << "\n"
                    << "At index: " <<
                    index.to_string() << "/" << v0.layout.TensorShape::to_string() << "\n"
                    << "   Dtype: " << v0.layout.dtype.name() << "\n"
                    << "   error: " << err << "/" << maxerr;
            }
        }
        float error_avg = error_sum / v0.layout.total_nr_elems();
        if (error_avg > maxerr_avg) {
            return ::testing::AssertionFailure()
                << "Average error too high\n"
                << "Value of: " << expr1 << "\n"
                << "Expected: " << expr0 << "\n"
                << "Average error: " << error_avg << "/" << maxerr_avg;
        }

        return ::testing::AssertionSuccess();
    }

    template<class Impl>
    void memcpy_noncontig(
            void *dst, const void *src, const TensorLayout &layout,
@@ -215,12 +166,7 @@ namespace {
        //! In order to avoid an unnecessary increase in binary size, we just
        //! use QuantizedS16 dtype in winograd_filter_preprocess now.
        cb(::megdnn::dtype::QuantizedS16)
        case DTypeTrait<dtype::Quantized4Asymm>::enumv:
            return assert_tensor_eq_with_lowbit4<uint8_t>(expr0, expr1, v0, v1,
                                                          maxerr, maxerr_avg);
        case DTypeTrait<dtype::QuantizedS4>::enumv:
            return assert_tensor_eq_with_lowbit4<int8_t>(expr0, expr1, v0, v1,
                                                         maxerr, maxerr_avg);
        MEGDNN_FOREACH_QUANTIZED_LOWBIT_DTYPE(cb)
 #undef cb
        default:
            megdnn_trap();
--- a/dnn/test/common/utils.h
+++ b/dnn/test/common/utils.h
@@ -228,6 +228,14 @@ static inline int diff(dt_qint8 x, dt_qint8 y) {
    return x.as_int8() - y.as_int8();
 }

 static inline int diff(dt_qint4 x, dt_qint4 y) {
    return x.as_int8() - y.as_int8();
 }

 static inline int diff(dt_quint4 x, dt_quint4 y) {
    return x.as_uint8() - y.as_uint8();
 }

 inline TensorShape cvt_src_or_dst_nchw2nhwc(const TensorShape& shape) {
    megdnn_assert(shape.ndim == 4);
    auto N = shape[0], C = shape[1], H = shape[2], W = shape[3];
@@ -356,6 +364,15 @@ static inline int operator+(dt_qint16 lhs, int rhs) {
    return lhs.as_int16();
 }

 static inline int operator+(dt_quint4 lhs, int rhs) {
    megdnn_assert(rhs == 0, "unexpected rhs");
    return lhs.as_uint8();
 }

 static inline int operator+(dt_qint4 lhs, int rhs) {
    megdnn_assert(rhs == 0, "unexpected rhs");
    return lhs.as_int8();
 }
 }  // namespace test

 static inline bool operator==(const TensorLayout& a, const TensorLayout& b) {
--- a/dnn/test/cuda/relayout_format.cpp
+++ b/dnn/test/cuda/relayout_format.cpp
@@ -11,13 +11,14 @@
 */
 #include "megdnn/dtype.h"
 #include "megdnn/oprs.h"
 #include "test/common/benchmarker.h"
 #include "test/cuda/benchmark.h"
 #include "test/common/checker.h"
 #include "test/common/rng.h"
 #include "test/cuda/fixture.h"

 using namespace megdnn;
 using namespace test;
 #define MEGDNN_WITH_BENCHMARK 1

 TEST_F(CUDA, RELAYOUT_FORMAT) {
    Checker<RelayoutFormat> checker(handle_cuda());
@@ -246,7 +247,7 @@ TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW64) {
    for (size_t n : {1, 3}) {
        for (size_t c : {64, 128}) {
            for (size_t h : {7, 14, 16, 28}) {
                for (size_t w : {2, 4, 14, 16}) {
                for (size_t w : {2, 3, 7, 8, 16, 31}) {
                    checker.set_dtype(0, dtype::QuantizedS4{2.f})
                            .set_dtype(1, dtype::QuantizedS4{2.f})
                            .set_rng(0, &s4)
@@ -286,7 +287,7 @@ TEST_F(CUDA, RELAYOUT_FORMAT_NCHW64_NCHW) {
    for (size_t n : {1, 3}) {
        for (size_t c : {64, 128}) {
            for (size_t h : {7, 14, 16, 28}) {
                for (size_t w : {2, 4, 14, 16}) {
                for (size_t w : {2, 3, 4, 7, 14, 16, 17}) {
                    checker.set_dtype(0, dtype::QuantizedS4{2.f})
                            .set_dtype(1, dtype::QuantizedS4{2.f})
                            .set_rng(0, &s4)
@@ -366,6 +367,46 @@ TEST_F(CUDA, BENCHMARK_RELAYOUT_FORMAT) {
        run(shapes, param, default_param);
    }
 }

 TEST_F(CUDA, BENCHMARK_RELAYOUT_FORMAT_QS4) {
    using Param = RelayoutFormat::Param;

    auto run = [&](const TensorShapeArray& shapes, Param param) {
        CUBenchmarker<RelayoutFormat> benchmarker(handle_cuda());
        benchmarker.set_param(param);
        benchmarker.set_dtype(0, dtype::QuantizedS4{1.19990307f})
                .set_dtype(1, dtype::QuantizedS4{1.20210322f});

        for (auto&& shape : shapes) {
            double memaccess = double(shape.total_nr_elems()) * 1e-6;
            auto time_ms = benchmarker.execs({shape, {}});
            printf("execute %s, time %.4f ms, %.4f GB/s\n",
                   shape.to_string().c_str(), time_ms, memaccess / time_ms);
        }
    };

    {
        TensorShapeArray shapes = {
                {1, 64, 56, 56}, {16, 64, 56, 56}, {64, 64, 56, 56},
                {1, 64, 56, 55}, {16, 64, 56, 55}, {64, 64, 56, 55},
        };
        Param param;
        param.mode = param::RelayoutFormat::Mode::NCHW_NCHW64;
        run(shapes, param);
    }
    {
        TensorShapeArray shapes = {
                {64, 1, 56, 56, 64},
                {1, 32, 7, 7, 64},
                {16, 32, 7, 7, 64},
                {64, 32, 7, 7, 64},
        };
        Param param;
        param.mode = param::RelayoutFormat::Mode::NCHW64_NCHW;
        run(shapes, param);
    }
 }

 #endif

 TEST_F(CUDA, RELAYOUT_FORMAT_NCHW4) {