Browse Source

feat(dnn/cuda): add relayout format when width is an odd number

GitOrigin-RevId: f059f1f56d
release-1.5
Megvii Engine Team 4 years ago
parent
commit
8fef78d06d
6 changed files with 512 additions and 186 deletions
  1. +1
    -1
      dnn/src/common/relayout_format.cpp
  2. +435
    -114
      dnn/src/cuda/relayout_format/relayout_format.cu
  3. +5
    -4
      dnn/test/common/benchmarker.h
  4. +10
    -64
      dnn/test/common/checker.cpp
  5. +17
    -0
      dnn/test/common/utils.h
  6. +44
    -3
      dnn/test/cuda/relayout_format.cpp

+ 1
- 1
dnn/src/common/relayout_format.cpp View File

@@ -380,7 +380,7 @@ void RelayoutFormat::deduce_format(TensorFormat src, TensorFormat& dst) {
break;
}

if (!dst.is_default() &&
if (dst.type() == TensorFormat::Type::IMAGE2D_PACK4 &&
(
handle()->type() != Handle::HandleType::NAIVE)) {
#if MEGDNN_ENABLE_MANGLING


+ 435
- 114
dnn/src/cuda/relayout_format/relayout_format.cu View File

@@ -10,10 +10,10 @@
* implied.
*/

#include <stdint.h>
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "cutlass/fast_math.h"
#include "cutlass/arch/memory.h"
#pragma GCC diagnostic pop
#include "src/cuda/query_blocksize.cuh"
@@ -112,6 +112,8 @@ struct CudaPostProcess<dtype::QuantizedS32, dtype::QuantizedS32, true> {

template <>
struct CudaPostProcess<dtype::QuantizedS4, dtype::QuantizedS4, false> {
using SrcType = dtype::QuantizedS4;
using DstType = dtype::QuantizedS4;
CudaDTypeParamImpl<dt_qint4> m_dst_type_cvt;
CudaDTypeParamImpl<dt_qint4> m_src_type_cvt;
CudaPostProcess(float src_scale, uint8_t, float dst_scale, uint8_t) {
@@ -126,12 +128,16 @@ struct CudaPostProcess<dtype::QuantizedS4, dtype::QuantizedS4, false> {

template <>
struct CudaPostProcess<dtype::QuantizedS4, dtype::QuantizedS4, true> {
using SrcType = dtype::QuantizedS4;
using DstType = dtype::QuantizedS4;
CudaPostProcess(float, uint8_t, float, uint8_t){};
inline __device__ int8_t operator()(int8_t val) { return val; }
};

template <>
struct CudaPostProcess<dtype::Quantized4Asymm, dtype::Quantized4Asymm, false> {
using SrcType = dtype::Quantized4Asymm;
using DstType = dtype::Quantized4Asymm;
CudaDTypeParamImpl<dt_quint4> m_dst_type_cvt;
CudaDTypeParamImpl<dt_quint4> m_src_type_cvt;
CudaPostProcess(float src_scale, uint8_t src_zero_point, float dst_scale,
@@ -149,6 +155,8 @@ struct CudaPostProcess<dtype::Quantized4Asymm, dtype::Quantized4Asymm, false> {

template <>
struct CudaPostProcess<dtype::Quantized4Asymm, dtype::Quantized4Asymm, true> {
using SrcType = dtype::Quantized4Asymm;
using DstType = dtype::Quantized4Asymm;
uint8_t m_src_zero_point = 0;
uint8_t m_dst_zero_point = 0;
CudaPostProcess(float, uint8_t src_zero_point, float,
@@ -328,13 +336,20 @@ struct Translayout<2, 64, SrcType, dtype::QuantizedS4, dtype::QuantizedS4,
unpack_int4x2(6)
unpack_int4x2(7)
// clang-format on
int frag_idx = i / 8;
dst_frag[0 * 8 + frag_idx] = pack_channel(0);
dst_frag[1 * 8 + frag_idx] = pack_channel(1);
#undef unpack_int4x2
}
}
using Fragment = array_wrapper<SrcType, 64>;
static inline __device__ void trans(
Fragment& dst, Fragment& src,
CudaPostProcess<DnnSrcType, DnnDstType, same_scale>& post_process) {
trans(reinterpret_cast<DstDtype(&)[2]>(dst),
reinterpret_cast<InnerDtype(&)[64]>(src), post_process, 0);
}
};

template <typename SrcType, bool same_scale>
@@ -375,6 +390,13 @@ struct Translayout<8, 64, SrcType, dtype::QuantizedS4, dtype::QuantizedS4,
dst_frag[7 * 8 + frag_idx] = pack_channel(7);
}
}
using Fragment = array_wrapper<unsigned, 64>;
static inline __device__ void trans(
Fragment& dst, Fragment& src,
CudaPostProcess<DnnSrcType, DnnDstType, same_scale>& post_process) {
trans(reinterpret_cast<DstDtype(&)[8]>(dst),
reinterpret_cast<InnerDtype(&)[64]>(src), post_process, 0);
}
};
#undef pack_channel

@@ -428,6 +450,13 @@ struct Translayout<2, 64, SrcType, dtype::Quantized4Asymm,
#undef unpack_int4x2
}
}
using Fragment = array_wrapper<SrcType, 64>;
static inline __device__ void trans(
Fragment& dst, Fragment& src,
CudaPostProcess<DnnSrcType, DnnDstType, same_scale>& post_process) {
trans(reinterpret_cast<DstDtype(&)[2]>(dst),
reinterpret_cast<InnerDtype(&)[64]>(src), post_process, 0);
}
};

template <typename SrcType, bool same_scale>
@@ -468,6 +497,13 @@ struct Translayout<8, 64, SrcType, dtype::Quantized4Asymm,
dst_frag[7 * 8 + frag_idx] = pack_channel(7);
}
}
using Fragment = array_wrapper<unsigned, 64>;
static inline __device__ void trans(
Fragment& dst, Fragment& src,
CudaPostProcess<DnnSrcType, DnnDstType, same_scale>& post_process) {
trans(reinterpret_cast<DstDtype(&)[8]>(dst),
reinterpret_cast<InnerDtype(&)[64]>(src), post_process, 0);
}
};
#undef pack_channel

@@ -1028,11 +1064,21 @@ public:
: pointer{nullptr}, chan_stride_in_elements{0}, channel{0} {}
MEGDNN_DEVICE TensorIteratorOverChannel(Type* pointer_,
int chan_stride_in_elements_,
int channel_)
int channel_, int, int)
: pointer{pointer_},
chan_stride_in_elements{chan_stride_in_elements_},
channel{channel_} {}

MEGDNN_DEVICE __forceinline__ void initialize(int c_idx, int hw_idx) {
pointer += (c_idx / pack_size) * chan_stride_in_elements +
hw_idx * pack_size * size_nbits / (8 * sizeof(Type));
}

MEGDNN_DEVICE __forceinline__ void add_pointer_offset(
size_t offset_in_type) {
pointer += offset_in_type;
}

MEGDNN_DEVICE __forceinline__ void load(Fragment& frag) {
AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
Type* pointer_ = pointer;
@@ -1087,64 +1133,224 @@ private:
int channel;
};

template <int pack_w, int pack_c, bool same_scale, typename SrcType,
typename DstType, typename DnnSrcType, typename DnnDstType,
int size_nbits = 8>
__global__ void kern_nchwx_nchw(
const SrcType* src, DstType* dst, int ic, int ihw, int n_stride_src,
int ic_stride, int n_stride_dst, int oc_stride,
CudaPostProcess<DnnSrcType, DnnDstType, same_scale> post_process,
const char zero_point) {
using InnerDtype =
typename DTypeRWHelper<typename DTypeTrait<DnnSrcType>::ctype,
pack_w>::InnerDtype;
using SrcIterator = TensorIteratorOverChannel<SrcType, pack_c, pack_c,
pack_w, size_nbits>;
using DstIteraotr = TensorIteratorOverChannel<InnerDtype, 1, pack_c, pack_w,
size_nbits>;
using Transpose = Translayout<pack_c, pack_w, SrcType, DnnSrcType,
DnnDstType, same_scale>;
static constexpr int size_src_type = sizeof(SrcType);
static constexpr int size_dst_type = sizeof(DstType);
MEGDNN_STATIC_ASSERT(std::is_same<SrcType MEGDNN_COMMA DstType>::value,
"Currently this kernel only support accessing tensor "
"src and dst in same data type.");
n_stride_src /= size_src_type;
ic_stride /= size_src_type;
n_stride_dst /= size_dst_type;
oc_stride /= size_dst_type;
#undef MEGDNN_COMMA
template <typename Type_, int pack_size_, int chan_blk_, int width_,
int size_nbits_>
class MaskedTensorIteratorOverChannel {
public:
using Type = Type_;
static constexpr int pack_size = pack_size_;
static constexpr int chan_blk = chan_blk_;
static constexpr int width = width_;
static constexpr int size_nbits = size_nbits_;
static constexpr int elements_in_type =
chan_blk * width * size_nbits / (8 * sizeof(Type));
static constexpr int lane_size_in_type =
(width * pack_size * size_nbits) / (8 * sizeof(Type));
static constexpr int pack_size_in_type =
(pack_size * size_nbits) >= (8 * sizeof(Type))
? (pack_size * size_nbits / (8 * sizeof(Type)))
: (width * pack_size * size_nbits / (8 * sizeof(Type)));
static constexpr int pack_size_in_byte = pack_size_in_type * sizeof(Type);
static constexpr int accesses = elements_in_type / pack_size_in_type;
static constexpr int mask_size = (accesses + 32 - 1) / 32;
using AccessType = array_wrapper<Type, pack_size_in_type>;
using Fragment = array_wrapper<Type, elements_in_type>;

const int n_idx = blockIdx.y;
const int ihw_block_idx = blockIdx.x * blockDim.x + threadIdx.x;
const int ihw_offset = ihw_block_idx * pack_w;
const int ihw_offset_in_type =
ihw_offset * size_nbits / (8 * size_src_type);
const int oc_stride_inner_dtype =
oc_stride * size_dst_type / sizeof(InnerDtype);
if (ihw_offset < ihw) {
const int ic_block = (ic + pack_c - 1) / pack_c;
const int src_offset_base =
n_idx * n_stride_src + ihw_offset_in_type * pack_c;
const int dst_offset_base = n_idx * n_stride_dst + ihw_offset_in_type;
SrcIterator src_iterator{const_cast<SrcType*>(src + src_offset_base),
ic_stride, ic};
DstIteraotr dst_iterator{
reinterpret_cast<InnerDtype*>(dst + dst_offset_base),
oc_stride_inner_dtype, ic};

for (int ic_blk_idx = 0; ic_blk_idx < ic_block; ++ic_blk_idx) {
typename SrcIterator::Fragment src_frag;
typename DstIteraotr::Fragment dst_frag;
src_iterator.load(src_frag);
Transpose::trans(
reinterpret_cast<typename SrcIterator::Fragment&>(dst_frag),
src_frag, post_process);
dst_iterator.store(dst_frag);
src_iterator.advance();
dst_iterator.advance();
MEGDNN_HOST MEGDNN_DEVICE MaskedTensorIteratorOverChannel()
: pointer{nullptr},
chan_stride_in_elements{0},
channel{0} {}
MEGDNN_HOST MEGDNN_DEVICE MaskedTensorIteratorOverChannel(
Type* pointer_, int chan_stride_in_elements_, int channel_,
int bound_, int div_)
: pointer{pointer_},
chan_stride_in_elements{chan_stride_in_elements_},
channel{channel_},
bound{bound_},
div{div_} {
cutlass::find_divisor(mul, shr, div);
}

MEGDNN_DEVICE __forceinline__ void initialize(int c_idx, int hw_idx) {
pointer += (c_idx / pack_size) * chan_stride_in_elements;
#pragma unroll
for (int i = 0; i < mask_size; ++i) {
mask[i] = 0;
}
#pragma unroll
for (int i = 0; i < chan_blk; i += pack_size) {
#pragma unroll
for (int j = 0; j < lane_size_in_type / pack_size_in_type; j++) {
int offset = hw_idx + j;
int h, w;
cutlass::fast_divmod(h, w, offset, div, mul, shr);
bool guard = (i < channel) && (w < bound);
int index = (i / pack_size) *
(lane_size_in_type / pack_size_in_type) +
j;
int mask_index = (index >> 5);
int mask_shift = (index & 0x1f);
mask[mask_index] |= (guard << mask_shift);
stride[j] = (h * bound + w) * pack_size * size_nbits /
(8 * sizeof(Type));
}
}
}

MEGDNN_DEVICE __forceinline__ void add_pointer_offset(size_t offset_in_type) {
pointer += offset_in_type;
}

MEGDNN_DEVICE __forceinline__ void load(Fragment& frag) {
AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
Type* pointer_ = pointer;
#pragma unroll
for (int i = 0; i < chan_blk; i += pack_size) {
#pragma unroll
for (int j = 0; j < lane_size_in_type / pack_size_in_type; j++) {
int frag_idx = i / pack_size *
(lane_size_in_type / pack_size_in_type) +
j;
int mask_index = (frag_idx >> 5);
int mask_shift = (frag_idx & 0x1f);
bool guard = (mask[mask_index] & (1 << mask_shift));
cutlass::arch::global_load<AccessType, pack_size_in_byte>(
frag_ptr[frag_idx],
reinterpret_cast<void*>(pointer_ + stride[j]), guard);
}
pointer_ += chan_stride_in_elements;
}
}

MEGDNN_DEVICE __forceinline__ void store(const Fragment& frag) {
const AccessType* frag_ptr = reinterpret_cast<const AccessType*>(&frag);
Type* pointer_ = pointer;
#pragma unroll
for (int i = 0; i < chan_blk; i += pack_size) {
#pragma unroll
for (int j = 0; j < lane_size_in_type / pack_size_in_type; j++) {
int frag_idx = i / pack_size *
(lane_size_in_type / pack_size_in_type) +
j;
int mask_index = (frag_idx >> 5);
int mask_shift = (frag_idx & 0x1f);
bool guard = (mask[mask_index] & (1 << mask_shift));
cutlass::arch::global_store<AccessType, pack_size_in_byte>(
frag_ptr[frag_idx],
reinterpret_cast<void*>(pointer_ + stride[j]), guard);
}
pointer_ += chan_stride_in_elements;
}
}

MEGDNN_DEVICE __forceinline__ void advance() {
pointer += (chan_blk / pack_size) * chan_stride_in_elements;
channel -= chan_blk;
}

private:
Type* pointer;
int chan_stride_in_elements;
int channel;
int bound;
int div;
uint32_t mul;
uint32_t shr;
uint32_t mask[mask_size];
size_t stride[accesses];
};

template <bool padding_, typename Type_, int pack_size_, int chan_blk_,
int width_, int size_nbits_>
struct TensorIteratorPolicy;
template <typename Type_, int pack_size_, int chan_blk_, int width_,
int size_nbits_>
struct TensorIteratorPolicy<true, Type_, pack_size_, chan_blk_, width_,
size_nbits_> {
using TensorIterator =
MaskedTensorIteratorOverChannel<Type_, pack_size_, chan_blk_,
width_, size_nbits_>;
};
template <typename Type_, int pack_size_, int chan_blk_, int width_,
int size_nbits_>
struct TensorIteratorPolicy<false, Type_, pack_size_, chan_blk_, width_,
size_nbits_> {
using TensorIterator =
TensorIteratorOverChannel<Type_, pack_size_, chan_blk_, width_,
size_nbits_>;
};

template <typename SrcIterator_, typename DstIterator_, typename Transpose_,
typename CudaPostProcess_>
struct RelayoutProblem {
using SrcIterator = SrcIterator_;
using DstIterator = DstIterator_;
using Transpose = Transpose_;
using CudaPostProcess = CudaPostProcess_;
MEGDNN_STATIC_ASSERT(SrcIterator::chan_blk == DstIterator::chan_blk,
"channel block mismatch");
MEGDNN_STATIC_ASSERT(SrcIterator::width == DstIterator::width,
"width block mismatch");
MEGDNN_STATIC_ASSERT(SrcIterator::size_nbits == DstIterator::size_nbits,
"size in bits of elements mismatch");
static constexpr int pack_chan = SrcIterator::chan_blk;
static constexpr int pack_width = SrcIterator::width;
using DnnSrcType = typename CudaPostProcess::SrcType;
using DnnDstType = typename CudaPostProcess::DstType;
struct Param {
SrcIterator src_iterator;
DstIterator dst_iterator;
CudaPostProcess post_process;
int n_stride_src;
int n_stride_dst;
int batch_size;
int channels;
int hw;
MEGDNN_HOST MEGDNN_DEVICE Param(SrcIterator src_iterator_,
DstIterator dst_iterator_,
CudaPostProcess post_process_,
int n_stride_src_, int n_stride_dst_,
int batch_size_, int channels_, int hw_)
: src_iterator{src_iterator_},
dst_iterator{dst_iterator_},
post_process{post_process_},
n_stride_src{n_stride_src_},
n_stride_dst{n_stride_dst_},
batch_size{batch_size_},
channels{channels_},
hw{hw_} {}
};
};

template <typename RelayoutProblem_>
__global__ void relayout_kern(typename RelayoutProblem_::Param param) {
using SrcIterator = typename RelayoutProblem_::SrcIterator;
using DstIterator = typename RelayoutProblem_::DstIterator;
static constexpr int pack_chan = RelayoutProblem_::pack_chan;
static constexpr int pack_width = RelayoutProblem_::pack_width;
const int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
const int thread_offset = thread_idx * pack_width;
const int hw_idx = (thread_offset % param.hw);
const int nc_blks = thread_offset / param.hw;
const int c_blks = (param.channels + pack_chan - 1) / pack_chan;
const int n_idx = nc_blks / c_blks;
const int c_blk_idx = nc_blks % c_blks;
const int c_idx = c_blk_idx * pack_chan;
if (n_idx < param.batch_size) {
const int src_offset = n_idx * param.n_stride_src;
const int dst_offset = n_idx * param.n_stride_dst;
param.src_iterator.add_pointer_offset(src_offset);
param.dst_iterator.add_pointer_offset(dst_offset);
param.src_iterator.initialize(c_idx, hw_idx);
param.dst_iterator.initialize(c_idx, hw_idx);
typename SrcIterator::Fragment src_frag;
typename DstIterator::Fragment dst_frag;
param.src_iterator.load(src_frag);
RelayoutProblem_::Transpose::trans(
reinterpret_cast<typename SrcIterator::Fragment&>(dst_frag),
src_frag, param.post_process);
param.dst_iterator.store(dst_frag);
}
}
} // namespace

@@ -1175,21 +1381,23 @@ void relayout_format::relayout_format_cuda_nchw_nchwx(
"Unsupport pack size(pack_oc:%d, src:%s, dst:%s)", pack_oc,
stype.name(), dtype.name());
#undef DEF
const int in_n = src.layout[0];
const int out_n = dst.layout[0];
const int ic = src.layout[1];
const int h = src.layout[2];
const int w = src.layout[3];
const int oc = dst.layout[1] * pack_oc;
const int hw = h * w;
const int ocpg = oc / group;
// stride in byte
const int n_stride_src = src_layout.dtype.size(src_layout.stride[0]);
const int ic_stride = src_layout.dtype.size(src_layout.stride[1]);
const int n_stride_dst = dst_layout.dtype.size(dst_layout.stride[0]);
const int oc_stride = dst_layout.dtype.size(dst_layout.stride[1]);
// no padding
if (src.layout.stride[2] == static_cast<ptrdiff_t>(src.layout[3])) {
const int in_n = src.layout[0];
const int out_n = dst.layout[0];
const int ic = src.layout[1];
const int h = src.layout[2];
const int w = src.layout[3];
const int oc = dst.layout[1] * pack_oc;
const int hw = h * w;
const int ocpg = oc / group;
// stride in byte
const int n_stride_src = src_layout.dtype.size(src_layout.stride[0]);
const int ic_stride = src_layout.dtype.size(src_layout.stride[1]);
const int n_stride_dst = dst_layout.dtype.size(dst_layout.stride[0]);
const int oc_stride = dst_layout.dtype.size(dst_layout.stride[1]);

bool same_scale = src_scale == dst_scale;
bool same_scale = src_scale == dst_scale;
#define DISPATCH_RAW(_same_scale, _pack_w, _pack_oc, _src_type, _dst_type, \
_src_c_type, _dst_c_type, _size_nbits) \
if (same_scale == _same_scale && hw % _pack_w == 0 && \
@@ -1225,19 +1433,95 @@ void relayout_format::relayout_format_cuda_nchw_nchwx(
DISPATCH_RAW(false, 8, 64, _src_type, _dst_type, char, char, 4); \
DISPATCH_RAW(true, 2, 64, _src_type, _dst_type, char, char, 4); \
DISPATCH_RAW(false, 2, 64, _src_type, _dst_type, char, char, 4);
DISPATCH_INT(QuantizedS32, QuantizedS32);
DISPATCH_BYTE(Uint8, QuantizedS8);
DISPATCH_BYTE(Quantized8Asymm, QuantizedS8);
DISPATCH_BYTE(QuantizedS8, QuantizedS8);
DISPATCH_4BITS(QuantizedS4, QuantizedS4);
DISPATCH_4BITS(Quantized4Asymm, Quantized4Asymm);
DISPATCH_INT(QuantizedS32, QuantizedS32);
DISPATCH_BYTE(Uint8, QuantizedS8);
DISPATCH_BYTE(Quantized8Asymm, QuantizedS8);
DISPATCH_BYTE(QuantizedS8, QuantizedS8);
DISPATCH_4BITS(QuantizedS4, QuantizedS4);
DISPATCH_4BITS(Quantized4Asymm, Quantized4Asymm);
#undef DISPATCH_4BITS
#undef DISPATCH_BYTE
#undef DISPATCH_INT
#undef DISPATCH_RAW
megdnn_assert(false,
"Unsupported data type(src:%s, dst:%s) or image size(%dx%d).",
stype.name(), dtype.name(), h, w);
megdnn_assert(
false,
"Unsupported data type(src:%s, dst:%s) or image size(%dx%d).",
stype.name(), dtype.name(), h, w);
} else {
megdnn_assert(src_layout.dtype.is_low_bit());
int n = src.layout[0];
int c = src.layout[1];
int h = src.layout[2];
// align to byte
int w = src.layout[3];
int w_pad = DIVUP(w, 2) * 2;
int hw = h * w_pad;
int n_stride_src = src_layout.stride[0];
int ic_stride = src_layout.stride[1];
int n_stride_dst = dst_layout.stride[0];
int oc_stride = dst_layout.stride[1];
int problem_size = n * (c / pack_oc) * hw;
bool same_scale = src_scale == dst_scale;
#define DISPATCH_RAW(_same_scale, _pack_w, _pack_oc, _src_type, _dst_type, \
_src_c_type, _dst_c_type, _size_nbits) \
if (same_scale == _same_scale && hw % _pack_w == 0 && \
stype.enumv().ev == DTypeEnum::Ev::_src_type && \
dtype.enumv().ev == DTypeEnum::Ev::_dst_type) { \
using InnerDtype_ = typename DTypeRWHelper< \
typename DTypeTrait<dtype::_src_type>::ctype, \
_pack_w>::InnerDtype; \
using SrcIterator_ = \
TensorIteratorOverChannel<InnerDtype_, 1, _pack_oc, _pack_w, \
_size_nbits>; \
using DstIterator_ = MaskedTensorIteratorOverChannel< \
_dst_c_type, _pack_oc, _pack_oc, _pack_w, _size_nbits>; \
using CudaPostProcess_ = \
CudaPostProcess<dtype::_src_type, dtype::_dst_type, \
_same_scale>; \
using Transpose_ = \
Translayout<_pack_w, _pack_oc, _src_c_type, dtype::_src_type, \
dtype::_dst_type, _same_scale>; \
using RelayoutProblem_ = \
RelayoutProblem<SrcIterator_, DstIterator_, Transpose_, \
CudaPostProcess_>; \
n_stride_src = n_stride_src * _size_nbits / (8 * sizeof(InnerDtype_)); \
ic_stride = ic_stride * _size_nbits / (8 * sizeof(InnerDtype_)); \
n_stride_dst = n_stride_dst * _size_nbits / (8 * sizeof(_dst_c_type)); \
oc_stride = oc_stride * _size_nbits / (8 * sizeof(_dst_c_type)); \
typename RelayoutProblem_::Param param{ \
SrcIterator_{(InnerDtype_*)src.raw_ptr, ic_stride, c, w, \
w_pad}, \
DstIterator_{(_dst_c_type*)dst.raw_ptr, oc_stride, c, w, \
w_pad}, \
CudaPostProcess_{src_scale, src_zero_point, dst_scale, \
dst_zero_point}, \
n_stride_src, \
n_stride_dst, \
n, \
c, \
hw}; \
auto kernel = relayout_kern<RelayoutProblem_>; \
int nr_threads = query_blocksize_for_kernel(kernel); \
nr_threads = std::min(nr_threads, DIVUP(problem_size, _pack_w)); \
const dim3 block_dim(DIVUP(problem_size, nr_threads* _pack_w)); \
const dim3 thread_dim(nr_threads); \
return kernel<<<block_dim, thread_dim, 0, stream>>>(param); \
}
#define DISPATCH_4BITS(_src_type, _dst_type) \
DISPATCH_RAW(true, 8, 64, _src_type, _dst_type, char, char, 4); \
DISPATCH_RAW(false, 8, 64, _src_type, _dst_type, char, char, 4); \
DISPATCH_RAW(true, 2, 64, _src_type, _dst_type, char, char, 4); \
DISPATCH_RAW(false, 2, 64, _src_type, _dst_type, char, char, 4);
DISPATCH_4BITS(QuantizedS4, QuantizedS4);
DISPATCH_4BITS(Quantized4Asymm, Quantized4Asymm);
#undef DISPATCH_4BITS
#undef DISPATCH_RAW
megdnn_assert(
false,
"Unsupported data type(src:%s, dst:%s) or image size(%dx%d).",
stype.name(), dtype.name(), h, w);
}
after_kernel_launch();
}

bool relayout_format::relayout_format_cuda_usable(
@@ -1283,43 +1567,77 @@ void relayout_format::relayout_format_cuda_nchwx_nchw(
// clang-format on
megdnn_assert(pack_ic == 64, "Unsupport pack size(pack_ic:%d)", pack_ic);
#undef DEF
const int n = src.layout[0];
const int c = src.layout[1] * pack_ic;
const int h = src.layout[2];
int n = src.layout[0];
int c = src.layout[1] * pack_ic;
int h = src.layout[2];
// align to byte
const int w = src.layout[3];
const int hw = h * w;
const int n_stride_src = src_layout.dtype.size(src_layout.stride[0]);
const int ic_stride = src_layout.dtype.size(src_layout.stride[1]);
const int n_stride_dst = dst_layout.dtype.size(dst_layout.stride[0]);
const int oc_stride = dst_layout.dtype.size(dst_layout.stride[1]);
int w = src.layout[3];
int w_pad = DIVUP(w, 2) * 2;
int hw = h * w_pad;
int n_stride_src = src_layout.stride[0];
int ic_stride = src_layout.stride[1];
int n_stride_dst = dst_layout.stride[0];
int oc_stride = dst_layout.stride[1];
int problem_size = n * (c / pack_ic) * hw;

bool same_scale = src_scale == dst_scale;
#define DISPATCH_RAW(_same_scale, _pack_w, _pack_oc, _src_type, _dst_type, \
_src_c_type, _dst_c_type, _size_nbits) \
if (same_scale == _same_scale && hw % _pack_w == 0 && \
stype.enumv().ev == DTypeEnum::Ev::_src_type && \
dtype.enumv().ev == DTypeEnum::Ev::_dst_type) { \
auto kernel = \
kern_nchwx_nchw<_pack_w, _pack_oc, _same_scale, _src_c_type, \
_dst_c_type, dtype::_src_type, \
dtype::_dst_type, _size_nbits>; \
int nr_threads = query_blocksize_for_kernel(kernel); \
const dim3 block_dim(DIVUP(hw, nr_threads* _pack_w), n); \
const dim3 thread_dim(nr_threads); \
return kernel<<<block_dim, thread_dim, 0, stream>>>( \
(_src_c_type*)src.raw_ptr, (_dst_c_type*)dst.raw_ptr, c, hw, \
n_stride_src, ic_stride, n_stride_dst, oc_stride, \
CudaPostProcess<dtype::_src_type, dtype::_dst_type, \
_same_scale>(src_scale, src_zero_point, \
dst_scale, dst_zero_point), \
src_zero_point); \
bool padding = w % 2 != 0;
#define DISPATCH_RAW(_padding, _same_scale, _pack_w, _pack_oc, _src_type, \
_dst_type, _src_c_type, _dst_c_type, _size_nbits) \
if (padding == _padding && same_scale == _same_scale && \
hw % _pack_w == 0 && stype.enumv().ev == DTypeEnum::Ev::_src_type && \
dtype.enumv().ev == DTypeEnum::Ev::_dst_type) { \
using SrcIterator_ = \
typename TensorIteratorPolicy<_padding, _src_c_type, _pack_oc, \
_pack_oc, _pack_w, \
_size_nbits>::TensorIterator; \
using InnerDtype_ = typename DTypeRWHelper< \
typename DTypeTrait<dtype::_src_type>::ctype, \
_pack_w>::InnerDtype; \
using DstIterator_ = \
TensorIteratorOverChannel<InnerDtype_, 1, _pack_oc, _pack_w, \
_size_nbits>; \
using CudaPostProcess_ = \
CudaPostProcess<dtype::_src_type, dtype::_dst_type, \
_same_scale>; \
using Transpose_ = \
Translayout<_pack_oc, _pack_w, _src_c_type, dtype::_src_type, \
dtype::_dst_type, _same_scale>; \
using RelayoutProblem_ = \
RelayoutProblem<SrcIterator_, DstIterator_, Transpose_, \
CudaPostProcess_>; \
n_stride_src = n_stride_src * _size_nbits / (8 * sizeof(_src_c_type)); \
ic_stride = ic_stride * _size_nbits / (8 * sizeof(_src_c_type)); \
n_stride_dst = n_stride_dst * _size_nbits / (8 * sizeof(InnerDtype_)); \
oc_stride = oc_stride * _size_nbits / (8 * sizeof(InnerDtype_)); \
typename RelayoutProblem_::Param param{ \
SrcIterator_{(_src_c_type*)src.raw_ptr, ic_stride, c, w, \
w_pad}, \
DstIterator_{(InnerDtype_*)dst.raw_ptr, oc_stride, c, w, \
w_pad}, \
CudaPostProcess_{src_scale, src_zero_point, dst_scale, \
dst_zero_point}, \
n_stride_src, \
n_stride_dst, \
n, \
c, \
hw}; \
auto kernel = relayout_kern<RelayoutProblem_>; \
int nr_threads = query_blocksize_for_kernel(kernel); \
nr_threads = std::min(nr_threads, DIVUP(problem_size, _pack_w)); \
const dim3 block_dim(DIVUP(problem_size, nr_threads* _pack_w)); \
const dim3 thread_dim(nr_threads); \
return kernel<<<block_dim, thread_dim, 0, stream>>>(param); \
}
#define DISPATCH_4BITS(_src_type, _dst_type) \
DISPATCH_RAW(true, 8, 64, _src_type, _dst_type, char, char, 4); \
DISPATCH_RAW(false, 8, 64, _src_type, _dst_type, char, char, 4); \
DISPATCH_RAW(true, 2, 64, _src_type, _dst_type, char, char, 4); \
DISPATCH_RAW(false, 2, 64, _src_type, _dst_type, char, char, 4);
#define DISPATCH_4BITS(_src_type, _dst_type) \
DISPATCH_RAW(true, true, 8, 64, _src_type, _dst_type, char, char, 4); \
DISPATCH_RAW(true, false, 8, 64, _src_type, _dst_type, char, char, 4); \
DISPATCH_RAW(true, true, 2, 64, _src_type, _dst_type, char, char, 4); \
DISPATCH_RAW(true, false, 2, 64, _src_type, _dst_type, char, char, 4); \
DISPATCH_RAW(false, true, 8, 64, _src_type, _dst_type, char, char, 4); \
DISPATCH_RAW(false, false, 8, 64, _src_type, _dst_type, char, char, 4); \
DISPATCH_RAW(false, true, 2, 64, _src_type, _dst_type, char, char, 4); \
DISPATCH_RAW(false, false, 2, 64, _src_type, _dst_type, char, char, 4);
DISPATCH_4BITS(QuantizedS4, QuantizedS4);
DISPATCH_4BITS(Quantized4Asymm, Quantized4Asymm);
#undef DISPATCH_4BITS
@@ -1327,6 +1645,7 @@ void relayout_format::relayout_format_cuda_nchwx_nchw(
megdnn_assert(false,
"Unsupported data type(src:%s, dst:%s) or image size(%dx%d).",
stype.name(), dtype.name(), h, w);
after_kernel_launch();
}

void relayout_format::relayout_format_cuda_nchw4_nchw(
@@ -1344,6 +1663,7 @@ void relayout_format::relayout_format_cuda_nchw4_nchw(
const dim3 thread_dim(nr_threads);
kern_nchw4_nchw<<<block_dim, thread_dim, 0, stream>>>(
(int8_t*)src.raw_ptr, (int8_t*)dst.raw_ptr, n, ic, oc, h, w, group);
after_kernel_launch();
}

void relayout_format::relayout_format_cuda_nchw_nchw4_weight(
@@ -1372,4 +1692,5 @@ void relayout_format::relayout_format_cuda_nchw_nchw4_weight(
(char*)src.raw_ptr, (char*)dst.raw_ptr, oc, ic, hw, oc_stride_src,
ic_stride, oc_stride_dst, group_stride_src, group_stride_dst, 0,
{});
after_kernel_launch();
}

+ 5
- 4
dnn/test/common/benchmarker.h View File

@@ -87,10 +87,11 @@ public:
for (size_t i = 0; i < shapes.size(); ++i) {
DType dt = (m_dtype.find(i) != m_dtype.end() ? m_dtype[i]
: dtype::Float32());
TensorFormat fmt = (m_fmt.find(i) != m_fmt.end()
? m_fmt[i]
: DefaultTensorFormat::make());
layouts[i] = TensorLayout(shapes[i], dt, fmt);
if (m_fmt.find(i) == m_fmt.end()) {
layouts[i] = TensorLayout(shapes[i], dt);
layouts[i].init_contiguous_stride();
} else
layouts[i] = TensorLayout(shapes[i], dt, m_fmt[i]);
}
return layouts;
}


+ 10
- 64
dnn/test/common/checker.cpp View File

@@ -19,7 +19,6 @@ using namespace megdnn;
using namespace test;

namespace {

template<typename ctype, class Iter>
::testing::AssertionResult assert_tensor_eq_with_iter(
const char *expr0, const char *expr1,
@@ -30,7 +29,7 @@ namespace {
double error_sum = 0;
double error_sum_biased = 0;
for (size_t i = 0; i < nr_elem; ++ i) {
ctype iv0 = *it0, iv1 = *it1;
ctype iv0 = ctype(*it0), iv1 = ctype(*it1);
float err = diff(iv0, iv1);
error_sum += std::abs(err);
error_sum_biased += err;
@@ -84,12 +83,14 @@ namespace {
const char *expr0, const char *expr1,
const TensorND &v0, const TensorND &v1,
float maxerr, float maxerr_avg, float maxerr_avg_biased) {

if (v0.layout.is_physical_contiguous() &&
v1.layout.is_physical_contiguous()) {
return assert_tensor_eq_with_iter<ctype>(
expr0, expr1, v0.ptr<ctype>(), v1.ptr<ctype>(), v0.layout,
maxerr, maxerr_avg, maxerr_avg_biased);
if (!std::is_same<ctype, dt_qint4>::value &&
!std::is_same<ctype, dt_quint4>::value) {
if (v0.layout.is_physical_contiguous() &&
v1.layout.is_physical_contiguous()) {
return assert_tensor_eq_with_iter<ctype>(
expr0, expr1, v0.ptr<ctype>(), v1.ptr<ctype>(),
v0.layout, maxerr, maxerr_avg, maxerr_avg_biased);
}
}

auto it0 = megdnn::tensor_iter_valonly<ctype>(v0).begin(),
@@ -100,56 +101,6 @@ namespace {
maxerr_avg_biased);
}

template <typename ITYPE>
::testing::AssertionResult assert_tensor_eq_with_lowbit4(
const char* expr0, const char* expr1,
const TensorND& v0, const TensorND& v1,
float maxerr, float maxerr_avg) {
if (!v0.layout.eq_layout(v1.layout)) {
return ::testing::AssertionFailure()
<< "Layout mismatch for testing equality of lowbit4\n"
<< "Value of: " << expr1 << "\n"
<< " Actual: " << v1.layout.TensorShape::to_string() << "\n"
<< "Expected: " << expr0 << "\n"
<< "Which is: " << v0.layout.TensorShape::to_string() << "\n";
}
auto v0_ptr = static_cast<ITYPE*>(v0.raw_ptr) - v0.layout.span().low_byte;
auto v1_ptr = static_cast<ITYPE*>(v1.raw_ptr) - v1.layout.span().low_byte;
double error_sum = 0;
for (size_t i = 0; i < v0.layout.span().dist_elem(); ++i) {
ITYPE iv0 = (v0_ptr[i / 2] << (i ^ 1) * 4);
iv0 = iv0 >> 4;
ITYPE iv1 = (v1_ptr[i / 2] << (i ^ 1) * 4);
iv1 = iv1 >> 4;

float err = std::abs(diff(iv0, iv1));
error_sum += err;
if (!good_float(iv0) || !good_float(iv1) || err >= maxerr) {
Index index(v0.layout, i);
return ::testing::AssertionFailure()
<< "Unequal value\n"
<< "Value of: " << expr1 << "\n"
<< " Actual: " << (iv1+0) << "\n"
<< "Expected: " << expr0 << "\n"
<< "Which is: " << (iv0+0) << "\n"
<< "At index: " <<
index.to_string() << "/" << v0.layout.TensorShape::to_string() << "\n"
<< " Dtype: " << v0.layout.dtype.name() << "\n"
<< " error: " << err << "/" << maxerr;
}
}
float error_avg = error_sum / v0.layout.total_nr_elems();
if (error_avg > maxerr_avg) {
return ::testing::AssertionFailure()
<< "Average error too high\n"
<< "Value of: " << expr1 << "\n"
<< "Expected: " << expr0 << "\n"
<< "Average error: " << error_avg << "/" << maxerr_avg;
}

return ::testing::AssertionSuccess();
}

template<class Impl>
void memcpy_noncontig(
void *dst, const void *src, const TensorLayout &layout,
@@ -215,12 +166,7 @@ namespace {
//! In order to avoid an unnecessary increase in binary size, we just
//! use QuantizedS16 dtype in winograd_filter_preprocess now.
cb(::megdnn::dtype::QuantizedS16)
case DTypeTrait<dtype::Quantized4Asymm>::enumv:
return assert_tensor_eq_with_lowbit4<uint8_t>(expr0, expr1, v0, v1,
maxerr, maxerr_avg);
case DTypeTrait<dtype::QuantizedS4>::enumv:
return assert_tensor_eq_with_lowbit4<int8_t>(expr0, expr1, v0, v1,
maxerr, maxerr_avg);
MEGDNN_FOREACH_QUANTIZED_LOWBIT_DTYPE(cb)
#undef cb
default:
megdnn_trap();


+ 17
- 0
dnn/test/common/utils.h View File

@@ -228,6 +228,14 @@ static inline int diff(dt_qint8 x, dt_qint8 y) {
return x.as_int8() - y.as_int8();
}

static inline int diff(dt_qint4 x, dt_qint4 y) {
return x.as_int8() - y.as_int8();
}

static inline int diff(dt_quint4 x, dt_quint4 y) {
return x.as_uint8() - y.as_uint8();
}

inline TensorShape cvt_src_or_dst_nchw2nhwc(const TensorShape& shape) {
megdnn_assert(shape.ndim == 4);
auto N = shape[0], C = shape[1], H = shape[2], W = shape[3];
@@ -356,6 +364,15 @@ static inline int operator+(dt_qint16 lhs, int rhs) {
return lhs.as_int16();
}

static inline int operator+(dt_quint4 lhs, int rhs) {
megdnn_assert(rhs == 0, "unexpected rhs");
return lhs.as_uint8();
}

static inline int operator+(dt_qint4 lhs, int rhs) {
megdnn_assert(rhs == 0, "unexpected rhs");
return lhs.as_int8();
}
} // namespace test

static inline bool operator==(const TensorLayout& a, const TensorLayout& b) {


+ 44
- 3
dnn/test/cuda/relayout_format.cpp View File

@@ -11,13 +11,14 @@
*/
#include "megdnn/dtype.h"
#include "megdnn/oprs.h"
#include "test/common/benchmarker.h"
#include "test/cuda/benchmark.h"
#include "test/common/checker.h"
#include "test/common/rng.h"
#include "test/cuda/fixture.h"

using namespace megdnn;
using namespace test;
#define MEGDNN_WITH_BENCHMARK 1

TEST_F(CUDA, RELAYOUT_FORMAT) {
Checker<RelayoutFormat> checker(handle_cuda());
@@ -246,7 +247,7 @@ TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW64) {
for (size_t n : {1, 3}) {
for (size_t c : {64, 128}) {
for (size_t h : {7, 14, 16, 28}) {
for (size_t w : {2, 4, 14, 16}) {
for (size_t w : {2, 3, 7, 8, 16, 31}) {
checker.set_dtype(0, dtype::QuantizedS4{2.f})
.set_dtype(1, dtype::QuantizedS4{2.f})
.set_rng(0, &s4)
@@ -286,7 +287,7 @@ TEST_F(CUDA, RELAYOUT_FORMAT_NCHW64_NCHW) {
for (size_t n : {1, 3}) {
for (size_t c : {64, 128}) {
for (size_t h : {7, 14, 16, 28}) {
for (size_t w : {2, 4, 14, 16}) {
for (size_t w : {2, 3, 4, 7, 14, 16, 17}) {
checker.set_dtype(0, dtype::QuantizedS4{2.f})
.set_dtype(1, dtype::QuantizedS4{2.f})
.set_rng(0, &s4)
@@ -366,6 +367,46 @@ TEST_F(CUDA, BENCHMARK_RELAYOUT_FORMAT) {
run(shapes, param, default_param);
}
}

TEST_F(CUDA, BENCHMARK_RELAYOUT_FORMAT_QS4) {
using Param = RelayoutFormat::Param;

auto run = [&](const TensorShapeArray& shapes, Param param) {
CUBenchmarker<RelayoutFormat> benchmarker(handle_cuda());
benchmarker.set_param(param);
benchmarker.set_dtype(0, dtype::QuantizedS4{1.19990307f})
.set_dtype(1, dtype::QuantizedS4{1.20210322f});

for (auto&& shape : shapes) {
double memaccess = double(shape.total_nr_elems()) * 1e-6;
auto time_ms = benchmarker.execs({shape, {}});
printf("execute %s, time %.4f ms, %.4f GB/s\n",
shape.to_string().c_str(), time_ms, memaccess / time_ms);
}
};

{
TensorShapeArray shapes = {
{1, 64, 56, 56}, {16, 64, 56, 56}, {64, 64, 56, 56},
{1, 64, 56, 55}, {16, 64, 56, 55}, {64, 64, 56, 55},
};
Param param;
param.mode = param::RelayoutFormat::Mode::NCHW_NCHW64;
run(shapes, param);
}
{
TensorShapeArray shapes = {
{64, 1, 56, 56, 64},
{1, 32, 7, 7, 64},
{16, 32, 7, 7, 64},
{64, 32, 7, 7, 64},
};
Param param;
param.mode = param::RelayoutFormat::Mode::NCHW64_NCHW;
run(shapes, param);
}
}

#endif

TEST_F(CUDA, RELAYOUT_FORMAT_NCHW4) {


Loading…
Cancel
Save