Browse Source

fix(dnn): fixes for int4

GitOrigin-RevId: 845e164fd3
release-1.5
Megvii Engine Team 4 years ago
parent
commit
86b69cacd0
22 changed files with 184 additions and 569 deletions
  1. +7
    -0
      dnn/include/megdnn/basic_types.h
  2. +9
    -0
      dnn/include/megdnn/dtype.h
  3. +1
    -1
      dnn/include/megdnn/tensor_format.h
  4. +5
    -0
      dnn/src/common/basic_types.cpp
  5. +6
    -1
      dnn/src/common/convolution.cpp
  6. +19
    -14
      dnn/src/common/tensor_format.cpp
  7. +4
    -4
      dnn/src/cuda/conv_bias/fallback_nchw_qs4.cpp
  8. +0
    -1
      dnn/src/cuda/conv_bias/opr_impl.h
  9. +3
    -4
      dnn/src/naive/matrix_mul/matrix_mul_helper.h
  10. +3
    -4
      dnn/src/naive/pooling/opr_impl.cpp
  11. +0
    -1
      dnn/test/common/checker.h
  12. +2
    -7
      dnn/test/common/test_basic_types.cpp
  13. +8
    -0
      dnn/test/common/utils.h
  14. +19
    -20
      dnn/test/naive/warp_perspective.cpp
  15. +26
    -12
      imperative/python/src/helper.cpp
  16. +1
    -1
      imperative/src/impl/physical_tensor.cpp
  17. +40
    -13
      src/core/impl/dtype.cpp
  18. +7
    -3
      src/core/impl/graph/var_node_mem_mgr.cpp
  19. +11
    -0
      src/core/include/megbrain/dtype.h
  20. +0
    -308
      src/gopt/test/inference.cpp
  21. +4
    -172
      src/opr/test/dnn/convolution.cpp
  22. +9
    -3
      src/serialization/impl/serializer_oss.cpp

+ 7
- 0
dnn/include/megdnn/basic_types.h View File

@@ -281,6 +281,13 @@ struct TensorLayout : public TensorShape {
add_axis_inplace(axis, 1, stride[axis] * shape[axis]); add_axis_inplace(axis, 1, stride[axis] * shape[axis]);
} }


/*!
* \brief modify data type of the layout inplace
*
* By the way this API will modify the format according to the data type
*/
void modify_dtype_inplace(DType dtype);

/* =================== generate new layout =================== */ /* =================== generate new layout =================== */


/** /**


+ 9
- 0
dnn/include/megdnn/dtype.h View File

@@ -513,6 +513,15 @@ class DType {


bool is_low_bit() const { return low_bit() != 0; } bool is_low_bit() const { return low_bit() != 0; }


bool is_quantized_lowbit() const {
return low_bit() != 0 &&
#if MEGDNN_CC_HOST
category() == DTypeCategory::QUANTIZED;
#else
category().ev == DTypeCategory::Ev::QUANTIZED;
#endif
}

/*! /*!
* \brief size of this data type, in bytes * \brief size of this data type, in bytes
*/ */


+ 1
- 1
dnn/include/megdnn/tensor_format.h View File

@@ -226,7 +226,7 @@ public:
std::string to_string() const override; std::string to_string() const override;


//! raise exception if given layout is illegal //! raise exception if given layout is illegal
void assert_valid(const TensorLayout& layout) const;
void assert_valid(const TensorLayout& layout) const override;


void serialize_append(std::string& result) const override; void serialize_append(std::string& result) const override;




+ 5
- 0
dnn/src/common/basic_types.cpp View File

@@ -282,6 +282,11 @@ void TensorLayout::add_axis_inplace(size_t axis, size_t shape,
this->stride[axis] = stride; this->stride[axis] = stride;
} }


void TensorLayout::modify_dtype_inplace(DType dtype_) {
dtype = dtype_;
format = Format(dtype);
}

bool TensorLayout::is_contiguous() const { bool TensorLayout::is_contiguous() const {
return format.impl()->is_contiguous_spec(*this); return format.impl()->is_contiguous_spec(*this);
} }


+ 6
- 1
dnn/src/common/convolution.cpp View File

@@ -952,7 +952,12 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src,
megdnn_assert(src[4] == 4); megdnn_assert(src[4] == 4);
dst[4] = 4; dst[4] = 4;
} }
dst.format = src.format;
if (!src.format.is_default() &&
!src.format.is_lowbit_aligned()) { // propagate
dst.format = src.format;
} else { // determined by dtype
dst.format = TensorFormat(dst.dtype);
}
dst.init_contiguous_stride(); dst.init_contiguous_stride();
return cflt; return cflt;
} }


+ 19
- 14
dnn/src/common/tensor_format.cpp View File

@@ -46,14 +46,15 @@ TensorFormat TensorFormat::deserialize(const std::string& bin,
TensorFormat::Format() : m_impl{DefaultTensorFormat::make().m_impl} {} TensorFormat::Format() : m_impl{DefaultTensorFormat::make().m_impl} {}


TensorFormat::Format(DType dtype) { TensorFormat::Format(DType dtype) {
megdnn_assert(dtype.valid());
if (dtype.is_low_bit()) {
if (dtype.valid() &&
dtype.is_quantized_lowbit()) { // quantized lowbit, by default
// aligned to bytes
size_t size_nbits = dtype.low_bit(); size_t size_nbits = dtype.low_bit();
megdnn_assert(size_nbits == 1 || size_nbits == 2 || size_nbits == 4, megdnn_assert(size_nbits == 1 || size_nbits == 2 || size_nbits == 4,
"unsupported lowbits data type(%s, size in bits: %zu)", "unsupported lowbits data type(%s, size in bits: %zu)",
dtype.name(), size_nbits); dtype.name(), size_nbits);
m_impl = LowbitsAlignedToBytesTensorFormat::make(size_nbits).m_impl; m_impl = LowbitsAlignedToBytesTensorFormat::make(size_nbits).m_impl;
} else {
} else { // non parameterized lowbit, default format
m_impl = DefaultTensorFormat::make().m_impl; m_impl = DefaultTensorFormat::make().m_impl;
} }
} }
@@ -89,8 +90,8 @@ bool TensorFormat::is_lowbit_aligned() const {
/* ===================== DefaultFormat ===================== */ /* ===================== DefaultFormat ===================== */
void DefaultTensorFormat::assert_valid(const TensorLayout& layout) const { void DefaultTensorFormat::assert_valid(const TensorLayout& layout) const {
megdnn_assert( megdnn_assert(
!layout.dtype.valid() || !layout.dtype.is_low_bit(),
"DefaultTensorFormat does not support low-bits tensor(dtype:%s)",
!layout.dtype.valid() || !layout.dtype.is_quantized_lowbit(),
"DefaultTensorFormat does not support quantized lowbit tensor(dtype:%s)",
layout.dtype.name()); layout.dtype.name());
} }


@@ -271,7 +272,8 @@ void Image2DPackedTensorFormatBase<PIXEL_SIZE>::assert_valid(
auto m_align_axis = align_axis(); auto m_align_axis = align_axis();
megdnn_assert(!(layout.shape[layout.ndim - 1] % PIXEL_SIZE), megdnn_assert(!(layout.shape[layout.ndim - 1] % PIXEL_SIZE),
"bad shape: %zu", layout.shape[layout.ndim - 1]); "bad shape: %zu", layout.shape[layout.ndim - 1]);
megdnn_assert(layout.dtype.valid() && layout.ndim > m_align_axis);
megdnn_assert(layout.dtype.valid() && !layout.dtype.is_quantized_lowbit() &&
layout.ndim > m_align_axis);
ptrdiff_t first_non_zero_stride = 0; ptrdiff_t first_non_zero_stride = 0;
for (int i = layout.ndim - 1; i >= 0; --i) { for (int i = layout.ndim - 1; i >= 0; --i) {
megdnn_assert(layout.shape[i] && layout.stride[i] >= 0); megdnn_assert(layout.shape[i] && layout.stride[i] >= 0);
@@ -478,6 +480,7 @@ void LowbitsAlignedTensorFormatBase::assert_valid(
megdnn_assert(layout.dtype.valid() && layout.dtype.is_low_bit() && megdnn_assert(layout.dtype.valid() && layout.dtype.is_low_bit() &&
layout.dtype.low_bit() == m_size_nbits); layout.dtype.low_bit() == m_size_nbits);
bool has_dim_unity_stride = false; bool has_dim_unity_stride = false;
bool has_dim_aligned_stride = false;
for (int i = layout.ndim - 1; i >= 0; --i) { for (int i = layout.ndim - 1; i >= 0; --i) {
if (!has_dim_unity_stride && layout.stride[i] == 1) if (!has_dim_unity_stride && layout.stride[i] == 1)
has_dim_unity_stride = true; has_dim_unity_stride = true;
@@ -485,15 +488,16 @@ void LowbitsAlignedTensorFormatBase::assert_valid(
layout.stride[i] >= 0 && layout.stride[i] >= 0 &&
(layout.stride[i] % m_align_size_in_elements == 0 || (layout.stride[i] % m_align_size_in_elements == 0 ||
layout.stride[i] == 1), layout.stride[i] == 1),
"bad stride:%s, %zu", layout.to_string().c_str(),
layout.stride[i]);
"bad stride:%s, %ld", layout.to_string().c_str(),
static_cast<long>(layout.stride[i]));
if (!has_dim_aligned_stride &&
static_cast<size_t>(layout.stride[i]) == m_align_size_in_elements)
has_dim_aligned_stride = true;
} }
if (!has_dim_unity_stride &&
(int)layout.stride[layout.ndim - 1] ==
round_up(1, (int)m_align_size_in_elements))
has_dim_unity_stride = true;
megdnn_assert(layout.ndim == 0 || has_dim_unity_stride,
"innermost dim not contiguous");

megdnn_assert(
layout.ndim == 0 || has_dim_unity_stride || has_dim_aligned_stride,
"innermost dim not contiguous");
} }


void LowbitsAlignedTensorFormatBase::serialize_append( void LowbitsAlignedTensorFormatBase::serialize_append(
@@ -542,6 +546,7 @@ size_t LowbitsAlignedTensorFormatBase::init_contiguous_stride(
multiplier = round_up(multiplier, m_align_size_in_elements); multiplier = round_up(multiplier, m_align_size_in_elements);
accum = mul(accum, multiplier); accum = mul(accum, multiplier);
} }
assert_valid(layout);
return accum; return accum;
} }




+ 4
- 4
dnn/src/cuda/conv_bias/fallback_nchw_qs4.cpp View File

@@ -12,6 +12,7 @@


#include "./algo.h" #include "./algo.h"
#include "src/cuda/utils.h" #include "src/cuda/utils.h"
#include "src/common/conv_bias.h"


using namespace megdnn; using namespace megdnn;
using namespace cuda; using namespace cuda;
@@ -27,8 +28,7 @@ bool ConvBiasForwardImpl::AlgoFallbackNCHWQS4::is_available(
bool available = true; bool available = true;
auto&& param = args.opr->param(); auto&& param = args.opr->param();
auto&& fm = args.filter_meta; auto&& fm = args.filter_meta;
if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout),
param.format))
if (!check_bias_share_in_channel(*(args.bias_layout), param.format))
return false; return false;
if (param.format != Format::NCHW) if (param.format != Format::NCHW)
return false; return false;
@@ -128,7 +128,7 @@ void ConvBiasForwardImpl::AlgoFallbackNCHWQS4::exec(
conv_op->param() = args.opr->param(); conv_op->param() = args.opr->param();
using Format = param::ConvBias::Format; using Format = param::ConvBias::Format;
conv_op->param().format = Format::NCHW64; conv_op->param().format = Format::NCHW64;
ExecArgs args_{dynamic_cast<ConvBiasForwardImpl*>(conv_op.get()),
ExecArgs args_{reinterpret_cast<ConvBiasForwardImpl*>(conv_op.get()),
src_, src_,
filter_, filter_,
bias_, bias_,
@@ -190,7 +190,7 @@ WorkspaceBundle ConvBiasForwardImpl::AlgoFallbackNCHWQS4::get_workspace_bundle(
conv_op->param() = args.opr->param(); conv_op->param() = args.opr->param();
using Format = param::ConvBias::Format; using Format = param::ConvBias::Format;
conv_op->param().format = Format::NCHW64; conv_op->param().format = Format::NCHW64;
SizeArgs args_{dynamic_cast<ConvBiasForwardImpl*>(conv_op.get()),
SizeArgs args_{reinterpret_cast<ConvBiasForwardImpl*>(conv_op.get()),
layouts[0], layouts[0],
layouts[1], layouts[1],
layouts[2], layouts[2],


+ 0
- 1
dnn/src/cuda/conv_bias/opr_impl.h View File

@@ -64,7 +64,6 @@ public:
class AlgoInt8CHWN4IMMAImplicitGemmReorderFilter; class AlgoInt8CHWN4IMMAImplicitGemmReorderFilter;
class AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth; class AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth;
class AlgoInt8NCHW32IMMAImplicitGemm; class AlgoInt8NCHW32IMMAImplicitGemm;
class AlgoFallbackNCHWQS4;
class AlgoBFloat16; class AlgoBFloat16;


class AlgoPack; class AlgoPack;


+ 3
- 4
dnn/src/naive/matrix_mul/matrix_mul_helper.h View File

@@ -151,9 +151,9 @@ void exec_matrix_mul_quint4x4x32_helper(
MEGDNN_MARK_USED_VAR(format); MEGDNN_MARK_USED_VAR(format);
MEGDNN_MARK_USED_VAR(compute_mode); MEGDNN_MARK_USED_VAR(compute_mode);
auto convert_layout = [](const TensorLayout& layout) { auto convert_layout = [](const TensorLayout& layout) {
auto ret = layout;
auto param = layout.dtype.param<dtype::Quantized4Asymm>(); auto param = layout.dtype.param<dtype::Quantized4Asymm>();
ret.dtype = dtype::Quantized8Asymm(param.scale, param.zero_point);
TensorLayout ret(layout,
dtype::Quantized8Asymm(param.scale, param.zero_point));
return ret; return ret;
}; };
TensorLayout A_layout, B_layout; TensorLayout A_layout, B_layout;
@@ -205,9 +205,8 @@ void exec_matrix_mul_qint4x4x16_helper(
MEGDNN_MARK_USED_VAR(format); MEGDNN_MARK_USED_VAR(format);
MEGDNN_MARK_USED_VAR(compute_mode); MEGDNN_MARK_USED_VAR(compute_mode);
auto convert_layout = [](const TensorLayout& layout) { auto convert_layout = [](const TensorLayout& layout) {
auto ret = layout;
auto param = layout.dtype.param<dtype::QuantizedS4>(); auto param = layout.dtype.param<dtype::QuantizedS4>();
ret.dtype = dtype::QuantizedS8(param.scale);
TensorLayout ret(layout, dtype::QuantizedS8(param.scale));
return ret; return ret;
}; };
TensorLayout A_layout, B_layout; TensorLayout A_layout, B_layout;


+ 3
- 4
dnn/src/naive/pooling/opr_impl.cpp View File

@@ -406,8 +406,7 @@ size_t PoolingForwardImpl::get_workspace_in_bytes(const TensorLayout& src,
} }
namespace { namespace {


void post_process(const TensorND& dst, TensorND& comp_dst, Handle* handle,
WorkspaceBundle& workspace_bundle) {
void post_process(const TensorND& dst, TensorND& comp_dst) {
if (dst.layout.dtype.enumv() == DTypeEnum::QuantizedS4) { if (dst.layout.dtype.enumv() == DTypeEnum::QuantizedS4) {
int8_to_int4(comp_dst, dst); int8_to_int4(comp_dst, dst);
} else if (dst.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { } else if (dst.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) {
@@ -427,8 +426,8 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4) { if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4) {
float scale = src.layout.dtype.param<dtype::QuantizedS4>().scale; float scale = src.layout.dtype.param<dtype::QuantizedS4>().scale;
comp_src.layout.dtype = dtype::QuantizedS8(scale); comp_src.layout.dtype = dtype::QuantizedS8(scale);
comp_src.layout.init_contiguous_stride();
comp_src.layout.format = TensorLayout::Format(comp_src.layout.dtype); comp_src.layout.format = TensorLayout::Format(comp_src.layout.dtype);
comp_src.layout.init_contiguous_stride();
comp_src.raw_ptr = wsb.get(0); comp_src.raw_ptr = wsb.get(0);
comp_dst.layout.dtype = dtype::QuantizedS8(scale); comp_dst.layout.dtype = dtype::QuantizedS8(scale);
comp_dst.layout.format = TensorLayout::Format(comp_dst.layout.dtype); comp_dst.layout.format = TensorLayout::Format(comp_dst.layout.dtype);
@@ -571,7 +570,7 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
default: \ default: \
megdnn_assert(0, "not support mode"); \ megdnn_assert(0, "not support mode"); \
} \ } \
post_process(dst, comp_dst, handle(), wsb); \
post_process(dst, comp_dst); \
return; \ return; \
} }




+ 0
- 1
dnn/test/common/checker.h View File

@@ -132,7 +132,6 @@ public:
: dtype::Float32()); : dtype::Float32());
if (m_fmt.find(i) == m_fmt.end()) { if (m_fmt.find(i) == m_fmt.end()) {
layouts[i] = TensorLayout(shapes[i], dt); layouts[i] = TensorLayout(shapes[i], dt);
layouts[i].init_contiguous_stride();
} else } else
layouts[i] = TensorLayout(shapes[i], dt, m_fmt[i]); layouts[i] = TensorLayout(shapes[i], dt, m_fmt[i]);
} }


+ 2
- 7
dnn/test/common/test_basic_types.cpp View File

@@ -325,13 +325,8 @@ TEST(BASIC_TYPES, TENSOR_LAYOUT_FMT_LOW_BITS) {


layout = TensorLayout{{1, 32, 1, 1}, dtype::QuantizedS4{1.2f}}; layout = TensorLayout{{1, 32, 1, 1}, dtype::QuantizedS4{1.2f}};
layout = layout.broadcast({16, 32, 7, 7}); layout = layout.broadcast({16, 32, 7, 7});
EXPECT_EQ(make_layout({16, 32, 49}, {0, 1, 0}, dtype::QuantizedS4{1.2}),
EXPECT_EQ(make_layout({16, 32, 49}, {0, 2, 0}, dtype::QuantizedS4{1.2}),
layout.collapse_contiguous()); layout.collapse_contiguous());

layout = TensorLayout{{1, 32, 1, 1}, dtype::QuantizedS4{1.2f}};
layout.init_contiguous_stride();
layout = layout.broadcast({16, 32, 7, 7});
ASSERT_THROW(layout.span(), MegDNNError);
} }


TEST(BASIC_TYPES, TENSOR_LAYOUT_FMT_LOW_BITS_VALID) { TEST(BASIC_TYPES, TENSOR_LAYOUT_FMT_LOW_BITS_VALID) {
@@ -342,7 +337,7 @@ TEST(BASIC_TYPES, TENSOR_LAYOUT_FMT_LOW_BITS_VALID) {
LowbitsAlignedToBytesTensorFormat::make(4_z)), LowbitsAlignedToBytesTensorFormat::make(4_z)),
MegDNNError); MegDNNError);
ASSERT_THROW(TensorLayout({16, 32, 7, 7}, dtype::IntB2{}, ASSERT_THROW(TensorLayout({16, 32, 7, 7}, dtype::IntB2{},
LowbitsAlignedToBytesTensorFormat::make(2_z)),
LowbitsAlignedToBytesTensorFormat::make(4_z)),
MegDNNError); MegDNNError);
} }




+ 8
- 0
dnn/test/common/utils.h View File

@@ -343,6 +343,14 @@ static inline bool good_float(dt_qint32) {
return true; return true;
} }


static inline bool good_float(dt_qint4) {
return true;
}

static inline bool good_float(dt_quint4) {
return true;
}

// A hack for the (x+0) promote to int trick on dt_quint8. // A hack for the (x+0) promote to int trick on dt_quint8.
static inline int operator+(dt_quint8 lhs, int rhs) { static inline int operator+(dt_quint8 lhs, int rhs) {
megdnn_assert(rhs == 0, "unexpected rhs"); megdnn_assert(rhs == 0, "unexpected rhs");


+ 19
- 20
dnn/test/naive/warp_perspective.cpp View File

@@ -545,12 +545,12 @@ TEST_F(NAIVE, WARP_PERSPECTIVE_NCHW64) {
using Param = WarpPerspective::Param; using Param = WarpPerspective::Param;


auto convert_true_format = [](const TensorLayout& layout) { auto convert_true_format = [](const TensorLayout& layout) {
if (layout.ndim == 4)
return layout
.reshape({layout[0], layout[1] / 64, layout[2], layout[3],
64})
.dimshuffle({0, 1, 4, 2, 3});
else
if (layout.ndim == 4) {
TensorLayout ret{
{layout[0], layout[1] / 64, layout[2], layout[3], 64},
layout.dtype};
return ret.dimshuffle({0, 1, 4, 2, 3});
} else
return layout; return layout;
}; };


@@ -563,15 +563,16 @@ TEST_F(NAIVE, WARP_PERSPECTIVE_NCHW64) {


TensorNDArray nchw_tensors; TensorNDArray nchw_tensors;
for (size_t i = 0; i < tensors.size(); ++i) { for (size_t i = 0; i < tensors.size(); ++i) {
TensorLayout ly;
auto layout = tensors[i].layout; auto layout = tensors[i].layout;
if (layout.dtype.enumv() == DTypeEnum::QuantizedS4)
layout.dtype = dtype::QuantizedS4();
if (layout.ndim == 5) {
layout = layout.reshape({layout[0], layout[1] * layout[4],
layout[2], layout[3]});
if (tensors[i].layout.ndim == 5) {
ly = TensorLayout{{layout[0], layout[1] * layout[4], layout[2],
layout[3]},
layout.dtype};
} else {
ly = layout;
} }
nchw_tensors.emplace_back(malloc(layout.span().dist_byte()),
layout);
nchw_tensors.emplace_back(malloc(ly.span().dist_byte()), ly);
} }
TensorNDArray nchw64_tensors; TensorNDArray nchw64_tensors;
for (size_t i = 0; i < tensors.size(); ++i) { for (size_t i = 0; i < tensors.size(); ++i) {
@@ -617,13 +618,11 @@ TEST_F(NAIVE, WARP_PERSPECTIVE_NCHW64) {
checker.set_param(param); checker.set_param(param);
checker.execs({{2, 1, 10, 10, 64}, {2, 3, 3}, {2, 1, 10, 12, 64}}); checker.execs({{2, 1, 10, 10, 64}, {2, 3, 3}, {2, 1, 10, 12, 64}});
checker.execs( checker.execs(
{{20, 30, 10, 12, 64}, {20, 3, 3}, {20, 30, 11, 12, 64}});
checker.execs(
{{220, 3, 10, 10, 64}, {220, 3, 3}, {220, 3, 10, 12, 64}});
checker.execs({{1, 25, 25, 24, 64}, {1, 3, 3}, {1, 25, 25, 510, 64}});
checker.execs({{1, 25, 25, 510, 64}, {1, 3, 3}, {1, 25, 25, 24, 64}});
checker.execs({{1, 25, 25, 24, 64}, {1, 3, 3}, {1, 25, 51, 50, 64}});
checker.execs({{1, 25, 51, 50, 64}, {1, 3, 3}, {1, 25, 25, 24, 64}});
{{20, 3, 10, 12, 64}, {20, 3, 3}, {20, 3, 11, 12, 64}});
checker.execs({{1, 3, 25, 24, 64}, {1, 3, 3}, {1, 3, 25, 51, 64}});
checker.execs({{1, 3, 25, 51, 64}, {1, 3, 3}, {1, 3, 25, 24, 64}});
checker.execs({{1, 3, 25, 24, 64}, {1, 3, 3}, {1, 3, 51, 50, 64}});
checker.execs({{1, 3, 51, 50, 64}, {1, 3, 3}, {1, 3, 25, 24, 64}});
} }
} }
// vim: syntax=cpp.doxygen // vim: syntax=cpp.doxygen

+ 26
- 12
imperative/python/src/helper.cpp View File

@@ -18,6 +18,7 @@
#include "megbrain/graph/cg.h" #include "megbrain/graph/cg.h"
#include "megbrain/tensor.h" #include "megbrain/tensor.h"
#include "megbrain/utils/mempool.h" #include "megbrain/utils/mempool.h"

#include "./numpy_dtypes.h" #include "./numpy_dtypes.h"


namespace py = pybind11; namespace py = pybind11;
@@ -390,16 +391,24 @@ HostTensorND lowbit_ndarray_to_host_tensor(
} else { } else {
mgb_assert(layout.ndim && layout.ndim <= TensorShape::MAX_NDIM, mgb_assert(layout.ndim && layout.ndim <= TensorShape::MAX_NDIM,
"unsupported ndim %zu", layout.ndim); "unsupported ndim %zu", layout.ndim);
for (size_t i = 0; i < layout.ndim; ++ i) {
layout.shape[i] = PyArray_SHAPE(input)[i];
layout.stride[i] = PyArray_STRIDE(input, i);
TensorLayout ly;
ly.ndim = layout.ndim;
for (size_t i = 0; i < layout.ndim; ++i) {
ly.shape[i] = layout.shape[i] = PyArray_SHAPE(input)[i];
ly.stride[i] = PyArray_STRIDE(input, i);
mgb_assert(layout.shape[i], "zero shape not supported"); mgb_assert(layout.shape[i], "zero shape not supported");
} }
mgb_assert(layout.is_contiguous());
mgb_assert(ly.is_physical_contiguous());
layout.init_contiguous_stride();
} }
HostTensorND ret{comp_node, layout}; HostTensorND ret{comp_node, layout};
lowbit_memcpy_byte2compact(layout.dtype, ret.raw_ptr(), src_ptr,
layout.total_nr_elems());
if (layout.format.is_lowbit_aligned()) {
mgb_assert(layout.is_contiguous());
lowbit_memcpy_byte2aligned(ret.raw_ptr(), src_ptr, layout);
} else {
lowbit_memcpy_byte2compact(layout.dtype, ret.raw_ptr(), src_ptr,
layout.total_nr_elems());
}
return ret; return ret;
} }


@@ -423,10 +432,8 @@ std::pair<HostTensorND, bool> np2tensor_try_borrow(
} }


// make result from PyArrayObject; its reference may be stolen // make result from PyArrayObject; its reference may be stolen
auto make_from_arr = [&](PyArrayObject *input, bool allow_borrow) {

TensorLayout layout;
layout.dtype = dtype_np2mgb_descr(PyArray_DESCR(input));
auto make_from_arr = [&](PyArrayObject* input, bool allow_borrow) {
TensorLayout layout{{}, dtype_np2mgb_descr(PyArray_DESCR(input))};
if (dtype.valid()) if (dtype.valid())
mgb_assert(dtype == layout.dtype); mgb_assert(dtype == layout.dtype);
layout.ndim = PyArray_NDIM(input); layout.ndim = PyArray_NDIM(input);
@@ -605,8 +612,15 @@ PyObject* ndarray_from_tensor(
if (val.dtype().is_low_bit()) { if (val.dtype().is_low_bit()) {
mgb_assert(share_type != ShareType::MUST_SHARE, mgb_assert(share_type != ShareType::MUST_SHARE,
"can not share memory for lowbit dtype"); "can not share memory for lowbit dtype");
lowbit_memcpy_compact2byte(val.dtype(), alloc_new_ret(), val.raw_ptr(),
val.layout().total_nr_elems());
const auto& layout = val.layout();
if (layout.format.is_lowbit_aligned()) {
lowbit_memcpy_aligned2byte(alloc_new_ret(), val.raw_ptr(),
val.layout());
} else {
lowbit_memcpy_compact2byte(val.dtype(), alloc_new_ret(),
val.raw_ptr(),
val.layout().total_nr_elems());
}
} else if (share_type == ShareType::MUST_UNSHARE) { } else if (share_type == ShareType::MUST_UNSHARE) {
memcpy(alloc_new_ret(), val.raw_ptr(), val.layout().span().dist_byte()); memcpy(alloc_new_ret(), val.raw_ptr(), val.layout().span().dist_byte());
} else { } else {


+ 1
- 1
imperative/src/impl/physical_tensor.cpp View File

@@ -290,7 +290,7 @@ Tensor::Tensor(const DeviceTensorND &dv, const HostTensorND& hv) {
} }


Tensor::Tensor(const TensorLayout& layout, const CompNode& cn) Tensor::Tensor(const TensorLayout& layout, const CompNode& cn)
: m_layout{layout}, m_blob{Blob::make(cn, layout.dtype.size(layout.total_nr_elems()))},
: m_layout{layout}, m_blob{Blob::make(cn, layout.span().dist_byte())},
m_offset{0} {} m_offset{0} {}


Tensor::Tensor(const BlobPtr blob, const size_t offset, const TensorLayout& layout) Tensor::Tensor(const BlobPtr blob, const size_t offset, const TensorLayout& layout)


+ 40
- 13
src/core/impl/dtype.cpp View File

@@ -359,19 +359,6 @@ struct LowbitMemcpy<bits, true> {
} }
}; };


template<typename DT>
struct QuantizedLowbitTrait;

template<>
struct QuantizedLowbitTrait<dtype::Quantized4Asymm> {
static constexpr int8_t SHIFT = 0;
};

template<>
struct QuantizedLowbitTrait<dtype::QuantizedS4> {
static constexpr int8_t SHIFT = 8;
};

template <typename DT, bool div_byte = (DTypeTrait<DT>::category == template <typename DT, bool div_byte = (DTypeTrait<DT>::category ==
DTypeCategory::QUANTIZED) && DTypeCategory::QUANTIZED) &&
(8 % DTypeTrait<DT>::low_bit == 0)> (8 % DTypeTrait<DT>::low_bit == 0)>
@@ -452,4 +439,44 @@ void mgb::lowbit_memcpy_compact2byte(
mgb_throw(MegBrainError, "bad dtype for lowbit: %s", dtype.name()); mgb_throw(MegBrainError, "bad dtype for lowbit: %s", dtype.name());
} }


void mgb::lowbit_memcpy_byte2aligned(void* dest, const void* src,
const ::megdnn::TensorLayout& layout) {
size_t low_bit = layout.dtype.low_bit();
size_t dim = layout.shape[layout.ndim - 1];
if ((dim * low_bit) % 8) { // padding
size_t n = layout.total_nr_elems();
size_t stride = divup<size_t>(dim * low_bit, 8);
dt_byte* dest_ptr = reinterpret_cast<dt_byte*>(dest);
const dt_byte* src_ptr = reinterpret_cast<const dt_byte*>(src);
for (size_t i = 0; i < n / dim; ++i) {
lowbit_memcpy_byte2compact(layout.dtype, dest_ptr, src_ptr, dim);
dest_ptr += stride;
src_ptr += dim;
}
} else {
lowbit_memcpy_byte2compact(layout.dtype, dest, src,
layout.total_nr_elems());
}
}

void mgb::lowbit_memcpy_aligned2byte(void* dest, const void* src,
const ::megdnn::TensorLayout& layout) {
size_t low_bit = layout.dtype.low_bit();
size_t dim = layout.shape[layout.ndim - 1];
if ((dim * low_bit) % 8) { // padding
size_t n = layout.total_nr_elems();
size_t stride = divup<size_t>(dim * low_bit, 8);
dt_byte* dest_ptr = reinterpret_cast<dt_byte*>(dest);
const dt_byte* src_ptr = reinterpret_cast<const dt_byte*>(src);
for (size_t i = 0; i < n / dim; ++i) {
lowbit_memcpy_compact2byte(layout.dtype, dest_ptr, src_ptr, dim);
dest_ptr += dim;
src_ptr += stride;
}
} else {
lowbit_memcpy_compact2byte(layout.dtype, dest, src,
layout.total_nr_elems());
}
}

// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

+ 7
- 3
src/core/impl/graph/var_node_mem_mgr.cpp View File

@@ -1340,15 +1340,19 @@ void VarNodeMemManager::make_dev_tensor_from_mem_plan_single(
void VarNodeMemManager::var_alloc_with_shape(VarNode* var, void VarNodeMemManager::var_alloc_with_shape(VarNode* var,
const TensorShape& shape, const TensorShape& shape,
size_t size_req) { size_t size_req) {
mgb_assert(var->format().is_default(),
bool cond_default = var->format().is_default();
bool cond_lowbit = var->dtype().is_quantized_lowbit() &&
var->format().is_lowbit_aligned();
mgb_assert(cond_default || cond_lowbit,
"dynamic shape is currently only supported for var with " "dynamic shape is currently only supported for var with "
"default format; got %s", "default format; got %s",
var->format().to_string().c_str()); var->format().to_string().c_str());
var->shape(shape); var->shape(shape);
TensorLayout ly{shape, var->dtype()};
if (size_req != 0) { if (size_req != 0) {
mgb_assert(var->dtype().size(shape.total_nr_elems()) <= size_req);
mgb_assert(ly.span().dist_byte() <= size_req);
} else { } else {
size_req = var->dtype().size(shape.total_nr_elems());
size_req = ly.span().dist_byte();
} }


auto&& mplan = var->m_mem_plan; auto&& mplan = var->m_mem_plan;


+ 11
- 0
src/core/include/megbrain/dtype.h View File

@@ -202,6 +202,17 @@ void lowbit_memcpy_byte2compact(
void lowbit_memcpy_compact2byte( void lowbit_memcpy_compact2byte(
DType dtype, void *dest, const void *src, size_t n); DType dtype, void *dest, const void *src, size_t n);


/*!
* \brief copy from byte representation to an aligend tensor for lowbit types
*/
void lowbit_memcpy_byte2aligned(void* dest, const void* src,
const ::megdnn::TensorLayout& ly);

/*!
* \brief copy from an aligend tensor to byte representation for lowbit types
*/
void lowbit_memcpy_aligned2byte(void* dest, const void* src,
const ::megdnn::TensorLayout& ly);


} // namespace mgb } // namespace mgb




+ 0
- 308
src/gopt/test/inference.cpp View File

@@ -4454,314 +4454,6 @@ TEST(TestGoptInference, PaddingChannelsWithWarpPerspective) {
MGB_ASSERT_TENSOR_EQ(t1, t2); MGB_ASSERT_TENSOR_EQ(t1, t2);
} }


TEST(TestGoptInference, EnableNCHW64Basic) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);

HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
auto mkvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
dtype);
};
auto mkcvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
.rename(name),
dtype);
};

auto x = mkvar("x", {16, 4, 14, 14}, dtype::QuantizedS8(2.5f)),
w = mkcvar("w", {16, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
b = mkcvar("b", {1, 16, 1, 1}, dtype::QuantizedS32(6.25f));
opr::ConvBias::Param param;
param.format = opr::ConvBias::Param::Format::NCHW;
param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY;
param.stride_h = param.stride_w = 1;
param.pad_h = param.pad_w = 1;

auto y = opr::ConvBias::make(x, w, b, param, {},
OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
auto w1 = mkcvar("w1", {32, 16, 3, 3}, dtype::QuantizedS8(2.5f)),
b1 = mkcvar("b1", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f));
auto y1 = opr::ConvBias::make(y, w1, b1, param, {},
OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
auto w2 = mkcvar("w2", {64, 32, 3, 3}, dtype::QuantizedS8(2.5f)),
b2 = mkcvar("b2", {1, 64, 1, 1}, dtype::QuantizedS32(6.25f));
auto y2 = opr::ConvBias::make(y1, w2, b2, param, {},
OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
y2 = opr::TypeCvt::make(y2, dtype::QuantizedS4{40.f});
auto w3 = mkcvar("w3", {64, 64, 3, 3}, dtype::QuantizedS4(2.5f)),
b3 = mkcvar("b3", {1, 64, 1, 1}, dtype::QuantizedS32(100.f));
auto y3 = opr::ConvBias::make(y2, w3, b3, param, {},
OperatorNodeConfig{dtype::QuantizedS4{40.f}});
y3 = opr::TypeCvt::make(y3, dtype::QuantizedS8{2.5f});
auto w4 = mkcvar("w4", {16, 64, 3, 3}, dtype::QuantizedS8(2.5f)),
b4 = mkcvar("b4", {1, 16, 1, 1}, dtype::QuantizedS32(6.25f));
auto y4 = opr::ConvBias::make(y3, w4, b4, param, {},
OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode;
auto y5 = opr::ElemwiseMultiType::make(
{y, y4}, {ElemMultiMode::QFUSE_ADD_RELU},
OperatorNodeConfig{dtype::QuantizedS8{1.3f}});
y5 = opr::TypeCvt::make(y5, dtype::Float32());
SymbolVar y5_pad;
unpack_vector(
gopt::GraphOptimizer{}
.add_pass(gopt::EnableNCHW64Pass::make_nchw64_converter())
.apply({{y5}})
.endpoint_vars(),
y5_pad);
EXPECT_TRUE(y5.node()->shape().eq_shape(y5_pad.node()->shape()));
SmallVector<cg::OperatorNodeBase*> oprs;
auto cb = [&oprs](cg::OperatorNodeBase* opr) {
if (opr->same_type<opr::ConvBias>()) {
oprs.push_back(opr);
}
};
cg::DepOprIter{cb}.add(y5_pad.node()->owner_opr());
ASSERT_EQ(oprs.size(), 5);
using Format = opr::ConvBiasForward::Param::Format;
#define CHECK(_i, _fmt) \
{ \
const auto& o = oprs[_i]->cast_final<opr::ConvBias>(); \
ASSERT_EQ(o.param().format, Format::_fmt); \
}
CHECK(0, NCHW4);
CHECK(1, NCHW4);
CHECK(2, NCHW32);
CHECK(3, NCHW64);
CHECK(4, NCHW4);
#undef CHECK
HostTensorND t1, t2;
auto func1 = graph->compile({make_callback_copy(y5, t1)});
func1->execute();
auto func2 = graph->compile({make_callback_copy(y5_pad, t2)});
func2->execute();
MGB_ASSERT_TENSOR_EQ(t1, t2);
}

TEST(TestGoptInference, EnableNCHW64PaddingChannel) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);

HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
auto mkvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
dtype);
};
auto mkcvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
.rename(name),
dtype);
};

auto x = mkvar("x", {16, 4, 14, 14}, dtype::QuantizedS8(2.5f)),
w = mkcvar("w", {20, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
b = mkcvar("b", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f));
opr::ConvBias::Param param;
param.format = opr::ConvBias::Param::Format::NCHW;
param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY;
param.stride_h = param.stride_w = 1;
param.pad_h = param.pad_w = 1;

auto y = opr::ConvBias::make(x, w, b, param, {},
OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
opr::Pooling::Param pool;
pool.format = opr::Pooling::Param::Format::NCHW;
y = opr::Pooling::make(y, pool);

auto w1 = mkcvar("w1", {24, 20, 3, 3}, dtype::QuantizedS8(2.5f)),
b1 = mkcvar("b1", {1, 24, 1, 1}, dtype::QuantizedS32(6.25f));
auto y1 = opr::ConvBias::make(y, w1, b1, param, {},
OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
auto w2 = mkcvar("w2", {20, 24, 3, 3}, dtype::QuantizedS8(2.5f)),
b2 = mkcvar("b2", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f));
auto y2 = opr::ConvBias::make(y1, w2, b2, param, {},
OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
y2 = opr::TypeCvt::make(y2, dtype::QuantizedS4{40.f});
auto w3 = mkcvar("w3", {64, 20, 3, 3}, dtype::QuantizedS4(2.5f)),
b3 = mkcvar("b3", {1, 64, 1, 1}, dtype::QuantizedS32(100.f));
auto y3 = opr::ConvBias::make(y2, w3, b3, param, {},
OperatorNodeConfig{dtype::QuantizedS4{40.f}});
auto w4 = mkcvar("w4", {20, 64, 3, 3}, dtype::QuantizedS4(2.5f)),
b4 = mkcvar("b4", {1, 20, 1, 1}, dtype::QuantizedS32(100.f));
auto y4 = opr::ConvBias::make(y3, w4, b4, param, {},
OperatorNodeConfig{dtype::QuantizedS4{40.f}});
y4 = opr::TypeCvt::make(y4, dtype::QuantizedS8{2.5f});
using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode;
auto y5 = opr::ElemwiseMultiType::make(
{y, y4}, {ElemMultiMode::QFUSE_ADD_RELU},
OperatorNodeConfig{dtype::QuantizedS8{1.2f}});
opr::ConvolutionBackwardData::Param deconv;
deconv.format = opr::ConvolutionBackwardData::Param::Format::NCHW;
deconv.stride_h = deconv.stride_w = 2;
deconv.pad_h = deconv.pad_w = 1;
auto w6 = mkcvar("w6", {20, 64, 4, 4}, dtype::QuantizedS8{2.5f});
auto y6 = opr::ConvolutionBackwardData::make(
w6, y5, deconv, {},
OperatorNodeConfig{dtype::QuantizedS8(2.0f)});
y6 = opr::TypeCvt::make(y6, dtype::QuantizedS4{32.f});

std::shared_ptr<HostTensorND> mat = std::make_shared<HostTensorND>(
cn, TensorShape{16, 3, 3}, dtype::Float32());
warp_perspective_mat_gen(*mat, 16, 14, 14);
auto mat_var = opr::Host2DeviceCopy::make(*graph, mat).rename("mat");
opr::WarpPerspective::Param warp_param;
warp_param.format = opr::WarpPerspective::Param::Format::NCHW;
auto y7 = opr::WarpPerspective::make(y6, mat_var, TensorShape{14, 14},
warp_param);
y7 = opr::TypeCvt::make(y7, dtype::Float32());
SymbolVar y7_pad;
auto opt = gopt::OptimizeForInferenceOptions{};
opt.enable_nchw64();
unpack_vector(gopt::optimize_for_inference({y7}, opt), y7_pad);
EXPECT_TRUE(y7.node()->shape().eq_shape(y7_pad.node()->shape()));
HostTensorND t1, t2;
auto func1 = graph->compile({make_callback_copy(y7, t1)});
func1->execute();
auto func2 = graph->compile({make_callback_copy(y7_pad, t2)});
func2->execute();
MGB_ASSERT_TENSOR_EQ(t1, t2);
using Format = opr::ConvBiasForward::Param::Format;
SmallVector<cg::OperatorNodeBase*> oprs;
auto cb = [&oprs](cg::OperatorNodeBase* opr) {
if (opr->same_type<opr::ConvBias>()) {
oprs.push_back(opr);
}
};
cg::DepOprIter{cb}.add(y7_pad.node()->owner_opr());
ASSERT_EQ(oprs.size(), 5);
#define CHECK(_i, _fmt) \
{ \
const auto& o = oprs[_i]->cast_final<opr::ConvBias>(); \
ASSERT_EQ(o.param().format, Format::_fmt); \
}
CHECK(0, NCHW4);
CHECK(1, NCHW32);
CHECK(2, NCHW32);
CHECK(3, NCHW64);
CHECK(4, NCHW64);
#undef CHECK
{
const auto& deconv = find_opr<opr::ConvolutionBackwardData>(y7_pad);
ASSERT_EQ(deconv.param().format, Format::NCHW4);
const auto& pool = find_opr<opr::PoolingForward>(y7_pad);
ASSERT_EQ(pool.param().format, Format::NCHW4);
const auto& warp = find_opr<opr::WarpPerspectiveForward>(y7_pad);
ASSERT_EQ(warp.param().format, Format::NCHW64);
}
size_t nr_dimshuffle = find_opr_num<opr::Dimshuffle>(y7_pad);
ASSERT_EQ(nr_dimshuffle, 8);
}

TEST(TestGoptInference, EnableNCHW64FuseConvBiasZ) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);

HostTensorND t1, t2;
HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
auto mkvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
dtype);
};
auto mkcvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
.rename(name),
dtype);
};

auto x = mkvar("x", {16, 4, 14, 14}, dtype::QuantizedS8(2.5f)),
w = mkcvar("w", {32, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
b = mkcvar("b", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f));
opr::ConvBias::Param param;
param.format = opr::ConvBias::Param::Format::NCHW;
param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY;
param.stride_h = param.stride_w = 1;
param.pad_h = param.pad_w = 1;

auto y = opr::ConvBias::make(x, w, b, param, {},
OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
auto w1 = mkcvar("w1", {64, 32, 3, 3}, dtype::QuantizedS8(2.5f)),
b1 = mkcvar("b1", {1, 64, 1, 1}, dtype::QuantizedS32(6.25f));
auto y1 = opr::ConvBias::make(y, w1, b1, param, {},
OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
y1 = opr::TypeCvt::make(y1, dtype::QuantizedS4{40.f});
auto w2 = mkcvar("w2", {64, 64, 3, 3}, dtype::QuantizedS4(2.5f)),
b2 = mkcvar("b2", {1, 64, 1, 1}, dtype::QuantizedS32(100.f));
auto y2 = opr::ConvBias::make(y1, w2, b2, param, {},
OperatorNodeConfig{dtype::QuantizedS4{40.f}});
auto w3 = mkcvar("w3", {64, 64, 3, 3}, dtype::QuantizedS4(2.5f)),
b3 = mkcvar("b3", {1, 64, 1, 1}, dtype::QuantizedS32(100.f));
auto y3 = opr::ConvBias::make(y2, w3, b3, param, {},
OperatorNodeConfig{dtype::QuantizedS4(40.f)});
using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode;
auto y4 = opr::ElemwiseMultiType::make(
{y1, y3}, {ElemMultiMode::QFUSE_ADD_RELU},
OperatorNodeConfig{dtype::QuantizedS4{40.f}});
y4 = opr::TypeCvt::make(y4, dtype::Float32());
auto y5 = opr::ConvBias::make(y2, w3, b3, y1, param, {},
OperatorNodeConfig{dtype::QuantizedS4(40.f)});
y5 = opr::TypeCvt::make(y5, dtype::Float32());
SymbolVar y4_pad;
auto opt = gopt::OptimizeForInferenceOptions{};
opt.enable_nchw64();
unpack_vector(gopt::optimize_for_inference({y4}, opt), y4_pad);
EXPECT_TRUE(y4.node()->shape().eq_shape(y4_pad.node()->shape()));
size_t nr_elem_mult_type = find_opr_num<opr::ElemwiseMultiType>(y4_pad);
ASSERT_EQ(nr_elem_mult_type, 0);
auto func = graph->compile({make_callback_copy(y4_pad, t1)});
func->execute();

{
opr::ConvBias::Param param;
param.format = opr::ConvBias::Param::Format::NCHW;
param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY;
param.stride_h = param.stride_w = 1;
param.pad_h = param.pad_w = 1;

auto y = opr::ConvBias::make(
x, w, b, param, {},
OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
auto y1 = opr::ConvBias::make(
y, w1, b1, param, {},
OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
y1 = opr::TypeCvt::make(y1, dtype::QuantizedS4{40.f});
auto y2 = opr::ConvBias::make(
y1, w2, b2, param, {},
OperatorNodeConfig{dtype::QuantizedS4{40.f}});
param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
auto y3 = opr::ConvBias::make(
y2, w3, b3, y1, param, {},
OperatorNodeConfig{dtype::QuantizedS4(40.f)});
y3 = opr::TypeCvt::make(y3, dtype::Float32());
auto func = graph->compile({make_callback_copy(y3, t2)});
func->execute();
}
MGB_ASSERT_TENSOR_EQ(t1, t2);
}


#endif #endif




+ 4
- 172
src/opr/test/dnn/convolution.cpp View File

@@ -2604,174 +2604,6 @@ TEST_F(TestNoWeightPreprocess, NoPreprocess) {
#endif #endif


namespace { namespace {
TEST(TestOprDNN, ConvBiasInt4NCHW) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
auto sm_ver = prop.major * 10 + prop.minor;
if (sm_ver != 75) {
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)\n",
sm_ver, 75);
return;
}

auto run = [&cn](size_t N, size_t C, size_t H, size_t W, size_t F, size_t S,
size_t P) {
auto graph = ComputingGraph::make();

HostTensorGenerator<dtype::Int8> gen;
auto mkvar = [&gen](const char* name, const TensorShape& shp,
const DType& dtype,
std::shared_ptr<ComputingGraph> graph,
const CompNode& cn) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, gen(shp, cn))
.rename(name),
dtype);
};
auto mkcvar = [&gen](const char* name, const TensorShape& shp,
const DType& dtype,
std::shared_ptr<ComputingGraph> graph,
const CompNode& cn) {
return opr::TypeCvt::make(
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
.rename(name),
dtype);
};

using Policy = opr::ConvBias::ExecutionPolicy;
using Strategy = Policy::Strategy;
auto x = mkvar("x", {N, C * 4, H, W}, dtype::QuantizedS4(1.19960327f),
graph, cn),
w = mkcvar("w1", {C, C * 4, F, F}, dtype::QuantizedS4(1.19970327f),
graph, cn),
b = mkcvar("b1", {1, C, 1, 1},
dtype::QuantizedS32(1.19960327f * 1.19970327f), graph,
cn);
opr::ConvBias::Param param;
param.format = opr::ConvBias::Param::Format::NCHW;
param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
param.stride_h = param.stride_w = S;
param.pad_h = param.pad_w = P;
Policy policy;
policy.strategy = Strategy::PROFILE;

auto y = opr::ConvBias::make(
x, w, b, param, policy,
OperatorNodeConfig{dtype::QuantizedS4(11.9960501f)});
y = opr::TypeCvt::make(y, dtype::Float32());
auto x_f32 = opr::TypeCvt::make(x, dtype::Float32()),
w_f32 = opr::TypeCvt::make(w, dtype::Float32()),
b_f32 = opr::TypeCvt::make(b, dtype::Float32());
auto y_f32 = opr::ConvBias::make(x_f32, w_f32, b_f32, param, policy);
auto y_q4 = opr::TypeCvt::make(y_f32, dtype::QuantizedS4{11.9960501f});
y_q4 = opr::TypeCvt::make(y_q4, dtype::Float32());
HostTensorND host_y, host_y_q4;
auto func = graph->compile({make_callback_copy(y, host_y),
make_callback_copy(y_q4, host_y_q4)});
func->execute();
MGB_ASSERT_TENSOR_NEAR(host_y, host_y_q4, 1e-3);
};
run(2, 64, 14, 14, 3, 2, 1);
run(2, 64, 7, 7, 3, 1, 1);
run(2, 64, 14, 14, 1, 2, 0);
run(2, 64, 7, 7, 1, 1, 0);
}

TEST(TestOprDNN, ConvBiasInt4NCHW64) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
auto sm_ver = prop.major * 10 + prop.minor;
if (sm_ver != 75) {
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)\n",
sm_ver, 75);
return;
}

auto nchw2nchw64 = [](SymbolVar x) {
auto y = opr::RelayoutFormat::make(
x, opr::RelayoutFormat::Param::Mode::NCHW_NCHW64);
return y;
};

auto nchw642nchw = [](SymbolVar x) {
auto y = opr::RelayoutFormat::make(
x, opr::RelayoutFormat::Param::Mode::NCHW64_NCHW);
return y;
};

auto run = [&](size_t N, size_t C, size_t H, size_t W, size_t F, size_t S,
size_t P) {
auto graph = ComputingGraph::make();

HostTensorGenerator<dtype::Int8> gen;
auto mkvar = [&gen](const char* name, const TensorShape& shp,
const DType& dtype,
std::shared_ptr<ComputingGraph> graph,
const CompNode& cn) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, gen(shp, cn))
.rename(name),
dtype);
};
auto mkcvar = [&gen](const char* name, const TensorShape& shp,
const DType& dtype,
std::shared_ptr<ComputingGraph> graph,
const CompNode& cn) {
return opr::TypeCvt::make(
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
.rename(name),
dtype);
};

using Policy = opr::ConvBias::ExecutionPolicy;
using Strategy = Policy::Strategy;
auto x = mkvar("x", {N, C / 16, H, W, 64},
dtype::QuantizedS4(1.19960327f), graph, cn),
w = mkcvar("w1", {C, C / 16, F, F, 64},
dtype::QuantizedS4(1.19970327f), graph, cn),
b = mkcvar("b1", {1, C / 64, 1, 1, 64},
dtype::QuantizedS32(1.19960327f * 1.19970327f), graph,
cn);
opr::ConvBias::Param param;
param.format = opr::ConvBias::Param::Format::NCHW64;
param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
param.stride_h = param.stride_w = S;
param.pad_h = param.pad_w = P;
Policy policy;
policy.strategy = Strategy::PROFILE;

auto y = opr::ConvBias::make(
x, w, b, param, policy,
OperatorNodeConfig{dtype::QuantizedS4(11.9960501f)});
y = opr::TypeCvt::make(y, dtype::Float32());
x = nchw642nchw(x);
w = nchw642nchw(w);
b = nchw642nchw(b);
auto x_f32 = opr::TypeCvt::make(x, dtype::Float32()),
w_f32 = opr::TypeCvt::make(w, dtype::Float32()),
b_f32 = opr::TypeCvt::make(b, dtype::Float32());
param.format = opr::ConvBias::Param::Format::NCHW;
auto y_f32 = opr::ConvBias::make(x_f32, w_f32, b_f32, param, policy);
auto y_q4 = opr::TypeCvt::make(y_f32, dtype::QuantizedS4{11.9960501f});
y_q4 = opr::TypeCvt::make(y_q4, dtype::Float32());
y_q4 = nchw2nchw64(y_q4);
HostTensorND host_y, host_y_q4;
auto func = graph->compile({make_callback_copy(y, host_y),
make_callback_copy(y_q4, host_y_q4)});
func->execute();
MGB_ASSERT_TENSOR_NEAR(host_y, host_y_q4, 1e-3);
};
run(2, 64, 14, 14, 3, 2, 1);
run(2, 64, 7, 7, 3, 1, 1);
run(2, 64, 14, 14, 1, 2, 0);
run(2, 64, 7, 7, 1, 1, 0);
}


TEST(TestOprDNN, ConvBiasInt4Serialize) { TEST(TestOprDNN, ConvBiasInt4Serialize) {
using namespace serialization; using namespace serialization;
@@ -2783,7 +2615,7 @@ TEST(TestOprDNN, ConvBiasInt4Serialize) {


HostTensorGenerator<dtype::Int8> gen; HostTensorGenerator<dtype::Int8> gen;
std::shared_ptr<HostTensorND> xv; std::shared_ptr<HostTensorND> xv;
auto mkvar = [&gen](const char* name, const DType& dtype,
auto mkvar = [](const char* name, const DType& dtype,
std::shared_ptr<ComputingGraph> graph, std::shared_ptr<ComputingGraph> graph,
std::shared_ptr<HostTensorND> val) { std::shared_ptr<HostTensorND> val) {
return opr::TypeCvt::make( return opr::TypeCvt::make(
@@ -2856,9 +2688,9 @@ TEST(TestOprDNN, ConvBiasInt4SerializeWithParamFuse) {


HostTensorGenerator<dtype::Int8> gen; HostTensorGenerator<dtype::Int8> gen;
std::shared_ptr<HostTensorND> xv; std::shared_ptr<HostTensorND> xv;
auto mkvar = [&gen](const char* name, const DType& dtype,
std::shared_ptr<ComputingGraph> graph,
std::shared_ptr<HostTensorND> val) {
auto mkvar = [](const char* name, const DType& dtype,
std::shared_ptr<ComputingGraph> graph,
std::shared_ptr<HostTensorND> val) {
return opr::TypeCvt::make( return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, val).rename(name), dtype); opr::Host2DeviceCopy::make(*graph, val).rename(name), dtype);
}; };


+ 9
- 3
src/serialization/impl/serializer_oss.cpp View File

@@ -62,7 +62,12 @@ bool contains_any_in_set(const SmallVector<T>& list,


void check_tensor_value_valid(const std::string& name, void check_tensor_value_valid(const std::string& name,
const HostTensorND& tensor) { const HostTensorND& tensor) {
mgb_assert(tensor.layout().is_physical_contiguous(),
bool cond_normal = tensor.layout().format.is_default() &&
tensor.layout().is_physical_contiguous();
bool cond_lowbit = tensor.layout().dtype.is_quantized_lowbit() &&
tensor.layout().format.is_lowbit_aligned() &&
tensor.layout().is_contiguous();
mgb_assert(cond_normal || cond_lowbit,
"non-contiguous tensor: name=%s layout=%s", name.c_str(), "non-contiguous tensor: name=%s layout=%s", name.c_str(),
tensor.layout().to_string().c_str()); tensor.layout().to_string().c_str());
if (tensor.dtype() == dtype::Float32()) { if (tensor.dtype() == dtype::Float32()) {
@@ -585,11 +590,12 @@ TensorLayout load_tensor_layout(const fbs::Tensor* tensor) {
layout.ndim = tensor->shape()->size(); layout.ndim = tensor->shape()->size();
std::copy(tensor->shape()->begin(), tensor->shape()->end(), std::copy(tensor->shape()->begin(), tensor->shape()->end(),
layout.shape); layout.shape);
layout.init_contiguous_stride();
} }
if (tensor->dtype()) { if (tensor->dtype()) {
layout.dtype = fbs::intl::load_dtype(tensor->dtype());
// modify data type inplace for TensorLayout
layout.modify_dtype_inplace(fbs::intl::load_dtype(tensor->dtype()));
} }
layout.init_contiguous_stride();
return layout; return layout;
} }




Loading…
Cancel
Save