From 86b69cacd0c17a3d02a66f6f226cd16e7ab534ea Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Mon, 26 Apr 2021 17:20:49 +0800 Subject: [PATCH] fix(dnn): fixes for int4 GitOrigin-RevId: 845e164fd3143d2092627374167e74f37b952aed --- dnn/include/megdnn/basic_types.h | 7 + dnn/include/megdnn/dtype.h | 9 + dnn/include/megdnn/tensor_format.h | 2 +- dnn/src/common/basic_types.cpp | 5 + dnn/src/common/convolution.cpp | 7 +- dnn/src/common/tensor_format.cpp | 33 +-- dnn/src/cuda/conv_bias/fallback_nchw_qs4.cpp | 8 +- dnn/src/cuda/conv_bias/opr_impl.h | 1 - dnn/src/naive/matrix_mul/matrix_mul_helper.h | 7 +- dnn/src/naive/pooling/opr_impl.cpp | 7 +- dnn/test/common/checker.h | 1 - dnn/test/common/test_basic_types.cpp | 9 +- dnn/test/common/utils.h | 8 + dnn/test/naive/warp_perspective.cpp | 39 ++-- imperative/python/src/helper.cpp | 38 ++-- imperative/src/impl/physical_tensor.cpp | 2 +- src/core/impl/dtype.cpp | 53 +++-- src/core/impl/graph/var_node_mem_mgr.cpp | 10 +- src/core/include/megbrain/dtype.h | 11 + src/gopt/test/inference.cpp | 308 --------------------------- src/opr/test/dnn/convolution.cpp | 176 +-------------- src/serialization/impl/serializer_oss.cpp | 12 +- 22 files changed, 184 insertions(+), 569 deletions(-) diff --git a/dnn/include/megdnn/basic_types.h b/dnn/include/megdnn/basic_types.h index 2019f936..c10cd3b9 100644 --- a/dnn/include/megdnn/basic_types.h +++ b/dnn/include/megdnn/basic_types.h @@ -281,6 +281,13 @@ struct TensorLayout : public TensorShape { add_axis_inplace(axis, 1, stride[axis] * shape[axis]); } + /*! + * \brief modify data type of the layout inplace + * + * By the way this API will modify the format according to the data type + */ + void modify_dtype_inplace(DType dtype); + /* =================== generate new layout =================== */ /** diff --git a/dnn/include/megdnn/dtype.h b/dnn/include/megdnn/dtype.h index 74101e3a..dcc57cfe 100644 --- a/dnn/include/megdnn/dtype.h +++ b/dnn/include/megdnn/dtype.h @@ -513,6 +513,15 @@ class DType { bool is_low_bit() const { return low_bit() != 0; } + bool is_quantized_lowbit() const { + return low_bit() != 0 && +#if MEGDNN_CC_HOST + category() == DTypeCategory::QUANTIZED; +#else + category().ev == DTypeCategory::Ev::QUANTIZED; +#endif + } + /*! * \brief size of this data type, in bytes */ diff --git a/dnn/include/megdnn/tensor_format.h b/dnn/include/megdnn/tensor_format.h index 4a0bb570..bb9b68a7 100644 --- a/dnn/include/megdnn/tensor_format.h +++ b/dnn/include/megdnn/tensor_format.h @@ -226,7 +226,7 @@ public: std::string to_string() const override; //! raise exception if given layout is illegal - void assert_valid(const TensorLayout& layout) const; + void assert_valid(const TensorLayout& layout) const override; void serialize_append(std::string& result) const override; diff --git a/dnn/src/common/basic_types.cpp b/dnn/src/common/basic_types.cpp index d6800bd1..96de41d3 100644 --- a/dnn/src/common/basic_types.cpp +++ b/dnn/src/common/basic_types.cpp @@ -282,6 +282,11 @@ void TensorLayout::add_axis_inplace(size_t axis, size_t shape, this->stride[axis] = stride; } +void TensorLayout::modify_dtype_inplace(DType dtype_) { + dtype = dtype_; + format = Format(dtype); +} + bool TensorLayout::is_contiguous() const { return format.impl()->is_contiguous_spec(*this); } diff --git a/dnn/src/common/convolution.cpp b/dnn/src/common/convolution.cpp index be94b9ec..0730b2bf 100644 --- a/dnn/src/common/convolution.cpp +++ b/dnn/src/common/convolution.cpp @@ -952,7 +952,12 @@ ConvolutionBase::deduce_layout_fwd(const TensorLayout& src, megdnn_assert(src[4] == 4); dst[4] = 4; } - dst.format = src.format; + if (!src.format.is_default() && + !src.format.is_lowbit_aligned()) { // propagate + dst.format = src.format; + } else { // determined by dtype + dst.format = TensorFormat(dst.dtype); + } dst.init_contiguous_stride(); return cflt; } diff --git a/dnn/src/common/tensor_format.cpp b/dnn/src/common/tensor_format.cpp index ae1841ad..ac4736ad 100644 --- a/dnn/src/common/tensor_format.cpp +++ b/dnn/src/common/tensor_format.cpp @@ -46,14 +46,15 @@ TensorFormat TensorFormat::deserialize(const std::string& bin, TensorFormat::Format() : m_impl{DefaultTensorFormat::make().m_impl} {} TensorFormat::Format(DType dtype) { - megdnn_assert(dtype.valid()); - if (dtype.is_low_bit()) { + if (dtype.valid() && + dtype.is_quantized_lowbit()) { // quantized lowbit, by default + // aligned to bytes size_t size_nbits = dtype.low_bit(); megdnn_assert(size_nbits == 1 || size_nbits == 2 || size_nbits == 4, "unsupported lowbits data type(%s, size in bits: %zu)", dtype.name(), size_nbits); m_impl = LowbitsAlignedToBytesTensorFormat::make(size_nbits).m_impl; - } else { + } else { // non parameterized lowbit, default format m_impl = DefaultTensorFormat::make().m_impl; } } @@ -89,8 +90,8 @@ bool TensorFormat::is_lowbit_aligned() const { /* ===================== DefaultFormat ===================== */ void DefaultTensorFormat::assert_valid(const TensorLayout& layout) const { megdnn_assert( - !layout.dtype.valid() || !layout.dtype.is_low_bit(), - "DefaultTensorFormat does not support low-bits tensor(dtype:%s)", + !layout.dtype.valid() || !layout.dtype.is_quantized_lowbit(), + "DefaultTensorFormat does not support quantized lowbit tensor(dtype:%s)", layout.dtype.name()); } @@ -271,7 +272,8 @@ void Image2DPackedTensorFormatBase::assert_valid( auto m_align_axis = align_axis(); megdnn_assert(!(layout.shape[layout.ndim - 1] % PIXEL_SIZE), "bad shape: %zu", layout.shape[layout.ndim - 1]); - megdnn_assert(layout.dtype.valid() && layout.ndim > m_align_axis); + megdnn_assert(layout.dtype.valid() && !layout.dtype.is_quantized_lowbit() && + layout.ndim > m_align_axis); ptrdiff_t first_non_zero_stride = 0; for (int i = layout.ndim - 1; i >= 0; --i) { megdnn_assert(layout.shape[i] && layout.stride[i] >= 0); @@ -478,6 +480,7 @@ void LowbitsAlignedTensorFormatBase::assert_valid( megdnn_assert(layout.dtype.valid() && layout.dtype.is_low_bit() && layout.dtype.low_bit() == m_size_nbits); bool has_dim_unity_stride = false; + bool has_dim_aligned_stride = false; for (int i = layout.ndim - 1; i >= 0; --i) { if (!has_dim_unity_stride && layout.stride[i] == 1) has_dim_unity_stride = true; @@ -485,15 +488,16 @@ void LowbitsAlignedTensorFormatBase::assert_valid( layout.stride[i] >= 0 && (layout.stride[i] % m_align_size_in_elements == 0 || layout.stride[i] == 1), - "bad stride:%s, %zu", layout.to_string().c_str(), - layout.stride[i]); + "bad stride:%s, %ld", layout.to_string().c_str(), + static_cast(layout.stride[i])); + if (!has_dim_aligned_stride && + static_cast(layout.stride[i]) == m_align_size_in_elements) + has_dim_aligned_stride = true; } - if (!has_dim_unity_stride && - (int)layout.stride[layout.ndim - 1] == - round_up(1, (int)m_align_size_in_elements)) - has_dim_unity_stride = true; - megdnn_assert(layout.ndim == 0 || has_dim_unity_stride, - "innermost dim not contiguous"); + + megdnn_assert( + layout.ndim == 0 || has_dim_unity_stride || has_dim_aligned_stride, + "innermost dim not contiguous"); } void LowbitsAlignedTensorFormatBase::serialize_append( @@ -542,6 +546,7 @@ size_t LowbitsAlignedTensorFormatBase::init_contiguous_stride( multiplier = round_up(multiplier, m_align_size_in_elements); accum = mul(accum, multiplier); } + assert_valid(layout); return accum; } diff --git a/dnn/src/cuda/conv_bias/fallback_nchw_qs4.cpp b/dnn/src/cuda/conv_bias/fallback_nchw_qs4.cpp index f89a7562..3d14f690 100644 --- a/dnn/src/cuda/conv_bias/fallback_nchw_qs4.cpp +++ b/dnn/src/cuda/conv_bias/fallback_nchw_qs4.cpp @@ -12,6 +12,7 @@ #include "./algo.h" #include "src/cuda/utils.h" +#include "src/common/conv_bias.h" using namespace megdnn; using namespace cuda; @@ -27,8 +28,7 @@ bool ConvBiasForwardImpl::AlgoFallbackNCHWQS4::is_available( bool available = true; auto&& param = args.opr->param(); auto&& fm = args.filter_meta; - if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout), - param.format)) + if (!check_bias_share_in_channel(*(args.bias_layout), param.format)) return false; if (param.format != Format::NCHW) return false; @@ -128,7 +128,7 @@ void ConvBiasForwardImpl::AlgoFallbackNCHWQS4::exec( conv_op->param() = args.opr->param(); using Format = param::ConvBias::Format; conv_op->param().format = Format::NCHW64; - ExecArgs args_{dynamic_cast(conv_op.get()), + ExecArgs args_{reinterpret_cast(conv_op.get()), src_, filter_, bias_, @@ -190,7 +190,7 @@ WorkspaceBundle ConvBiasForwardImpl::AlgoFallbackNCHWQS4::get_workspace_bundle( conv_op->param() = args.opr->param(); using Format = param::ConvBias::Format; conv_op->param().format = Format::NCHW64; - SizeArgs args_{dynamic_cast(conv_op.get()), + SizeArgs args_{reinterpret_cast(conv_op.get()), layouts[0], layouts[1], layouts[2], diff --git a/dnn/src/cuda/conv_bias/opr_impl.h b/dnn/src/cuda/conv_bias/opr_impl.h index de3abd14..9d2b6ff5 100644 --- a/dnn/src/cuda/conv_bias/opr_impl.h +++ b/dnn/src/cuda/conv_bias/opr_impl.h @@ -64,7 +64,6 @@ public: class AlgoInt8CHWN4IMMAImplicitGemmReorderFilter; class AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth; class AlgoInt8NCHW32IMMAImplicitGemm; - class AlgoFallbackNCHWQS4; class AlgoBFloat16; class AlgoPack; diff --git a/dnn/src/naive/matrix_mul/matrix_mul_helper.h b/dnn/src/naive/matrix_mul/matrix_mul_helper.h index 9ef5c484..610000c0 100644 --- a/dnn/src/naive/matrix_mul/matrix_mul_helper.h +++ b/dnn/src/naive/matrix_mul/matrix_mul_helper.h @@ -151,9 +151,9 @@ void exec_matrix_mul_quint4x4x32_helper( MEGDNN_MARK_USED_VAR(format); MEGDNN_MARK_USED_VAR(compute_mode); auto convert_layout = [](const TensorLayout& layout) { - auto ret = layout; auto param = layout.dtype.param(); - ret.dtype = dtype::Quantized8Asymm(param.scale, param.zero_point); + TensorLayout ret(layout, + dtype::Quantized8Asymm(param.scale, param.zero_point)); return ret; }; TensorLayout A_layout, B_layout; @@ -205,9 +205,8 @@ void exec_matrix_mul_qint4x4x16_helper( MEGDNN_MARK_USED_VAR(format); MEGDNN_MARK_USED_VAR(compute_mode); auto convert_layout = [](const TensorLayout& layout) { - auto ret = layout; auto param = layout.dtype.param(); - ret.dtype = dtype::QuantizedS8(param.scale); + TensorLayout ret(layout, dtype::QuantizedS8(param.scale)); return ret; }; TensorLayout A_layout, B_layout; diff --git a/dnn/src/naive/pooling/opr_impl.cpp b/dnn/src/naive/pooling/opr_impl.cpp index 902780c4..82cf4d31 100644 --- a/dnn/src/naive/pooling/opr_impl.cpp +++ b/dnn/src/naive/pooling/opr_impl.cpp @@ -406,8 +406,7 @@ size_t PoolingForwardImpl::get_workspace_in_bytes(const TensorLayout& src, } namespace { -void post_process(const TensorND& dst, TensorND& comp_dst, Handle* handle, - WorkspaceBundle& workspace_bundle) { +void post_process(const TensorND& dst, TensorND& comp_dst) { if (dst.layout.dtype.enumv() == DTypeEnum::QuantizedS4) { int8_to_int4(comp_dst, dst); } else if (dst.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { @@ -427,8 +426,8 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4) { float scale = src.layout.dtype.param().scale; comp_src.layout.dtype = dtype::QuantizedS8(scale); - comp_src.layout.init_contiguous_stride(); comp_src.layout.format = TensorLayout::Format(comp_src.layout.dtype); + comp_src.layout.init_contiguous_stride(); comp_src.raw_ptr = wsb.get(0); comp_dst.layout.dtype = dtype::QuantizedS8(scale); comp_dst.layout.format = TensorLayout::Format(comp_dst.layout.dtype); @@ -571,7 +570,7 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, default: \ megdnn_assert(0, "not support mode"); \ } \ - post_process(dst, comp_dst, handle(), wsb); \ + post_process(dst, comp_dst); \ return; \ } diff --git a/dnn/test/common/checker.h b/dnn/test/common/checker.h index e7d6e032..4241250d 100644 --- a/dnn/test/common/checker.h +++ b/dnn/test/common/checker.h @@ -132,7 +132,6 @@ public: : dtype::Float32()); if (m_fmt.find(i) == m_fmt.end()) { layouts[i] = TensorLayout(shapes[i], dt); - layouts[i].init_contiguous_stride(); } else layouts[i] = TensorLayout(shapes[i], dt, m_fmt[i]); } diff --git a/dnn/test/common/test_basic_types.cpp b/dnn/test/common/test_basic_types.cpp index 3df876eb..c377fde5 100644 --- a/dnn/test/common/test_basic_types.cpp +++ b/dnn/test/common/test_basic_types.cpp @@ -325,13 +325,8 @@ TEST(BASIC_TYPES, TENSOR_LAYOUT_FMT_LOW_BITS) { layout = TensorLayout{{1, 32, 1, 1}, dtype::QuantizedS4{1.2f}}; layout = layout.broadcast({16, 32, 7, 7}); - EXPECT_EQ(make_layout({16, 32, 49}, {0, 1, 0}, dtype::QuantizedS4{1.2}), + EXPECT_EQ(make_layout({16, 32, 49}, {0, 2, 0}, dtype::QuantizedS4{1.2}), layout.collapse_contiguous()); - - layout = TensorLayout{{1, 32, 1, 1}, dtype::QuantizedS4{1.2f}}; - layout.init_contiguous_stride(); - layout = layout.broadcast({16, 32, 7, 7}); - ASSERT_THROW(layout.span(), MegDNNError); } TEST(BASIC_TYPES, TENSOR_LAYOUT_FMT_LOW_BITS_VALID) { @@ -342,7 +337,7 @@ TEST(BASIC_TYPES, TENSOR_LAYOUT_FMT_LOW_BITS_VALID) { LowbitsAlignedToBytesTensorFormat::make(4_z)), MegDNNError); ASSERT_THROW(TensorLayout({16, 32, 7, 7}, dtype::IntB2{}, - LowbitsAlignedToBytesTensorFormat::make(2_z)), + LowbitsAlignedToBytesTensorFormat::make(4_z)), MegDNNError); } diff --git a/dnn/test/common/utils.h b/dnn/test/common/utils.h index 9974022f..6cb505dd 100644 --- a/dnn/test/common/utils.h +++ b/dnn/test/common/utils.h @@ -343,6 +343,14 @@ static inline bool good_float(dt_qint32) { return true; } +static inline bool good_float(dt_qint4) { + return true; +} + +static inline bool good_float(dt_quint4) { + return true; +} + // A hack for the (x+0) promote to int trick on dt_quint8. static inline int operator+(dt_quint8 lhs, int rhs) { megdnn_assert(rhs == 0, "unexpected rhs"); diff --git a/dnn/test/naive/warp_perspective.cpp b/dnn/test/naive/warp_perspective.cpp index 15900d43..35ce09a9 100644 --- a/dnn/test/naive/warp_perspective.cpp +++ b/dnn/test/naive/warp_perspective.cpp @@ -545,12 +545,12 @@ TEST_F(NAIVE, WARP_PERSPECTIVE_NCHW64) { using Param = WarpPerspective::Param; auto convert_true_format = [](const TensorLayout& layout) { - if (layout.ndim == 4) - return layout - .reshape({layout[0], layout[1] / 64, layout[2], layout[3], - 64}) - .dimshuffle({0, 1, 4, 2, 3}); - else + if (layout.ndim == 4) { + TensorLayout ret{ + {layout[0], layout[1] / 64, layout[2], layout[3], 64}, + layout.dtype}; + return ret.dimshuffle({0, 1, 4, 2, 3}); + } else return layout; }; @@ -563,15 +563,16 @@ TEST_F(NAIVE, WARP_PERSPECTIVE_NCHW64) { TensorNDArray nchw_tensors; for (size_t i = 0; i < tensors.size(); ++i) { + TensorLayout ly; auto layout = tensors[i].layout; - if (layout.dtype.enumv() == DTypeEnum::QuantizedS4) - layout.dtype = dtype::QuantizedS4(); - if (layout.ndim == 5) { - layout = layout.reshape({layout[0], layout[1] * layout[4], - layout[2], layout[3]}); + if (tensors[i].layout.ndim == 5) { + ly = TensorLayout{{layout[0], layout[1] * layout[4], layout[2], + layout[3]}, + layout.dtype}; + } else { + ly = layout; } - nchw_tensors.emplace_back(malloc(layout.span().dist_byte()), - layout); + nchw_tensors.emplace_back(malloc(ly.span().dist_byte()), ly); } TensorNDArray nchw64_tensors; for (size_t i = 0; i < tensors.size(); ++i) { @@ -617,13 +618,11 @@ TEST_F(NAIVE, WARP_PERSPECTIVE_NCHW64) { checker.set_param(param); checker.execs({{2, 1, 10, 10, 64}, {2, 3, 3}, {2, 1, 10, 12, 64}}); checker.execs( - {{20, 30, 10, 12, 64}, {20, 3, 3}, {20, 30, 11, 12, 64}}); - checker.execs( - {{220, 3, 10, 10, 64}, {220, 3, 3}, {220, 3, 10, 12, 64}}); - checker.execs({{1, 25, 25, 24, 64}, {1, 3, 3}, {1, 25, 25, 510, 64}}); - checker.execs({{1, 25, 25, 510, 64}, {1, 3, 3}, {1, 25, 25, 24, 64}}); - checker.execs({{1, 25, 25, 24, 64}, {1, 3, 3}, {1, 25, 51, 50, 64}}); - checker.execs({{1, 25, 51, 50, 64}, {1, 3, 3}, {1, 25, 25, 24, 64}}); + {{20, 3, 10, 12, 64}, {20, 3, 3}, {20, 3, 11, 12, 64}}); + checker.execs({{1, 3, 25, 24, 64}, {1, 3, 3}, {1, 3, 25, 51, 64}}); + checker.execs({{1, 3, 25, 51, 64}, {1, 3, 3}, {1, 3, 25, 24, 64}}); + checker.execs({{1, 3, 25, 24, 64}, {1, 3, 3}, {1, 3, 51, 50, 64}}); + checker.execs({{1, 3, 51, 50, 64}, {1, 3, 3}, {1, 3, 25, 24, 64}}); } } // vim: syntax=cpp.doxygen diff --git a/imperative/python/src/helper.cpp b/imperative/python/src/helper.cpp index b9ad570f..d4f32536 100644 --- a/imperative/python/src/helper.cpp +++ b/imperative/python/src/helper.cpp @@ -18,6 +18,7 @@ #include "megbrain/graph/cg.h" #include "megbrain/tensor.h" #include "megbrain/utils/mempool.h" + #include "./numpy_dtypes.h" namespace py = pybind11; @@ -390,16 +391,24 @@ HostTensorND lowbit_ndarray_to_host_tensor( } else { mgb_assert(layout.ndim && layout.ndim <= TensorShape::MAX_NDIM, "unsupported ndim %zu", layout.ndim); - for (size_t i = 0; i < layout.ndim; ++ i) { - layout.shape[i] = PyArray_SHAPE(input)[i]; - layout.stride[i] = PyArray_STRIDE(input, i); + TensorLayout ly; + ly.ndim = layout.ndim; + for (size_t i = 0; i < layout.ndim; ++i) { + ly.shape[i] = layout.shape[i] = PyArray_SHAPE(input)[i]; + ly.stride[i] = PyArray_STRIDE(input, i); mgb_assert(layout.shape[i], "zero shape not supported"); } - mgb_assert(layout.is_contiguous()); + mgb_assert(ly.is_physical_contiguous()); + layout.init_contiguous_stride(); } HostTensorND ret{comp_node, layout}; - lowbit_memcpy_byte2compact(layout.dtype, ret.raw_ptr(), src_ptr, - layout.total_nr_elems()); + if (layout.format.is_lowbit_aligned()) { + mgb_assert(layout.is_contiguous()); + lowbit_memcpy_byte2aligned(ret.raw_ptr(), src_ptr, layout); + } else { + lowbit_memcpy_byte2compact(layout.dtype, ret.raw_ptr(), src_ptr, + layout.total_nr_elems()); + } return ret; } @@ -423,10 +432,8 @@ std::pair np2tensor_try_borrow( } // make result from PyArrayObject; its reference may be stolen - auto make_from_arr = [&](PyArrayObject *input, bool allow_borrow) { - - TensorLayout layout; - layout.dtype = dtype_np2mgb_descr(PyArray_DESCR(input)); + auto make_from_arr = [&](PyArrayObject* input, bool allow_borrow) { + TensorLayout layout{{}, dtype_np2mgb_descr(PyArray_DESCR(input))}; if (dtype.valid()) mgb_assert(dtype == layout.dtype); layout.ndim = PyArray_NDIM(input); @@ -605,8 +612,15 @@ PyObject* ndarray_from_tensor( if (val.dtype().is_low_bit()) { mgb_assert(share_type != ShareType::MUST_SHARE, "can not share memory for lowbit dtype"); - lowbit_memcpy_compact2byte(val.dtype(), alloc_new_ret(), val.raw_ptr(), - val.layout().total_nr_elems()); + const auto& layout = val.layout(); + if (layout.format.is_lowbit_aligned()) { + lowbit_memcpy_aligned2byte(alloc_new_ret(), val.raw_ptr(), + val.layout()); + } else { + lowbit_memcpy_compact2byte(val.dtype(), alloc_new_ret(), + val.raw_ptr(), + val.layout().total_nr_elems()); + } } else if (share_type == ShareType::MUST_UNSHARE) { memcpy(alloc_new_ret(), val.raw_ptr(), val.layout().span().dist_byte()); } else { diff --git a/imperative/src/impl/physical_tensor.cpp b/imperative/src/impl/physical_tensor.cpp index 4261c4f0..c6831de2 100644 --- a/imperative/src/impl/physical_tensor.cpp +++ b/imperative/src/impl/physical_tensor.cpp @@ -290,7 +290,7 @@ Tensor::Tensor(const DeviceTensorND &dv, const HostTensorND& hv) { } Tensor::Tensor(const TensorLayout& layout, const CompNode& cn) - : m_layout{layout}, m_blob{Blob::make(cn, layout.dtype.size(layout.total_nr_elems()))}, + : m_layout{layout}, m_blob{Blob::make(cn, layout.span().dist_byte())}, m_offset{0} {} Tensor::Tensor(const BlobPtr blob, const size_t offset, const TensorLayout& layout) diff --git a/src/core/impl/dtype.cpp b/src/core/impl/dtype.cpp index 91c8f23b..5d03ac0d 100644 --- a/src/core/impl/dtype.cpp +++ b/src/core/impl/dtype.cpp @@ -359,19 +359,6 @@ struct LowbitMemcpy { } }; -template -struct QuantizedLowbitTrait; - -template<> -struct QuantizedLowbitTrait { - static constexpr int8_t SHIFT = 0; -}; - -template<> -struct QuantizedLowbitTrait { - static constexpr int8_t SHIFT = 8; -}; - template ::category == DTypeCategory::QUANTIZED) && (8 % DTypeTrait
::low_bit == 0)> @@ -452,4 +439,44 @@ void mgb::lowbit_memcpy_compact2byte( mgb_throw(MegBrainError, "bad dtype for lowbit: %s", dtype.name()); } +void mgb::lowbit_memcpy_byte2aligned(void* dest, const void* src, + const ::megdnn::TensorLayout& layout) { + size_t low_bit = layout.dtype.low_bit(); + size_t dim = layout.shape[layout.ndim - 1]; + if ((dim * low_bit) % 8) { // padding + size_t n = layout.total_nr_elems(); + size_t stride = divup(dim * low_bit, 8); + dt_byte* dest_ptr = reinterpret_cast(dest); + const dt_byte* src_ptr = reinterpret_cast(src); + for (size_t i = 0; i < n / dim; ++i) { + lowbit_memcpy_byte2compact(layout.dtype, dest_ptr, src_ptr, dim); + dest_ptr += stride; + src_ptr += dim; + } + } else { + lowbit_memcpy_byte2compact(layout.dtype, dest, src, + layout.total_nr_elems()); + } +} + +void mgb::lowbit_memcpy_aligned2byte(void* dest, const void* src, + const ::megdnn::TensorLayout& layout) { + size_t low_bit = layout.dtype.low_bit(); + size_t dim = layout.shape[layout.ndim - 1]; + if ((dim * low_bit) % 8) { // padding + size_t n = layout.total_nr_elems(); + size_t stride = divup(dim * low_bit, 8); + dt_byte* dest_ptr = reinterpret_cast(dest); + const dt_byte* src_ptr = reinterpret_cast(src); + for (size_t i = 0; i < n / dim; ++i) { + lowbit_memcpy_compact2byte(layout.dtype, dest_ptr, src_ptr, dim); + dest_ptr += dim; + src_ptr += stride; + } + } else { + lowbit_memcpy_compact2byte(layout.dtype, dest, src, + layout.total_nr_elems()); + } +} + // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/core/impl/graph/var_node_mem_mgr.cpp b/src/core/impl/graph/var_node_mem_mgr.cpp index 12b9d386..c5e65a8a 100644 --- a/src/core/impl/graph/var_node_mem_mgr.cpp +++ b/src/core/impl/graph/var_node_mem_mgr.cpp @@ -1340,15 +1340,19 @@ void VarNodeMemManager::make_dev_tensor_from_mem_plan_single( void VarNodeMemManager::var_alloc_with_shape(VarNode* var, const TensorShape& shape, size_t size_req) { - mgb_assert(var->format().is_default(), + bool cond_default = var->format().is_default(); + bool cond_lowbit = var->dtype().is_quantized_lowbit() && + var->format().is_lowbit_aligned(); + mgb_assert(cond_default || cond_lowbit, "dynamic shape is currently only supported for var with " "default format; got %s", var->format().to_string().c_str()); var->shape(shape); + TensorLayout ly{shape, var->dtype()}; if (size_req != 0) { - mgb_assert(var->dtype().size(shape.total_nr_elems()) <= size_req); + mgb_assert(ly.span().dist_byte() <= size_req); } else { - size_req = var->dtype().size(shape.total_nr_elems()); + size_req = ly.span().dist_byte(); } auto&& mplan = var->m_mem_plan; diff --git a/src/core/include/megbrain/dtype.h b/src/core/include/megbrain/dtype.h index 38191c6e..0e86b6fe 100644 --- a/src/core/include/megbrain/dtype.h +++ b/src/core/include/megbrain/dtype.h @@ -202,6 +202,17 @@ void lowbit_memcpy_byte2compact( void lowbit_memcpy_compact2byte( DType dtype, void *dest, const void *src, size_t n); +/*! + * \brief copy from byte representation to an aligend tensor for lowbit types + */ +void lowbit_memcpy_byte2aligned(void* dest, const void* src, + const ::megdnn::TensorLayout& ly); + +/*! + * \brief copy from an aligend tensor to byte representation for lowbit types + */ +void lowbit_memcpy_aligned2byte(void* dest, const void* src, + const ::megdnn::TensorLayout& ly); } // namespace mgb diff --git a/src/gopt/test/inference.cpp b/src/gopt/test/inference.cpp index a875f3cc..447f0f53 100644 --- a/src/gopt/test/inference.cpp +++ b/src/gopt/test/inference.cpp @@ -4454,314 +4454,6 @@ TEST(TestGoptInference, PaddingChannelsWithWarpPerspective) { MGB_ASSERT_TENSOR_EQ(t1, t2); } -TEST(TestGoptInference, EnableNCHW64Basic) { - REQUIRE_GPU(1); - auto cn = CompNode::load("gpu0"); - cn.activate(); - REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); - - HostTensorGenerator gen; - auto graph = ComputingGraph::make(); - graph->options().graph_opt_level = 0; - auto mkvar = [&](const char* name, const TensorShape& shp, - const DType& dtype) { - return opr::TypeCvt::make( - opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), - dtype); - }; - auto mkcvar = [&](const char* name, const TensorShape& shp, - const DType& dtype) { - return opr::TypeCvt::make( - opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) - .rename(name), - dtype); - }; - - auto x = mkvar("x", {16, 4, 14, 14}, dtype::QuantizedS8(2.5f)), - w = mkcvar("w", {16, 4, 3, 3}, dtype::QuantizedS8(2.5f)), - b = mkcvar("b", {1, 16, 1, 1}, dtype::QuantizedS32(6.25f)); - opr::ConvBias::Param param; - param.format = opr::ConvBias::Param::Format::NCHW; - param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; - param.stride_h = param.stride_w = 1; - param.pad_h = param.pad_w = 1; - - auto y = opr::ConvBias::make(x, w, b, param, {}, - OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); - auto w1 = mkcvar("w1", {32, 16, 3, 3}, dtype::QuantizedS8(2.5f)), - b1 = mkcvar("b1", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f)); - auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, - OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); - auto w2 = mkcvar("w2", {64, 32, 3, 3}, dtype::QuantizedS8(2.5f)), - b2 = mkcvar("b2", {1, 64, 1, 1}, dtype::QuantizedS32(6.25f)); - auto y2 = opr::ConvBias::make(y1, w2, b2, param, {}, - OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); - y2 = opr::TypeCvt::make(y2, dtype::QuantizedS4{40.f}); - auto w3 = mkcvar("w3", {64, 64, 3, 3}, dtype::QuantizedS4(2.5f)), - b3 = mkcvar("b3", {1, 64, 1, 1}, dtype::QuantizedS32(100.f)); - auto y3 = opr::ConvBias::make(y2, w3, b3, param, {}, - OperatorNodeConfig{dtype::QuantizedS4{40.f}}); - y3 = opr::TypeCvt::make(y3, dtype::QuantizedS8{2.5f}); - auto w4 = mkcvar("w4", {16, 64, 3, 3}, dtype::QuantizedS8(2.5f)), - b4 = mkcvar("b4", {1, 16, 1, 1}, dtype::QuantizedS32(6.25f)); - auto y4 = opr::ConvBias::make(y3, w4, b4, param, {}, - OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); - using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode; - auto y5 = opr::ElemwiseMultiType::make( - {y, y4}, {ElemMultiMode::QFUSE_ADD_RELU}, - OperatorNodeConfig{dtype::QuantizedS8{1.3f}}); - y5 = opr::TypeCvt::make(y5, dtype::Float32()); - SymbolVar y5_pad; - unpack_vector( - gopt::GraphOptimizer{} - .add_pass(gopt::EnableNCHW64Pass::make_nchw64_converter()) - .apply({{y5}}) - .endpoint_vars(), - y5_pad); - EXPECT_TRUE(y5.node()->shape().eq_shape(y5_pad.node()->shape())); - SmallVector oprs; - auto cb = [&oprs](cg::OperatorNodeBase* opr) { - if (opr->same_type()) { - oprs.push_back(opr); - } - }; - cg::DepOprIter{cb}.add(y5_pad.node()->owner_opr()); - ASSERT_EQ(oprs.size(), 5); - using Format = opr::ConvBiasForward::Param::Format; -#define CHECK(_i, _fmt) \ - { \ - const auto& o = oprs[_i]->cast_final(); \ - ASSERT_EQ(o.param().format, Format::_fmt); \ - } - CHECK(0, NCHW4); - CHECK(1, NCHW4); - CHECK(2, NCHW32); - CHECK(3, NCHW64); - CHECK(4, NCHW4); -#undef CHECK - HostTensorND t1, t2; - auto func1 = graph->compile({make_callback_copy(y5, t1)}); - func1->execute(); - auto func2 = graph->compile({make_callback_copy(y5_pad, t2)}); - func2->execute(); - MGB_ASSERT_TENSOR_EQ(t1, t2); -} - -TEST(TestGoptInference, EnableNCHW64PaddingChannel) { - REQUIRE_GPU(1); - auto cn = CompNode::load("gpu0"); - cn.activate(); - REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); - - HostTensorGenerator gen; - auto graph = ComputingGraph::make(); - graph->options().graph_opt_level = 0; - auto mkvar = [&](const char* name, const TensorShape& shp, - const DType& dtype) { - return opr::TypeCvt::make( - opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), - dtype); - }; - auto mkcvar = [&](const char* name, const TensorShape& shp, - const DType& dtype) { - return opr::TypeCvt::make( - opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) - .rename(name), - dtype); - }; - - auto x = mkvar("x", {16, 4, 14, 14}, dtype::QuantizedS8(2.5f)), - w = mkcvar("w", {20, 4, 3, 3}, dtype::QuantizedS8(2.5f)), - b = mkcvar("b", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f)); - opr::ConvBias::Param param; - param.format = opr::ConvBias::Param::Format::NCHW; - param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; - param.stride_h = param.stride_w = 1; - param.pad_h = param.pad_w = 1; - - auto y = opr::ConvBias::make(x, w, b, param, {}, - OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); - opr::Pooling::Param pool; - pool.format = opr::Pooling::Param::Format::NCHW; - y = opr::Pooling::make(y, pool); - - auto w1 = mkcvar("w1", {24, 20, 3, 3}, dtype::QuantizedS8(2.5f)), - b1 = mkcvar("b1", {1, 24, 1, 1}, dtype::QuantizedS32(6.25f)); - auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, - OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); - auto w2 = mkcvar("w2", {20, 24, 3, 3}, dtype::QuantizedS8(2.5f)), - b2 = mkcvar("b2", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f)); - auto y2 = opr::ConvBias::make(y1, w2, b2, param, {}, - OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); - y2 = opr::TypeCvt::make(y2, dtype::QuantizedS4{40.f}); - auto w3 = mkcvar("w3", {64, 20, 3, 3}, dtype::QuantizedS4(2.5f)), - b3 = mkcvar("b3", {1, 64, 1, 1}, dtype::QuantizedS32(100.f)); - auto y3 = opr::ConvBias::make(y2, w3, b3, param, {}, - OperatorNodeConfig{dtype::QuantizedS4{40.f}}); - auto w4 = mkcvar("w4", {20, 64, 3, 3}, dtype::QuantizedS4(2.5f)), - b4 = mkcvar("b4", {1, 20, 1, 1}, dtype::QuantizedS32(100.f)); - auto y4 = opr::ConvBias::make(y3, w4, b4, param, {}, - OperatorNodeConfig{dtype::QuantizedS4{40.f}}); - y4 = opr::TypeCvt::make(y4, dtype::QuantizedS8{2.5f}); - using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode; - auto y5 = opr::ElemwiseMultiType::make( - {y, y4}, {ElemMultiMode::QFUSE_ADD_RELU}, - OperatorNodeConfig{dtype::QuantizedS8{1.2f}}); - opr::ConvolutionBackwardData::Param deconv; - deconv.format = opr::ConvolutionBackwardData::Param::Format::NCHW; - deconv.stride_h = deconv.stride_w = 2; - deconv.pad_h = deconv.pad_w = 1; - auto w6 = mkcvar("w6", {20, 64, 4, 4}, dtype::QuantizedS8{2.5f}); - auto y6 = opr::ConvolutionBackwardData::make( - w6, y5, deconv, {}, - OperatorNodeConfig{dtype::QuantizedS8(2.0f)}); - y6 = opr::TypeCvt::make(y6, dtype::QuantizedS4{32.f}); - - std::shared_ptr mat = std::make_shared( - cn, TensorShape{16, 3, 3}, dtype::Float32()); - warp_perspective_mat_gen(*mat, 16, 14, 14); - auto mat_var = opr::Host2DeviceCopy::make(*graph, mat).rename("mat"); - opr::WarpPerspective::Param warp_param; - warp_param.format = opr::WarpPerspective::Param::Format::NCHW; - auto y7 = opr::WarpPerspective::make(y6, mat_var, TensorShape{14, 14}, - warp_param); - y7 = opr::TypeCvt::make(y7, dtype::Float32()); - SymbolVar y7_pad; - auto opt = gopt::OptimizeForInferenceOptions{}; - opt.enable_nchw64(); - unpack_vector(gopt::optimize_for_inference({y7}, opt), y7_pad); - EXPECT_TRUE(y7.node()->shape().eq_shape(y7_pad.node()->shape())); - HostTensorND t1, t2; - auto func1 = graph->compile({make_callback_copy(y7, t1)}); - func1->execute(); - auto func2 = graph->compile({make_callback_copy(y7_pad, t2)}); - func2->execute(); - MGB_ASSERT_TENSOR_EQ(t1, t2); - using Format = opr::ConvBiasForward::Param::Format; - SmallVector oprs; - auto cb = [&oprs](cg::OperatorNodeBase* opr) { - if (opr->same_type()) { - oprs.push_back(opr); - } - }; - cg::DepOprIter{cb}.add(y7_pad.node()->owner_opr()); - ASSERT_EQ(oprs.size(), 5); -#define CHECK(_i, _fmt) \ - { \ - const auto& o = oprs[_i]->cast_final(); \ - ASSERT_EQ(o.param().format, Format::_fmt); \ - } - CHECK(0, NCHW4); - CHECK(1, NCHW32); - CHECK(2, NCHW32); - CHECK(3, NCHW64); - CHECK(4, NCHW64); -#undef CHECK - { - const auto& deconv = find_opr(y7_pad); - ASSERT_EQ(deconv.param().format, Format::NCHW4); - const auto& pool = find_opr(y7_pad); - ASSERT_EQ(pool.param().format, Format::NCHW4); - const auto& warp = find_opr(y7_pad); - ASSERT_EQ(warp.param().format, Format::NCHW64); - } - size_t nr_dimshuffle = find_opr_num(y7_pad); - ASSERT_EQ(nr_dimshuffle, 8); -} - -TEST(TestGoptInference, EnableNCHW64FuseConvBiasZ) { - REQUIRE_GPU(1); - auto cn = CompNode::load("gpu0"); - cn.activate(); - REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); - - HostTensorND t1, t2; - HostTensorGenerator gen; - auto graph = ComputingGraph::make(); - graph->options().graph_opt_level = 0; - auto mkvar = [&](const char* name, const TensorShape& shp, - const DType& dtype) { - return opr::TypeCvt::make( - opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), - dtype); - }; - auto mkcvar = [&](const char* name, const TensorShape& shp, - const DType& dtype) { - return opr::TypeCvt::make( - opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) - .rename(name), - dtype); - }; - - auto x = mkvar("x", {16, 4, 14, 14}, dtype::QuantizedS8(2.5f)), - w = mkcvar("w", {32, 4, 3, 3}, dtype::QuantizedS8(2.5f)), - b = mkcvar("b", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f)); - opr::ConvBias::Param param; - param.format = opr::ConvBias::Param::Format::NCHW; - param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; - param.stride_h = param.stride_w = 1; - param.pad_h = param.pad_w = 1; - - auto y = opr::ConvBias::make(x, w, b, param, {}, - OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); - auto w1 = mkcvar("w1", {64, 32, 3, 3}, dtype::QuantizedS8(2.5f)), - b1 = mkcvar("b1", {1, 64, 1, 1}, dtype::QuantizedS32(6.25f)); - auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, - OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); - y1 = opr::TypeCvt::make(y1, dtype::QuantizedS4{40.f}); - auto w2 = mkcvar("w2", {64, 64, 3, 3}, dtype::QuantizedS4(2.5f)), - b2 = mkcvar("b2", {1, 64, 1, 1}, dtype::QuantizedS32(100.f)); - auto y2 = opr::ConvBias::make(y1, w2, b2, param, {}, - OperatorNodeConfig{dtype::QuantizedS4{40.f}}); - auto w3 = mkcvar("w3", {64, 64, 3, 3}, dtype::QuantizedS4(2.5f)), - b3 = mkcvar("b3", {1, 64, 1, 1}, dtype::QuantizedS32(100.f)); - auto y3 = opr::ConvBias::make(y2, w3, b3, param, {}, - OperatorNodeConfig{dtype::QuantizedS4(40.f)}); - using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode; - auto y4 = opr::ElemwiseMultiType::make( - {y1, y3}, {ElemMultiMode::QFUSE_ADD_RELU}, - OperatorNodeConfig{dtype::QuantizedS4{40.f}}); - y4 = opr::TypeCvt::make(y4, dtype::Float32()); - auto y5 = opr::ConvBias::make(y2, w3, b3, y1, param, {}, - OperatorNodeConfig{dtype::QuantizedS4(40.f)}); - y5 = opr::TypeCvt::make(y5, dtype::Float32()); - SymbolVar y4_pad; - auto opt = gopt::OptimizeForInferenceOptions{}; - opt.enable_nchw64(); - unpack_vector(gopt::optimize_for_inference({y4}, opt), y4_pad); - EXPECT_TRUE(y4.node()->shape().eq_shape(y4_pad.node()->shape())); - size_t nr_elem_mult_type = find_opr_num(y4_pad); - ASSERT_EQ(nr_elem_mult_type, 0); - auto func = graph->compile({make_callback_copy(y4_pad, t1)}); - func->execute(); - - { - opr::ConvBias::Param param; - param.format = opr::ConvBias::Param::Format::NCHW; - param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; - param.stride_h = param.stride_w = 1; - param.pad_h = param.pad_w = 1; - - auto y = opr::ConvBias::make( - x, w, b, param, {}, - OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); - auto y1 = opr::ConvBias::make( - y, w1, b1, param, {}, - OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); - y1 = opr::TypeCvt::make(y1, dtype::QuantizedS4{40.f}); - auto y2 = opr::ConvBias::make( - y1, w2, b2, param, {}, - OperatorNodeConfig{dtype::QuantizedS4{40.f}}); - param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; - auto y3 = opr::ConvBias::make( - y2, w3, b3, y1, param, {}, - OperatorNodeConfig{dtype::QuantizedS4(40.f)}); - y3 = opr::TypeCvt::make(y3, dtype::Float32()); - auto func = graph->compile({make_callback_copy(y3, t2)}); - func->execute(); - } - MGB_ASSERT_TENSOR_EQ(t1, t2); -} #endif diff --git a/src/opr/test/dnn/convolution.cpp b/src/opr/test/dnn/convolution.cpp index b30aab09..734528e3 100644 --- a/src/opr/test/dnn/convolution.cpp +++ b/src/opr/test/dnn/convolution.cpp @@ -2604,174 +2604,6 @@ TEST_F(TestNoWeightPreprocess, NoPreprocess) { #endif namespace { -TEST(TestOprDNN, ConvBiasInt4NCHW) { - REQUIRE_GPU(1); - auto cn = CompNode::load("gpu0"); - cn.activate(); - auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; - auto sm_ver = prop.major * 10 + prop.minor; - if (sm_ver != 75) { - printf("This testcast ignored due to insufficient cuda cap(got: %d, " - "expected: %d)\n", - sm_ver, 75); - return; - } - - auto run = [&cn](size_t N, size_t C, size_t H, size_t W, size_t F, size_t S, - size_t P) { - auto graph = ComputingGraph::make(); - - HostTensorGenerator gen; - auto mkvar = [&gen](const char* name, const TensorShape& shp, - const DType& dtype, - std::shared_ptr graph, - const CompNode& cn) { - return opr::TypeCvt::make( - opr::Host2DeviceCopy::make(*graph, gen(shp, cn)) - .rename(name), - dtype); - }; - auto mkcvar = [&gen](const char* name, const TensorShape& shp, - const DType& dtype, - std::shared_ptr graph, - const CompNode& cn) { - return opr::TypeCvt::make( - opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) - .rename(name), - dtype); - }; - - using Policy = opr::ConvBias::ExecutionPolicy; - using Strategy = Policy::Strategy; - auto x = mkvar("x", {N, C * 4, H, W}, dtype::QuantizedS4(1.19960327f), - graph, cn), - w = mkcvar("w1", {C, C * 4, F, F}, dtype::QuantizedS4(1.19970327f), - graph, cn), - b = mkcvar("b1", {1, C, 1, 1}, - dtype::QuantizedS32(1.19960327f * 1.19970327f), graph, - cn); - opr::ConvBias::Param param; - param.format = opr::ConvBias::Param::Format::NCHW; - param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; - param.stride_h = param.stride_w = S; - param.pad_h = param.pad_w = P; - Policy policy; - policy.strategy = Strategy::PROFILE; - - auto y = opr::ConvBias::make( - x, w, b, param, policy, - OperatorNodeConfig{dtype::QuantizedS4(11.9960501f)}); - y = opr::TypeCvt::make(y, dtype::Float32()); - auto x_f32 = opr::TypeCvt::make(x, dtype::Float32()), - w_f32 = opr::TypeCvt::make(w, dtype::Float32()), - b_f32 = opr::TypeCvt::make(b, dtype::Float32()); - auto y_f32 = opr::ConvBias::make(x_f32, w_f32, b_f32, param, policy); - auto y_q4 = opr::TypeCvt::make(y_f32, dtype::QuantizedS4{11.9960501f}); - y_q4 = opr::TypeCvt::make(y_q4, dtype::Float32()); - HostTensorND host_y, host_y_q4; - auto func = graph->compile({make_callback_copy(y, host_y), - make_callback_copy(y_q4, host_y_q4)}); - func->execute(); - MGB_ASSERT_TENSOR_NEAR(host_y, host_y_q4, 1e-3); - }; - run(2, 64, 14, 14, 3, 2, 1); - run(2, 64, 7, 7, 3, 1, 1); - run(2, 64, 14, 14, 1, 2, 0); - run(2, 64, 7, 7, 1, 1, 0); -} - -TEST(TestOprDNN, ConvBiasInt4NCHW64) { - REQUIRE_GPU(1); - auto cn = CompNode::load("gpu0"); - cn.activate(); - auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; - auto sm_ver = prop.major * 10 + prop.minor; - if (sm_ver != 75) { - printf("This testcast ignored due to insufficient cuda cap(got: %d, " - "expected: %d)\n", - sm_ver, 75); - return; - } - - auto nchw2nchw64 = [](SymbolVar x) { - auto y = opr::RelayoutFormat::make( - x, opr::RelayoutFormat::Param::Mode::NCHW_NCHW64); - return y; - }; - - auto nchw642nchw = [](SymbolVar x) { - auto y = opr::RelayoutFormat::make( - x, opr::RelayoutFormat::Param::Mode::NCHW64_NCHW); - return y; - }; - - auto run = [&](size_t N, size_t C, size_t H, size_t W, size_t F, size_t S, - size_t P) { - auto graph = ComputingGraph::make(); - - HostTensorGenerator gen; - auto mkvar = [&gen](const char* name, const TensorShape& shp, - const DType& dtype, - std::shared_ptr graph, - const CompNode& cn) { - return opr::TypeCvt::make( - opr::Host2DeviceCopy::make(*graph, gen(shp, cn)) - .rename(name), - dtype); - }; - auto mkcvar = [&gen](const char* name, const TensorShape& shp, - const DType& dtype, - std::shared_ptr graph, - const CompNode& cn) { - return opr::TypeCvt::make( - opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) - .rename(name), - dtype); - }; - - using Policy = opr::ConvBias::ExecutionPolicy; - using Strategy = Policy::Strategy; - auto x = mkvar("x", {N, C / 16, H, W, 64}, - dtype::QuantizedS4(1.19960327f), graph, cn), - w = mkcvar("w1", {C, C / 16, F, F, 64}, - dtype::QuantizedS4(1.19970327f), graph, cn), - b = mkcvar("b1", {1, C / 64, 1, 1, 64}, - dtype::QuantizedS32(1.19960327f * 1.19970327f), graph, - cn); - opr::ConvBias::Param param; - param.format = opr::ConvBias::Param::Format::NCHW64; - param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; - param.stride_h = param.stride_w = S; - param.pad_h = param.pad_w = P; - Policy policy; - policy.strategy = Strategy::PROFILE; - - auto y = opr::ConvBias::make( - x, w, b, param, policy, - OperatorNodeConfig{dtype::QuantizedS4(11.9960501f)}); - y = opr::TypeCvt::make(y, dtype::Float32()); - x = nchw642nchw(x); - w = nchw642nchw(w); - b = nchw642nchw(b); - auto x_f32 = opr::TypeCvt::make(x, dtype::Float32()), - w_f32 = opr::TypeCvt::make(w, dtype::Float32()), - b_f32 = opr::TypeCvt::make(b, dtype::Float32()); - param.format = opr::ConvBias::Param::Format::NCHW; - auto y_f32 = opr::ConvBias::make(x_f32, w_f32, b_f32, param, policy); - auto y_q4 = opr::TypeCvt::make(y_f32, dtype::QuantizedS4{11.9960501f}); - y_q4 = opr::TypeCvt::make(y_q4, dtype::Float32()); - y_q4 = nchw2nchw64(y_q4); - HostTensorND host_y, host_y_q4; - auto func = graph->compile({make_callback_copy(y, host_y), - make_callback_copy(y_q4, host_y_q4)}); - func->execute(); - MGB_ASSERT_TENSOR_NEAR(host_y, host_y_q4, 1e-3); - }; - run(2, 64, 14, 14, 3, 2, 1); - run(2, 64, 7, 7, 3, 1, 1); - run(2, 64, 14, 14, 1, 2, 0); - run(2, 64, 7, 7, 1, 1, 0); -} TEST(TestOprDNN, ConvBiasInt4Serialize) { using namespace serialization; @@ -2783,7 +2615,7 @@ TEST(TestOprDNN, ConvBiasInt4Serialize) { HostTensorGenerator gen; std::shared_ptr xv; - auto mkvar = [&gen](const char* name, const DType& dtype, + auto mkvar = [](const char* name, const DType& dtype, std::shared_ptr graph, std::shared_ptr val) { return opr::TypeCvt::make( @@ -2856,9 +2688,9 @@ TEST(TestOprDNN, ConvBiasInt4SerializeWithParamFuse) { HostTensorGenerator gen; std::shared_ptr xv; - auto mkvar = [&gen](const char* name, const DType& dtype, - std::shared_ptr graph, - std::shared_ptr val) { + auto mkvar = [](const char* name, const DType& dtype, + std::shared_ptr graph, + std::shared_ptr val) { return opr::TypeCvt::make( opr::Host2DeviceCopy::make(*graph, val).rename(name), dtype); }; diff --git a/src/serialization/impl/serializer_oss.cpp b/src/serialization/impl/serializer_oss.cpp index 22223f17..393e38bf 100644 --- a/src/serialization/impl/serializer_oss.cpp +++ b/src/serialization/impl/serializer_oss.cpp @@ -62,7 +62,12 @@ bool contains_any_in_set(const SmallVector& list, void check_tensor_value_valid(const std::string& name, const HostTensorND& tensor) { - mgb_assert(tensor.layout().is_physical_contiguous(), + bool cond_normal = tensor.layout().format.is_default() && + tensor.layout().is_physical_contiguous(); + bool cond_lowbit = tensor.layout().dtype.is_quantized_lowbit() && + tensor.layout().format.is_lowbit_aligned() && + tensor.layout().is_contiguous(); + mgb_assert(cond_normal || cond_lowbit, "non-contiguous tensor: name=%s layout=%s", name.c_str(), tensor.layout().to_string().c_str()); if (tensor.dtype() == dtype::Float32()) { @@ -585,11 +590,12 @@ TensorLayout load_tensor_layout(const fbs::Tensor* tensor) { layout.ndim = tensor->shape()->size(); std::copy(tensor->shape()->begin(), tensor->shape()->end(), layout.shape); - layout.init_contiguous_stride(); } if (tensor->dtype()) { - layout.dtype = fbs::intl::load_dtype(tensor->dtype()); + // modify data type inplace for TensorLayout + layout.modify_dtype_inplace(fbs::intl::load_dtype(tensor->dtype())); } + layout.init_contiguous_stride(); return layout; }