@@ -281,6 +281,13 @@ struct TensorLayout : public TensorShape { | |||
add_axis_inplace(axis, 1, stride[axis] * shape[axis]); | |||
} | |||
/*! | |||
* \brief modify data type of the layout inplace | |||
* | |||
* By the way this API will modify the format according to the data type | |||
*/ | |||
void modify_dtype_inplace(DType dtype); | |||
/* =================== generate new layout =================== */ | |||
/** | |||
@@ -513,6 +513,15 @@ class DType { | |||
bool is_low_bit() const { return low_bit() != 0; } | |||
bool is_quantized_lowbit() const { | |||
return low_bit() != 0 && | |||
#if MEGDNN_CC_HOST | |||
category() == DTypeCategory::QUANTIZED; | |||
#else | |||
category().ev == DTypeCategory::Ev::QUANTIZED; | |||
#endif | |||
} | |||
/*! | |||
* \brief size of this data type, in bytes | |||
*/ | |||
@@ -226,7 +226,7 @@ public: | |||
std::string to_string() const override; | |||
//! raise exception if given layout is illegal | |||
void assert_valid(const TensorLayout& layout) const; | |||
void assert_valid(const TensorLayout& layout) const override; | |||
void serialize_append(std::string& result) const override; | |||
@@ -282,6 +282,11 @@ void TensorLayout::add_axis_inplace(size_t axis, size_t shape, | |||
this->stride[axis] = stride; | |||
} | |||
void TensorLayout::modify_dtype_inplace(DType dtype_) { | |||
dtype = dtype_; | |||
format = Format(dtype); | |||
} | |||
bool TensorLayout::is_contiguous() const { | |||
return format.impl()->is_contiguous_spec(*this); | |||
} | |||
@@ -952,7 +952,12 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||
megdnn_assert(src[4] == 4); | |||
dst[4] = 4; | |||
} | |||
dst.format = src.format; | |||
if (!src.format.is_default() && | |||
!src.format.is_lowbit_aligned()) { // propagate | |||
dst.format = src.format; | |||
} else { // determined by dtype | |||
dst.format = TensorFormat(dst.dtype); | |||
} | |||
dst.init_contiguous_stride(); | |||
return cflt; | |||
} | |||
@@ -46,14 +46,15 @@ TensorFormat TensorFormat::deserialize(const std::string& bin, | |||
TensorFormat::Format() : m_impl{DefaultTensorFormat::make().m_impl} {} | |||
TensorFormat::Format(DType dtype) { | |||
megdnn_assert(dtype.valid()); | |||
if (dtype.is_low_bit()) { | |||
if (dtype.valid() && | |||
dtype.is_quantized_lowbit()) { // quantized lowbit, by default | |||
// aligned to bytes | |||
size_t size_nbits = dtype.low_bit(); | |||
megdnn_assert(size_nbits == 1 || size_nbits == 2 || size_nbits == 4, | |||
"unsupported lowbits data type(%s, size in bits: %zu)", | |||
dtype.name(), size_nbits); | |||
m_impl = LowbitsAlignedToBytesTensorFormat::make(size_nbits).m_impl; | |||
} else { | |||
} else { // non parameterized lowbit, default format | |||
m_impl = DefaultTensorFormat::make().m_impl; | |||
} | |||
} | |||
@@ -89,8 +90,8 @@ bool TensorFormat::is_lowbit_aligned() const { | |||
/* ===================== DefaultFormat ===================== */ | |||
void DefaultTensorFormat::assert_valid(const TensorLayout& layout) const { | |||
megdnn_assert( | |||
!layout.dtype.valid() || !layout.dtype.is_low_bit(), | |||
"DefaultTensorFormat does not support low-bits tensor(dtype:%s)", | |||
!layout.dtype.valid() || !layout.dtype.is_quantized_lowbit(), | |||
"DefaultTensorFormat does not support quantized lowbit tensor(dtype:%s)", | |||
layout.dtype.name()); | |||
} | |||
@@ -271,7 +272,8 @@ void Image2DPackedTensorFormatBase<PIXEL_SIZE>::assert_valid( | |||
auto m_align_axis = align_axis(); | |||
megdnn_assert(!(layout.shape[layout.ndim - 1] % PIXEL_SIZE), | |||
"bad shape: %zu", layout.shape[layout.ndim - 1]); | |||
megdnn_assert(layout.dtype.valid() && layout.ndim > m_align_axis); | |||
megdnn_assert(layout.dtype.valid() && !layout.dtype.is_quantized_lowbit() && | |||
layout.ndim > m_align_axis); | |||
ptrdiff_t first_non_zero_stride = 0; | |||
for (int i = layout.ndim - 1; i >= 0; --i) { | |||
megdnn_assert(layout.shape[i] && layout.stride[i] >= 0); | |||
@@ -478,6 +480,7 @@ void LowbitsAlignedTensorFormatBase::assert_valid( | |||
megdnn_assert(layout.dtype.valid() && layout.dtype.is_low_bit() && | |||
layout.dtype.low_bit() == m_size_nbits); | |||
bool has_dim_unity_stride = false; | |||
bool has_dim_aligned_stride = false; | |||
for (int i = layout.ndim - 1; i >= 0; --i) { | |||
if (!has_dim_unity_stride && layout.stride[i] == 1) | |||
has_dim_unity_stride = true; | |||
@@ -485,15 +488,16 @@ void LowbitsAlignedTensorFormatBase::assert_valid( | |||
layout.stride[i] >= 0 && | |||
(layout.stride[i] % m_align_size_in_elements == 0 || | |||
layout.stride[i] == 1), | |||
"bad stride:%s, %zu", layout.to_string().c_str(), | |||
layout.stride[i]); | |||
"bad stride:%s, %ld", layout.to_string().c_str(), | |||
static_cast<long>(layout.stride[i])); | |||
if (!has_dim_aligned_stride && | |||
static_cast<size_t>(layout.stride[i]) == m_align_size_in_elements) | |||
has_dim_aligned_stride = true; | |||
} | |||
if (!has_dim_unity_stride && | |||
(int)layout.stride[layout.ndim - 1] == | |||
round_up(1, (int)m_align_size_in_elements)) | |||
has_dim_unity_stride = true; | |||
megdnn_assert(layout.ndim == 0 || has_dim_unity_stride, | |||
"innermost dim not contiguous"); | |||
megdnn_assert( | |||
layout.ndim == 0 || has_dim_unity_stride || has_dim_aligned_stride, | |||
"innermost dim not contiguous"); | |||
} | |||
void LowbitsAlignedTensorFormatBase::serialize_append( | |||
@@ -542,6 +546,7 @@ size_t LowbitsAlignedTensorFormatBase::init_contiguous_stride( | |||
multiplier = round_up(multiplier, m_align_size_in_elements); | |||
accum = mul(accum, multiplier); | |||
} | |||
assert_valid(layout); | |||
return accum; | |||
} | |||
@@ -12,6 +12,7 @@ | |||
#include "./algo.h" | |||
#include "src/cuda/utils.h" | |||
#include "src/common/conv_bias.h" | |||
using namespace megdnn; | |||
using namespace cuda; | |||
@@ -27,8 +28,7 @@ bool ConvBiasForwardImpl::AlgoFallbackNCHWQS4::is_available( | |||
bool available = true; | |||
auto&& param = args.opr->param(); | |||
auto&& fm = args.filter_meta; | |||
if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout), | |||
param.format)) | |||
if (!check_bias_share_in_channel(*(args.bias_layout), param.format)) | |||
return false; | |||
if (param.format != Format::NCHW) | |||
return false; | |||
@@ -128,7 +128,7 @@ void ConvBiasForwardImpl::AlgoFallbackNCHWQS4::exec( | |||
conv_op->param() = args.opr->param(); | |||
using Format = param::ConvBias::Format; | |||
conv_op->param().format = Format::NCHW64; | |||
ExecArgs args_{dynamic_cast<ConvBiasForwardImpl*>(conv_op.get()), | |||
ExecArgs args_{reinterpret_cast<ConvBiasForwardImpl*>(conv_op.get()), | |||
src_, | |||
filter_, | |||
bias_, | |||
@@ -190,7 +190,7 @@ WorkspaceBundle ConvBiasForwardImpl::AlgoFallbackNCHWQS4::get_workspace_bundle( | |||
conv_op->param() = args.opr->param(); | |||
using Format = param::ConvBias::Format; | |||
conv_op->param().format = Format::NCHW64; | |||
SizeArgs args_{dynamic_cast<ConvBiasForwardImpl*>(conv_op.get()), | |||
SizeArgs args_{reinterpret_cast<ConvBiasForwardImpl*>(conv_op.get()), | |||
layouts[0], | |||
layouts[1], | |||
layouts[2], | |||
@@ -64,7 +64,6 @@ public: | |||
class AlgoInt8CHWN4IMMAImplicitGemmReorderFilter; | |||
class AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth; | |||
class AlgoInt8NCHW32IMMAImplicitGemm; | |||
class AlgoFallbackNCHWQS4; | |||
class AlgoBFloat16; | |||
class AlgoPack; | |||
@@ -151,9 +151,9 @@ void exec_matrix_mul_quint4x4x32_helper( | |||
MEGDNN_MARK_USED_VAR(format); | |||
MEGDNN_MARK_USED_VAR(compute_mode); | |||
auto convert_layout = [](const TensorLayout& layout) { | |||
auto ret = layout; | |||
auto param = layout.dtype.param<dtype::Quantized4Asymm>(); | |||
ret.dtype = dtype::Quantized8Asymm(param.scale, param.zero_point); | |||
TensorLayout ret(layout, | |||
dtype::Quantized8Asymm(param.scale, param.zero_point)); | |||
return ret; | |||
}; | |||
TensorLayout A_layout, B_layout; | |||
@@ -205,9 +205,8 @@ void exec_matrix_mul_qint4x4x16_helper( | |||
MEGDNN_MARK_USED_VAR(format); | |||
MEGDNN_MARK_USED_VAR(compute_mode); | |||
auto convert_layout = [](const TensorLayout& layout) { | |||
auto ret = layout; | |||
auto param = layout.dtype.param<dtype::QuantizedS4>(); | |||
ret.dtype = dtype::QuantizedS8(param.scale); | |||
TensorLayout ret(layout, dtype::QuantizedS8(param.scale)); | |||
return ret; | |||
}; | |||
TensorLayout A_layout, B_layout; | |||
@@ -406,8 +406,7 @@ size_t PoolingForwardImpl::get_workspace_in_bytes(const TensorLayout& src, | |||
} | |||
namespace { | |||
void post_process(const TensorND& dst, TensorND& comp_dst, Handle* handle, | |||
WorkspaceBundle& workspace_bundle) { | |||
void post_process(const TensorND& dst, TensorND& comp_dst) { | |||
if (dst.layout.dtype.enumv() == DTypeEnum::QuantizedS4) { | |||
int8_to_int4(comp_dst, dst); | |||
} else if (dst.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { | |||
@@ -427,8 +426,8 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, | |||
if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4) { | |||
float scale = src.layout.dtype.param<dtype::QuantizedS4>().scale; | |||
comp_src.layout.dtype = dtype::QuantizedS8(scale); | |||
comp_src.layout.init_contiguous_stride(); | |||
comp_src.layout.format = TensorLayout::Format(comp_src.layout.dtype); | |||
comp_src.layout.init_contiguous_stride(); | |||
comp_src.raw_ptr = wsb.get(0); | |||
comp_dst.layout.dtype = dtype::QuantizedS8(scale); | |||
comp_dst.layout.format = TensorLayout::Format(comp_dst.layout.dtype); | |||
@@ -571,7 +570,7 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, | |||
default: \ | |||
megdnn_assert(0, "not support mode"); \ | |||
} \ | |||
post_process(dst, comp_dst, handle(), wsb); \ | |||
post_process(dst, comp_dst); \ | |||
return; \ | |||
} | |||
@@ -132,7 +132,6 @@ public: | |||
: dtype::Float32()); | |||
if (m_fmt.find(i) == m_fmt.end()) { | |||
layouts[i] = TensorLayout(shapes[i], dt); | |||
layouts[i].init_contiguous_stride(); | |||
} else | |||
layouts[i] = TensorLayout(shapes[i], dt, m_fmt[i]); | |||
} | |||
@@ -325,13 +325,8 @@ TEST(BASIC_TYPES, TENSOR_LAYOUT_FMT_LOW_BITS) { | |||
layout = TensorLayout{{1, 32, 1, 1}, dtype::QuantizedS4{1.2f}}; | |||
layout = layout.broadcast({16, 32, 7, 7}); | |||
EXPECT_EQ(make_layout({16, 32, 49}, {0, 1, 0}, dtype::QuantizedS4{1.2}), | |||
EXPECT_EQ(make_layout({16, 32, 49}, {0, 2, 0}, dtype::QuantizedS4{1.2}), | |||
layout.collapse_contiguous()); | |||
layout = TensorLayout{{1, 32, 1, 1}, dtype::QuantizedS4{1.2f}}; | |||
layout.init_contiguous_stride(); | |||
layout = layout.broadcast({16, 32, 7, 7}); | |||
ASSERT_THROW(layout.span(), MegDNNError); | |||
} | |||
TEST(BASIC_TYPES, TENSOR_LAYOUT_FMT_LOW_BITS_VALID) { | |||
@@ -342,7 +337,7 @@ TEST(BASIC_TYPES, TENSOR_LAYOUT_FMT_LOW_BITS_VALID) { | |||
LowbitsAlignedToBytesTensorFormat::make(4_z)), | |||
MegDNNError); | |||
ASSERT_THROW(TensorLayout({16, 32, 7, 7}, dtype::IntB2{}, | |||
LowbitsAlignedToBytesTensorFormat::make(2_z)), | |||
LowbitsAlignedToBytesTensorFormat::make(4_z)), | |||
MegDNNError); | |||
} | |||
@@ -343,6 +343,14 @@ static inline bool good_float(dt_qint32) { | |||
return true; | |||
} | |||
static inline bool good_float(dt_qint4) { | |||
return true; | |||
} | |||
static inline bool good_float(dt_quint4) { | |||
return true; | |||
} | |||
// A hack for the (x+0) promote to int trick on dt_quint8. | |||
static inline int operator+(dt_quint8 lhs, int rhs) { | |||
megdnn_assert(rhs == 0, "unexpected rhs"); | |||
@@ -545,12 +545,12 @@ TEST_F(NAIVE, WARP_PERSPECTIVE_NCHW64) { | |||
using Param = WarpPerspective::Param; | |||
auto convert_true_format = [](const TensorLayout& layout) { | |||
if (layout.ndim == 4) | |||
return layout | |||
.reshape({layout[0], layout[1] / 64, layout[2], layout[3], | |||
64}) | |||
.dimshuffle({0, 1, 4, 2, 3}); | |||
else | |||
if (layout.ndim == 4) { | |||
TensorLayout ret{ | |||
{layout[0], layout[1] / 64, layout[2], layout[3], 64}, | |||
layout.dtype}; | |||
return ret.dimshuffle({0, 1, 4, 2, 3}); | |||
} else | |||
return layout; | |||
}; | |||
@@ -563,15 +563,16 @@ TEST_F(NAIVE, WARP_PERSPECTIVE_NCHW64) { | |||
TensorNDArray nchw_tensors; | |||
for (size_t i = 0; i < tensors.size(); ++i) { | |||
TensorLayout ly; | |||
auto layout = tensors[i].layout; | |||
if (layout.dtype.enumv() == DTypeEnum::QuantizedS4) | |||
layout.dtype = dtype::QuantizedS4(); | |||
if (layout.ndim == 5) { | |||
layout = layout.reshape({layout[0], layout[1] * layout[4], | |||
layout[2], layout[3]}); | |||
if (tensors[i].layout.ndim == 5) { | |||
ly = TensorLayout{{layout[0], layout[1] * layout[4], layout[2], | |||
layout[3]}, | |||
layout.dtype}; | |||
} else { | |||
ly = layout; | |||
} | |||
nchw_tensors.emplace_back(malloc(layout.span().dist_byte()), | |||
layout); | |||
nchw_tensors.emplace_back(malloc(ly.span().dist_byte()), ly); | |||
} | |||
TensorNDArray nchw64_tensors; | |||
for (size_t i = 0; i < tensors.size(); ++i) { | |||
@@ -617,13 +618,11 @@ TEST_F(NAIVE, WARP_PERSPECTIVE_NCHW64) { | |||
checker.set_param(param); | |||
checker.execs({{2, 1, 10, 10, 64}, {2, 3, 3}, {2, 1, 10, 12, 64}}); | |||
checker.execs( | |||
{{20, 30, 10, 12, 64}, {20, 3, 3}, {20, 30, 11, 12, 64}}); | |||
checker.execs( | |||
{{220, 3, 10, 10, 64}, {220, 3, 3}, {220, 3, 10, 12, 64}}); | |||
checker.execs({{1, 25, 25, 24, 64}, {1, 3, 3}, {1, 25, 25, 510, 64}}); | |||
checker.execs({{1, 25, 25, 510, 64}, {1, 3, 3}, {1, 25, 25, 24, 64}}); | |||
checker.execs({{1, 25, 25, 24, 64}, {1, 3, 3}, {1, 25, 51, 50, 64}}); | |||
checker.execs({{1, 25, 51, 50, 64}, {1, 3, 3}, {1, 25, 25, 24, 64}}); | |||
{{20, 3, 10, 12, 64}, {20, 3, 3}, {20, 3, 11, 12, 64}}); | |||
checker.execs({{1, 3, 25, 24, 64}, {1, 3, 3}, {1, 3, 25, 51, 64}}); | |||
checker.execs({{1, 3, 25, 51, 64}, {1, 3, 3}, {1, 3, 25, 24, 64}}); | |||
checker.execs({{1, 3, 25, 24, 64}, {1, 3, 3}, {1, 3, 51, 50, 64}}); | |||
checker.execs({{1, 3, 51, 50, 64}, {1, 3, 3}, {1, 3, 25, 24, 64}}); | |||
} | |||
} | |||
// vim: syntax=cpp.doxygen |
@@ -18,6 +18,7 @@ | |||
#include "megbrain/graph/cg.h" | |||
#include "megbrain/tensor.h" | |||
#include "megbrain/utils/mempool.h" | |||
#include "./numpy_dtypes.h" | |||
namespace py = pybind11; | |||
@@ -390,16 +391,24 @@ HostTensorND lowbit_ndarray_to_host_tensor( | |||
} else { | |||
mgb_assert(layout.ndim && layout.ndim <= TensorShape::MAX_NDIM, | |||
"unsupported ndim %zu", layout.ndim); | |||
for (size_t i = 0; i < layout.ndim; ++ i) { | |||
layout.shape[i] = PyArray_SHAPE(input)[i]; | |||
layout.stride[i] = PyArray_STRIDE(input, i); | |||
TensorLayout ly; | |||
ly.ndim = layout.ndim; | |||
for (size_t i = 0; i < layout.ndim; ++i) { | |||
ly.shape[i] = layout.shape[i] = PyArray_SHAPE(input)[i]; | |||
ly.stride[i] = PyArray_STRIDE(input, i); | |||
mgb_assert(layout.shape[i], "zero shape not supported"); | |||
} | |||
mgb_assert(layout.is_contiguous()); | |||
mgb_assert(ly.is_physical_contiguous()); | |||
layout.init_contiguous_stride(); | |||
} | |||
HostTensorND ret{comp_node, layout}; | |||
lowbit_memcpy_byte2compact(layout.dtype, ret.raw_ptr(), src_ptr, | |||
layout.total_nr_elems()); | |||
if (layout.format.is_lowbit_aligned()) { | |||
mgb_assert(layout.is_contiguous()); | |||
lowbit_memcpy_byte2aligned(ret.raw_ptr(), src_ptr, layout); | |||
} else { | |||
lowbit_memcpy_byte2compact(layout.dtype, ret.raw_ptr(), src_ptr, | |||
layout.total_nr_elems()); | |||
} | |||
return ret; | |||
} | |||
@@ -423,10 +432,8 @@ std::pair<HostTensorND, bool> np2tensor_try_borrow( | |||
} | |||
// make result from PyArrayObject; its reference may be stolen | |||
auto make_from_arr = [&](PyArrayObject *input, bool allow_borrow) { | |||
TensorLayout layout; | |||
layout.dtype = dtype_np2mgb_descr(PyArray_DESCR(input)); | |||
auto make_from_arr = [&](PyArrayObject* input, bool allow_borrow) { | |||
TensorLayout layout{{}, dtype_np2mgb_descr(PyArray_DESCR(input))}; | |||
if (dtype.valid()) | |||
mgb_assert(dtype == layout.dtype); | |||
layout.ndim = PyArray_NDIM(input); | |||
@@ -605,8 +612,15 @@ PyObject* ndarray_from_tensor( | |||
if (val.dtype().is_low_bit()) { | |||
mgb_assert(share_type != ShareType::MUST_SHARE, | |||
"can not share memory for lowbit dtype"); | |||
lowbit_memcpy_compact2byte(val.dtype(), alloc_new_ret(), val.raw_ptr(), | |||
val.layout().total_nr_elems()); | |||
const auto& layout = val.layout(); | |||
if (layout.format.is_lowbit_aligned()) { | |||
lowbit_memcpy_aligned2byte(alloc_new_ret(), val.raw_ptr(), | |||
val.layout()); | |||
} else { | |||
lowbit_memcpy_compact2byte(val.dtype(), alloc_new_ret(), | |||
val.raw_ptr(), | |||
val.layout().total_nr_elems()); | |||
} | |||
} else if (share_type == ShareType::MUST_UNSHARE) { | |||
memcpy(alloc_new_ret(), val.raw_ptr(), val.layout().span().dist_byte()); | |||
} else { | |||
@@ -290,7 +290,7 @@ Tensor::Tensor(const DeviceTensorND &dv, const HostTensorND& hv) { | |||
} | |||
Tensor::Tensor(const TensorLayout& layout, const CompNode& cn) | |||
: m_layout{layout}, m_blob{Blob::make(cn, layout.dtype.size(layout.total_nr_elems()))}, | |||
: m_layout{layout}, m_blob{Blob::make(cn, layout.span().dist_byte())}, | |||
m_offset{0} {} | |||
Tensor::Tensor(const BlobPtr blob, const size_t offset, const TensorLayout& layout) | |||
@@ -359,19 +359,6 @@ struct LowbitMemcpy<bits, true> { | |||
} | |||
}; | |||
template<typename DT> | |||
struct QuantizedLowbitTrait; | |||
template<> | |||
struct QuantizedLowbitTrait<dtype::Quantized4Asymm> { | |||
static constexpr int8_t SHIFT = 0; | |||
}; | |||
template<> | |||
struct QuantizedLowbitTrait<dtype::QuantizedS4> { | |||
static constexpr int8_t SHIFT = 8; | |||
}; | |||
template <typename DT, bool div_byte = (DTypeTrait<DT>::category == | |||
DTypeCategory::QUANTIZED) && | |||
(8 % DTypeTrait<DT>::low_bit == 0)> | |||
@@ -452,4 +439,44 @@ void mgb::lowbit_memcpy_compact2byte( | |||
mgb_throw(MegBrainError, "bad dtype for lowbit: %s", dtype.name()); | |||
} | |||
void mgb::lowbit_memcpy_byte2aligned(void* dest, const void* src, | |||
const ::megdnn::TensorLayout& layout) { | |||
size_t low_bit = layout.dtype.low_bit(); | |||
size_t dim = layout.shape[layout.ndim - 1]; | |||
if ((dim * low_bit) % 8) { // padding | |||
size_t n = layout.total_nr_elems(); | |||
size_t stride = divup<size_t>(dim * low_bit, 8); | |||
dt_byte* dest_ptr = reinterpret_cast<dt_byte*>(dest); | |||
const dt_byte* src_ptr = reinterpret_cast<const dt_byte*>(src); | |||
for (size_t i = 0; i < n / dim; ++i) { | |||
lowbit_memcpy_byte2compact(layout.dtype, dest_ptr, src_ptr, dim); | |||
dest_ptr += stride; | |||
src_ptr += dim; | |||
} | |||
} else { | |||
lowbit_memcpy_byte2compact(layout.dtype, dest, src, | |||
layout.total_nr_elems()); | |||
} | |||
} | |||
void mgb::lowbit_memcpy_aligned2byte(void* dest, const void* src, | |||
const ::megdnn::TensorLayout& layout) { | |||
size_t low_bit = layout.dtype.low_bit(); | |||
size_t dim = layout.shape[layout.ndim - 1]; | |||
if ((dim * low_bit) % 8) { // padding | |||
size_t n = layout.total_nr_elems(); | |||
size_t stride = divup<size_t>(dim * low_bit, 8); | |||
dt_byte* dest_ptr = reinterpret_cast<dt_byte*>(dest); | |||
const dt_byte* src_ptr = reinterpret_cast<const dt_byte*>(src); | |||
for (size_t i = 0; i < n / dim; ++i) { | |||
lowbit_memcpy_compact2byte(layout.dtype, dest_ptr, src_ptr, dim); | |||
dest_ptr += dim; | |||
src_ptr += stride; | |||
} | |||
} else { | |||
lowbit_memcpy_compact2byte(layout.dtype, dest, src, | |||
layout.total_nr_elems()); | |||
} | |||
} | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -1340,15 +1340,19 @@ void VarNodeMemManager::make_dev_tensor_from_mem_plan_single( | |||
void VarNodeMemManager::var_alloc_with_shape(VarNode* var, | |||
const TensorShape& shape, | |||
size_t size_req) { | |||
mgb_assert(var->format().is_default(), | |||
bool cond_default = var->format().is_default(); | |||
bool cond_lowbit = var->dtype().is_quantized_lowbit() && | |||
var->format().is_lowbit_aligned(); | |||
mgb_assert(cond_default || cond_lowbit, | |||
"dynamic shape is currently only supported for var with " | |||
"default format; got %s", | |||
var->format().to_string().c_str()); | |||
var->shape(shape); | |||
TensorLayout ly{shape, var->dtype()}; | |||
if (size_req != 0) { | |||
mgb_assert(var->dtype().size(shape.total_nr_elems()) <= size_req); | |||
mgb_assert(ly.span().dist_byte() <= size_req); | |||
} else { | |||
size_req = var->dtype().size(shape.total_nr_elems()); | |||
size_req = ly.span().dist_byte(); | |||
} | |||
auto&& mplan = var->m_mem_plan; | |||
@@ -202,6 +202,17 @@ void lowbit_memcpy_byte2compact( | |||
void lowbit_memcpy_compact2byte( | |||
DType dtype, void *dest, const void *src, size_t n); | |||
/*! | |||
* \brief copy from byte representation to an aligend tensor for lowbit types | |||
*/ | |||
void lowbit_memcpy_byte2aligned(void* dest, const void* src, | |||
const ::megdnn::TensorLayout& ly); | |||
/*! | |||
* \brief copy from an aligend tensor to byte representation for lowbit types | |||
*/ | |||
void lowbit_memcpy_aligned2byte(void* dest, const void* src, | |||
const ::megdnn::TensorLayout& ly); | |||
} // namespace mgb | |||
@@ -4454,314 +4454,6 @@ TEST(TestGoptInference, PaddingChannelsWithWarpPerspective) { | |||
MGB_ASSERT_TENSOR_EQ(t1, t2); | |||
} | |||
TEST(TestGoptInference, EnableNCHW64Basic) { | |||
REQUIRE_GPU(1); | |||
auto cn = CompNode::load("gpu0"); | |||
cn.activate(); | |||
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); | |||
HostTensorGenerator<dtype::Int8> gen; | |||
auto graph = ComputingGraph::make(); | |||
graph->options().graph_opt_level = 0; | |||
auto mkvar = [&](const char* name, const TensorShape& shp, | |||
const DType& dtype) { | |||
return opr::TypeCvt::make( | |||
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), | |||
dtype); | |||
}; | |||
auto mkcvar = [&](const char* name, const TensorShape& shp, | |||
const DType& dtype) { | |||
return opr::TypeCvt::make( | |||
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) | |||
.rename(name), | |||
dtype); | |||
}; | |||
auto x = mkvar("x", {16, 4, 14, 14}, dtype::QuantizedS8(2.5f)), | |||
w = mkcvar("w", {16, 4, 3, 3}, dtype::QuantizedS8(2.5f)), | |||
b = mkcvar("b", {1, 16, 1, 1}, dtype::QuantizedS32(6.25f)); | |||
opr::ConvBias::Param param; | |||
param.format = opr::ConvBias::Param::Format::NCHW; | |||
param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; | |||
param.stride_h = param.stride_w = 1; | |||
param.pad_h = param.pad_w = 1; | |||
auto y = opr::ConvBias::make(x, w, b, param, {}, | |||
OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||
auto w1 = mkcvar("w1", {32, 16, 3, 3}, dtype::QuantizedS8(2.5f)), | |||
b1 = mkcvar("b1", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f)); | |||
auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, | |||
OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||
auto w2 = mkcvar("w2", {64, 32, 3, 3}, dtype::QuantizedS8(2.5f)), | |||
b2 = mkcvar("b2", {1, 64, 1, 1}, dtype::QuantizedS32(6.25f)); | |||
auto y2 = opr::ConvBias::make(y1, w2, b2, param, {}, | |||
OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||
y2 = opr::TypeCvt::make(y2, dtype::QuantizedS4{40.f}); | |||
auto w3 = mkcvar("w3", {64, 64, 3, 3}, dtype::QuantizedS4(2.5f)), | |||
b3 = mkcvar("b3", {1, 64, 1, 1}, dtype::QuantizedS32(100.f)); | |||
auto y3 = opr::ConvBias::make(y2, w3, b3, param, {}, | |||
OperatorNodeConfig{dtype::QuantizedS4{40.f}}); | |||
y3 = opr::TypeCvt::make(y3, dtype::QuantizedS8{2.5f}); | |||
auto w4 = mkcvar("w4", {16, 64, 3, 3}, dtype::QuantizedS8(2.5f)), | |||
b4 = mkcvar("b4", {1, 16, 1, 1}, dtype::QuantizedS32(6.25f)); | |||
auto y4 = opr::ConvBias::make(y3, w4, b4, param, {}, | |||
OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||
using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode; | |||
auto y5 = opr::ElemwiseMultiType::make( | |||
{y, y4}, {ElemMultiMode::QFUSE_ADD_RELU}, | |||
OperatorNodeConfig{dtype::QuantizedS8{1.3f}}); | |||
y5 = opr::TypeCvt::make(y5, dtype::Float32()); | |||
SymbolVar y5_pad; | |||
unpack_vector( | |||
gopt::GraphOptimizer{} | |||
.add_pass(gopt::EnableNCHW64Pass::make_nchw64_converter()) | |||
.apply({{y5}}) | |||
.endpoint_vars(), | |||
y5_pad); | |||
EXPECT_TRUE(y5.node()->shape().eq_shape(y5_pad.node()->shape())); | |||
SmallVector<cg::OperatorNodeBase*> oprs; | |||
auto cb = [&oprs](cg::OperatorNodeBase* opr) { | |||
if (opr->same_type<opr::ConvBias>()) { | |||
oprs.push_back(opr); | |||
} | |||
}; | |||
cg::DepOprIter{cb}.add(y5_pad.node()->owner_opr()); | |||
ASSERT_EQ(oprs.size(), 5); | |||
using Format = opr::ConvBiasForward::Param::Format; | |||
#define CHECK(_i, _fmt) \ | |||
{ \ | |||
const auto& o = oprs[_i]->cast_final<opr::ConvBias>(); \ | |||
ASSERT_EQ(o.param().format, Format::_fmt); \ | |||
} | |||
CHECK(0, NCHW4); | |||
CHECK(1, NCHW4); | |||
CHECK(2, NCHW32); | |||
CHECK(3, NCHW64); | |||
CHECK(4, NCHW4); | |||
#undef CHECK | |||
HostTensorND t1, t2; | |||
auto func1 = graph->compile({make_callback_copy(y5, t1)}); | |||
func1->execute(); | |||
auto func2 = graph->compile({make_callback_copy(y5_pad, t2)}); | |||
func2->execute(); | |||
MGB_ASSERT_TENSOR_EQ(t1, t2); | |||
} | |||
TEST(TestGoptInference, EnableNCHW64PaddingChannel) { | |||
REQUIRE_GPU(1); | |||
auto cn = CompNode::load("gpu0"); | |||
cn.activate(); | |||
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); | |||
HostTensorGenerator<dtype::Int8> gen; | |||
auto graph = ComputingGraph::make(); | |||
graph->options().graph_opt_level = 0; | |||
auto mkvar = [&](const char* name, const TensorShape& shp, | |||
const DType& dtype) { | |||
return opr::TypeCvt::make( | |||
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), | |||
dtype); | |||
}; | |||
auto mkcvar = [&](const char* name, const TensorShape& shp, | |||
const DType& dtype) { | |||
return opr::TypeCvt::make( | |||
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) | |||
.rename(name), | |||
dtype); | |||
}; | |||
auto x = mkvar("x", {16, 4, 14, 14}, dtype::QuantizedS8(2.5f)), | |||
w = mkcvar("w", {20, 4, 3, 3}, dtype::QuantizedS8(2.5f)), | |||
b = mkcvar("b", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f)); | |||
opr::ConvBias::Param param; | |||
param.format = opr::ConvBias::Param::Format::NCHW; | |||
param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; | |||
param.stride_h = param.stride_w = 1; | |||
param.pad_h = param.pad_w = 1; | |||
auto y = opr::ConvBias::make(x, w, b, param, {}, | |||
OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||
opr::Pooling::Param pool; | |||
pool.format = opr::Pooling::Param::Format::NCHW; | |||
y = opr::Pooling::make(y, pool); | |||
auto w1 = mkcvar("w1", {24, 20, 3, 3}, dtype::QuantizedS8(2.5f)), | |||
b1 = mkcvar("b1", {1, 24, 1, 1}, dtype::QuantizedS32(6.25f)); | |||
auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, | |||
OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||
auto w2 = mkcvar("w2", {20, 24, 3, 3}, dtype::QuantizedS8(2.5f)), | |||
b2 = mkcvar("b2", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f)); | |||
auto y2 = opr::ConvBias::make(y1, w2, b2, param, {}, | |||
OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||
y2 = opr::TypeCvt::make(y2, dtype::QuantizedS4{40.f}); | |||
auto w3 = mkcvar("w3", {64, 20, 3, 3}, dtype::QuantizedS4(2.5f)), | |||
b3 = mkcvar("b3", {1, 64, 1, 1}, dtype::QuantizedS32(100.f)); | |||
auto y3 = opr::ConvBias::make(y2, w3, b3, param, {}, | |||
OperatorNodeConfig{dtype::QuantizedS4{40.f}}); | |||
auto w4 = mkcvar("w4", {20, 64, 3, 3}, dtype::QuantizedS4(2.5f)), | |||
b4 = mkcvar("b4", {1, 20, 1, 1}, dtype::QuantizedS32(100.f)); | |||
auto y4 = opr::ConvBias::make(y3, w4, b4, param, {}, | |||
OperatorNodeConfig{dtype::QuantizedS4{40.f}}); | |||
y4 = opr::TypeCvt::make(y4, dtype::QuantizedS8{2.5f}); | |||
using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode; | |||
auto y5 = opr::ElemwiseMultiType::make( | |||
{y, y4}, {ElemMultiMode::QFUSE_ADD_RELU}, | |||
OperatorNodeConfig{dtype::QuantizedS8{1.2f}}); | |||
opr::ConvolutionBackwardData::Param deconv; | |||
deconv.format = opr::ConvolutionBackwardData::Param::Format::NCHW; | |||
deconv.stride_h = deconv.stride_w = 2; | |||
deconv.pad_h = deconv.pad_w = 1; | |||
auto w6 = mkcvar("w6", {20, 64, 4, 4}, dtype::QuantizedS8{2.5f}); | |||
auto y6 = opr::ConvolutionBackwardData::make( | |||
w6, y5, deconv, {}, | |||
OperatorNodeConfig{dtype::QuantizedS8(2.0f)}); | |||
y6 = opr::TypeCvt::make(y6, dtype::QuantizedS4{32.f}); | |||
std::shared_ptr<HostTensorND> mat = std::make_shared<HostTensorND>( | |||
cn, TensorShape{16, 3, 3}, dtype::Float32()); | |||
warp_perspective_mat_gen(*mat, 16, 14, 14); | |||
auto mat_var = opr::Host2DeviceCopy::make(*graph, mat).rename("mat"); | |||
opr::WarpPerspective::Param warp_param; | |||
warp_param.format = opr::WarpPerspective::Param::Format::NCHW; | |||
auto y7 = opr::WarpPerspective::make(y6, mat_var, TensorShape{14, 14}, | |||
warp_param); | |||
y7 = opr::TypeCvt::make(y7, dtype::Float32()); | |||
SymbolVar y7_pad; | |||
auto opt = gopt::OptimizeForInferenceOptions{}; | |||
opt.enable_nchw64(); | |||
unpack_vector(gopt::optimize_for_inference({y7}, opt), y7_pad); | |||
EXPECT_TRUE(y7.node()->shape().eq_shape(y7_pad.node()->shape())); | |||
HostTensorND t1, t2; | |||
auto func1 = graph->compile({make_callback_copy(y7, t1)}); | |||
func1->execute(); | |||
auto func2 = graph->compile({make_callback_copy(y7_pad, t2)}); | |||
func2->execute(); | |||
MGB_ASSERT_TENSOR_EQ(t1, t2); | |||
using Format = opr::ConvBiasForward::Param::Format; | |||
SmallVector<cg::OperatorNodeBase*> oprs; | |||
auto cb = [&oprs](cg::OperatorNodeBase* opr) { | |||
if (opr->same_type<opr::ConvBias>()) { | |||
oprs.push_back(opr); | |||
} | |||
}; | |||
cg::DepOprIter{cb}.add(y7_pad.node()->owner_opr()); | |||
ASSERT_EQ(oprs.size(), 5); | |||
#define CHECK(_i, _fmt) \ | |||
{ \ | |||
const auto& o = oprs[_i]->cast_final<opr::ConvBias>(); \ | |||
ASSERT_EQ(o.param().format, Format::_fmt); \ | |||
} | |||
CHECK(0, NCHW4); | |||
CHECK(1, NCHW32); | |||
CHECK(2, NCHW32); | |||
CHECK(3, NCHW64); | |||
CHECK(4, NCHW64); | |||
#undef CHECK | |||
{ | |||
const auto& deconv = find_opr<opr::ConvolutionBackwardData>(y7_pad); | |||
ASSERT_EQ(deconv.param().format, Format::NCHW4); | |||
const auto& pool = find_opr<opr::PoolingForward>(y7_pad); | |||
ASSERT_EQ(pool.param().format, Format::NCHW4); | |||
const auto& warp = find_opr<opr::WarpPerspectiveForward>(y7_pad); | |||
ASSERT_EQ(warp.param().format, Format::NCHW64); | |||
} | |||
size_t nr_dimshuffle = find_opr_num<opr::Dimshuffle>(y7_pad); | |||
ASSERT_EQ(nr_dimshuffle, 8); | |||
} | |||
TEST(TestGoptInference, EnableNCHW64FuseConvBiasZ) { | |||
REQUIRE_GPU(1); | |||
auto cn = CompNode::load("gpu0"); | |||
cn.activate(); | |||
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); | |||
HostTensorND t1, t2; | |||
HostTensorGenerator<dtype::Int8> gen; | |||
auto graph = ComputingGraph::make(); | |||
graph->options().graph_opt_level = 0; | |||
auto mkvar = [&](const char* name, const TensorShape& shp, | |||
const DType& dtype) { | |||
return opr::TypeCvt::make( | |||
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), | |||
dtype); | |||
}; | |||
auto mkcvar = [&](const char* name, const TensorShape& shp, | |||
const DType& dtype) { | |||
return opr::TypeCvt::make( | |||
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) | |||
.rename(name), | |||
dtype); | |||
}; | |||
auto x = mkvar("x", {16, 4, 14, 14}, dtype::QuantizedS8(2.5f)), | |||
w = mkcvar("w", {32, 4, 3, 3}, dtype::QuantizedS8(2.5f)), | |||
b = mkcvar("b", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f)); | |||
opr::ConvBias::Param param; | |||
param.format = opr::ConvBias::Param::Format::NCHW; | |||
param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; | |||
param.stride_h = param.stride_w = 1; | |||
param.pad_h = param.pad_w = 1; | |||
auto y = opr::ConvBias::make(x, w, b, param, {}, | |||
OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||
auto w1 = mkcvar("w1", {64, 32, 3, 3}, dtype::QuantizedS8(2.5f)), | |||
b1 = mkcvar("b1", {1, 64, 1, 1}, dtype::QuantizedS32(6.25f)); | |||
auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, | |||
OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||
y1 = opr::TypeCvt::make(y1, dtype::QuantizedS4{40.f}); | |||
auto w2 = mkcvar("w2", {64, 64, 3, 3}, dtype::QuantizedS4(2.5f)), | |||
b2 = mkcvar("b2", {1, 64, 1, 1}, dtype::QuantizedS32(100.f)); | |||
auto y2 = opr::ConvBias::make(y1, w2, b2, param, {}, | |||
OperatorNodeConfig{dtype::QuantizedS4{40.f}}); | |||
auto w3 = mkcvar("w3", {64, 64, 3, 3}, dtype::QuantizedS4(2.5f)), | |||
b3 = mkcvar("b3", {1, 64, 1, 1}, dtype::QuantizedS32(100.f)); | |||
auto y3 = opr::ConvBias::make(y2, w3, b3, param, {}, | |||
OperatorNodeConfig{dtype::QuantizedS4(40.f)}); | |||
using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode; | |||
auto y4 = opr::ElemwiseMultiType::make( | |||
{y1, y3}, {ElemMultiMode::QFUSE_ADD_RELU}, | |||
OperatorNodeConfig{dtype::QuantizedS4{40.f}}); | |||
y4 = opr::TypeCvt::make(y4, dtype::Float32()); | |||
auto y5 = opr::ConvBias::make(y2, w3, b3, y1, param, {}, | |||
OperatorNodeConfig{dtype::QuantizedS4(40.f)}); | |||
y5 = opr::TypeCvt::make(y5, dtype::Float32()); | |||
SymbolVar y4_pad; | |||
auto opt = gopt::OptimizeForInferenceOptions{}; | |||
opt.enable_nchw64(); | |||
unpack_vector(gopt::optimize_for_inference({y4}, opt), y4_pad); | |||
EXPECT_TRUE(y4.node()->shape().eq_shape(y4_pad.node()->shape())); | |||
size_t nr_elem_mult_type = find_opr_num<opr::ElemwiseMultiType>(y4_pad); | |||
ASSERT_EQ(nr_elem_mult_type, 0); | |||
auto func = graph->compile({make_callback_copy(y4_pad, t1)}); | |||
func->execute(); | |||
{ | |||
opr::ConvBias::Param param; | |||
param.format = opr::ConvBias::Param::Format::NCHW; | |||
param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; | |||
param.stride_h = param.stride_w = 1; | |||
param.pad_h = param.pad_w = 1; | |||
auto y = opr::ConvBias::make( | |||
x, w, b, param, {}, | |||
OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||
auto y1 = opr::ConvBias::make( | |||
y, w1, b1, param, {}, | |||
OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||
y1 = opr::TypeCvt::make(y1, dtype::QuantizedS4{40.f}); | |||
auto y2 = opr::ConvBias::make( | |||
y1, w2, b2, param, {}, | |||
OperatorNodeConfig{dtype::QuantizedS4{40.f}}); | |||
param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; | |||
auto y3 = opr::ConvBias::make( | |||
y2, w3, b3, y1, param, {}, | |||
OperatorNodeConfig{dtype::QuantizedS4(40.f)}); | |||
y3 = opr::TypeCvt::make(y3, dtype::Float32()); | |||
auto func = graph->compile({make_callback_copy(y3, t2)}); | |||
func->execute(); | |||
} | |||
MGB_ASSERT_TENSOR_EQ(t1, t2); | |||
} | |||
#endif | |||
@@ -2604,174 +2604,6 @@ TEST_F(TestNoWeightPreprocess, NoPreprocess) { | |||
#endif | |||
namespace { | |||
TEST(TestOprDNN, ConvBiasInt4NCHW) { | |||
REQUIRE_GPU(1); | |||
auto cn = CompNode::load("gpu0"); | |||
cn.activate(); | |||
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; | |||
auto sm_ver = prop.major * 10 + prop.minor; | |||
if (sm_ver != 75) { | |||
printf("This testcast ignored due to insufficient cuda cap(got: %d, " | |||
"expected: %d)\n", | |||
sm_ver, 75); | |||
return; | |||
} | |||
auto run = [&cn](size_t N, size_t C, size_t H, size_t W, size_t F, size_t S, | |||
size_t P) { | |||
auto graph = ComputingGraph::make(); | |||
HostTensorGenerator<dtype::Int8> gen; | |||
auto mkvar = [&gen](const char* name, const TensorShape& shp, | |||
const DType& dtype, | |||
std::shared_ptr<ComputingGraph> graph, | |||
const CompNode& cn) { | |||
return opr::TypeCvt::make( | |||
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)) | |||
.rename(name), | |||
dtype); | |||
}; | |||
auto mkcvar = [&gen](const char* name, const TensorShape& shp, | |||
const DType& dtype, | |||
std::shared_ptr<ComputingGraph> graph, | |||
const CompNode& cn) { | |||
return opr::TypeCvt::make( | |||
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) | |||
.rename(name), | |||
dtype); | |||
}; | |||
using Policy = opr::ConvBias::ExecutionPolicy; | |||
using Strategy = Policy::Strategy; | |||
auto x = mkvar("x", {N, C * 4, H, W}, dtype::QuantizedS4(1.19960327f), | |||
graph, cn), | |||
w = mkcvar("w1", {C, C * 4, F, F}, dtype::QuantizedS4(1.19970327f), | |||
graph, cn), | |||
b = mkcvar("b1", {1, C, 1, 1}, | |||
dtype::QuantizedS32(1.19960327f * 1.19970327f), graph, | |||
cn); | |||
opr::ConvBias::Param param; | |||
param.format = opr::ConvBias::Param::Format::NCHW; | |||
param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; | |||
param.stride_h = param.stride_w = S; | |||
param.pad_h = param.pad_w = P; | |||
Policy policy; | |||
policy.strategy = Strategy::PROFILE; | |||
auto y = opr::ConvBias::make( | |||
x, w, b, param, policy, | |||
OperatorNodeConfig{dtype::QuantizedS4(11.9960501f)}); | |||
y = opr::TypeCvt::make(y, dtype::Float32()); | |||
auto x_f32 = opr::TypeCvt::make(x, dtype::Float32()), | |||
w_f32 = opr::TypeCvt::make(w, dtype::Float32()), | |||
b_f32 = opr::TypeCvt::make(b, dtype::Float32()); | |||
auto y_f32 = opr::ConvBias::make(x_f32, w_f32, b_f32, param, policy); | |||
auto y_q4 = opr::TypeCvt::make(y_f32, dtype::QuantizedS4{11.9960501f}); | |||
y_q4 = opr::TypeCvt::make(y_q4, dtype::Float32()); | |||
HostTensorND host_y, host_y_q4; | |||
auto func = graph->compile({make_callback_copy(y, host_y), | |||
make_callback_copy(y_q4, host_y_q4)}); | |||
func->execute(); | |||
MGB_ASSERT_TENSOR_NEAR(host_y, host_y_q4, 1e-3); | |||
}; | |||
run(2, 64, 14, 14, 3, 2, 1); | |||
run(2, 64, 7, 7, 3, 1, 1); | |||
run(2, 64, 14, 14, 1, 2, 0); | |||
run(2, 64, 7, 7, 1, 1, 0); | |||
} | |||
TEST(TestOprDNN, ConvBiasInt4NCHW64) { | |||
REQUIRE_GPU(1); | |||
auto cn = CompNode::load("gpu0"); | |||
cn.activate(); | |||
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; | |||
auto sm_ver = prop.major * 10 + prop.minor; | |||
if (sm_ver != 75) { | |||
printf("This testcast ignored due to insufficient cuda cap(got: %d, " | |||
"expected: %d)\n", | |||
sm_ver, 75); | |||
return; | |||
} | |||
auto nchw2nchw64 = [](SymbolVar x) { | |||
auto y = opr::RelayoutFormat::make( | |||
x, opr::RelayoutFormat::Param::Mode::NCHW_NCHW64); | |||
return y; | |||
}; | |||
auto nchw642nchw = [](SymbolVar x) { | |||
auto y = opr::RelayoutFormat::make( | |||
x, opr::RelayoutFormat::Param::Mode::NCHW64_NCHW); | |||
return y; | |||
}; | |||
auto run = [&](size_t N, size_t C, size_t H, size_t W, size_t F, size_t S, | |||
size_t P) { | |||
auto graph = ComputingGraph::make(); | |||
HostTensorGenerator<dtype::Int8> gen; | |||
auto mkvar = [&gen](const char* name, const TensorShape& shp, | |||
const DType& dtype, | |||
std::shared_ptr<ComputingGraph> graph, | |||
const CompNode& cn) { | |||
return opr::TypeCvt::make( | |||
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)) | |||
.rename(name), | |||
dtype); | |||
}; | |||
auto mkcvar = [&gen](const char* name, const TensorShape& shp, | |||
const DType& dtype, | |||
std::shared_ptr<ComputingGraph> graph, | |||
const CompNode& cn) { | |||
return opr::TypeCvt::make( | |||
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) | |||
.rename(name), | |||
dtype); | |||
}; | |||
using Policy = opr::ConvBias::ExecutionPolicy; | |||
using Strategy = Policy::Strategy; | |||
auto x = mkvar("x", {N, C / 16, H, W, 64}, | |||
dtype::QuantizedS4(1.19960327f), graph, cn), | |||
w = mkcvar("w1", {C, C / 16, F, F, 64}, | |||
dtype::QuantizedS4(1.19970327f), graph, cn), | |||
b = mkcvar("b1", {1, C / 64, 1, 1, 64}, | |||
dtype::QuantizedS32(1.19960327f * 1.19970327f), graph, | |||
cn); | |||
opr::ConvBias::Param param; | |||
param.format = opr::ConvBias::Param::Format::NCHW64; | |||
param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; | |||
param.stride_h = param.stride_w = S; | |||
param.pad_h = param.pad_w = P; | |||
Policy policy; | |||
policy.strategy = Strategy::PROFILE; | |||
auto y = opr::ConvBias::make( | |||
x, w, b, param, policy, | |||
OperatorNodeConfig{dtype::QuantizedS4(11.9960501f)}); | |||
y = opr::TypeCvt::make(y, dtype::Float32()); | |||
x = nchw642nchw(x); | |||
w = nchw642nchw(w); | |||
b = nchw642nchw(b); | |||
auto x_f32 = opr::TypeCvt::make(x, dtype::Float32()), | |||
w_f32 = opr::TypeCvt::make(w, dtype::Float32()), | |||
b_f32 = opr::TypeCvt::make(b, dtype::Float32()); | |||
param.format = opr::ConvBias::Param::Format::NCHW; | |||
auto y_f32 = opr::ConvBias::make(x_f32, w_f32, b_f32, param, policy); | |||
auto y_q4 = opr::TypeCvt::make(y_f32, dtype::QuantizedS4{11.9960501f}); | |||
y_q4 = opr::TypeCvt::make(y_q4, dtype::Float32()); | |||
y_q4 = nchw2nchw64(y_q4); | |||
HostTensorND host_y, host_y_q4; | |||
auto func = graph->compile({make_callback_copy(y, host_y), | |||
make_callback_copy(y_q4, host_y_q4)}); | |||
func->execute(); | |||
MGB_ASSERT_TENSOR_NEAR(host_y, host_y_q4, 1e-3); | |||
}; | |||
run(2, 64, 14, 14, 3, 2, 1); | |||
run(2, 64, 7, 7, 3, 1, 1); | |||
run(2, 64, 14, 14, 1, 2, 0); | |||
run(2, 64, 7, 7, 1, 1, 0); | |||
} | |||
TEST(TestOprDNN, ConvBiasInt4Serialize) { | |||
using namespace serialization; | |||
@@ -2783,7 +2615,7 @@ TEST(TestOprDNN, ConvBiasInt4Serialize) { | |||
HostTensorGenerator<dtype::Int8> gen; | |||
std::shared_ptr<HostTensorND> xv; | |||
auto mkvar = [&gen](const char* name, const DType& dtype, | |||
auto mkvar = [](const char* name, const DType& dtype, | |||
std::shared_ptr<ComputingGraph> graph, | |||
std::shared_ptr<HostTensorND> val) { | |||
return opr::TypeCvt::make( | |||
@@ -2856,9 +2688,9 @@ TEST(TestOprDNN, ConvBiasInt4SerializeWithParamFuse) { | |||
HostTensorGenerator<dtype::Int8> gen; | |||
std::shared_ptr<HostTensorND> xv; | |||
auto mkvar = [&gen](const char* name, const DType& dtype, | |||
std::shared_ptr<ComputingGraph> graph, | |||
std::shared_ptr<HostTensorND> val) { | |||
auto mkvar = [](const char* name, const DType& dtype, | |||
std::shared_ptr<ComputingGraph> graph, | |||
std::shared_ptr<HostTensorND> val) { | |||
return opr::TypeCvt::make( | |||
opr::Host2DeviceCopy::make(*graph, val).rename(name), dtype); | |||
}; | |||
@@ -62,7 +62,12 @@ bool contains_any_in_set(const SmallVector<T>& list, | |||
void check_tensor_value_valid(const std::string& name, | |||
const HostTensorND& tensor) { | |||
mgb_assert(tensor.layout().is_physical_contiguous(), | |||
bool cond_normal = tensor.layout().format.is_default() && | |||
tensor.layout().is_physical_contiguous(); | |||
bool cond_lowbit = tensor.layout().dtype.is_quantized_lowbit() && | |||
tensor.layout().format.is_lowbit_aligned() && | |||
tensor.layout().is_contiguous(); | |||
mgb_assert(cond_normal || cond_lowbit, | |||
"non-contiguous tensor: name=%s layout=%s", name.c_str(), | |||
tensor.layout().to_string().c_str()); | |||
if (tensor.dtype() == dtype::Float32()) { | |||
@@ -585,11 +590,12 @@ TensorLayout load_tensor_layout(const fbs::Tensor* tensor) { | |||
layout.ndim = tensor->shape()->size(); | |||
std::copy(tensor->shape()->begin(), tensor->shape()->end(), | |||
layout.shape); | |||
layout.init_contiguous_stride(); | |||
} | |||
if (tensor->dtype()) { | |||
layout.dtype = fbs::intl::load_dtype(tensor->dtype()); | |||
// modify data type inplace for TensorLayout | |||
layout.modify_dtype_inplace(fbs::intl::load_dtype(tensor->dtype())); | |||
} | |||
layout.init_contiguous_stride(); | |||
return layout; | |||
} | |||