@@ -281,6 +281,13 @@ struct TensorLayout : public TensorShape { | |||||
add_axis_inplace(axis, 1, stride[axis] * shape[axis]); | add_axis_inplace(axis, 1, stride[axis] * shape[axis]); | ||||
} | } | ||||
/*! | |||||
* \brief modify data type of the layout inplace | |||||
* | |||||
* By the way this API will modify the format according to the data type | |||||
*/ | |||||
void modify_dtype_inplace(DType dtype); | |||||
/* =================== generate new layout =================== */ | /* =================== generate new layout =================== */ | ||||
/** | /** | ||||
@@ -513,6 +513,15 @@ class DType { | |||||
bool is_low_bit() const { return low_bit() != 0; } | bool is_low_bit() const { return low_bit() != 0; } | ||||
bool is_quantized_lowbit() const { | |||||
return low_bit() != 0 && | |||||
#if MEGDNN_CC_HOST | |||||
category() == DTypeCategory::QUANTIZED; | |||||
#else | |||||
category().ev == DTypeCategory::Ev::QUANTIZED; | |||||
#endif | |||||
} | |||||
/*! | /*! | ||||
* \brief size of this data type, in bytes | * \brief size of this data type, in bytes | ||||
*/ | */ | ||||
@@ -226,7 +226,7 @@ public: | |||||
std::string to_string() const override; | std::string to_string() const override; | ||||
//! raise exception if given layout is illegal | //! raise exception if given layout is illegal | ||||
void assert_valid(const TensorLayout& layout) const; | |||||
void assert_valid(const TensorLayout& layout) const override; | |||||
void serialize_append(std::string& result) const override; | void serialize_append(std::string& result) const override; | ||||
@@ -282,6 +282,11 @@ void TensorLayout::add_axis_inplace(size_t axis, size_t shape, | |||||
this->stride[axis] = stride; | this->stride[axis] = stride; | ||||
} | } | ||||
void TensorLayout::modify_dtype_inplace(DType dtype_) { | |||||
dtype = dtype_; | |||||
format = Format(dtype); | |||||
} | |||||
bool TensorLayout::is_contiguous() const { | bool TensorLayout::is_contiguous() const { | ||||
return format.impl()->is_contiguous_spec(*this); | return format.impl()->is_contiguous_spec(*this); | ||||
} | } | ||||
@@ -952,7 +952,12 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||||
megdnn_assert(src[4] == 4); | megdnn_assert(src[4] == 4); | ||||
dst[4] = 4; | dst[4] = 4; | ||||
} | } | ||||
dst.format = src.format; | |||||
if (!src.format.is_default() && | |||||
!src.format.is_lowbit_aligned()) { // propagate | |||||
dst.format = src.format; | |||||
} else { // determined by dtype | |||||
dst.format = TensorFormat(dst.dtype); | |||||
} | |||||
dst.init_contiguous_stride(); | dst.init_contiguous_stride(); | ||||
return cflt; | return cflt; | ||||
} | } | ||||
@@ -46,14 +46,15 @@ TensorFormat TensorFormat::deserialize(const std::string& bin, | |||||
TensorFormat::Format() : m_impl{DefaultTensorFormat::make().m_impl} {} | TensorFormat::Format() : m_impl{DefaultTensorFormat::make().m_impl} {} | ||||
TensorFormat::Format(DType dtype) { | TensorFormat::Format(DType dtype) { | ||||
megdnn_assert(dtype.valid()); | |||||
if (dtype.is_low_bit()) { | |||||
if (dtype.valid() && | |||||
dtype.is_quantized_lowbit()) { // quantized lowbit, by default | |||||
// aligned to bytes | |||||
size_t size_nbits = dtype.low_bit(); | size_t size_nbits = dtype.low_bit(); | ||||
megdnn_assert(size_nbits == 1 || size_nbits == 2 || size_nbits == 4, | megdnn_assert(size_nbits == 1 || size_nbits == 2 || size_nbits == 4, | ||||
"unsupported lowbits data type(%s, size in bits: %zu)", | "unsupported lowbits data type(%s, size in bits: %zu)", | ||||
dtype.name(), size_nbits); | dtype.name(), size_nbits); | ||||
m_impl = LowbitsAlignedToBytesTensorFormat::make(size_nbits).m_impl; | m_impl = LowbitsAlignedToBytesTensorFormat::make(size_nbits).m_impl; | ||||
} else { | |||||
} else { // non parameterized lowbit, default format | |||||
m_impl = DefaultTensorFormat::make().m_impl; | m_impl = DefaultTensorFormat::make().m_impl; | ||||
} | } | ||||
} | } | ||||
@@ -89,8 +90,8 @@ bool TensorFormat::is_lowbit_aligned() const { | |||||
/* ===================== DefaultFormat ===================== */ | /* ===================== DefaultFormat ===================== */ | ||||
void DefaultTensorFormat::assert_valid(const TensorLayout& layout) const { | void DefaultTensorFormat::assert_valid(const TensorLayout& layout) const { | ||||
megdnn_assert( | megdnn_assert( | ||||
!layout.dtype.valid() || !layout.dtype.is_low_bit(), | |||||
"DefaultTensorFormat does not support low-bits tensor(dtype:%s)", | |||||
!layout.dtype.valid() || !layout.dtype.is_quantized_lowbit(), | |||||
"DefaultTensorFormat does not support quantized lowbit tensor(dtype:%s)", | |||||
layout.dtype.name()); | layout.dtype.name()); | ||||
} | } | ||||
@@ -271,7 +272,8 @@ void Image2DPackedTensorFormatBase<PIXEL_SIZE>::assert_valid( | |||||
auto m_align_axis = align_axis(); | auto m_align_axis = align_axis(); | ||||
megdnn_assert(!(layout.shape[layout.ndim - 1] % PIXEL_SIZE), | megdnn_assert(!(layout.shape[layout.ndim - 1] % PIXEL_SIZE), | ||||
"bad shape: %zu", layout.shape[layout.ndim - 1]); | "bad shape: %zu", layout.shape[layout.ndim - 1]); | ||||
megdnn_assert(layout.dtype.valid() && layout.ndim > m_align_axis); | |||||
megdnn_assert(layout.dtype.valid() && !layout.dtype.is_quantized_lowbit() && | |||||
layout.ndim > m_align_axis); | |||||
ptrdiff_t first_non_zero_stride = 0; | ptrdiff_t first_non_zero_stride = 0; | ||||
for (int i = layout.ndim - 1; i >= 0; --i) { | for (int i = layout.ndim - 1; i >= 0; --i) { | ||||
megdnn_assert(layout.shape[i] && layout.stride[i] >= 0); | megdnn_assert(layout.shape[i] && layout.stride[i] >= 0); | ||||
@@ -478,6 +480,7 @@ void LowbitsAlignedTensorFormatBase::assert_valid( | |||||
megdnn_assert(layout.dtype.valid() && layout.dtype.is_low_bit() && | megdnn_assert(layout.dtype.valid() && layout.dtype.is_low_bit() && | ||||
layout.dtype.low_bit() == m_size_nbits); | layout.dtype.low_bit() == m_size_nbits); | ||||
bool has_dim_unity_stride = false; | bool has_dim_unity_stride = false; | ||||
bool has_dim_aligned_stride = false; | |||||
for (int i = layout.ndim - 1; i >= 0; --i) { | for (int i = layout.ndim - 1; i >= 0; --i) { | ||||
if (!has_dim_unity_stride && layout.stride[i] == 1) | if (!has_dim_unity_stride && layout.stride[i] == 1) | ||||
has_dim_unity_stride = true; | has_dim_unity_stride = true; | ||||
@@ -485,15 +488,16 @@ void LowbitsAlignedTensorFormatBase::assert_valid( | |||||
layout.stride[i] >= 0 && | layout.stride[i] >= 0 && | ||||
(layout.stride[i] % m_align_size_in_elements == 0 || | (layout.stride[i] % m_align_size_in_elements == 0 || | ||||
layout.stride[i] == 1), | layout.stride[i] == 1), | ||||
"bad stride:%s, %zu", layout.to_string().c_str(), | |||||
layout.stride[i]); | |||||
"bad stride:%s, %ld", layout.to_string().c_str(), | |||||
static_cast<long>(layout.stride[i])); | |||||
if (!has_dim_aligned_stride && | |||||
static_cast<size_t>(layout.stride[i]) == m_align_size_in_elements) | |||||
has_dim_aligned_stride = true; | |||||
} | } | ||||
if (!has_dim_unity_stride && | |||||
(int)layout.stride[layout.ndim - 1] == | |||||
round_up(1, (int)m_align_size_in_elements)) | |||||
has_dim_unity_stride = true; | |||||
megdnn_assert(layout.ndim == 0 || has_dim_unity_stride, | |||||
"innermost dim not contiguous"); | |||||
megdnn_assert( | |||||
layout.ndim == 0 || has_dim_unity_stride || has_dim_aligned_stride, | |||||
"innermost dim not contiguous"); | |||||
} | } | ||||
void LowbitsAlignedTensorFormatBase::serialize_append( | void LowbitsAlignedTensorFormatBase::serialize_append( | ||||
@@ -542,6 +546,7 @@ size_t LowbitsAlignedTensorFormatBase::init_contiguous_stride( | |||||
multiplier = round_up(multiplier, m_align_size_in_elements); | multiplier = round_up(multiplier, m_align_size_in_elements); | ||||
accum = mul(accum, multiplier); | accum = mul(accum, multiplier); | ||||
} | } | ||||
assert_valid(layout); | |||||
return accum; | return accum; | ||||
} | } | ||||
@@ -12,6 +12,7 @@ | |||||
#include "./algo.h" | #include "./algo.h" | ||||
#include "src/cuda/utils.h" | #include "src/cuda/utils.h" | ||||
#include "src/common/conv_bias.h" | |||||
using namespace megdnn; | using namespace megdnn; | ||||
using namespace cuda; | using namespace cuda; | ||||
@@ -27,8 +28,7 @@ bool ConvBiasForwardImpl::AlgoFallbackNCHWQS4::is_available( | |||||
bool available = true; | bool available = true; | ||||
auto&& param = args.opr->param(); | auto&& param = args.opr->param(); | ||||
auto&& fm = args.filter_meta; | auto&& fm = args.filter_meta; | ||||
if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout), | |||||
param.format)) | |||||
if (!check_bias_share_in_channel(*(args.bias_layout), param.format)) | |||||
return false; | return false; | ||||
if (param.format != Format::NCHW) | if (param.format != Format::NCHW) | ||||
return false; | return false; | ||||
@@ -128,7 +128,7 @@ void ConvBiasForwardImpl::AlgoFallbackNCHWQS4::exec( | |||||
conv_op->param() = args.opr->param(); | conv_op->param() = args.opr->param(); | ||||
using Format = param::ConvBias::Format; | using Format = param::ConvBias::Format; | ||||
conv_op->param().format = Format::NCHW64; | conv_op->param().format = Format::NCHW64; | ||||
ExecArgs args_{dynamic_cast<ConvBiasForwardImpl*>(conv_op.get()), | |||||
ExecArgs args_{reinterpret_cast<ConvBiasForwardImpl*>(conv_op.get()), | |||||
src_, | src_, | ||||
filter_, | filter_, | ||||
bias_, | bias_, | ||||
@@ -190,7 +190,7 @@ WorkspaceBundle ConvBiasForwardImpl::AlgoFallbackNCHWQS4::get_workspace_bundle( | |||||
conv_op->param() = args.opr->param(); | conv_op->param() = args.opr->param(); | ||||
using Format = param::ConvBias::Format; | using Format = param::ConvBias::Format; | ||||
conv_op->param().format = Format::NCHW64; | conv_op->param().format = Format::NCHW64; | ||||
SizeArgs args_{dynamic_cast<ConvBiasForwardImpl*>(conv_op.get()), | |||||
SizeArgs args_{reinterpret_cast<ConvBiasForwardImpl*>(conv_op.get()), | |||||
layouts[0], | layouts[0], | ||||
layouts[1], | layouts[1], | ||||
layouts[2], | layouts[2], | ||||
@@ -64,7 +64,6 @@ public: | |||||
class AlgoInt8CHWN4IMMAImplicitGemmReorderFilter; | class AlgoInt8CHWN4IMMAImplicitGemmReorderFilter; | ||||
class AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth; | class AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth; | ||||
class AlgoInt8NCHW32IMMAImplicitGemm; | class AlgoInt8NCHW32IMMAImplicitGemm; | ||||
class AlgoFallbackNCHWQS4; | |||||
class AlgoBFloat16; | class AlgoBFloat16; | ||||
class AlgoPack; | class AlgoPack; | ||||
@@ -151,9 +151,9 @@ void exec_matrix_mul_quint4x4x32_helper( | |||||
MEGDNN_MARK_USED_VAR(format); | MEGDNN_MARK_USED_VAR(format); | ||||
MEGDNN_MARK_USED_VAR(compute_mode); | MEGDNN_MARK_USED_VAR(compute_mode); | ||||
auto convert_layout = [](const TensorLayout& layout) { | auto convert_layout = [](const TensorLayout& layout) { | ||||
auto ret = layout; | |||||
auto param = layout.dtype.param<dtype::Quantized4Asymm>(); | auto param = layout.dtype.param<dtype::Quantized4Asymm>(); | ||||
ret.dtype = dtype::Quantized8Asymm(param.scale, param.zero_point); | |||||
TensorLayout ret(layout, | |||||
dtype::Quantized8Asymm(param.scale, param.zero_point)); | |||||
return ret; | return ret; | ||||
}; | }; | ||||
TensorLayout A_layout, B_layout; | TensorLayout A_layout, B_layout; | ||||
@@ -205,9 +205,8 @@ void exec_matrix_mul_qint4x4x16_helper( | |||||
MEGDNN_MARK_USED_VAR(format); | MEGDNN_MARK_USED_VAR(format); | ||||
MEGDNN_MARK_USED_VAR(compute_mode); | MEGDNN_MARK_USED_VAR(compute_mode); | ||||
auto convert_layout = [](const TensorLayout& layout) { | auto convert_layout = [](const TensorLayout& layout) { | ||||
auto ret = layout; | |||||
auto param = layout.dtype.param<dtype::QuantizedS4>(); | auto param = layout.dtype.param<dtype::QuantizedS4>(); | ||||
ret.dtype = dtype::QuantizedS8(param.scale); | |||||
TensorLayout ret(layout, dtype::QuantizedS8(param.scale)); | |||||
return ret; | return ret; | ||||
}; | }; | ||||
TensorLayout A_layout, B_layout; | TensorLayout A_layout, B_layout; | ||||
@@ -406,8 +406,7 @@ size_t PoolingForwardImpl::get_workspace_in_bytes(const TensorLayout& src, | |||||
} | } | ||||
namespace { | namespace { | ||||
void post_process(const TensorND& dst, TensorND& comp_dst, Handle* handle, | |||||
WorkspaceBundle& workspace_bundle) { | |||||
void post_process(const TensorND& dst, TensorND& comp_dst) { | |||||
if (dst.layout.dtype.enumv() == DTypeEnum::QuantizedS4) { | if (dst.layout.dtype.enumv() == DTypeEnum::QuantizedS4) { | ||||
int8_to_int4(comp_dst, dst); | int8_to_int4(comp_dst, dst); | ||||
} else if (dst.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { | } else if (dst.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { | ||||
@@ -427,8 +426,8 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, | |||||
if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4) { | if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4) { | ||||
float scale = src.layout.dtype.param<dtype::QuantizedS4>().scale; | float scale = src.layout.dtype.param<dtype::QuantizedS4>().scale; | ||||
comp_src.layout.dtype = dtype::QuantizedS8(scale); | comp_src.layout.dtype = dtype::QuantizedS8(scale); | ||||
comp_src.layout.init_contiguous_stride(); | |||||
comp_src.layout.format = TensorLayout::Format(comp_src.layout.dtype); | comp_src.layout.format = TensorLayout::Format(comp_src.layout.dtype); | ||||
comp_src.layout.init_contiguous_stride(); | |||||
comp_src.raw_ptr = wsb.get(0); | comp_src.raw_ptr = wsb.get(0); | ||||
comp_dst.layout.dtype = dtype::QuantizedS8(scale); | comp_dst.layout.dtype = dtype::QuantizedS8(scale); | ||||
comp_dst.layout.format = TensorLayout::Format(comp_dst.layout.dtype); | comp_dst.layout.format = TensorLayout::Format(comp_dst.layout.dtype); | ||||
@@ -571,7 +570,7 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, | |||||
default: \ | default: \ | ||||
megdnn_assert(0, "not support mode"); \ | megdnn_assert(0, "not support mode"); \ | ||||
} \ | } \ | ||||
post_process(dst, comp_dst, handle(), wsb); \ | |||||
post_process(dst, comp_dst); \ | |||||
return; \ | return; \ | ||||
} | } | ||||
@@ -132,7 +132,6 @@ public: | |||||
: dtype::Float32()); | : dtype::Float32()); | ||||
if (m_fmt.find(i) == m_fmt.end()) { | if (m_fmt.find(i) == m_fmt.end()) { | ||||
layouts[i] = TensorLayout(shapes[i], dt); | layouts[i] = TensorLayout(shapes[i], dt); | ||||
layouts[i].init_contiguous_stride(); | |||||
} else | } else | ||||
layouts[i] = TensorLayout(shapes[i], dt, m_fmt[i]); | layouts[i] = TensorLayout(shapes[i], dt, m_fmt[i]); | ||||
} | } | ||||
@@ -325,13 +325,8 @@ TEST(BASIC_TYPES, TENSOR_LAYOUT_FMT_LOW_BITS) { | |||||
layout = TensorLayout{{1, 32, 1, 1}, dtype::QuantizedS4{1.2f}}; | layout = TensorLayout{{1, 32, 1, 1}, dtype::QuantizedS4{1.2f}}; | ||||
layout = layout.broadcast({16, 32, 7, 7}); | layout = layout.broadcast({16, 32, 7, 7}); | ||||
EXPECT_EQ(make_layout({16, 32, 49}, {0, 1, 0}, dtype::QuantizedS4{1.2}), | |||||
EXPECT_EQ(make_layout({16, 32, 49}, {0, 2, 0}, dtype::QuantizedS4{1.2}), | |||||
layout.collapse_contiguous()); | layout.collapse_contiguous()); | ||||
layout = TensorLayout{{1, 32, 1, 1}, dtype::QuantizedS4{1.2f}}; | |||||
layout.init_contiguous_stride(); | |||||
layout = layout.broadcast({16, 32, 7, 7}); | |||||
ASSERT_THROW(layout.span(), MegDNNError); | |||||
} | } | ||||
TEST(BASIC_TYPES, TENSOR_LAYOUT_FMT_LOW_BITS_VALID) { | TEST(BASIC_TYPES, TENSOR_LAYOUT_FMT_LOW_BITS_VALID) { | ||||
@@ -342,7 +337,7 @@ TEST(BASIC_TYPES, TENSOR_LAYOUT_FMT_LOW_BITS_VALID) { | |||||
LowbitsAlignedToBytesTensorFormat::make(4_z)), | LowbitsAlignedToBytesTensorFormat::make(4_z)), | ||||
MegDNNError); | MegDNNError); | ||||
ASSERT_THROW(TensorLayout({16, 32, 7, 7}, dtype::IntB2{}, | ASSERT_THROW(TensorLayout({16, 32, 7, 7}, dtype::IntB2{}, | ||||
LowbitsAlignedToBytesTensorFormat::make(2_z)), | |||||
LowbitsAlignedToBytesTensorFormat::make(4_z)), | |||||
MegDNNError); | MegDNNError); | ||||
} | } | ||||
@@ -343,6 +343,14 @@ static inline bool good_float(dt_qint32) { | |||||
return true; | return true; | ||||
} | } | ||||
static inline bool good_float(dt_qint4) { | |||||
return true; | |||||
} | |||||
static inline bool good_float(dt_quint4) { | |||||
return true; | |||||
} | |||||
// A hack for the (x+0) promote to int trick on dt_quint8. | // A hack for the (x+0) promote to int trick on dt_quint8. | ||||
static inline int operator+(dt_quint8 lhs, int rhs) { | static inline int operator+(dt_quint8 lhs, int rhs) { | ||||
megdnn_assert(rhs == 0, "unexpected rhs"); | megdnn_assert(rhs == 0, "unexpected rhs"); | ||||
@@ -545,12 +545,12 @@ TEST_F(NAIVE, WARP_PERSPECTIVE_NCHW64) { | |||||
using Param = WarpPerspective::Param; | using Param = WarpPerspective::Param; | ||||
auto convert_true_format = [](const TensorLayout& layout) { | auto convert_true_format = [](const TensorLayout& layout) { | ||||
if (layout.ndim == 4) | |||||
return layout | |||||
.reshape({layout[0], layout[1] / 64, layout[2], layout[3], | |||||
64}) | |||||
.dimshuffle({0, 1, 4, 2, 3}); | |||||
else | |||||
if (layout.ndim == 4) { | |||||
TensorLayout ret{ | |||||
{layout[0], layout[1] / 64, layout[2], layout[3], 64}, | |||||
layout.dtype}; | |||||
return ret.dimshuffle({0, 1, 4, 2, 3}); | |||||
} else | |||||
return layout; | return layout; | ||||
}; | }; | ||||
@@ -563,15 +563,16 @@ TEST_F(NAIVE, WARP_PERSPECTIVE_NCHW64) { | |||||
TensorNDArray nchw_tensors; | TensorNDArray nchw_tensors; | ||||
for (size_t i = 0; i < tensors.size(); ++i) { | for (size_t i = 0; i < tensors.size(); ++i) { | ||||
TensorLayout ly; | |||||
auto layout = tensors[i].layout; | auto layout = tensors[i].layout; | ||||
if (layout.dtype.enumv() == DTypeEnum::QuantizedS4) | |||||
layout.dtype = dtype::QuantizedS4(); | |||||
if (layout.ndim == 5) { | |||||
layout = layout.reshape({layout[0], layout[1] * layout[4], | |||||
layout[2], layout[3]}); | |||||
if (tensors[i].layout.ndim == 5) { | |||||
ly = TensorLayout{{layout[0], layout[1] * layout[4], layout[2], | |||||
layout[3]}, | |||||
layout.dtype}; | |||||
} else { | |||||
ly = layout; | |||||
} | } | ||||
nchw_tensors.emplace_back(malloc(layout.span().dist_byte()), | |||||
layout); | |||||
nchw_tensors.emplace_back(malloc(ly.span().dist_byte()), ly); | |||||
} | } | ||||
TensorNDArray nchw64_tensors; | TensorNDArray nchw64_tensors; | ||||
for (size_t i = 0; i < tensors.size(); ++i) { | for (size_t i = 0; i < tensors.size(); ++i) { | ||||
@@ -617,13 +618,11 @@ TEST_F(NAIVE, WARP_PERSPECTIVE_NCHW64) { | |||||
checker.set_param(param); | checker.set_param(param); | ||||
checker.execs({{2, 1, 10, 10, 64}, {2, 3, 3}, {2, 1, 10, 12, 64}}); | checker.execs({{2, 1, 10, 10, 64}, {2, 3, 3}, {2, 1, 10, 12, 64}}); | ||||
checker.execs( | checker.execs( | ||||
{{20, 30, 10, 12, 64}, {20, 3, 3}, {20, 30, 11, 12, 64}}); | |||||
checker.execs( | |||||
{{220, 3, 10, 10, 64}, {220, 3, 3}, {220, 3, 10, 12, 64}}); | |||||
checker.execs({{1, 25, 25, 24, 64}, {1, 3, 3}, {1, 25, 25, 510, 64}}); | |||||
checker.execs({{1, 25, 25, 510, 64}, {1, 3, 3}, {1, 25, 25, 24, 64}}); | |||||
checker.execs({{1, 25, 25, 24, 64}, {1, 3, 3}, {1, 25, 51, 50, 64}}); | |||||
checker.execs({{1, 25, 51, 50, 64}, {1, 3, 3}, {1, 25, 25, 24, 64}}); | |||||
{{20, 3, 10, 12, 64}, {20, 3, 3}, {20, 3, 11, 12, 64}}); | |||||
checker.execs({{1, 3, 25, 24, 64}, {1, 3, 3}, {1, 3, 25, 51, 64}}); | |||||
checker.execs({{1, 3, 25, 51, 64}, {1, 3, 3}, {1, 3, 25, 24, 64}}); | |||||
checker.execs({{1, 3, 25, 24, 64}, {1, 3, 3}, {1, 3, 51, 50, 64}}); | |||||
checker.execs({{1, 3, 51, 50, 64}, {1, 3, 3}, {1, 3, 25, 24, 64}}); | |||||
} | } | ||||
} | } | ||||
// vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen |
@@ -18,6 +18,7 @@ | |||||
#include "megbrain/graph/cg.h" | #include "megbrain/graph/cg.h" | ||||
#include "megbrain/tensor.h" | #include "megbrain/tensor.h" | ||||
#include "megbrain/utils/mempool.h" | #include "megbrain/utils/mempool.h" | ||||
#include "./numpy_dtypes.h" | #include "./numpy_dtypes.h" | ||||
namespace py = pybind11; | namespace py = pybind11; | ||||
@@ -390,16 +391,24 @@ HostTensorND lowbit_ndarray_to_host_tensor( | |||||
} else { | } else { | ||||
mgb_assert(layout.ndim && layout.ndim <= TensorShape::MAX_NDIM, | mgb_assert(layout.ndim && layout.ndim <= TensorShape::MAX_NDIM, | ||||
"unsupported ndim %zu", layout.ndim); | "unsupported ndim %zu", layout.ndim); | ||||
for (size_t i = 0; i < layout.ndim; ++ i) { | |||||
layout.shape[i] = PyArray_SHAPE(input)[i]; | |||||
layout.stride[i] = PyArray_STRIDE(input, i); | |||||
TensorLayout ly; | |||||
ly.ndim = layout.ndim; | |||||
for (size_t i = 0; i < layout.ndim; ++i) { | |||||
ly.shape[i] = layout.shape[i] = PyArray_SHAPE(input)[i]; | |||||
ly.stride[i] = PyArray_STRIDE(input, i); | |||||
mgb_assert(layout.shape[i], "zero shape not supported"); | mgb_assert(layout.shape[i], "zero shape not supported"); | ||||
} | } | ||||
mgb_assert(layout.is_contiguous()); | |||||
mgb_assert(ly.is_physical_contiguous()); | |||||
layout.init_contiguous_stride(); | |||||
} | } | ||||
HostTensorND ret{comp_node, layout}; | HostTensorND ret{comp_node, layout}; | ||||
lowbit_memcpy_byte2compact(layout.dtype, ret.raw_ptr(), src_ptr, | |||||
layout.total_nr_elems()); | |||||
if (layout.format.is_lowbit_aligned()) { | |||||
mgb_assert(layout.is_contiguous()); | |||||
lowbit_memcpy_byte2aligned(ret.raw_ptr(), src_ptr, layout); | |||||
} else { | |||||
lowbit_memcpy_byte2compact(layout.dtype, ret.raw_ptr(), src_ptr, | |||||
layout.total_nr_elems()); | |||||
} | |||||
return ret; | return ret; | ||||
} | } | ||||
@@ -423,10 +432,8 @@ std::pair<HostTensorND, bool> np2tensor_try_borrow( | |||||
} | } | ||||
// make result from PyArrayObject; its reference may be stolen | // make result from PyArrayObject; its reference may be stolen | ||||
auto make_from_arr = [&](PyArrayObject *input, bool allow_borrow) { | |||||
TensorLayout layout; | |||||
layout.dtype = dtype_np2mgb_descr(PyArray_DESCR(input)); | |||||
auto make_from_arr = [&](PyArrayObject* input, bool allow_borrow) { | |||||
TensorLayout layout{{}, dtype_np2mgb_descr(PyArray_DESCR(input))}; | |||||
if (dtype.valid()) | if (dtype.valid()) | ||||
mgb_assert(dtype == layout.dtype); | mgb_assert(dtype == layout.dtype); | ||||
layout.ndim = PyArray_NDIM(input); | layout.ndim = PyArray_NDIM(input); | ||||
@@ -605,8 +612,15 @@ PyObject* ndarray_from_tensor( | |||||
if (val.dtype().is_low_bit()) { | if (val.dtype().is_low_bit()) { | ||||
mgb_assert(share_type != ShareType::MUST_SHARE, | mgb_assert(share_type != ShareType::MUST_SHARE, | ||||
"can not share memory for lowbit dtype"); | "can not share memory for lowbit dtype"); | ||||
lowbit_memcpy_compact2byte(val.dtype(), alloc_new_ret(), val.raw_ptr(), | |||||
val.layout().total_nr_elems()); | |||||
const auto& layout = val.layout(); | |||||
if (layout.format.is_lowbit_aligned()) { | |||||
lowbit_memcpy_aligned2byte(alloc_new_ret(), val.raw_ptr(), | |||||
val.layout()); | |||||
} else { | |||||
lowbit_memcpy_compact2byte(val.dtype(), alloc_new_ret(), | |||||
val.raw_ptr(), | |||||
val.layout().total_nr_elems()); | |||||
} | |||||
} else if (share_type == ShareType::MUST_UNSHARE) { | } else if (share_type == ShareType::MUST_UNSHARE) { | ||||
memcpy(alloc_new_ret(), val.raw_ptr(), val.layout().span().dist_byte()); | memcpy(alloc_new_ret(), val.raw_ptr(), val.layout().span().dist_byte()); | ||||
} else { | } else { | ||||
@@ -290,7 +290,7 @@ Tensor::Tensor(const DeviceTensorND &dv, const HostTensorND& hv) { | |||||
} | } | ||||
Tensor::Tensor(const TensorLayout& layout, const CompNode& cn) | Tensor::Tensor(const TensorLayout& layout, const CompNode& cn) | ||||
: m_layout{layout}, m_blob{Blob::make(cn, layout.dtype.size(layout.total_nr_elems()))}, | |||||
: m_layout{layout}, m_blob{Blob::make(cn, layout.span().dist_byte())}, | |||||
m_offset{0} {} | m_offset{0} {} | ||||
Tensor::Tensor(const BlobPtr blob, const size_t offset, const TensorLayout& layout) | Tensor::Tensor(const BlobPtr blob, const size_t offset, const TensorLayout& layout) | ||||
@@ -359,19 +359,6 @@ struct LowbitMemcpy<bits, true> { | |||||
} | } | ||||
}; | }; | ||||
template<typename DT> | |||||
struct QuantizedLowbitTrait; | |||||
template<> | |||||
struct QuantizedLowbitTrait<dtype::Quantized4Asymm> { | |||||
static constexpr int8_t SHIFT = 0; | |||||
}; | |||||
template<> | |||||
struct QuantizedLowbitTrait<dtype::QuantizedS4> { | |||||
static constexpr int8_t SHIFT = 8; | |||||
}; | |||||
template <typename DT, bool div_byte = (DTypeTrait<DT>::category == | template <typename DT, bool div_byte = (DTypeTrait<DT>::category == | ||||
DTypeCategory::QUANTIZED) && | DTypeCategory::QUANTIZED) && | ||||
(8 % DTypeTrait<DT>::low_bit == 0)> | (8 % DTypeTrait<DT>::low_bit == 0)> | ||||
@@ -452,4 +439,44 @@ void mgb::lowbit_memcpy_compact2byte( | |||||
mgb_throw(MegBrainError, "bad dtype for lowbit: %s", dtype.name()); | mgb_throw(MegBrainError, "bad dtype for lowbit: %s", dtype.name()); | ||||
} | } | ||||
void mgb::lowbit_memcpy_byte2aligned(void* dest, const void* src, | |||||
const ::megdnn::TensorLayout& layout) { | |||||
size_t low_bit = layout.dtype.low_bit(); | |||||
size_t dim = layout.shape[layout.ndim - 1]; | |||||
if ((dim * low_bit) % 8) { // padding | |||||
size_t n = layout.total_nr_elems(); | |||||
size_t stride = divup<size_t>(dim * low_bit, 8); | |||||
dt_byte* dest_ptr = reinterpret_cast<dt_byte*>(dest); | |||||
const dt_byte* src_ptr = reinterpret_cast<const dt_byte*>(src); | |||||
for (size_t i = 0; i < n / dim; ++i) { | |||||
lowbit_memcpy_byte2compact(layout.dtype, dest_ptr, src_ptr, dim); | |||||
dest_ptr += stride; | |||||
src_ptr += dim; | |||||
} | |||||
} else { | |||||
lowbit_memcpy_byte2compact(layout.dtype, dest, src, | |||||
layout.total_nr_elems()); | |||||
} | |||||
} | |||||
void mgb::lowbit_memcpy_aligned2byte(void* dest, const void* src, | |||||
const ::megdnn::TensorLayout& layout) { | |||||
size_t low_bit = layout.dtype.low_bit(); | |||||
size_t dim = layout.shape[layout.ndim - 1]; | |||||
if ((dim * low_bit) % 8) { // padding | |||||
size_t n = layout.total_nr_elems(); | |||||
size_t stride = divup<size_t>(dim * low_bit, 8); | |||||
dt_byte* dest_ptr = reinterpret_cast<dt_byte*>(dest); | |||||
const dt_byte* src_ptr = reinterpret_cast<const dt_byte*>(src); | |||||
for (size_t i = 0; i < n / dim; ++i) { | |||||
lowbit_memcpy_compact2byte(layout.dtype, dest_ptr, src_ptr, dim); | |||||
dest_ptr += dim; | |||||
src_ptr += stride; | |||||
} | |||||
} else { | |||||
lowbit_memcpy_compact2byte(layout.dtype, dest, src, | |||||
layout.total_nr_elems()); | |||||
} | |||||
} | |||||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -1340,15 +1340,19 @@ void VarNodeMemManager::make_dev_tensor_from_mem_plan_single( | |||||
void VarNodeMemManager::var_alloc_with_shape(VarNode* var, | void VarNodeMemManager::var_alloc_with_shape(VarNode* var, | ||||
const TensorShape& shape, | const TensorShape& shape, | ||||
size_t size_req) { | size_t size_req) { | ||||
mgb_assert(var->format().is_default(), | |||||
bool cond_default = var->format().is_default(); | |||||
bool cond_lowbit = var->dtype().is_quantized_lowbit() && | |||||
var->format().is_lowbit_aligned(); | |||||
mgb_assert(cond_default || cond_lowbit, | |||||
"dynamic shape is currently only supported for var with " | "dynamic shape is currently only supported for var with " | ||||
"default format; got %s", | "default format; got %s", | ||||
var->format().to_string().c_str()); | var->format().to_string().c_str()); | ||||
var->shape(shape); | var->shape(shape); | ||||
TensorLayout ly{shape, var->dtype()}; | |||||
if (size_req != 0) { | if (size_req != 0) { | ||||
mgb_assert(var->dtype().size(shape.total_nr_elems()) <= size_req); | |||||
mgb_assert(ly.span().dist_byte() <= size_req); | |||||
} else { | } else { | ||||
size_req = var->dtype().size(shape.total_nr_elems()); | |||||
size_req = ly.span().dist_byte(); | |||||
} | } | ||||
auto&& mplan = var->m_mem_plan; | auto&& mplan = var->m_mem_plan; | ||||
@@ -202,6 +202,17 @@ void lowbit_memcpy_byte2compact( | |||||
void lowbit_memcpy_compact2byte( | void lowbit_memcpy_compact2byte( | ||||
DType dtype, void *dest, const void *src, size_t n); | DType dtype, void *dest, const void *src, size_t n); | ||||
/*! | |||||
* \brief copy from byte representation to an aligend tensor for lowbit types | |||||
*/ | |||||
void lowbit_memcpy_byte2aligned(void* dest, const void* src, | |||||
const ::megdnn::TensorLayout& ly); | |||||
/*! | |||||
* \brief copy from an aligend tensor to byte representation for lowbit types | |||||
*/ | |||||
void lowbit_memcpy_aligned2byte(void* dest, const void* src, | |||||
const ::megdnn::TensorLayout& ly); | |||||
} // namespace mgb | } // namespace mgb | ||||
@@ -4454,314 +4454,6 @@ TEST(TestGoptInference, PaddingChannelsWithWarpPerspective) { | |||||
MGB_ASSERT_TENSOR_EQ(t1, t2); | MGB_ASSERT_TENSOR_EQ(t1, t2); | ||||
} | } | ||||
TEST(TestGoptInference, EnableNCHW64Basic) { | |||||
REQUIRE_GPU(1); | |||||
auto cn = CompNode::load("gpu0"); | |||||
cn.activate(); | |||||
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); | |||||
HostTensorGenerator<dtype::Int8> gen; | |||||
auto graph = ComputingGraph::make(); | |||||
graph->options().graph_opt_level = 0; | |||||
auto mkvar = [&](const char* name, const TensorShape& shp, | |||||
const DType& dtype) { | |||||
return opr::TypeCvt::make( | |||||
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), | |||||
dtype); | |||||
}; | |||||
auto mkcvar = [&](const char* name, const TensorShape& shp, | |||||
const DType& dtype) { | |||||
return opr::TypeCvt::make( | |||||
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) | |||||
.rename(name), | |||||
dtype); | |||||
}; | |||||
auto x = mkvar("x", {16, 4, 14, 14}, dtype::QuantizedS8(2.5f)), | |||||
w = mkcvar("w", {16, 4, 3, 3}, dtype::QuantizedS8(2.5f)), | |||||
b = mkcvar("b", {1, 16, 1, 1}, dtype::QuantizedS32(6.25f)); | |||||
opr::ConvBias::Param param; | |||||
param.format = opr::ConvBias::Param::Format::NCHW; | |||||
param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; | |||||
param.stride_h = param.stride_w = 1; | |||||
param.pad_h = param.pad_w = 1; | |||||
auto y = opr::ConvBias::make(x, w, b, param, {}, | |||||
OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
auto w1 = mkcvar("w1", {32, 16, 3, 3}, dtype::QuantizedS8(2.5f)), | |||||
b1 = mkcvar("b1", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f)); | |||||
auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, | |||||
OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
auto w2 = mkcvar("w2", {64, 32, 3, 3}, dtype::QuantizedS8(2.5f)), | |||||
b2 = mkcvar("b2", {1, 64, 1, 1}, dtype::QuantizedS32(6.25f)); | |||||
auto y2 = opr::ConvBias::make(y1, w2, b2, param, {}, | |||||
OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
y2 = opr::TypeCvt::make(y2, dtype::QuantizedS4{40.f}); | |||||
auto w3 = mkcvar("w3", {64, 64, 3, 3}, dtype::QuantizedS4(2.5f)), | |||||
b3 = mkcvar("b3", {1, 64, 1, 1}, dtype::QuantizedS32(100.f)); | |||||
auto y3 = opr::ConvBias::make(y2, w3, b3, param, {}, | |||||
OperatorNodeConfig{dtype::QuantizedS4{40.f}}); | |||||
y3 = opr::TypeCvt::make(y3, dtype::QuantizedS8{2.5f}); | |||||
auto w4 = mkcvar("w4", {16, 64, 3, 3}, dtype::QuantizedS8(2.5f)), | |||||
b4 = mkcvar("b4", {1, 16, 1, 1}, dtype::QuantizedS32(6.25f)); | |||||
auto y4 = opr::ConvBias::make(y3, w4, b4, param, {}, | |||||
OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode; | |||||
auto y5 = opr::ElemwiseMultiType::make( | |||||
{y, y4}, {ElemMultiMode::QFUSE_ADD_RELU}, | |||||
OperatorNodeConfig{dtype::QuantizedS8{1.3f}}); | |||||
y5 = opr::TypeCvt::make(y5, dtype::Float32()); | |||||
SymbolVar y5_pad; | |||||
unpack_vector( | |||||
gopt::GraphOptimizer{} | |||||
.add_pass(gopt::EnableNCHW64Pass::make_nchw64_converter()) | |||||
.apply({{y5}}) | |||||
.endpoint_vars(), | |||||
y5_pad); | |||||
EXPECT_TRUE(y5.node()->shape().eq_shape(y5_pad.node()->shape())); | |||||
SmallVector<cg::OperatorNodeBase*> oprs; | |||||
auto cb = [&oprs](cg::OperatorNodeBase* opr) { | |||||
if (opr->same_type<opr::ConvBias>()) { | |||||
oprs.push_back(opr); | |||||
} | |||||
}; | |||||
cg::DepOprIter{cb}.add(y5_pad.node()->owner_opr()); | |||||
ASSERT_EQ(oprs.size(), 5); | |||||
using Format = opr::ConvBiasForward::Param::Format; | |||||
#define CHECK(_i, _fmt) \ | |||||
{ \ | |||||
const auto& o = oprs[_i]->cast_final<opr::ConvBias>(); \ | |||||
ASSERT_EQ(o.param().format, Format::_fmt); \ | |||||
} | |||||
CHECK(0, NCHW4); | |||||
CHECK(1, NCHW4); | |||||
CHECK(2, NCHW32); | |||||
CHECK(3, NCHW64); | |||||
CHECK(4, NCHW4); | |||||
#undef CHECK | |||||
HostTensorND t1, t2; | |||||
auto func1 = graph->compile({make_callback_copy(y5, t1)}); | |||||
func1->execute(); | |||||
auto func2 = graph->compile({make_callback_copy(y5_pad, t2)}); | |||||
func2->execute(); | |||||
MGB_ASSERT_TENSOR_EQ(t1, t2); | |||||
} | |||||
TEST(TestGoptInference, EnableNCHW64PaddingChannel) { | |||||
REQUIRE_GPU(1); | |||||
auto cn = CompNode::load("gpu0"); | |||||
cn.activate(); | |||||
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); | |||||
HostTensorGenerator<dtype::Int8> gen; | |||||
auto graph = ComputingGraph::make(); | |||||
graph->options().graph_opt_level = 0; | |||||
auto mkvar = [&](const char* name, const TensorShape& shp, | |||||
const DType& dtype) { | |||||
return opr::TypeCvt::make( | |||||
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), | |||||
dtype); | |||||
}; | |||||
auto mkcvar = [&](const char* name, const TensorShape& shp, | |||||
const DType& dtype) { | |||||
return opr::TypeCvt::make( | |||||
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) | |||||
.rename(name), | |||||
dtype); | |||||
}; | |||||
auto x = mkvar("x", {16, 4, 14, 14}, dtype::QuantizedS8(2.5f)), | |||||
w = mkcvar("w", {20, 4, 3, 3}, dtype::QuantizedS8(2.5f)), | |||||
b = mkcvar("b", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f)); | |||||
opr::ConvBias::Param param; | |||||
param.format = opr::ConvBias::Param::Format::NCHW; | |||||
param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; | |||||
param.stride_h = param.stride_w = 1; | |||||
param.pad_h = param.pad_w = 1; | |||||
auto y = opr::ConvBias::make(x, w, b, param, {}, | |||||
OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
opr::Pooling::Param pool; | |||||
pool.format = opr::Pooling::Param::Format::NCHW; | |||||
y = opr::Pooling::make(y, pool); | |||||
auto w1 = mkcvar("w1", {24, 20, 3, 3}, dtype::QuantizedS8(2.5f)), | |||||
b1 = mkcvar("b1", {1, 24, 1, 1}, dtype::QuantizedS32(6.25f)); | |||||
auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, | |||||
OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
auto w2 = mkcvar("w2", {20, 24, 3, 3}, dtype::QuantizedS8(2.5f)), | |||||
b2 = mkcvar("b2", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f)); | |||||
auto y2 = opr::ConvBias::make(y1, w2, b2, param, {}, | |||||
OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
y2 = opr::TypeCvt::make(y2, dtype::QuantizedS4{40.f}); | |||||
auto w3 = mkcvar("w3", {64, 20, 3, 3}, dtype::QuantizedS4(2.5f)), | |||||
b3 = mkcvar("b3", {1, 64, 1, 1}, dtype::QuantizedS32(100.f)); | |||||
auto y3 = opr::ConvBias::make(y2, w3, b3, param, {}, | |||||
OperatorNodeConfig{dtype::QuantizedS4{40.f}}); | |||||
auto w4 = mkcvar("w4", {20, 64, 3, 3}, dtype::QuantizedS4(2.5f)), | |||||
b4 = mkcvar("b4", {1, 20, 1, 1}, dtype::QuantizedS32(100.f)); | |||||
auto y4 = opr::ConvBias::make(y3, w4, b4, param, {}, | |||||
OperatorNodeConfig{dtype::QuantizedS4{40.f}}); | |||||
y4 = opr::TypeCvt::make(y4, dtype::QuantizedS8{2.5f}); | |||||
using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode; | |||||
auto y5 = opr::ElemwiseMultiType::make( | |||||
{y, y4}, {ElemMultiMode::QFUSE_ADD_RELU}, | |||||
OperatorNodeConfig{dtype::QuantizedS8{1.2f}}); | |||||
opr::ConvolutionBackwardData::Param deconv; | |||||
deconv.format = opr::ConvolutionBackwardData::Param::Format::NCHW; | |||||
deconv.stride_h = deconv.stride_w = 2; | |||||
deconv.pad_h = deconv.pad_w = 1; | |||||
auto w6 = mkcvar("w6", {20, 64, 4, 4}, dtype::QuantizedS8{2.5f}); | |||||
auto y6 = opr::ConvolutionBackwardData::make( | |||||
w6, y5, deconv, {}, | |||||
OperatorNodeConfig{dtype::QuantizedS8(2.0f)}); | |||||
y6 = opr::TypeCvt::make(y6, dtype::QuantizedS4{32.f}); | |||||
std::shared_ptr<HostTensorND> mat = std::make_shared<HostTensorND>( | |||||
cn, TensorShape{16, 3, 3}, dtype::Float32()); | |||||
warp_perspective_mat_gen(*mat, 16, 14, 14); | |||||
auto mat_var = opr::Host2DeviceCopy::make(*graph, mat).rename("mat"); | |||||
opr::WarpPerspective::Param warp_param; | |||||
warp_param.format = opr::WarpPerspective::Param::Format::NCHW; | |||||
auto y7 = opr::WarpPerspective::make(y6, mat_var, TensorShape{14, 14}, | |||||
warp_param); | |||||
y7 = opr::TypeCvt::make(y7, dtype::Float32()); | |||||
SymbolVar y7_pad; | |||||
auto opt = gopt::OptimizeForInferenceOptions{}; | |||||
opt.enable_nchw64(); | |||||
unpack_vector(gopt::optimize_for_inference({y7}, opt), y7_pad); | |||||
EXPECT_TRUE(y7.node()->shape().eq_shape(y7_pad.node()->shape())); | |||||
HostTensorND t1, t2; | |||||
auto func1 = graph->compile({make_callback_copy(y7, t1)}); | |||||
func1->execute(); | |||||
auto func2 = graph->compile({make_callback_copy(y7_pad, t2)}); | |||||
func2->execute(); | |||||
MGB_ASSERT_TENSOR_EQ(t1, t2); | |||||
using Format = opr::ConvBiasForward::Param::Format; | |||||
SmallVector<cg::OperatorNodeBase*> oprs; | |||||
auto cb = [&oprs](cg::OperatorNodeBase* opr) { | |||||
if (opr->same_type<opr::ConvBias>()) { | |||||
oprs.push_back(opr); | |||||
} | |||||
}; | |||||
cg::DepOprIter{cb}.add(y7_pad.node()->owner_opr()); | |||||
ASSERT_EQ(oprs.size(), 5); | |||||
#define CHECK(_i, _fmt) \ | |||||
{ \ | |||||
const auto& o = oprs[_i]->cast_final<opr::ConvBias>(); \ | |||||
ASSERT_EQ(o.param().format, Format::_fmt); \ | |||||
} | |||||
CHECK(0, NCHW4); | |||||
CHECK(1, NCHW32); | |||||
CHECK(2, NCHW32); | |||||
CHECK(3, NCHW64); | |||||
CHECK(4, NCHW64); | |||||
#undef CHECK | |||||
{ | |||||
const auto& deconv = find_opr<opr::ConvolutionBackwardData>(y7_pad); | |||||
ASSERT_EQ(deconv.param().format, Format::NCHW4); | |||||
const auto& pool = find_opr<opr::PoolingForward>(y7_pad); | |||||
ASSERT_EQ(pool.param().format, Format::NCHW4); | |||||
const auto& warp = find_opr<opr::WarpPerspectiveForward>(y7_pad); | |||||
ASSERT_EQ(warp.param().format, Format::NCHW64); | |||||
} | |||||
size_t nr_dimshuffle = find_opr_num<opr::Dimshuffle>(y7_pad); | |||||
ASSERT_EQ(nr_dimshuffle, 8); | |||||
} | |||||
TEST(TestGoptInference, EnableNCHW64FuseConvBiasZ) { | |||||
REQUIRE_GPU(1); | |||||
auto cn = CompNode::load("gpu0"); | |||||
cn.activate(); | |||||
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); | |||||
HostTensorND t1, t2; | |||||
HostTensorGenerator<dtype::Int8> gen; | |||||
auto graph = ComputingGraph::make(); | |||||
graph->options().graph_opt_level = 0; | |||||
auto mkvar = [&](const char* name, const TensorShape& shp, | |||||
const DType& dtype) { | |||||
return opr::TypeCvt::make( | |||||
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), | |||||
dtype); | |||||
}; | |||||
auto mkcvar = [&](const char* name, const TensorShape& shp, | |||||
const DType& dtype) { | |||||
return opr::TypeCvt::make( | |||||
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) | |||||
.rename(name), | |||||
dtype); | |||||
}; | |||||
auto x = mkvar("x", {16, 4, 14, 14}, dtype::QuantizedS8(2.5f)), | |||||
w = mkcvar("w", {32, 4, 3, 3}, dtype::QuantizedS8(2.5f)), | |||||
b = mkcvar("b", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f)); | |||||
opr::ConvBias::Param param; | |||||
param.format = opr::ConvBias::Param::Format::NCHW; | |||||
param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; | |||||
param.stride_h = param.stride_w = 1; | |||||
param.pad_h = param.pad_w = 1; | |||||
auto y = opr::ConvBias::make(x, w, b, param, {}, | |||||
OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
auto w1 = mkcvar("w1", {64, 32, 3, 3}, dtype::QuantizedS8(2.5f)), | |||||
b1 = mkcvar("b1", {1, 64, 1, 1}, dtype::QuantizedS32(6.25f)); | |||||
auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, | |||||
OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
y1 = opr::TypeCvt::make(y1, dtype::QuantizedS4{40.f}); | |||||
auto w2 = mkcvar("w2", {64, 64, 3, 3}, dtype::QuantizedS4(2.5f)), | |||||
b2 = mkcvar("b2", {1, 64, 1, 1}, dtype::QuantizedS32(100.f)); | |||||
auto y2 = opr::ConvBias::make(y1, w2, b2, param, {}, | |||||
OperatorNodeConfig{dtype::QuantizedS4{40.f}}); | |||||
auto w3 = mkcvar("w3", {64, 64, 3, 3}, dtype::QuantizedS4(2.5f)), | |||||
b3 = mkcvar("b3", {1, 64, 1, 1}, dtype::QuantizedS32(100.f)); | |||||
auto y3 = opr::ConvBias::make(y2, w3, b3, param, {}, | |||||
OperatorNodeConfig{dtype::QuantizedS4(40.f)}); | |||||
using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode; | |||||
auto y4 = opr::ElemwiseMultiType::make( | |||||
{y1, y3}, {ElemMultiMode::QFUSE_ADD_RELU}, | |||||
OperatorNodeConfig{dtype::QuantizedS4{40.f}}); | |||||
y4 = opr::TypeCvt::make(y4, dtype::Float32()); | |||||
auto y5 = opr::ConvBias::make(y2, w3, b3, y1, param, {}, | |||||
OperatorNodeConfig{dtype::QuantizedS4(40.f)}); | |||||
y5 = opr::TypeCvt::make(y5, dtype::Float32()); | |||||
SymbolVar y4_pad; | |||||
auto opt = gopt::OptimizeForInferenceOptions{}; | |||||
opt.enable_nchw64(); | |||||
unpack_vector(gopt::optimize_for_inference({y4}, opt), y4_pad); | |||||
EXPECT_TRUE(y4.node()->shape().eq_shape(y4_pad.node()->shape())); | |||||
size_t nr_elem_mult_type = find_opr_num<opr::ElemwiseMultiType>(y4_pad); | |||||
ASSERT_EQ(nr_elem_mult_type, 0); | |||||
auto func = graph->compile({make_callback_copy(y4_pad, t1)}); | |||||
func->execute(); | |||||
{ | |||||
opr::ConvBias::Param param; | |||||
param.format = opr::ConvBias::Param::Format::NCHW; | |||||
param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; | |||||
param.stride_h = param.stride_w = 1; | |||||
param.pad_h = param.pad_w = 1; | |||||
auto y = opr::ConvBias::make( | |||||
x, w, b, param, {}, | |||||
OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
auto y1 = opr::ConvBias::make( | |||||
y, w1, b1, param, {}, | |||||
OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
y1 = opr::TypeCvt::make(y1, dtype::QuantizedS4{40.f}); | |||||
auto y2 = opr::ConvBias::make( | |||||
y1, w2, b2, param, {}, | |||||
OperatorNodeConfig{dtype::QuantizedS4{40.f}}); | |||||
param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; | |||||
auto y3 = opr::ConvBias::make( | |||||
y2, w3, b3, y1, param, {}, | |||||
OperatorNodeConfig{dtype::QuantizedS4(40.f)}); | |||||
y3 = opr::TypeCvt::make(y3, dtype::Float32()); | |||||
auto func = graph->compile({make_callback_copy(y3, t2)}); | |||||
func->execute(); | |||||
} | |||||
MGB_ASSERT_TENSOR_EQ(t1, t2); | |||||
} | |||||
#endif | #endif | ||||
@@ -2604,174 +2604,6 @@ TEST_F(TestNoWeightPreprocess, NoPreprocess) { | |||||
#endif | #endif | ||||
namespace { | namespace { | ||||
TEST(TestOprDNN, ConvBiasInt4NCHW) { | |||||
REQUIRE_GPU(1); | |||||
auto cn = CompNode::load("gpu0"); | |||||
cn.activate(); | |||||
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; | |||||
auto sm_ver = prop.major * 10 + prop.minor; | |||||
if (sm_ver != 75) { | |||||
printf("This testcast ignored due to insufficient cuda cap(got: %d, " | |||||
"expected: %d)\n", | |||||
sm_ver, 75); | |||||
return; | |||||
} | |||||
auto run = [&cn](size_t N, size_t C, size_t H, size_t W, size_t F, size_t S, | |||||
size_t P) { | |||||
auto graph = ComputingGraph::make(); | |||||
HostTensorGenerator<dtype::Int8> gen; | |||||
auto mkvar = [&gen](const char* name, const TensorShape& shp, | |||||
const DType& dtype, | |||||
std::shared_ptr<ComputingGraph> graph, | |||||
const CompNode& cn) { | |||||
return opr::TypeCvt::make( | |||||
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)) | |||||
.rename(name), | |||||
dtype); | |||||
}; | |||||
auto mkcvar = [&gen](const char* name, const TensorShape& shp, | |||||
const DType& dtype, | |||||
std::shared_ptr<ComputingGraph> graph, | |||||
const CompNode& cn) { | |||||
return opr::TypeCvt::make( | |||||
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) | |||||
.rename(name), | |||||
dtype); | |||||
}; | |||||
using Policy = opr::ConvBias::ExecutionPolicy; | |||||
using Strategy = Policy::Strategy; | |||||
auto x = mkvar("x", {N, C * 4, H, W}, dtype::QuantizedS4(1.19960327f), | |||||
graph, cn), | |||||
w = mkcvar("w1", {C, C * 4, F, F}, dtype::QuantizedS4(1.19970327f), | |||||
graph, cn), | |||||
b = mkcvar("b1", {1, C, 1, 1}, | |||||
dtype::QuantizedS32(1.19960327f * 1.19970327f), graph, | |||||
cn); | |||||
opr::ConvBias::Param param; | |||||
param.format = opr::ConvBias::Param::Format::NCHW; | |||||
param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; | |||||
param.stride_h = param.stride_w = S; | |||||
param.pad_h = param.pad_w = P; | |||||
Policy policy; | |||||
policy.strategy = Strategy::PROFILE; | |||||
auto y = opr::ConvBias::make( | |||||
x, w, b, param, policy, | |||||
OperatorNodeConfig{dtype::QuantizedS4(11.9960501f)}); | |||||
y = opr::TypeCvt::make(y, dtype::Float32()); | |||||
auto x_f32 = opr::TypeCvt::make(x, dtype::Float32()), | |||||
w_f32 = opr::TypeCvt::make(w, dtype::Float32()), | |||||
b_f32 = opr::TypeCvt::make(b, dtype::Float32()); | |||||
auto y_f32 = opr::ConvBias::make(x_f32, w_f32, b_f32, param, policy); | |||||
auto y_q4 = opr::TypeCvt::make(y_f32, dtype::QuantizedS4{11.9960501f}); | |||||
y_q4 = opr::TypeCvt::make(y_q4, dtype::Float32()); | |||||
HostTensorND host_y, host_y_q4; | |||||
auto func = graph->compile({make_callback_copy(y, host_y), | |||||
make_callback_copy(y_q4, host_y_q4)}); | |||||
func->execute(); | |||||
MGB_ASSERT_TENSOR_NEAR(host_y, host_y_q4, 1e-3); | |||||
}; | |||||
run(2, 64, 14, 14, 3, 2, 1); | |||||
run(2, 64, 7, 7, 3, 1, 1); | |||||
run(2, 64, 14, 14, 1, 2, 0); | |||||
run(2, 64, 7, 7, 1, 1, 0); | |||||
} | |||||
TEST(TestOprDNN, ConvBiasInt4NCHW64) { | |||||
REQUIRE_GPU(1); | |||||
auto cn = CompNode::load("gpu0"); | |||||
cn.activate(); | |||||
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; | |||||
auto sm_ver = prop.major * 10 + prop.minor; | |||||
if (sm_ver != 75) { | |||||
printf("This testcast ignored due to insufficient cuda cap(got: %d, " | |||||
"expected: %d)\n", | |||||
sm_ver, 75); | |||||
return; | |||||
} | |||||
auto nchw2nchw64 = [](SymbolVar x) { | |||||
auto y = opr::RelayoutFormat::make( | |||||
x, opr::RelayoutFormat::Param::Mode::NCHW_NCHW64); | |||||
return y; | |||||
}; | |||||
auto nchw642nchw = [](SymbolVar x) { | |||||
auto y = opr::RelayoutFormat::make( | |||||
x, opr::RelayoutFormat::Param::Mode::NCHW64_NCHW); | |||||
return y; | |||||
}; | |||||
auto run = [&](size_t N, size_t C, size_t H, size_t W, size_t F, size_t S, | |||||
size_t P) { | |||||
auto graph = ComputingGraph::make(); | |||||
HostTensorGenerator<dtype::Int8> gen; | |||||
auto mkvar = [&gen](const char* name, const TensorShape& shp, | |||||
const DType& dtype, | |||||
std::shared_ptr<ComputingGraph> graph, | |||||
const CompNode& cn) { | |||||
return opr::TypeCvt::make( | |||||
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)) | |||||
.rename(name), | |||||
dtype); | |||||
}; | |||||
auto mkcvar = [&gen](const char* name, const TensorShape& shp, | |||||
const DType& dtype, | |||||
std::shared_ptr<ComputingGraph> graph, | |||||
const CompNode& cn) { | |||||
return opr::TypeCvt::make( | |||||
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) | |||||
.rename(name), | |||||
dtype); | |||||
}; | |||||
using Policy = opr::ConvBias::ExecutionPolicy; | |||||
using Strategy = Policy::Strategy; | |||||
auto x = mkvar("x", {N, C / 16, H, W, 64}, | |||||
dtype::QuantizedS4(1.19960327f), graph, cn), | |||||
w = mkcvar("w1", {C, C / 16, F, F, 64}, | |||||
dtype::QuantizedS4(1.19970327f), graph, cn), | |||||
b = mkcvar("b1", {1, C / 64, 1, 1, 64}, | |||||
dtype::QuantizedS32(1.19960327f * 1.19970327f), graph, | |||||
cn); | |||||
opr::ConvBias::Param param; | |||||
param.format = opr::ConvBias::Param::Format::NCHW64; | |||||
param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; | |||||
param.stride_h = param.stride_w = S; | |||||
param.pad_h = param.pad_w = P; | |||||
Policy policy; | |||||
policy.strategy = Strategy::PROFILE; | |||||
auto y = opr::ConvBias::make( | |||||
x, w, b, param, policy, | |||||
OperatorNodeConfig{dtype::QuantizedS4(11.9960501f)}); | |||||
y = opr::TypeCvt::make(y, dtype::Float32()); | |||||
x = nchw642nchw(x); | |||||
w = nchw642nchw(w); | |||||
b = nchw642nchw(b); | |||||
auto x_f32 = opr::TypeCvt::make(x, dtype::Float32()), | |||||
w_f32 = opr::TypeCvt::make(w, dtype::Float32()), | |||||
b_f32 = opr::TypeCvt::make(b, dtype::Float32()); | |||||
param.format = opr::ConvBias::Param::Format::NCHW; | |||||
auto y_f32 = opr::ConvBias::make(x_f32, w_f32, b_f32, param, policy); | |||||
auto y_q4 = opr::TypeCvt::make(y_f32, dtype::QuantizedS4{11.9960501f}); | |||||
y_q4 = opr::TypeCvt::make(y_q4, dtype::Float32()); | |||||
y_q4 = nchw2nchw64(y_q4); | |||||
HostTensorND host_y, host_y_q4; | |||||
auto func = graph->compile({make_callback_copy(y, host_y), | |||||
make_callback_copy(y_q4, host_y_q4)}); | |||||
func->execute(); | |||||
MGB_ASSERT_TENSOR_NEAR(host_y, host_y_q4, 1e-3); | |||||
}; | |||||
run(2, 64, 14, 14, 3, 2, 1); | |||||
run(2, 64, 7, 7, 3, 1, 1); | |||||
run(2, 64, 14, 14, 1, 2, 0); | |||||
run(2, 64, 7, 7, 1, 1, 0); | |||||
} | |||||
TEST(TestOprDNN, ConvBiasInt4Serialize) { | TEST(TestOprDNN, ConvBiasInt4Serialize) { | ||||
using namespace serialization; | using namespace serialization; | ||||
@@ -2783,7 +2615,7 @@ TEST(TestOprDNN, ConvBiasInt4Serialize) { | |||||
HostTensorGenerator<dtype::Int8> gen; | HostTensorGenerator<dtype::Int8> gen; | ||||
std::shared_ptr<HostTensorND> xv; | std::shared_ptr<HostTensorND> xv; | ||||
auto mkvar = [&gen](const char* name, const DType& dtype, | |||||
auto mkvar = [](const char* name, const DType& dtype, | |||||
std::shared_ptr<ComputingGraph> graph, | std::shared_ptr<ComputingGraph> graph, | ||||
std::shared_ptr<HostTensorND> val) { | std::shared_ptr<HostTensorND> val) { | ||||
return opr::TypeCvt::make( | return opr::TypeCvt::make( | ||||
@@ -2856,9 +2688,9 @@ TEST(TestOprDNN, ConvBiasInt4SerializeWithParamFuse) { | |||||
HostTensorGenerator<dtype::Int8> gen; | HostTensorGenerator<dtype::Int8> gen; | ||||
std::shared_ptr<HostTensorND> xv; | std::shared_ptr<HostTensorND> xv; | ||||
auto mkvar = [&gen](const char* name, const DType& dtype, | |||||
std::shared_ptr<ComputingGraph> graph, | |||||
std::shared_ptr<HostTensorND> val) { | |||||
auto mkvar = [](const char* name, const DType& dtype, | |||||
std::shared_ptr<ComputingGraph> graph, | |||||
std::shared_ptr<HostTensorND> val) { | |||||
return opr::TypeCvt::make( | return opr::TypeCvt::make( | ||||
opr::Host2DeviceCopy::make(*graph, val).rename(name), dtype); | opr::Host2DeviceCopy::make(*graph, val).rename(name), dtype); | ||||
}; | }; | ||||
@@ -62,7 +62,12 @@ bool contains_any_in_set(const SmallVector<T>& list, | |||||
void check_tensor_value_valid(const std::string& name, | void check_tensor_value_valid(const std::string& name, | ||||
const HostTensorND& tensor) { | const HostTensorND& tensor) { | ||||
mgb_assert(tensor.layout().is_physical_contiguous(), | |||||
bool cond_normal = tensor.layout().format.is_default() && | |||||
tensor.layout().is_physical_contiguous(); | |||||
bool cond_lowbit = tensor.layout().dtype.is_quantized_lowbit() && | |||||
tensor.layout().format.is_lowbit_aligned() && | |||||
tensor.layout().is_contiguous(); | |||||
mgb_assert(cond_normal || cond_lowbit, | |||||
"non-contiguous tensor: name=%s layout=%s", name.c_str(), | "non-contiguous tensor: name=%s layout=%s", name.c_str(), | ||||
tensor.layout().to_string().c_str()); | tensor.layout().to_string().c_str()); | ||||
if (tensor.dtype() == dtype::Float32()) { | if (tensor.dtype() == dtype::Float32()) { | ||||
@@ -585,11 +590,12 @@ TensorLayout load_tensor_layout(const fbs::Tensor* tensor) { | |||||
layout.ndim = tensor->shape()->size(); | layout.ndim = tensor->shape()->size(); | ||||
std::copy(tensor->shape()->begin(), tensor->shape()->end(), | std::copy(tensor->shape()->begin(), tensor->shape()->end(), | ||||
layout.shape); | layout.shape); | ||||
layout.init_contiguous_stride(); | |||||
} | } | ||||
if (tensor->dtype()) { | if (tensor->dtype()) { | ||||
layout.dtype = fbs::intl::load_dtype(tensor->dtype()); | |||||
// modify data type inplace for TensorLayout | |||||
layout.modify_dtype_inplace(fbs::intl::load_dtype(tensor->dtype())); | |||||
} | } | ||||
layout.init_contiguous_stride(); | |||||
return layout; | return layout; | ||||
} | } | ||||