GitOrigin-RevId: 771968f9ac
tags/v1.3.0
@@ -11,3 +11,4 @@ imperative/python/test/integration/data/*.mge filter=lfs diff=lfs merge=lfs -tex | |||||
ci/resource/models/float/mobilenet_v2.pkl filter=lfs diff=lfs merge=lfs -text | ci/resource/models/float/mobilenet_v2.pkl filter=lfs diff=lfs merge=lfs -text | ||||
ci/resource/models/float/shufflenet_v2.pkl filter=lfs diff=lfs merge=lfs -text | ci/resource/models/float/shufflenet_v2.pkl filter=lfs diff=lfs merge=lfs -text | ||||
ci/resource/dump/roi_align_backward_8.8.0.mdl filter=lfs diff=lfs merge=lfs -text | ci/resource/dump/roi_align_backward_8.8.0.mdl filter=lfs diff=lfs merge=lfs -text | ||||
ci/resource/dump/relayout_format_8.10.0.mdl filter=lfs diff=lfs merge=lfs -text |
@@ -1233,6 +1233,7 @@ protected: | |||||
void check_exec(const TensorLayout& src, const TensorLayout& dst, | void check_exec(const TensorLayout& src, const TensorLayout& dst, | ||||
size_t workspace_in_bytes); | size_t workspace_in_bytes); | ||||
void deduce_exec_layout(const TensorLayout& src, const TensorLayout& dst, | void deduce_exec_layout(const TensorLayout& src, const TensorLayout& dst, | ||||
TensorLayout& exec_workspace, | |||||
TensorLayout& exec_src, TensorLayout& exec_dst); | TensorLayout& exec_src, TensorLayout& exec_dst); | ||||
}; | }; | ||||
} // namespace megdnn | } // namespace megdnn | ||||
@@ -935,8 +935,9 @@ Relayout mode. | |||||
Note: the axis column means the corresponding ``align_axis`` for image format | Note: the axis column means the corresponding ``align_axis`` for image format | ||||
when the ``I`` suffix is present. | when the ``I`` suffix is present. | ||||
Note: NCHW_NCHW4_WEIGHT will auto pad oc and ic, you should remove oc in later opr by seting group and oc param with NCHW4_NCHW | |||||
""" | """ | ||||
(pdef('RelayoutFormat', 'Change the tensor layout format'). | |||||
(pdef('RelayoutFormat', 'Change the tensor layout format', version=0, is_legacy=True). | |||||
add_enum( | add_enum( | ||||
Doc('Mode', RELAYOUT_FORMAT_MODE_DOC), | Doc('Mode', RELAYOUT_FORMAT_MODE_DOC), | ||||
'NHWC_NHWCD4', | 'NHWC_NHWCD4', | ||||
@@ -964,9 +965,16 @@ when the ``I`` suffix is present. | |||||
'NCHW_NCHW4_IC_SMALL', | 'NCHW_NCHW4_IC_SMALL', | ||||
'NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT', | 'NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT', | ||||
'NCHW_NCHW4', | 'NCHW_NCHW4', | ||||
'NCHW4_NCHW', | |||||
'NCHW_NCHW4_WEIGHT', | |||||
) | ) | ||||
) | ) | ||||
(pdef('RelayoutFormat', 'Change the tensor layout format', version=1). | |||||
add_enum_alias('Mode', 'RelayoutFormatV0'). | |||||
add_fields('uint32', 'oc', '0'). | |||||
add_fields('uint32', 'group', '1') | |||||
) | |||||
(pdef('SeparableFilter', version=0, is_legacy=True). | (pdef('SeparableFilter', version=0, is_legacy=True). | ||||
add_enum_alias('Format', 'ConvolutionV0'). | add_enum_alias('Format', 'ConvolutionV0'). | ||||
@@ -208,14 +208,48 @@ void RelayoutFormat::deduce_layout_fwd(const TensorLayout& src, | |||||
dst[3] = src[2]; | dst[3] = src[2]; | ||||
dst[4] = src[4]; | dst[4] = src[4]; | ||||
break; | break; | ||||
case Param::Mode::NCHW_NCHW4: | |||||
case Param::Mode::NCHW_NCHW4: { | |||||
megdnn_assert(src.ndim == 4); | megdnn_assert(src.ndim == 4); | ||||
const size_t group = param().group; | |||||
megdnn_assert(src[1] % group == 0); | |||||
const size_t icpg = src[1] / group; | |||||
dst.ndim = 5; | dst.ndim = 5; | ||||
dst[0] = src[0]; | dst[0] = src[0]; | ||||
dst[1] = div_ceil<size_t>(src[1], 4); | |||||
dst[1] = group * div_ceil<size_t>(icpg, 4); | |||||
dst[2] = src[2]; | dst[2] = src[2]; | ||||
dst[3] = src[3]; | dst[3] = src[3]; | ||||
dst[4] = 4; | dst[4] = 4; | ||||
}; break; | |||||
case Param::Mode::NCHW_NCHW4_WEIGHT:; | |||||
{ | |||||
if (src.ndim == 4) { | |||||
//! dense case | |||||
dst.ndim = 5; | |||||
dst[0] = div_ceil<size_t>(src[0], 4) * 4; | |||||
dst[1] = div_ceil<size_t>(src[1], 4); | |||||
dst[2] = src[2]; | |||||
dst[3] = src[3]; | |||||
dst[4] = 4; | |||||
} else if (src.ndim == 5) { | |||||
//! group case | |||||
dst.ndim = 6; | |||||
dst[0] = src[0]; | |||||
dst[1] = div_ceil<size_t>(src[1], 4) * 4; | |||||
dst[2] = div_ceil<size_t>(src[2], 4); | |||||
dst[3] = src[3]; | |||||
dst[4] = src[4]; | |||||
dst[5] = 4; | |||||
} | |||||
}; | |||||
break; | |||||
case Param::Mode::NCHW4_NCHW: | |||||
megdnn_assert(src.ndim == 5); | |||||
dst.ndim = 4; | |||||
dst[0] = src[0]; | |||||
dst[1] = param().oc == 0 ? src[1] * 4 : param().oc; | |||||
dst[2] = src[2]; | |||||
dst[3] = src[3]; | |||||
megdnn_assert(dst[1] % param().group == 0); | |||||
break; | break; | ||||
default: | default: | ||||
megdnn_assert(0, "Invalid RelayoutFormat Mode"); | megdnn_assert(0, "Invalid RelayoutFormat Mode"); | ||||
@@ -258,16 +292,13 @@ void RelayoutFormat::deduce_format(TensorFormat src, TensorFormat& dst) { | |||||
CHECK_SRC(DefaultTensorFormat::make()); | CHECK_SRC(DefaultTensorFormat::make()); | ||||
dst = src; | dst = src; | ||||
break; | break; | ||||
case Param::Mode::NCHW_NCHW4: | |||||
CHECK_SRC(DefaultTensorFormat::make()); | |||||
dst = src; | |||||
break; | |||||
case Param::Mode::NCHW_NHWCD4I: | case Param::Mode::NCHW_NHWCD4I: | ||||
CHECK_SRC(DefaultTensorFormat::make()); | CHECK_SRC(DefaultTensorFormat::make()); | ||||
dst = Image2DPack4TensorFormat::make_raw(2, align, vendor_type); | dst = Image2DPack4TensorFormat::make_raw(2, align, vendor_type); | ||||
break; | break; | ||||
case Param::Mode::NHWCD4I_NCHW: | case Param::Mode::NHWCD4I_NCHW: | ||||
CHECK_SRC(Image2DPack4TensorFormat::make_raw(2, align, vendor_type)); | |||||
CHECK_SRC( | |||||
Image2DPack4TensorFormat::make_raw(2, align, vendor_type)); | |||||
dst = DefaultTensorFormat::make(); | dst = DefaultTensorFormat::make(); | ||||
break; | break; | ||||
case Param::Mode::NHWCD4_NCHW: | case Param::Mode::NHWCD4_NCHW: | ||||
@@ -308,6 +339,9 @@ void RelayoutFormat::deduce_format(TensorFormat src, TensorFormat& dst) { | |||||
CHECK_SRC(DefaultTensorFormat::make()); | CHECK_SRC(DefaultTensorFormat::make()); | ||||
dst = src; | dst = src; | ||||
break; | break; | ||||
case Param::Mode::NCHW4_NCHW: | |||||
case Param::Mode::NCHW_NCHW4: | |||||
case Param::Mode::NCHW_NCHW4_WEIGHT: | |||||
case Param::Mode::NCHW_NCHW88: | case Param::Mode::NCHW_NCHW88: | ||||
case Param::Mode::NCHW88_NCHW: | case Param::Mode::NCHW88_NCHW: | ||||
case Param::Mode::NCHW_NCHW88_CONV_DENSE_WEIGHT: | case Param::Mode::NCHW_NCHW88_CONV_DENSE_WEIGHT: | ||||
@@ -354,6 +388,7 @@ void RelayoutFormat::check_exec(const TensorLayout& src, | |||||
void RelayoutFormat::deduce_exec_layout(const TensorLayout& src, | void RelayoutFormat::deduce_exec_layout(const TensorLayout& src, | ||||
const TensorLayout& dst, | const TensorLayout& dst, | ||||
TensorLayout& exec_workspace, | |||||
TensorLayout& exec_src, | TensorLayout& exec_src, | ||||
TensorLayout& exec_dst) { | TensorLayout& exec_dst) { | ||||
check_layout_fwd(src, dst); | check_layout_fwd(src, dst); | ||||
@@ -362,10 +397,10 @@ void RelayoutFormat::deduce_exec_layout(const TensorLayout& src, | |||||
case Param::Mode::NCHW_NCHW88: | case Param::Mode::NCHW_NCHW88: | ||||
// nchw to nchw8c | // nchw to nchw8c | ||||
{ | { | ||||
TensorLayout work_space_layout( | |||||
exec_workspace = TensorLayout( | |||||
{src[0], round_up(src[1], 8_z), src[2], src[3]}, | {src[0], round_up(src[1], 8_z), src[2], src[3]}, | ||||
src.dtype, src.format); | src.dtype, src.format); | ||||
exec_src = work_space_layout | |||||
exec_src = exec_workspace | |||||
.reshape({src[0], div_ceil(src[1], 8_z), 8, | .reshape({src[0], div_ceil(src[1], 8_z), 8, | ||||
src[2], src[3]}) | src[2], src[3]}) | ||||
.dimshuffle({0, 1, 3, 4, 2}); | .dimshuffle({0, 1, 3, 4, 2}); | ||||
@@ -375,13 +410,56 @@ void RelayoutFormat::deduce_exec_layout(const TensorLayout& src, | |||||
case Param::Mode::NCHW_NCHW4: | case Param::Mode::NCHW_NCHW4: | ||||
// nchw to nchw4 | // nchw to nchw4 | ||||
{ | { | ||||
TensorLayout work_space_layout( | |||||
{src[0], round_up(src[1], 4_z), src[2], src[3]}, | |||||
const size_t group = param().group; | |||||
const size_t icpg = src[1] / group; | |||||
exec_workspace = TensorLayout( | |||||
{src[0], group * round_up(icpg, 4_z), src[2], src[3]}, | |||||
src.dtype, src.format); | src.dtype, src.format); | ||||
exec_src = work_space_layout | |||||
.reshape({src[0], div_ceil(src[1], 4_z), 4, | |||||
src[2], src[3]}) | |||||
.dimshuffle({0, 1, 3, 4, 2}); | |||||
exec_src = | |||||
exec_workspace | |||||
.reshape({src[0], group * div_ceil(icpg, 4_z), | |||||
4, src[2], src[3]}) | |||||
.dimshuffle({0, 1, 3, 4, 2}); | |||||
exec_dst = dst; | |||||
} | |||||
break; | |||||
case Param::Mode::NCHW_NCHW4_WEIGHT: | |||||
// nchw to nchw4_weight | |||||
{ | |||||
if (src.ndim == 4) { | |||||
exec_workspace = TensorLayout( | |||||
{round_up(src[0], 4_z), round_up(src[1], 4_z), | |||||
src[2], src[3]}, | |||||
src.dtype, src.format); | |||||
exec_src = exec_workspace | |||||
.reshape({round_up(src[0], 4_z), | |||||
div_ceil(src[1], 4_z), 4, | |||||
src[2], src[3]}) | |||||
.dimshuffle({0, 1, 3, 4, 2}); | |||||
exec_dst = dst; | |||||
} else if (src.ndim == 5) { | |||||
exec_workspace = TensorLayout( | |||||
{src[0], round_up(src[1], 4_z), | |||||
round_up(src[2], 4_z), src[3], src[4]}, | |||||
src.dtype, src.format); | |||||
exec_src = exec_workspace | |||||
.reshape({src[0], round_up(src[1], 4_z), | |||||
div_ceil(src[2], 4_z), 4, | |||||
src[3], src[4]}) | |||||
.dimshuffle({0, 1, 2, 4, 5, 3}); | |||||
exec_dst = dst; | |||||
} | |||||
} | |||||
break; | |||||
case Param::Mode::NCHW4_NCHW: | |||||
// nchw to nchw4 | |||||
{ | |||||
exec_workspace = | |||||
TensorLayout({src[0], src[1] * 4, src[2], src[3]}, | |||||
src.dtype, src.format) | |||||
.reshape({src[0], src[1], 4, src[2], src[3]}) | |||||
.dimshuffle({0, 1, 3, 4, 2}); | |||||
exec_src = src; | |||||
exec_dst = dst; | exec_dst = dst; | ||||
} | } | ||||
break; | break; | ||||
@@ -396,11 +474,11 @@ void RelayoutFormat::deduce_exec_layout(const TensorLayout& src, | |||||
{ | { | ||||
megdnn_assert(src.ndim == 4); | megdnn_assert(src.ndim == 4); | ||||
megdnn_assert(src[0] % 8 == 0); | megdnn_assert(src[0] % 8 == 0); | ||||
TensorLayout work_space_layout( | |||||
exec_workspace = TensorLayout( | |||||
{src[0], round_up(src[1], 8_z), src[2], src[3]}, | {src[0], round_up(src[1], 8_z), src[2], src[3]}, | ||||
src.dtype, src.format); | src.dtype, src.format); | ||||
exec_src = | exec_src = | ||||
work_space_layout | |||||
exec_workspace | |||||
.reshape({src[0] / 8, 8, div_ceil(src[1], 8_z), | .reshape({src[0] / 8, 8, div_ceil(src[1], 8_z), | ||||
8, src[2], src[3]}) | 8, src[2], src[3]}) | ||||
.dimshuffle({0, 2, 4, 5, 3, 1}); | .dimshuffle({0, 2, 4, 5, 3, 1}); | ||||
@@ -411,10 +489,10 @@ void RelayoutFormat::deduce_exec_layout(const TensorLayout& src, | |||||
// goihw to goihw8g | // goihw to goihw8g | ||||
{ | { | ||||
megdnn_assert(src.ndim == 5); | megdnn_assert(src.ndim == 5); | ||||
TensorLayout work_space_layout( | |||||
exec_workspace = TensorLayout( | |||||
{round_up(src[0], 8_z), src[1], src[2], src[3], src[4]}, | {round_up(src[0], 8_z), src[1], src[2], src[3], src[4]}, | ||||
src.dtype, src.format); | src.dtype, src.format); | ||||
exec_src = work_space_layout | |||||
exec_src = exec_workspace | |||||
.reshape({div_ceil(src[0], 8_z), 8, src[1], | .reshape({div_ceil(src[0], 8_z), 8, src[1], | ||||
src[2], src[3], src[4]}) | src[2], src[3], src[4]}) | ||||
.dimshuffle({0, 2, 3, 4, 5, 1}); | .dimshuffle({0, 2, 3, 4, 5, 1}); | ||||
@@ -426,10 +504,10 @@ void RelayoutFormat::deduce_exec_layout(const TensorLayout& src, | |||||
{ | { | ||||
megdnn_assert(src.ndim == 5); | megdnn_assert(src.ndim == 5); | ||||
megdnn_assert(src[1] % 8 == 0); | megdnn_assert(src[1] % 8 == 0); | ||||
TensorLayout work_space_layout( | |||||
exec_workspace = TensorLayout( | |||||
{src[0], src[1], round_up(src[2], 8_z), src[3], src[4]}, | {src[0], src[1], round_up(src[2], 8_z), src[3], src[4]}, | ||||
src.dtype, src.format); | src.dtype, src.format); | ||||
exec_src = work_space_layout | |||||
exec_src = exec_workspace | |||||
.reshape({src[0], src[1] / 8, 8, | .reshape({src[0], src[1] / 8, 8, | ||||
div_ceil(src[2], 8_z), 8, src[3], | div_ceil(src[2], 8_z), 8, src[3], | ||||
src[4]}) | src[4]}) | ||||
@@ -442,10 +520,10 @@ void RelayoutFormat::deduce_exec_layout(const TensorLayout& src, | |||||
case Param::Mode::NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT: | case Param::Mode::NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT: | ||||
// nchw to nchw4c or oihw to oihw4i | // nchw to nchw4c or oihw to oihw4i | ||||
{ | { | ||||
TensorLayout work_space_layout( | |||||
exec_workspace = TensorLayout( | |||||
{src[0], round_up(src[1], 4_z), src[2], src[3]}, | {src[0], round_up(src[1], 4_z), src[2], src[3]}, | ||||
src.dtype, src.format); | src.dtype, src.format); | ||||
exec_src = work_space_layout | |||||
exec_src = exec_workspace | |||||
.reshape({src[0], div_ceil(src[1], 4_z), 4, | .reshape({src[0], div_ceil(src[1], 4_z), 4, | ||||
src[2], src[3]}) | src[2], src[3]}) | ||||
.dimshuffle({0, 1, 3, 4, 2}); | .dimshuffle({0, 1, 3, 4, 2}); | ||||
@@ -91,6 +91,7 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() { | |||||
all_algos.push_back(&algo); | all_algos.push_back(&algo); | ||||
} | } | ||||
all_algos.push_back(&int8_chwn4_dotprod); | all_algos.push_back(&int8_chwn4_dotprod); | ||||
all_algos.push_back(&fallback_nchw_qs8); | |||||
for (size_t i = all_algo_size; i < all_algos.size(); ++i) { | for (size_t i = all_algo_size; i < all_algos.size(); ++i) { | ||||
non_cudnn_algos.push_back(all_algos[i]); | non_cudnn_algos.push_back(all_algos[i]); | ||||
} | } | ||||
@@ -15,14 +15,14 @@ | |||||
#include "megdnn/oprs.h" | #include "megdnn/oprs.h" | ||||
#include "src/common/algo_base.h" | #include "src/common/algo_base.h" | ||||
#include "src/common/utils.h" | |||||
#include "src/common/metahelper.h" | #include "src/common/metahelper.h" | ||||
#include "src/common/utils.h" | |||||
#include "src/cuda/conv_bias/conv_bias_int8.cuh" | #include "src/cuda/conv_bias/conv_bias_int8.cuh" | ||||
#include "src/cuda/conv_bias/helper.h" | #include "src/cuda/conv_bias/helper.h" | ||||
#include "src/cuda/conv_bias/opr_impl.h" | #include "src/cuda/conv_bias/opr_impl.h" | ||||
#include "src/cuda/convolution_helper/parameter.cuh" | #include "src/cuda/convolution_helper/parameter.cuh" | ||||
#include "src/cuda/handle.h" | |||||
#include "src/cuda/cudnn_wrapper.h" | #include "src/cuda/cudnn_wrapper.h" | ||||
#include "src/cuda/handle.h" | |||||
#include <cuda.h> | #include <cuda.h> | ||||
#include <memory> | #include <memory> | ||||
@@ -547,6 +547,28 @@ private: | |||||
std::string m_name; | std::string m_name; | ||||
}; | }; | ||||
class ConvBiasForwardImpl::AlgoFallbackNCHWQS8 final : public AlgoBase { | |||||
public: | |||||
bool is_available(const SizeArgs& args) const override; | |||||
size_t get_workspace_in_bytes(const SizeArgs& args) const override; | |||||
void exec(const ExecArgs& args) const override; | |||||
const char* name() const override { | |||||
return "FALLBACK_CONV_NCHW_QS8"; | |||||
} | |||||
AlgoAttribute attribute() const override { | |||||
return AlgoAttribute::REPRODUCIBLE; | |||||
} | |||||
MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8) | |||||
private: | |||||
void make_inner_layout(const SizeArgs& args, TensorLayout& inner_src_layout, | |||||
TensorLayout& inner_weight_layout, | |||||
TensorLayout& inner_dst_layout, | |||||
TensorLayout& inner_bias_layout, | |||||
TensorLayout& inner_z_layout) const; | |||||
WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; | |||||
}; | |||||
#if CUDA_VERSION >= 10000 | #if CUDA_VERSION >= 10000 | ||||
class ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemm final | class ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemm final | ||||
: public AlgoBase { | : public AlgoBase { | ||||
@@ -763,6 +785,7 @@ public: | |||||
non_cudnn_algos, bfloat16_algos; | non_cudnn_algos, bfloat16_algos; | ||||
std::vector<AlgoCUDNNConvBiasActivation> cudnn_conv_bias_activations; | std::vector<AlgoCUDNNConvBiasActivation> cudnn_conv_bias_activations; | ||||
std::vector<AlgoCUDNNConv> cudnn_convs; | std::vector<AlgoCUDNNConv> cudnn_convs; | ||||
AlgoFallbackNCHWQS8 fallback_nchw_qs8; | |||||
AlgoChanwise chanwise; | AlgoChanwise chanwise; | ||||
AlgoChanwiseSmall chanwise_small; | AlgoChanwiseSmall chanwise_small; | ||||
AlgoChanwise8x8x32 chanwise8x8x32; | AlgoChanwise8x8x32 chanwise8x8x32; | ||||
@@ -0,0 +1,162 @@ | |||||
/** | |||||
* \file dnn/src/cuda/conv_bias/conv_nchwqs8.cpp | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | |||||
#include "src/common/conv_bias.h" | |||||
#include "src/cuda/conv_bias/algo.h" | |||||
#include "src/cuda/cudnn_wrapper.h" | |||||
#include "src/cuda/relayout_format/opr_impl.h" | |||||
#include "src/cuda/utils.h" | |||||
using namespace megdnn; | |||||
using namespace cuda; | |||||
using namespace conv_bias; | |||||
namespace { | |||||
inline void deduce_reformat_layout(std::unique_ptr<RelayoutFormat>& relayout, | |||||
const TensorLayout& src_layout, | |||||
TensorLayout& dst_layout, | |||||
RelayoutFormat::Param::Mode mode, | |||||
const int oc = 0, const int group = 1) { | |||||
if (src_layout.ndim > 0) { | |||||
RelayoutFormat::Param trans_param; | |||||
trans_param.mode = mode; | |||||
trans_param.oc = oc; | |||||
trans_param.group = group; | |||||
relayout->param() = trans_param; | |||||
relayout->deduce_layout(src_layout, dst_layout); | |||||
} else { | |||||
dst_layout = src_layout; | |||||
} | |||||
} | |||||
} // namespace | |||||
void ConvBiasForwardImpl::AlgoFallbackNCHWQS8::make_inner_layout( | |||||
const SizeArgs& args, TensorLayout& inner_src_layout, | |||||
TensorLayout& inner_weight_layout, TensorLayout& inner_dst_layout, | |||||
TensorLayout& inner_bias_layout, TensorLayout& inner_z_layout) const { | |||||
auto relayout_src = args.handle->create_operator<RelayoutFormat>(); | |||||
deduce_reformat_layout(relayout_src, *args.src_layout, inner_src_layout, | |||||
RelayoutFormat::Param::Mode::NCHW_NCHW4, 0, | |||||
args.filter_meta.group); | |||||
deduce_reformat_layout(relayout_src, *args.filter_layout, | |||||
inner_weight_layout, | |||||
RelayoutFormat::Param::Mode::NCHW_NCHW4_WEIGHT); | |||||
deduce_reformat_layout(relayout_src, *args.dst_layout, inner_dst_layout, | |||||
RelayoutFormat::Param::Mode::NCHW_NCHW4, 0, | |||||
args.filter_meta.group); | |||||
deduce_reformat_layout(relayout_src, *args.bias_layout, inner_bias_layout, | |||||
RelayoutFormat::Param::Mode::NCHW_NCHW4, 0, | |||||
args.filter_meta.group); | |||||
deduce_reformat_layout(relayout_src, *args.z_layout, inner_z_layout, | |||||
RelayoutFormat::Param::Mode::NCHW_NCHW4, 0, | |||||
args.filter_meta.group); | |||||
}; | |||||
bool ConvBiasForwardImpl::AlgoFallbackNCHWQS8::is_available( | |||||
const SizeArgs& args) const { | |||||
auto&& param = args.opr->param(); | |||||
bool is_format_ok = param.format == param::ConvBias::Format::NCHW; | |||||
bool is_version_ok = CUDNN_VERSION >= 7500; | |||||
bool is_dtype_ok = | |||||
args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS8; | |||||
bool is_bias_ok = | |||||
args.bias_layout->ndim == 0 || | |||||
(args.bias_layout->ndim == 4 && args.bias_layout->shape[0] == 1 && | |||||
args.bias_layout->shape[2] == 1 && | |||||
args.bias_layout->shape[3] == 1); | |||||
bool is_ok = is_format_ok && is_version_ok && is_dtype_ok && is_bias_ok; | |||||
return is_ok; | |||||
} | |||||
WorkspaceBundle ConvBiasForwardImpl::AlgoFallbackNCHWQS8::get_workspace_bundle( | |||||
void* ptr, const SizeArgs& args) const { | |||||
TensorLayout inner_src_layout; | |||||
TensorLayout inner_weight_layout; | |||||
TensorLayout inner_dst_layout; | |||||
TensorLayout inner_bias_layout; | |||||
TensorLayout inner_z_layout; | |||||
make_inner_layout(args, inner_src_layout, inner_weight_layout, | |||||
inner_dst_layout, inner_bias_layout, inner_z_layout); | |||||
auto opr = args.handle->create_operator<ConvBiasForward>(); | |||||
Param inner_conv_param = args.opr->param(); | |||||
inner_conv_param.format = Param::Format::NCHW4; | |||||
opr->param() = inner_conv_param; | |||||
return WorkspaceBundle(ptr, {inner_src_layout.span().dist_byte(), | |||||
inner_weight_layout.span().dist_byte(), | |||||
inner_dst_layout.span().dist_byte(), | |||||
inner_bias_layout.span().dist_byte(), | |||||
inner_z_layout.span().dist_byte(), | |||||
opr->get_workspace_in_bytes( | |||||
inner_src_layout, inner_weight_layout, | |||||
inner_bias_layout, inner_z_layout, | |||||
inner_dst_layout, nullptr)}); | |||||
} | |||||
size_t ConvBiasForwardImpl::AlgoFallbackNCHWQS8::get_workspace_in_bytes( | |||||
const SizeArgs& args) const { | |||||
auto trans_bundle = get_workspace_bundle(nullptr, args); | |||||
return trans_bundle.total_size_in_bytes(); | |||||
} | |||||
void ConvBiasForwardImpl::AlgoFallbackNCHWQS8::exec( | |||||
const ExecArgs& args) const { | |||||
auto relayout_nchw_nchw4 = args.handle->create_operator<RelayoutFormat>(); | |||||
RelayoutFormat::Param in_trans; | |||||
in_trans.mode = RelayoutFormat::Param::Mode::NCHW_NCHW4; | |||||
in_trans.group = args.filter_meta.group; | |||||
relayout_nchw_nchw4->param() = in_trans; | |||||
auto relayout_weight = args.handle->create_operator<RelayoutFormat>(); | |||||
RelayoutFormat::Param weight_trans; | |||||
weight_trans.mode = RelayoutFormat::Param::Mode::NCHW_NCHW4_WEIGHT; | |||||
relayout_weight->param() = weight_trans; | |||||
auto relayout_nchw4_nchw = args.handle->create_operator<RelayoutFormat>(); | |||||
RelayoutFormat::Param nchw4_nchw_trans; | |||||
nchw4_nchw_trans.mode = RelayoutFormat::Param::Mode::NCHW4_NCHW; | |||||
nchw4_nchw_trans.oc = args.dst_layout->shape[1]; | |||||
nchw4_nchw_trans.group = args.filter_meta.group; | |||||
relayout_nchw4_nchw->param() = nchw4_nchw_trans; | |||||
auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args); | |||||
TensorLayout inner_src_layout; | |||||
TensorLayout inner_weight_layout; | |||||
TensorLayout inner_dst_layout; | |||||
TensorLayout inner_bias_layout; | |||||
TensorLayout inner_z_layout; | |||||
make_inner_layout(args, inner_src_layout, inner_weight_layout, | |||||
inner_dst_layout, inner_bias_layout, inner_z_layout); | |||||
TensorND inner_src(bundle.get(0), inner_src_layout); | |||||
TensorND inner_weight(bundle.get(1), inner_weight_layout); | |||||
TensorND inner_dst(bundle.get(2), inner_dst_layout); | |||||
TensorND inner_bias(bundle.get(3), inner_bias_layout); | |||||
TensorND inner_z(bundle.get(4), inner_z_layout); | |||||
Param inner_conv_param = args.opr->param(); | |||||
inner_conv_param.format = Param::Format::NCHW4; | |||||
auto inner_opr = args.handle->create_operator<ConvBiasForward>(); | |||||
inner_opr->param() = inner_conv_param; | |||||
relayout_nchw_nchw4->exec(*args.src_tensor, inner_src, {}); | |||||
relayout_weight->exec(*args.filter_tensor, inner_weight, {}); | |||||
if (inner_bias_layout.ndim > 0) { | |||||
relayout_nchw_nchw4->exec(*args.bias_tensor, inner_bias, {}); | |||||
} | |||||
if (inner_z_layout.ndim > 0) { | |||||
relayout_nchw_nchw4->exec(*args.z_tensor, inner_z, {}); | |||||
} | |||||
inner_opr->exec(inner_src, inner_weight, inner_bias, inner_z, inner_dst, | |||||
nullptr, Workspace((dt_byte*)bundle.get(5), bundle.get_size(5))); | |||||
relayout_nchw4_nchw->exec(inner_dst, *args.dst_tensor, {}); | |||||
} | |||||
// vim: syntax=cpp.doxygen |
@@ -201,6 +201,11 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic( | |||||
return algo; | return algo; | ||||
} | } | ||||
if (sm_algo_pack.fallback_nchw_qs8.is_available_reproducible( | |||||
args, reproducible, workspace_limit_in_bytes)) { | |||||
return &sm_algo_pack.fallback_nchw_qs8; | |||||
} | |||||
if (args.src_layout->dtype.enumv() != DTypeTrait<dtype::BFloat16>::enumv) { | if (args.src_layout->dtype.enumv() != DTypeTrait<dtype::BFloat16>::enumv) { | ||||
if (reproducible) { | if (reproducible) { | ||||
return megdnn::get_reproducible_algo<ConvBiasForwardImpl>( | return megdnn::get_reproducible_algo<ConvBiasForwardImpl>( | ||||
@@ -49,6 +49,7 @@ public: | |||||
class AlgoChanwiseSmall; | class AlgoChanwiseSmall; | ||||
class AlgoChanwise8x8x32; | class AlgoChanwise8x8x32; | ||||
class AlgoCUDNNConv; | class AlgoCUDNNConv; | ||||
class AlgoFallbackNCHWQS8; | |||||
class AlgoInplaceMatmul; | class AlgoInplaceMatmul; | ||||
class AlgoMatmul; | class AlgoMatmul; | ||||
class AlgoMatmul8x8x32; | class AlgoMatmul8x8x32; | ||||
@@ -24,6 +24,9 @@ void RelayoutFormatImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, | |||||
megdnn_assert( | megdnn_assert( | ||||
param().mode == param::RelayoutFormat::Mode::NCHW4_CHWN4 || | param().mode == param::RelayoutFormat::Mode::NCHW4_CHWN4 || | ||||
param().mode == param::RelayoutFormat::Mode::NCHW_NCHW4 || | param().mode == param::RelayoutFormat::Mode::NCHW_NCHW4 || | ||||
param().mode == | |||||
param::RelayoutFormat::Mode::NCHW_NCHW4_WEIGHT || | |||||
param().mode == param::RelayoutFormat::Mode::NCHW4_NCHW || | |||||
param().mode == param::RelayoutFormat::Mode::CHWN4_NCHW4 || | param().mode == param::RelayoutFormat::Mode::CHWN4_NCHW4 || | ||||
param().mode == Param::Mode::NCHW_NCHW4_IC_SMALL || | param().mode == Param::Mode::NCHW_NCHW4_IC_SMALL || | ||||
param().mode == | param().mode == | ||||
@@ -76,7 +79,9 @@ void RelayoutFormatImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, | |||||
{src.raw_ptr, exec_src_layout}, {dst.raw_ptr, exec_dst_layout}); | {src.raw_ptr, exec_src_layout}, {dst.raw_ptr, exec_dst_layout}); | ||||
} | } | ||||
if (param().mode == Param::Mode::NCHW_NCHW4) { | |||||
if (param().mode == Param::Mode::NCHW_NCHW4 || | |||||
param().mode == Param::Mode::NCHW4_NCHW || | |||||
param().mode == Param::Mode::NCHW_NCHW4_WEIGHT) { | |||||
bool is_usable = relayout_format::RelayoutFormatFast::usable( | bool is_usable = relayout_format::RelayoutFormatFast::usable( | ||||
src.layout, dst.layout); | src.layout, dst.layout); | ||||
megdnn_assert(is_usable, | megdnn_assert(is_usable, | ||||
@@ -85,10 +90,12 @@ void RelayoutFormatImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, | |||||
src.layout.to_string().c_str(), src.layout.dtype.name(), | src.layout.to_string().c_str(), src.layout.dtype.name(), | ||||
dst.layout.to_string().c_str(), dst.layout.dtype.name()); | dst.layout.to_string().c_str(), dst.layout.dtype.name()); | ||||
relayout_format::RelayoutFormatFast::exec(src, dst, | relayout_format::RelayoutFormatFast::exec(src, dst, | ||||
cuda_stream(this->handle())); | |||||
cuda_stream(this->handle()), | |||||
param().mode, param().group); | |||||
} else { | } else { | ||||
TensorLayout exec_src, exec_dst; | |||||
deduce_exec_layout(src.layout, dst.layout, exec_src, exec_dst); | |||||
TensorLayout exec_src, exec_dst, exec_workspace; | |||||
deduce_exec_layout(src.layout, dst.layout, exec_workspace, exec_src, | |||||
exec_dst); | |||||
TensorND exec_src_nd{src.raw_ptr, exec_src}; | TensorND exec_src_nd{src.raw_ptr, exec_src}; | ||||
TensorND exec_dst_nd{dst.raw_ptr, exec_dst}; | TensorND exec_dst_nd{dst.raw_ptr, exec_dst}; | ||||
handle()->create_operator<RelayoutForward>()->exec(exec_src_nd, | handle()->create_operator<RelayoutForward>()->exec(exec_src_nd, | ||||
@@ -36,7 +36,9 @@ bool relayout_format::RelayoutFormatFast::usable( | |||||
void relayout_format::RelayoutFormatFast::exec(const TensorND& src, | void relayout_format::RelayoutFormatFast::exec(const TensorND& src, | ||||
const TensorND& dst, | const TensorND& dst, | ||||
cudaStream_t stream) { | |||||
cudaStream_t stream, | |||||
RelayoutFormat::Param::Mode mode, | |||||
int group) { | |||||
size_t ih = src.layout[2]; | size_t ih = src.layout[2]; | ||||
size_t iw = src.layout[3]; | size_t iw = src.layout[3]; | ||||
size_t hw = ih * iw; | size_t hw = ih * iw; | ||||
@@ -49,11 +51,22 @@ void relayout_format::RelayoutFormatFast::exec(const TensorND& src, | |||||
if (src.layout.dtype.enumv() == DTypeEnum::Uint8) { | if (src.layout.dtype.enumv() == DTypeEnum::Uint8) { | ||||
src_zero_point = 128; | src_zero_point = 128; | ||||
} | } | ||||
if (hw % 4 == 0) { | |||||
relayout_format_cuda_exec<4>(src, dst, stream, src_scale, dst_scale, | |||||
src_zero_point, dst_zero_point); | |||||
if (mode == RelayoutFormat::Param::Mode::NCHW_NCHW4) { | |||||
if (hw % 4 == 0) { | |||||
relayout_format_cuda_nchw_nchw4<4>(src, dst, stream, src_scale, | |||||
dst_scale, src_zero_point, | |||||
dst_zero_point, group); | |||||
} else { | |||||
relayout_format_cuda_nchw_nchw4<1>(src, dst, stream, src_scale, | |||||
dst_scale, src_zero_point, | |||||
dst_zero_point, group); | |||||
} | |||||
} else if (mode == RelayoutFormat::Param::Mode::NCHW_NCHW4_WEIGHT) { | |||||
relayout_format_cuda_nchw_nchw4_weight(src, dst, stream); | |||||
} else if (mode == RelayoutFormat::Param::Mode::NCHW4_NCHW) { | |||||
relayout_format_cuda_nchw4_nchw(src, dst, stream, group); | |||||
} else { | } else { | ||||
relayout_format_cuda_exec<1>(src, dst, stream, src_scale, dst_scale, | |||||
src_zero_point, dst_zero_point); | |||||
megdnn_throw("only support nchw_nchw4 nchw4_nchw layout_format"); | |||||
} | } | ||||
} | } |
@@ -80,10 +80,30 @@ struct CudaPostProcess<dtype::QuantizedS8, dtype::QuantizedS8, false> { | |||||
template <> | template <> | ||||
struct CudaPostProcess<dtype::QuantizedS8, dtype::QuantizedS8, true> { | struct CudaPostProcess<dtype::QuantizedS8, dtype::QuantizedS8, true> { | ||||
CudaPostProcess(){}; | |||||
CudaPostProcess(float, uint8_t, float, uint8_t){}; | CudaPostProcess(float, uint8_t, float, uint8_t){}; | ||||
inline __device__ int8_t operator()(int8_t val) { return val; } | inline __device__ int8_t operator()(int8_t val) { return val; } | ||||
}; | }; | ||||
template <> | |||||
struct CudaPostProcess<dtype::QuantizedS32, dtype::QuantizedS32, false> { | |||||
CudaDTypeParamImpl<dt_qint32> m_dst_type_cvt; | |||||
CudaDTypeParamImpl<dt_qint32> m_src_type_cvt; | |||||
CudaPostProcess(float src_scale, int, float dst_scale, int) { | |||||
m_dst_type_cvt = CudaDTypeParamImpl<dt_qint32>(dst_scale); | |||||
m_src_type_cvt = CudaDTypeParamImpl<dt_qint32>(src_scale); | |||||
}; | |||||
inline __device__ int operator()(int val) { | |||||
float med_var = m_src_type_cvt.dequantize(dt_qint32(val)); | |||||
return m_dst_type_cvt.quantize(med_var).as_int32(); | |||||
} | |||||
}; | |||||
template <> | |||||
struct CudaPostProcess<dtype::QuantizedS32, dtype::QuantizedS32, true> { | |||||
CudaPostProcess(float, int, float, int){}; | |||||
inline __device__ int operator()(int val) { return val; } | |||||
}; | |||||
template <typename SrcType, int pack_w> | template <typename SrcType, int pack_w> | ||||
struct DTypeRWHelper; | struct DTypeRWHelper; | ||||
template <> | template <> | ||||
@@ -98,6 +118,18 @@ struct DTypeRWHelper<char, 4> { | |||||
using DstDtype = char4; | using DstDtype = char4; | ||||
}; | }; | ||||
template <> | |||||
struct DTypeRWHelper<int, 1> { | |||||
using InnerDtype = int; | |||||
using DstDtype = int4; | |||||
}; | |||||
template <> | |||||
struct DTypeRWHelper<int, 4> { | |||||
using InnerDtype = int4; | |||||
using DstDtype = int4; | |||||
}; | |||||
template <int pack_w, int pack_c, typename SrcType, typename DnnSrcType, | template <int pack_w, int pack_c, typename SrcType, typename DnnSrcType, | ||||
typename DnnDstType, bool same_scale> | typename DnnDstType, bool same_scale> | ||||
struct Translayout { | struct Translayout { | ||||
@@ -165,6 +197,11 @@ inline __device__ char4 make_zero_pad<char4>(const char zero_point) { | |||||
return {zero_point, zero_point, zero_point, zero_point}; | return {zero_point, zero_point, zero_point, zero_point}; | ||||
} | } | ||||
template <> | |||||
inline __device__ int4 make_zero_pad<int4>(const char zero_point) { | |||||
return {zero_point, zero_point, zero_point, zero_point}; | |||||
} | |||||
template <typename DstDtype> | template <typename DstDtype> | ||||
inline __device__ void write_helper(DstDtype* ptr, DstDtype val) { | inline __device__ void write_helper(DstDtype* ptr, DstDtype val) { | ||||
*ptr = val; | *ptr = val; | ||||
@@ -176,14 +213,14 @@ inline __device__ void write_helper<char4>(char4* ptr, char4 val) { | |||||
*rel_ptr = *(int32_t*)(&val); | *rel_ptr = *(int32_t*)(&val); | ||||
} | } | ||||
template <bool with_pad, int pack_w, int pack_c, bool same_scale, | |||||
template <bool with_pad, int pack_w, int pack_c, bool same_scale, bool all_pad, | |||||
typename SrcType, typename DstType, typename DnnSrcType, | typename SrcType, typename DstType, typename DnnSrcType, | ||||
typename DnnDstType> | typename DnnDstType> | ||||
struct RelayoutKern { | struct RelayoutKern { | ||||
using InnerDtype = typename DTypeRWHelper<SrcType, pack_w>::InnerDtype; | using InnerDtype = typename DTypeRWHelper<SrcType, pack_w>::InnerDtype; | ||||
using DstDtype = typename DTypeRWHelper<SrcType, pack_w>::DstDtype; | using DstDtype = typename DTypeRWHelper<SrcType, pack_w>::DstDtype; | ||||
static inline __device__ void write(DstType* dst_ptr, | static inline __device__ void write(DstType* dst_ptr, | ||||
char4 (&dst_width)[pack_w]) { | |||||
DstDtype (&dst_width)[pack_w]) { | |||||
DstDtype* dst_inner_ptr = (DstDtype*)dst_ptr; | DstDtype* dst_inner_ptr = (DstDtype*)dst_ptr; | ||||
#pragma unroll | #pragma unroll | ||||
for (int iw_idx = 0; iw_idx < pack_w; ++iw_idx) { | for (int iw_idx = 0; iw_idx < pack_w; ++iw_idx) { | ||||
@@ -213,6 +250,17 @@ struct RelayoutKern { | |||||
} | } | ||||
} | } | ||||
static inline __device__ void fake_read(const SrcType* src_ptr, | |||||
InnerDtype (&read_channel)[pack_c], | |||||
const int ic_stride, | |||||
const int remain_ic, | |||||
const InnerDtype zero_point) { | |||||
#pragma unroll | |||||
for (int ic_idx = 0; ic_idx < pack_c; ++ic_idx) { | |||||
read_channel[ic_idx] = zero_point; | |||||
} | |||||
} | |||||
static inline __device__ void core_relayout_kern( | static inline __device__ void core_relayout_kern( | ||||
const SrcType* src, DstType* dst, const int src_offset_base, | const SrcType* src, DstType* dst, const int src_offset_base, | ||||
const int dst_offset_base, const int ic_offset, const int ic_stride, | const int dst_offset_base, const int ic_offset, const int ic_stride, | ||||
@@ -220,12 +268,20 @@ struct RelayoutKern { | |||||
CudaPostProcess<DnnSrcType, DnnDstType, same_scale>& post_process, | CudaPostProcess<DnnSrcType, DnnDstType, same_scale>& post_process, | ||||
const char zero_point) { | const char zero_point) { | ||||
InnerDtype read_channel[pack_c]; | InnerDtype read_channel[pack_c]; | ||||
if (with_pad) { | |||||
if (all_pad) { | |||||
const InnerDtype zero_pad = make_zero_pad<InnerDtype>(zero_point); | const InnerDtype zero_pad = make_zero_pad<InnerDtype>(zero_point); | ||||
read_with_pad(src + ic_offset + src_offset_base, read_channel, | |||||
ic_stride, remain_ic, zero_pad); | |||||
fake_read(src + ic_offset + src_offset_base, read_channel, | |||||
ic_stride, remain_ic, zero_pad); | |||||
} else { | } else { | ||||
read(src + ic_offset + src_offset_base, read_channel, ic_stride); | |||||
if (with_pad) { | |||||
const InnerDtype zero_pad = | |||||
make_zero_pad<InnerDtype>(zero_point); | |||||
read_with_pad(src + ic_offset + src_offset_base, read_channel, | |||||
ic_stride, remain_ic, zero_pad); | |||||
} else { | |||||
read(src + ic_offset + src_offset_base, read_channel, | |||||
ic_stride); | |||||
} | |||||
} | } | ||||
DstDtype dst_width[pack_w]; | DstDtype dst_width[pack_w]; | ||||
Translayout<pack_w, pack_c, SrcType, DnnSrcType, DnnDstType, | Translayout<pack_w, pack_c, SrcType, DnnSrcType, DnnDstType, | ||||
@@ -238,45 +294,195 @@ struct RelayoutKern { | |||||
template <int pack_w, bool same_scale, typename SrcType, typename DstType, | template <int pack_w, bool same_scale, typename SrcType, typename DstType, | ||||
typename DnnSrcType, typename DnnDstType> | typename DnnSrcType, typename DnnDstType> | ||||
__global__ void kern_nchw_nchw4( | __global__ void kern_nchw_nchw4( | ||||
const SrcType* src, DstType* dst, int ic, int ihw, int n_stride_src, | |||||
int ic_stride, int n_stride_dst, | |||||
const SrcType* src, DstType* dst, int in_n, int ic, int ihw, | |||||
int n_stride_src, int ic_stride, int n_stride_dst, | |||||
CudaPostProcess<DnnSrcType, DnnDstType, same_scale> post_process, | CudaPostProcess<DnnSrcType, DnnDstType, same_scale> post_process, | ||||
const char zero_point) { | |||||
const char zero_point, const int group, const int ocpg) { | |||||
constexpr int pack_c = 4; | constexpr int pack_c = 4; | ||||
const int n_idx = blockIdx.y; | const int n_idx = blockIdx.y; | ||||
const int ihw_block_idx = blockIdx.x * blockDim.x + threadIdx.x; | const int ihw_block_idx = blockIdx.x * blockDim.x + threadIdx.x; | ||||
const int ihw_offset = ihw_block_idx * pack_w; | const int ihw_offset = ihw_block_idx * pack_w; | ||||
if (ihw_offset < ihw) { | if (ihw_offset < ihw) { | ||||
const int ic_block = ic / pack_c; | |||||
const int remain_ic = ic % pack_c; | |||||
const int src_offset_base = n_idx * n_stride_src + ihw_offset; | const int src_offset_base = n_idx * n_stride_src + ihw_offset; | ||||
const int dst_offset_base = n_idx * n_stride_dst + ihw_offset * pack_c; | const int dst_offset_base = n_idx * n_stride_dst + ihw_offset * pack_c; | ||||
if (n_idx < in_n) { | |||||
const int icpg = ic / group; | |||||
const int ic_block = icpg / pack_c; | |||||
const int remain_ic = icpg % pack_c; | |||||
const int src_group_stride = icpg * ic_stride; | |||||
const int dst_group_stride = ocpg * ic_stride; | |||||
for (int g_idx = 0; g_idx < group; ++g_idx) { | |||||
const int src_offset = | |||||
src_offset_base + g_idx * src_group_stride; | |||||
const int dst_offset = | |||||
dst_offset_base + g_idx * dst_group_stride; | |||||
for (int ic_blk_idx = 0; ic_blk_idx < ic_block; ++ic_blk_idx) { | |||||
const int ic_offset = ic_blk_idx * pack_c * ic_stride; | |||||
RelayoutKern<false, pack_w, pack_c, same_scale, false, | |||||
SrcType, DstType, DnnSrcType, | |||||
DnnDstType>::core_relayout_kern(src, dst, | |||||
src_offset, | |||||
dst_offset, | |||||
ic_offset, | |||||
ic_stride, | |||||
remain_ic, | |||||
post_process, | |||||
zero_point); | |||||
} | |||||
if (remain_ic > 0) { | |||||
const int ic_offset = ic_block * pack_c * ic_stride; | |||||
RelayoutKern<true, pack_w, pack_c, same_scale, false, | |||||
SrcType, DstType, DnnSrcType, | |||||
DnnDstType>::core_relayout_kern(src, dst, | |||||
src_offset, | |||||
dst_offset, | |||||
ic_offset, | |||||
ic_stride, | |||||
remain_ic, | |||||
post_process, | |||||
zero_point); | |||||
} | |||||
} | |||||
} else { | |||||
//! pad n | |||||
const int ic_full_block = group * ocpg / pack_c; | |||||
for (int ic_blk_idx = 0; ic_blk_idx < ic_full_block; ++ic_blk_idx) { | |||||
RelayoutKern<false, pack_w, pack_c, same_scale, true, SrcType, | |||||
DstType, DnnSrcType, | |||||
DnnDstType>::core_relayout_kern(src, dst, | |||||
src_offset_base, | |||||
dst_offset_base, 0, | |||||
ic_stride, 0, | |||||
post_process, | |||||
zero_point); | |||||
} | |||||
} | |||||
} | |||||
} | |||||
for (int ic_blk_idx = 0; ic_blk_idx < ic_block; ++ic_blk_idx) { | |||||
const int ic_offset = ic_blk_idx * pack_c * ic_stride; | |||||
RelayoutKern<false, pack_w, pack_c, same_scale, SrcType, DstType, | |||||
DnnSrcType, | |||||
DnnDstType>::core_relayout_kern(src, dst, | |||||
src_offset_base, | |||||
dst_offset_base, | |||||
ic_offset, ic_stride, | |||||
remain_ic, | |||||
post_process, | |||||
zero_point); | |||||
__global__ void kern_nchw4_nchw(const int8_t* src, int8_t* dst, int n, int ic, | |||||
int oc, int oh, int ow, int group) { | |||||
constexpr int pack_w = 1; | |||||
constexpr int pack_ic = 4; | |||||
const int n_idx = blockIdx.y; | |||||
const int hw_block_idx = blockIdx.x * blockDim.x + threadIdx.x; | |||||
const int hw_offset = hw_block_idx * pack_w; | |||||
const int hw = oh * ow; | |||||
const int n_stride_src = ic * hw; | |||||
const int n_stride_dst = oc * hw; | |||||
const int c_stride = hw; | |||||
if (hw_offset < hw) { | |||||
const int icpg = ic / group; | |||||
const int ocpg = oc / group; | |||||
const int src_group_stride = icpg * c_stride; | |||||
const int dst_group_stride = ocpg * c_stride; | |||||
for (int g_idx = 0; g_idx < group; ++g_idx) { | |||||
const int oc_block = ocpg / pack_ic; | |||||
const int remain_oc = ocpg % pack_ic; | |||||
const int src_offset_base = n_idx * n_stride_src + | |||||
g_idx * src_group_stride + | |||||
hw_offset * pack_ic; | |||||
const int dst_offset_base = | |||||
n_idx * n_stride_dst + g_idx * dst_group_stride + hw_offset; | |||||
for (int ic_blk_idx = 0; ic_blk_idx < oc_block; ++ic_blk_idx) { | |||||
const int oc_offset = ic_blk_idx * pack_ic * c_stride; | |||||
char4 temp = *(char4*)(src + src_offset_base + oc_offset); | |||||
dst[dst_offset_base + oc_offset + 0 * c_stride] = temp.x; | |||||
dst[dst_offset_base + oc_offset + 1 * c_stride] = temp.y; | |||||
dst[dst_offset_base + oc_offset + 2 * c_stride] = temp.z; | |||||
dst[dst_offset_base + oc_offset + 3 * c_stride] = temp.w; | |||||
} | |||||
if (remain_oc > 0) { | |||||
const int oc_offset = oc_block * pack_ic * c_stride; | |||||
char4 temp = *(char4*)(src + src_offset_base + oc_offset); | |||||
dst[dst_offset_base + oc_offset + 0 * c_stride] = temp.x; | |||||
if (remain_oc > 1) { | |||||
dst[dst_offset_base + oc_offset + 1 * c_stride] = temp.y; | |||||
} | |||||
if (remain_oc > 2) { | |||||
dst[dst_offset_base + oc_offset + 2 * c_stride] = temp.z; | |||||
} | |||||
} | |||||
} | } | ||||
} | |||||
} | |||||
if (remain_ic > 0) { | |||||
const int ic_offset = ic_block * pack_c * ic_stride; | |||||
RelayoutKern<true, pack_w, pack_c, same_scale, SrcType, DstType, | |||||
DnnSrcType, | |||||
DnnDstType>::core_relayout_kern(src, dst, | |||||
src_offset_base, | |||||
dst_offset_base, | |||||
ic_offset, ic_stride, | |||||
remain_ic, | |||||
post_process, | |||||
zero_point); | |||||
__global__ void kern_nchw_nchw4_weight( | |||||
const char* src, char* dst, int in_oc, int ic, int ihw, | |||||
int oc_stride_src, int ic_stride, int oc_stride_dst, | |||||
int group_stride_src, int group_stride_dst, const char zero_point, | |||||
CudaPostProcess<dtype::QuantizedS8, dtype::QuantizedS8, true> | |||||
post_process) { | |||||
typedef char SrcType; | |||||
typedef char DstType; | |||||
typedef dtype::QuantizedS8 DnnSrcType; | |||||
typedef dtype::QuantizedS8 DnnDstType; | |||||
constexpr int pack_c = 4; | |||||
constexpr int pack_w = 1; | |||||
constexpr bool same_scale = true; | |||||
const int group_idx = blockIdx.z; | |||||
const int oc_idx = blockIdx.y; | |||||
const int ihw_block_idx = blockIdx.x * blockDim.x + threadIdx.x; | |||||
const int ihw_offset = ihw_block_idx * pack_w; | |||||
if (ihw_offset < ihw) { | |||||
const int ic_block = ic / pack_c; | |||||
const int remain_ic = ic % pack_c; | |||||
const int src_offset_base = group_idx * group_stride_src + | |||||
oc_idx * oc_stride_src + ihw_offset; | |||||
const int dst_offset_base = group_idx * group_stride_dst + | |||||
oc_idx * oc_stride_dst + | |||||
ihw_offset * pack_c; | |||||
if (oc_idx < in_oc) { | |||||
for (int ic_blk_idx = 0; ic_blk_idx < ic_block; ++ic_blk_idx) { | |||||
const int ic_offset = ic_blk_idx * pack_c * ic_stride; | |||||
RelayoutKern<false, pack_w, pack_c, same_scale, false, SrcType, | |||||
DstType, DnnSrcType, | |||||
DnnDstType>::core_relayout_kern(src, dst, | |||||
src_offset_base, | |||||
dst_offset_base, | |||||
ic_offset, | |||||
ic_stride, | |||||
remain_ic, | |||||
post_process, | |||||
zero_point); | |||||
} | |||||
if (remain_ic > 0) { | |||||
const int ic_offset = ic_block * pack_c * ic_stride; | |||||
RelayoutKern<true, pack_w, pack_c, same_scale, false, SrcType, | |||||
DstType, DnnSrcType, | |||||
DnnDstType>::core_relayout_kern(src, dst, | |||||
src_offset_base, | |||||
dst_offset_base, | |||||
ic_offset, | |||||
ic_stride, | |||||
remain_ic, | |||||
post_process, | |||||
zero_point); | |||||
} | |||||
} else { | |||||
//! pad oc per group | |||||
const int ic_full_block = (ic + pack_c - 1) / pack_c; | |||||
for (int ic_blk_idx = 0; ic_blk_idx < ic_full_block; ++ic_blk_idx) { | |||||
const int ic_offset = ic_blk_idx * pack_c * ic_stride; | |||||
RelayoutKern<false, pack_w, pack_c, same_scale, true, SrcType, | |||||
DstType, DnnSrcType, | |||||
DnnDstType>::core_relayout_kern(src, dst, | |||||
src_offset_base, | |||||
dst_offset_base, | |||||
ic_offset, | |||||
ic_stride, | |||||
remain_ic, | |||||
post_process, | |||||
zero_point); | |||||
} | |||||
} | } | ||||
} | } | ||||
} | } | ||||
@@ -284,20 +490,23 @@ __global__ void kern_nchw_nchw4( | |||||
} // namespace | } // namespace | ||||
template <int pack_w = 1> | template <int pack_w = 1> | ||||
void relayout_format::relayout_format_cuda_exec( | |||||
void relayout_format::relayout_format_cuda_nchw_nchw4( | |||||
const TensorND& src, const TensorND& dst, const cudaStream_t& stream, | const TensorND& src, const TensorND& dst, const cudaStream_t& stream, | ||||
const float src_scale, const float dst_scale, | const float src_scale, const float dst_scale, | ||||
const uint8_t src_zero_point, const uint8_t dst_zero_point) { | |||||
const uint8_t src_zero_point, const uint8_t dst_zero_point, | |||||
const int group) { | |||||
constexpr int pack_oc = 4; | constexpr int pack_oc = 4; | ||||
const int n = src.layout[0]; | |||||
const int c = src.layout[1]; | |||||
const int in_n = src.layout[0]; | |||||
const int out_n = dst.layout[0]; | |||||
const int ic = src.layout[1]; | |||||
const int h = src.layout[2]; | const int h = src.layout[2]; | ||||
const int w = src.layout[3]; | const int w = src.layout[3]; | ||||
const int oc = dst.layout[1] * pack_oc; | |||||
const int hw = h * w; | const int hw = h * w; | ||||
const int oc_block = DIVUP(c, pack_oc); | |||||
const int n_stride_src = c * hw; | |||||
const int ocpg = oc / group; | |||||
const int n_stride_src = ic * hw; | |||||
const int ic_stride = hw; | const int ic_stride = hw; | ||||
const int n_stride_dst = oc_block * pack_oc * h * w; | |||||
const int n_stride_dst = oc * hw; | |||||
auto& src_layout = src.layout; | auto& src_layout = src.layout; | ||||
auto& dst_layout = dst.layout; | auto& dst_layout = dst.layout; | ||||
@@ -307,26 +516,26 @@ void relayout_format::relayout_format_cuda_exec( | |||||
int nr_threads = query_blocksize_for_kernel( \ | int nr_threads = query_blocksize_for_kernel( \ | ||||
kern_nchw_nchw4<pack_w, true, SRC_C_TYPE, DST_C_TYPE, \ | kern_nchw_nchw4<pack_w, true, SRC_C_TYPE, DST_C_TYPE, \ | ||||
SRC_TYPE, DST_TYPE>); \ | SRC_TYPE, DST_TYPE>); \ | ||||
const dim3 block_dim(DIVUP(hw, nr_threads* pack_w), n); \ | |||||
const dim3 block_dim(DIVUP(hw, nr_threads* pack_w), out_n); \ | |||||
const dim3 thread_dim(nr_threads); \ | const dim3 thread_dim(nr_threads); \ | ||||
kern_nchw_nchw4<pack_w, true><<<block_dim, thread_dim, 0, stream>>>( \ | kern_nchw_nchw4<pack_w, true><<<block_dim, thread_dim, 0, stream>>>( \ | ||||
(SRC_C_TYPE*)src.raw_ptr, (DST_C_TYPE*)dst.raw_ptr, c, hw, \ | |||||
n_stride_src, ic_stride, n_stride_dst, \ | |||||
(SRC_C_TYPE*)src.raw_ptr, (DST_C_TYPE*)dst.raw_ptr, in_n, ic, \ | |||||
hw, n_stride_src, ic_stride, n_stride_dst, \ | |||||
CudaPostProcess<SRC_TYPE, DST_TYPE, true>( \ | CudaPostProcess<SRC_TYPE, DST_TYPE, true>( \ | ||||
src_scale, src_zero_point, dst_scale, dst_zero_point), \ | src_scale, src_zero_point, dst_scale, dst_zero_point), \ | ||||
src_zero_point); \ | |||||
src_zero_point, group, ocpg); \ | |||||
} else { \ | } else { \ | ||||
int nr_threads = query_blocksize_for_kernel( \ | int nr_threads = query_blocksize_for_kernel( \ | ||||
kern_nchw_nchw4<pack_w, false, SRC_C_TYPE, DST_C_TYPE, \ | kern_nchw_nchw4<pack_w, false, SRC_C_TYPE, DST_C_TYPE, \ | ||||
SRC_TYPE, DST_TYPE>); \ | SRC_TYPE, DST_TYPE>); \ | ||||
const dim3 block_dim(DIVUP(hw, nr_threads* pack_w), n); \ | |||||
const dim3 block_dim(DIVUP(hw, nr_threads* pack_w), out_n); \ | |||||
const dim3 thread_dim(nr_threads); \ | const dim3 thread_dim(nr_threads); \ | ||||
kern_nchw_nchw4<pack_w, false><<<block_dim, thread_dim, 0, stream>>>( \ | kern_nchw_nchw4<pack_w, false><<<block_dim, thread_dim, 0, stream>>>( \ | ||||
(SRC_C_TYPE*)src.raw_ptr, (DST_C_TYPE*)dst.raw_ptr, c, hw, \ | |||||
n_stride_src, ic_stride, n_stride_dst, \ | |||||
(SRC_C_TYPE*)src.raw_ptr, (DST_C_TYPE*)dst.raw_ptr, in_n, ic, \ | |||||
hw, n_stride_src, ic_stride, n_stride_dst, \ | |||||
CudaPostProcess<SRC_TYPE, DST_TYPE, false>( \ | CudaPostProcess<SRC_TYPE, DST_TYPE, false>( \ | ||||
src_scale, src_zero_point, dst_scale, dst_zero_point), \ | src_scale, src_zero_point, dst_scale, dst_zero_point), \ | ||||
src_zero_point); \ | |||||
src_zero_point, group, ocpg); \ | |||||
} | } | ||||
if (src_layout.dtype.enumv().ev == DTypeEnum::Ev::Uint8 && | if (src_layout.dtype.enumv().ev == DTypeEnum::Ev::Uint8 && | ||||
@@ -340,6 +549,10 @@ void relayout_format::relayout_format_cuda_exec( | |||||
dst_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS8) { | dst_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS8) { | ||||
RUN_KERNEL(same_scale, dtype::QuantizedS8, dtype::QuantizedS8, char, | RUN_KERNEL(same_scale, dtype::QuantizedS8, dtype::QuantizedS8, char, | ||||
char); | char); | ||||
} else if (src_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS32 && | |||||
dst_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS32) { | |||||
RUN_KERNEL(same_scale, dtype::QuantizedS32, dtype::QuantizedS32, int, | |||||
int); | |||||
} else { | } else { | ||||
megdnn_assert(0, "not support dtype %s %s", src_layout.dtype.name(), | megdnn_assert(0, "not support dtype %s %s", src_layout.dtype.name(), | ||||
dst_layout.dtype.name()); | dst_layout.dtype.name()); | ||||
@@ -356,16 +569,65 @@ bool relayout_format::relayout_format_cuda_usable( | |||||
(src_layout.dtype.enumv().ev == DTypeEnum::Ev::Quantized8Asymm && | (src_layout.dtype.enumv().ev == DTypeEnum::Ev::Quantized8Asymm && | ||||
dst_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS8) || | dst_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS8) || | ||||
(src_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS8 && | (src_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS8 && | ||||
dst_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS8); | |||||
dst_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS8) || | |||||
(src_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS32 && | |||||
dst_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS32); | |||||
return is_all_continue && is_all_int8; | return is_all_continue && is_all_int8; | ||||
} | } | ||||
template void relayout_format::relayout_format_cuda_exec<1>( | |||||
void relayout_format::relayout_format_cuda_nchw4_nchw( | |||||
const TensorND& src, const TensorND& dst, const cudaStream_t& stream, | |||||
const int group) { | |||||
constexpr int pack_w = 1; | |||||
const int n = src.layout[0]; | |||||
const int ic = src.layout[1] * 4; | |||||
const int h = src.layout[2]; | |||||
const int w = src.layout[3]; | |||||
const int oc = dst.layout[1]; | |||||
const int hw = h * w; | |||||
int nr_threads = query_blocksize_for_kernel(kern_nchw4_nchw); | |||||
const dim3 block_dim(DIVUP(hw, nr_threads * pack_w), n); | |||||
const dim3 thread_dim(nr_threads); | |||||
kern_nchw4_nchw<<<block_dim, thread_dim, 0, stream>>>( | |||||
(int8_t*)src.raw_ptr, (int8_t*)dst.raw_ptr, n, ic, oc, h, w, group); | |||||
} | |||||
void relayout_format::relayout_format_cuda_nchw_nchw4_weight( | |||||
const TensorND& src, const TensorND& dst, const cudaStream_t& stream) { | |||||
constexpr int pack_c = 4; | |||||
const bool is_group = src.layout.ndim == 5; | |||||
const int group = is_group ? src.layout[0] : 1; | |||||
const int oc = is_group ? src.layout[1] : src.layout[0]; | |||||
const int ic = is_group ? src.layout[2] : src.layout[1]; | |||||
const int kh = is_group ? src.layout[3] : src.layout[2]; | |||||
const int kw = is_group ? src.layout[4] : src.layout[3]; | |||||
const int hw = kh * kw; | |||||
const int oc_round = ROUNDUP(oc, pack_c); | |||||
const int ic_round = ROUNDUP(ic, pack_c); | |||||
const int ic_stride = hw; | |||||
const int oc_stride_src = ic * ic_stride; | |||||
const int oc_stride_dst = ic_round * ic_stride; | |||||
const int group_stride_src = oc * oc_stride_src; | |||||
const int group_stride_dst = oc_round * oc_stride_dst; | |||||
int nr_threads = 32; | |||||
const dim3 block_dim(DIVUP(hw, nr_threads), oc_round, group); | |||||
const dim3 thread_dim(nr_threads); | |||||
kern_nchw_nchw4_weight<<<block_dim, thread_dim, 0, stream>>>( | |||||
(char*)src.raw_ptr, (char*)dst.raw_ptr, oc, ic, hw, oc_stride_src, | |||||
ic_stride, oc_stride_dst, group_stride_src, group_stride_dst, 0, | |||||
{}); | |||||
} | |||||
template void relayout_format::relayout_format_cuda_nchw_nchw4<1>( | |||||
const TensorND& src, const TensorND& dst, const cudaStream_t& stream, | const TensorND& src, const TensorND& dst, const cudaStream_t& stream, | ||||
const float src_scale, const float dst_scale, | const float src_scale, const float dst_scale, | ||||
const uint8_t src_zero_point, const uint8_t dst_zero_point); | |||||
const uint8_t src_zero_point, const uint8_t dst_zero_point, | |||||
const int group); | |||||
template void relayout_format::relayout_format_cuda_exec<4>( | |||||
template void relayout_format::relayout_format_cuda_nchw_nchw4<4>( | |||||
const TensorND& src, const TensorND& dst, const cudaStream_t& stream, | const TensorND& src, const TensorND& dst, const cudaStream_t& stream, | ||||
const float src_scale, const float dst_scale, | const float src_scale, const float dst_scale, | ||||
const uint8_t src_zero_point, const uint8_t dst_zero_point); | |||||
const uint8_t src_zero_point, const uint8_t dst_zero_point, | |||||
const int group); |
@@ -20,16 +20,25 @@ namespace cuda { | |||||
namespace relayout_format { | namespace relayout_format { | ||||
template <int pack_w = 1> | template <int pack_w = 1> | ||||
void relayout_format_cuda_exec(const TensorND& src, const TensorND& dst, | |||||
const cudaStream_t& stream, | |||||
const float src_scale = 1.f, | |||||
const float dst_scale = 1.f, | |||||
const uint8_t src_zero_point = 0, | |||||
const uint8_t dst_zero_point = 0); | |||||
void relayout_format_cuda_nchw_nchw4(const TensorND& src, const TensorND& dst, | |||||
const cudaStream_t& stream, | |||||
const float src_scale = 1.f, | |||||
const float dst_scale = 1.f, | |||||
const uint8_t src_zero_point = 0, | |||||
const uint8_t dst_zero_point = 0, | |||||
const int group = 1); | |||||
bool relayout_format_cuda_usable(const TensorLayout& src_layout, | bool relayout_format_cuda_usable(const TensorLayout& src_layout, | ||||
const TensorLayout& dst_layout); | const TensorLayout& dst_layout); | ||||
void relayout_format_cuda_nchw4_nchw(const TensorND& src, const TensorND& dst, | |||||
const cudaStream_t& stream, | |||||
const int group); | |||||
void relayout_format_cuda_nchw_nchw4_weight(const TensorND& src, | |||||
const TensorND& dst, | |||||
const cudaStream_t& stream); | |||||
} // namespace relayout_format | } // namespace relayout_format | ||||
} // namespace cuda | } // namespace cuda | ||||
} // namespace megdnn | } // namespace megdnn | ||||
@@ -13,6 +13,7 @@ | |||||
#pragma once | #pragma once | ||||
#include "megdnn/basic_types.h" | #include "megdnn/basic_types.h" | ||||
#include "megdnn/oprs.h" | |||||
#include "src/cuda/utils.cuh" | #include "src/cuda/utils.cuh" | ||||
namespace megdnn { | namespace megdnn { | ||||
@@ -23,7 +24,8 @@ struct RelayoutFormatFast { | |||||
static bool usable(const TensorLayout& src_layout, | static bool usable(const TensorLayout& src_layout, | ||||
const TensorLayout& dst_layout); | const TensorLayout& dst_layout); | ||||
static void exec(const TensorND& src, const TensorND& dst, | static void exec(const TensorND& src, const TensorND& dst, | ||||
cudaStream_t stream); | |||||
cudaStream_t stream, RelayoutFormat::Param::Mode mode, | |||||
int group); | |||||
}; | }; | ||||
} // namespace relayout_format | } // namespace relayout_format | ||||
@@ -86,6 +86,7 @@ | |||||
#endif | #endif | ||||
#define DIVUP(x, y) (((x) + (y)-1) / (y)) | #define DIVUP(x, y) (((x) + (y)-1) / (y)) | ||||
#define ROUNDUP(x, y) (DIVUP(x, y) * (y)) | |||||
#define KERN_FOR(i, n) \ | #define KERN_FOR(i, n) \ | ||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ | for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ | ||||
@@ -23,6 +23,71 @@ using namespace megdnn; | |||||
using namespace naive; | using namespace naive; | ||||
namespace { | namespace { | ||||
template <typename ctype> | |||||
void recursive_cp(const TensorND& dst, const TensorND& src, size_t idx = 0, | |||||
size_t src_offset = 0, size_t dst_offset = 0) { | |||||
if (idx < (src.layout.ndim - 1)) { | |||||
for (size_t i = 0; i < src.layout[idx]; ++i) { | |||||
recursive_cp<ctype>(dst, src, idx + 1, | |||||
src_offset + i * src.layout.stride[idx], | |||||
dst_offset + i * dst.layout.stride[idx]); | |||||
} | |||||
} else { | |||||
for (size_t i = 0; i < src.layout[idx]; ++i) { | |||||
((ctype*)dst.raw_ptr)[dst_offset + i * dst.layout.stride[idx]] = | |||||
((ctype*)src | |||||
.raw_ptr)[src_offset + i * src.layout.stride[idx]]; | |||||
} | |||||
} | |||||
} | |||||
void padding_to_workspace(_megdnn_tensor_out dst, _megdnn_tensor_in src) { | |||||
switch (src.layout.dtype.enumv()) { | |||||
#define cb(name, ctype) \ | |||||
case (DTypeEnum::name): { \ | |||||
recursive_cp<ctype>(dst, src); \ | |||||
break; \ | |||||
} | |||||
cb(Float32, dt_float32); | |||||
cb(Int32, dt_int32); | |||||
cb(QuantizedS32, dt_int32); | |||||
cb(QuantizedS8, dt_qint8); | |||||
default: | |||||
megdnn_assert(0, "not support dtype %s", src.layout.dtype.name()); | |||||
#undef cb | |||||
} | |||||
} | |||||
void extract_from_workspace(_megdnn_tensor_out dst, _megdnn_tensor_in src, | |||||
size_t group) { | |||||
megdnn_assert(dst.layout.is_contiguous() && src.layout.is_contiguous(), | |||||
"dst %s, src %s", dst.layout.to_string().c_str(), | |||||
src.layout.to_string().c_str()); | |||||
const size_t type_size = dst.layout.dtype.size(); | |||||
const size_t n = dst.layout[0]; | |||||
const size_t n_stride_dst = dst.layout.stride[0]; | |||||
const size_t n_stride_src = src.layout.stride[0]; | |||||
const size_t ocpg = dst.layout[1] / group; | |||||
const size_t icpg = src.layout[1] / group; | |||||
const size_t dst_hw = dst.layout[2] * dst.layout[3]; | |||||
const size_t src_hw = src.layout[2] * src.layout[3]; | |||||
megdnn_assert(dst_hw == src_hw); | |||||
for (size_t nid = 0; nid < n; ++nid) { | |||||
const size_t n_offset_dst = nid * n_stride_dst * type_size; | |||||
const size_t n_offset_src = nid * n_stride_src * type_size; | |||||
for (size_t gid = 0; gid < group; ++gid) { | |||||
memcpy((char*)dst.raw_ptr + n_offset_dst + | |||||
gid * ocpg * dst_hw * type_size, | |||||
(char*)src.raw_ptr + n_offset_src + | |||||
gid * icpg * src_hw * type_size, | |||||
ocpg * dst_hw * type_size); | |||||
} | |||||
} | |||||
}; | |||||
template <typename dtype> | template <typename dtype> | ||||
void padding_src_to_workspace(dtype* dptr, const dtype* sptr, size_t N, | void padding_src_to_workspace(dtype* dptr, const dtype* sptr, size_t N, | ||||
size_t IC, size_t IH, size_t IW) { | size_t IC, size_t IH, size_t IW) { | ||||
@@ -86,6 +151,8 @@ void padding_to_workspace(_megdnn_tensor_out dst, _megdnn_tensor_in src, | |||||
} | } | ||||
cb(Float32, dt_float32); | cb(Float32, dt_float32); | ||||
cb(Int32, dt_int32); | |||||
cb(QuantizedS32, dt_int32); | |||||
cb(QuantizedS8, dt_qint8); | cb(QuantizedS8, dt_qint8); | ||||
case (DTypeEnum::Quantized8Asymm): { | case (DTypeEnum::Quantized8Asymm): { | ||||
@@ -159,6 +226,19 @@ void do_copy_diff_q8_q8(const TensorND& dst, const TensorND& src) { | |||||
++isrc; | ++isrc; | ||||
} | } | ||||
} | } | ||||
void do_copy_diff_q32_q32(const TensorND& dst, const TensorND& src) { | |||||
auto isrc = tensor_iter_valonly<DTypeTrait<dtype::QuantizedS32>::ctype>(src) | |||||
.begin(); | |||||
auto idst = tensor_iter_valonly<DTypeTrait<dtype::QuantizedS32>::ctype>(dst) | |||||
.begin(); | |||||
auto src_dt_parm = src.layout.dtype.param<dtype::QuantizedS32>(); | |||||
auto dst_dt_parm = dst.layout.dtype.param<dtype::QuantizedS32>(); | |||||
for (size_t i = 0, it = dst.layout.total_nr_elems(); i < it; ++i) { | |||||
*idst = dst_dt_parm.quantize(src_dt_parm.dequantize(*isrc)); | |||||
++idst; | |||||
++isrc; | |||||
} | |||||
} | |||||
void do_copy_diff_u8_q8(const TensorND& dst, const TensorND& src) { | void do_copy_diff_u8_q8(const TensorND& dst, const TensorND& src) { | ||||
auto isrc = | auto isrc = | ||||
@@ -263,12 +343,34 @@ size_t RelayoutFormatImpl::get_workspace_in_bytes(const TensorLayout& src, | |||||
return n * c * h * w * src.dtype.size(); | return n * c * h * w * src.dtype.size(); | ||||
} | } | ||||
case Param::Mode::NCHW_NCHW4: { | case Param::Mode::NCHW_NCHW4: { | ||||
size_t group = param().group; | |||||
size_t n = src[0]; | size_t n = src[0]; | ||||
size_t c = round_up(src[1], 4_z); | |||||
size_t c = group * round_up(src[1] / group, 4_z); | |||||
size_t h = src[2]; | size_t h = src[2]; | ||||
size_t w = src[3]; | size_t w = src[3]; | ||||
return n * c * h * w * src.dtype.size(); | return n * c * h * w * src.dtype.size(); | ||||
} | } | ||||
case Param::Mode::NCHW4_NCHW: { | |||||
return src.total_nr_elems() * src.dtype.size(); | |||||
} | |||||
case Param::Mode::NCHW_NCHW4_WEIGHT: { | |||||
if (src.ndim == 4) { | |||||
size_t oc = round_up(src[0], 4_z); | |||||
size_t ic = round_up(src[1], 4_z); | |||||
size_t h = src[2]; | |||||
size_t w = src[3]; | |||||
return oc * ic * h * w * src.dtype.size(); | |||||
} else if (src.ndim == 5) { | |||||
size_t group = src[0]; | |||||
size_t oc = round_up(src[1], 4_z); | |||||
size_t ic = round_up(src[2], 4_z); | |||||
size_t h = src[3]; | |||||
size_t w = src[4]; | |||||
return group * oc * ic * h * w * src.dtype.size(); | |||||
} else { | |||||
megdnn_throw("no support nchw_nchw4_weight"); | |||||
} | |||||
} | |||||
case Param::Mode::NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT: { | case Param::Mode::NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT: { | ||||
megdnn_assert(src.ndim == 4, "src must be oihw, ndim == 5"); | megdnn_assert(src.ndim == 4, "src must be oihw, ndim == 5"); | ||||
if (src[1] % 4 == 0) | if (src[1] % 4 == 0) | ||||
@@ -288,13 +390,15 @@ size_t RelayoutFormatImpl::get_workspace_in_bytes(const TensorLayout& src, | |||||
void RelayoutFormatImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, | void RelayoutFormatImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, | ||||
_megdnn_workspace workspace) { | _megdnn_workspace workspace) { | ||||
megdnn_assert(src.layout.dtype.category() == DTypeCategory::FLOAT || | megdnn_assert(src.layout.dtype.category() == DTypeCategory::FLOAT || | ||||
src.layout.dtype.enumv() == DTypeEnum::Int32 || | |||||
(src.layout.dtype.enumv() == DTypeEnum::Uint8 && | (src.layout.dtype.enumv() == DTypeEnum::Uint8 && | ||||
dst.layout.dtype.enumv() == DTypeEnum::QuantizedS8) || | dst.layout.dtype.enumv() == DTypeEnum::QuantizedS8) || | ||||
src.layout.dtype.category() == DTypeCategory::QUANTIZED); | src.layout.dtype.category() == DTypeCategory::QUANTIZED); | ||||
check_exec(src.layout, dst.layout, workspace.size); | check_exec(src.layout, dst.layout, workspace.size); | ||||
HandleImpl* m_handle = static_cast<HandleImpl*>(handle()); | HandleImpl* m_handle = static_cast<HandleImpl*>(handle()); | ||||
TensorLayout exec_src, exec_dst; | |||||
deduce_exec_layout(src.layout, dst.layout, exec_src, exec_dst); | |||||
TensorLayout exec_src, exec_dst, exec_workspace; | |||||
deduce_exec_layout(src.layout, dst.layout, exec_workspace, exec_src, | |||||
exec_dst); | |||||
TensorND exec_src_nd{src.raw_ptr, exec_src}; | TensorND exec_src_nd{src.raw_ptr, exec_src}; | ||||
TensorND exec_dst_nd{dst.raw_ptr, exec_dst}; | TensorND exec_dst_nd{dst.raw_ptr, exec_dst}; | ||||
// clean dst | // clean dst | ||||
@@ -371,6 +475,19 @@ void RelayoutFormatImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, | |||||
} \ | } \ | ||||
} \ | } \ | ||||
MIDOUT_END(); | MIDOUT_END(); | ||||
#define cb2(_idx, _pack_size, _mode, _src_layout, _workspace_layout) \ | |||||
MIDOUT_BEGIN(megdnn_naive_relayout_format, \ | |||||
midout_iv(Param::Mode::_mode)) { \ | |||||
size_t val = _src_layout[_idx]; \ | |||||
if (val % _pack_size != 0) { \ | |||||
memset(workspace.raw_ptr, 0, exec_src.span().dist_byte()); \ | |||||
padding_to_workspace({workspace.raw_ptr, _workspace_layout}, \ | |||||
{src.raw_ptr, _src_layout}); \ | |||||
exec_src_nd.raw_ptr = workspace.raw_ptr; \ | |||||
} \ | |||||
} \ | |||||
MIDOUT_END(); | |||||
cb(1, 8, NCHW_NCHW88); | cb(1, 8, NCHW_NCHW88); | ||||
} else if (param().mode == Param::Mode::NCHW_NCHW88_CONV_DENSE_WEIGHT) { | } else if (param().mode == Param::Mode::NCHW_NCHW88_CONV_DENSE_WEIGHT) { | ||||
@@ -384,10 +501,62 @@ void RelayoutFormatImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, | |||||
} else if (param().mode == Param::Mode::NCHW_NCHW4_IC_SMALL) { | } else if (param().mode == Param::Mode::NCHW_NCHW4_IC_SMALL) { | ||||
cb(1, 4, NCHW_NCHW4_IC_SMALL); | cb(1, 4, NCHW_NCHW4_IC_SMALL); | ||||
} else if (param().mode == Param::Mode::NCHW_NCHW4) { | } else if (param().mode == Param::Mode::NCHW_NCHW4) { | ||||
cb(1, 4, NCHW_NCHW4); | |||||
if (param().group == 1) { | |||||
cb(1, 4, NCHW_NCHW4); | |||||
} else { | |||||
TensorLayout group_src_layout{{src.layout[0], param().group, | |||||
src.layout[1] / param().group, | |||||
src.layout[2], src.layout[3]}, | |||||
src.layout.dtype, | |||||
src.layout.format}; | |||||
TensorLayout workspace_layout{ | |||||
{src.layout[0], param().group, | |||||
div_ceil(src.layout[1] / param().group, 4_z) * 4_z, | |||||
src.layout[2], src.layout[3]}, | |||||
src.layout.dtype, | |||||
src.layout.format}; | |||||
cb2(2, 4, NCHW_NCHW4, group_src_layout, workspace_layout); | |||||
} | |||||
} else if (param().mode == | } else if (param().mode == | ||||
Param::Mode::NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT) { | Param::Mode::NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT) { | ||||
cb(1, 4, NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT); | cb(1, 4, NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT); | ||||
} else if (param().mode == Param::Mode::NCHW_NCHW4_WEIGHT) { | |||||
#undef cb | |||||
#define cb(_idx0, _idx1, _pack_size, _mode) \ | |||||
MIDOUT_BEGIN(megdnn_naive_relayout_format, \ | |||||
midout_iv(Param::Mode::_mode)) { \ | |||||
size_t val0 = src.layout[_idx0]; \ | |||||
size_t val1 = src.layout[_idx1]; \ | |||||
if (val0 % _pack_size != 0 || val1 % _pack_size != 0) { \ | |||||
memset(workspace.raw_ptr, 0, exec_src.span().dist_byte()); \ | |||||
padding_to_workspace({workspace.raw_ptr, exec_workspace}, src); \ | |||||
exec_src_nd.raw_ptr = workspace.raw_ptr; \ | |||||
} \ | |||||
} \ | |||||
MIDOUT_END(); | |||||
if (src.layout.ndim == 4) { | |||||
cb(0, 1, 4, NCHW_NCHW4_WEIGHT); | |||||
} else if (src.layout.ndim == 5) { | |||||
cb(1, 2, 4, NCHW_NCHW4_WEIGHT); | |||||
} | |||||
} else if (param().mode == Param::Mode::NCHW4_NCHW) { | |||||
if (exec_workspace.total_nr_elems() == dst.layout.total_nr_elems()) { | |||||
m_handle->relayout_opr()->exec( | |||||
exec_src_nd, {dst.raw_ptr, exec_workspace}, handle()); | |||||
return; | |||||
} else { | |||||
m_handle->relayout_opr()->exec( | |||||
exec_src_nd, {workspace.raw_ptr, exec_workspace}, handle()); | |||||
TensorLayout workspace_layout{{src.layout[0], src.layout[1] * 4, | |||||
src.layout[2], src.layout[3]}, | |||||
src.layout.dtype, | |||||
src.layout.format}; | |||||
extract_from_workspace(exec_dst_nd, | |||||
{workspace.raw_ptr, workspace_layout}, | |||||
param().group); | |||||
return; | |||||
} | |||||
} | } | ||||
if (src.layout.dtype.enumv() == DTypeEnum::Quantized8Asymm && | if (src.layout.dtype.enumv() == DTypeEnum::Quantized8Asymm && | ||||
@@ -417,6 +586,15 @@ void RelayoutFormatImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, | |||||
}; | }; | ||||
MEGDNN_DISPATCH_CPU_KERN_OPR(func(dst0, src0)); | MEGDNN_DISPATCH_CPU_KERN_OPR(func(dst0, src0)); | ||||
return; | return; | ||||
} else if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS32 && | |||||
dst.layout.dtype.enumv() == DTypeEnum::QuantizedS32) { | |||||
TensorND src0 = exec_src_nd, dst0 = exec_dst_nd; | |||||
check_layout_and_canonize(src0.layout, src0.layout); | |||||
auto func = [](const TensorND& dst, const TensorND& src) { | |||||
do_copy_diff_q32_q32(dst, src); | |||||
}; | |||||
MEGDNN_DISPATCH_CPU_KERN_OPR(func(dst0, src0)); | |||||
return; | |||||
} else { | } else { | ||||
m_handle->relayout_opr()->exec(exec_src_nd, exec_dst_nd, handle()); | m_handle->relayout_opr()->exec(exec_src_nd, exec_dst_nd, handle()); | ||||
} | } | ||||
@@ -215,8 +215,7 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_QS8) { | |||||
.execs({src_shape, filter_shape, bias_shape, {}, {}}); | .execs({src_shape, filter_shape, bias_shape, {}, {}}); | ||||
} | } | ||||
} | } | ||||
//! close for cu111 ci, reopen it when bug fixed | |||||
#if CUDA_VERSION < 11000 | |||||
TEST_F(CUDA, CONV_BIAS_NCHW_QS8) { | TEST_F(CUDA, CONV_BIAS_NCHW_QS8) { | ||||
//! not support NonlineMode::SIGMOID and NonlineMode::H_SWISH | //! not support NonlineMode::SIGMOID and NonlineMode::H_SWISH | ||||
require_compute_capability(6, 1); | require_compute_capability(6, 1); | ||||
@@ -274,8 +273,97 @@ TEST_F(CUDA, CONV_BIAS_NCHW_QS8) { | |||||
} | } | ||||
} | } | ||||
} | } | ||||
for (NonlineMode mode : {NonlineMode::RELU, | |||||
NonlineMode::IDENTITY, NonlineMode::H_SWISH}) { | |||||
for (size_t g : {13}) { | |||||
for (size_t b : {1, 2}) { | |||||
for (size_t ic : {13}) { | |||||
for (size_t oc : {13}) { | |||||
for (size_t fh : {1, 3}) { | |||||
for (int ph : {static_cast<int>(fh / 2)}) { | |||||
for (int sh : {1, 2}) { | |||||
size_t ih = 16, iw = 16; | |||||
param.nonlineMode = mode; | |||||
param.stride_h = param.stride_w = sh; | |||||
param.pad_h = param.pad_w = ph; | |||||
param.sparse = | |||||
ConvBias::Param::Sparse::GROUP; | |||||
checker.set_param(param) | |||||
.execs({{b, ic, ih, iw}, | |||||
{g, oc/g, ic/g, fh, fh}, | |||||
{1, oc, 1, 1}, | |||||
{}, | |||||
{}}); | |||||
} | |||||
} | |||||
} | |||||
} | |||||
} | |||||
} | |||||
} | |||||
} | |||||
{ | |||||
size_t ih = 16, iw = 16, b = 1, oc = 14, ic = 14; | |||||
size_t fh = 3, sh = 1, ph = 1; | |||||
param.nonlineMode = NonlineMode::IDENTITY; | |||||
param.stride_h = param.stride_w = sh; | |||||
param.pad_h = param.pad_w = ph; | |||||
param.sparse = ConvBias::Param::Sparse::DENSE; | |||||
checker.set_param(param).execs( | |||||
{{b, ic, ih, iw}, {oc, ic, fh, fh}, {}, {}, {}}); | |||||
} | |||||
} | } | ||||
#endif | |||||
TEST_F(CUDA, CONV_BIAS_NCHW_QS8_FUSE_Z) { | |||||
require_compute_capability(6, 1); | |||||
Checker<ConvBiasForward> checker(handle_cuda()); | |||||
UniformIntRNG int_rng{-128, 127}; | |||||
using NonlineMode = ConvBias::Param::NonlineMode; | |||||
ConvBias::Param param; | |||||
param.format = ConvBias::Param::Format::NCHW; | |||||
checker.set_dtype(0, dtype::QuantizedS8(2.5f)) | |||||
.set_dtype(1, dtype::QuantizedS8(2.5f)) | |||||
.set_dtype(2, dtype::QuantizedS32(6.25f)) | |||||
.set_dtype(3, dtype::QuantizedS8(0.25f)) | |||||
.set_dtype(4, dtype::QuantizedS8(0.25f)) | |||||
.set_rng(0, &int_rng) | |||||
.set_rng(1, &int_rng) | |||||
.set_rng(2, &int_rng) | |||||
.set_rng(3, &int_rng); | |||||
for (NonlineMode mode : | |||||
{NonlineMode::RELU, NonlineMode::IDENTITY, NonlineMode::H_SWISH}) { | |||||
for (size_t b : {2}) { | |||||
for (size_t ic : {6, 16}) { | |||||
for (size_t oc : {4}) { | |||||
for (size_t fh : {1, 3}) { | |||||
for (int ph : {static_cast<int>(fh / 2)}) { | |||||
for (int sh : {1, 2}) { | |||||
size_t ih = 16, iw = 16; | |||||
param.nonlineMode = mode; | |||||
param.stride_h = param.stride_w = sh; | |||||
param.pad_h = param.pad_w = ph; | |||||
param.sparse = ConvBias::Param::Sparse::DENSE; | |||||
const size_t oh = (ih - fh + 2 * ph) / sh + 1; | |||||
const size_t ow = (iw - fh + 2 * ph) / sh + 1; | |||||
checker.set_param(param).execs( | |||||
{{b, ic, ih, iw}, | |||||
{oc, ic, fh, fh}, | |||||
{1, oc, 1, 1}, | |||||
{b, oc, oh, ow}, | |||||
{}}); | |||||
} | |||||
} | |||||
} | |||||
} | |||||
} | |||||
} | |||||
} | |||||
} | |||||
#if MEGDNN_WITH_BENCHMARK | #if MEGDNN_WITH_BENCHMARK | ||||
TEST_F(CUDA, BENCHMARK_CONV_BIAS_NCHW4_INT8) { | TEST_F(CUDA, BENCHMARK_CONV_BIAS_NCHW4_INT8) { | ||||
require_compute_capability(6, 1); | require_compute_capability(6, 1); | ||||
@@ -34,9 +34,43 @@ TEST_F(CUDA, RELAYOUT_FORMAT) { | |||||
checker.execs({{22, 23, 24, 25, 4}, {}}); | checker.execs({{22, 23, 24, 25, 4}, {}}); | ||||
} | } | ||||
TEST_F(CUDA, RELAYOUT_FORMAT_NCHW4_NCHW) { | |||||
Checker<RelayoutFormat> checker(handle_cuda()); | |||||
UniformIntRNG rng{-50, 50}; | |||||
param::RelayoutFormat param; | |||||
param.mode = param::RelayoutFormat::Mode::NCHW4_NCHW; | |||||
checker.set_dtype(0, dtype::QuantizedS8{0.1f}) | |||||
.set_dtype(1, dtype::QuantizedS8{0.1f}) | |||||
.set_rng(0, &rng) | |||||
.set_param(param) | |||||
.execs({{1, 1, 2, 2, 4}, {}}); | |||||
checker.set_dtype(0, dtype::QuantizedS8{0.1f}) | |||||
.set_dtype(1, dtype::QuantizedS8{0.1f}) | |||||
.set_rng(0, &rng) | |||||
.set_param(param) | |||||
.execs({{22, 23, 24, 25, 4}, {}}); | |||||
param.oc = 90; | |||||
checker.set_dtype(0, dtype::QuantizedS8{0.1f}) | |||||
.set_dtype(1, dtype::QuantizedS8{0.1f}) | |||||
.set_rng(0, &rng) | |||||
.set_param(param) | |||||
.execs({{22, 23, 24, 25, 4}, {}}); | |||||
param.oc = 16; | |||||
param.group = 8; | |||||
checker.set_dtype(0, dtype::QuantizedS8{0.1f}) | |||||
.set_dtype(1, dtype::QuantizedS8{0.1f}) | |||||
.set_rng(0, &rng) | |||||
.set_param(param) | |||||
.execs({{11, 16, 22, 33, 4}, {}}); | |||||
} | |||||
TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4) { | TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4) { | ||||
Checker<RelayoutFormat> checker(handle_cuda()); | Checker<RelayoutFormat> checker(handle_cuda()); | ||||
UniformIntRNG rng{0, 50}; | |||||
UniformIntRNG rng{-50, 50}; | |||||
param::RelayoutFormat param; | param::RelayoutFormat param; | ||||
param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4; | param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4; | ||||
@@ -55,6 +89,12 @@ TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4) { | |||||
.set_rng(0, &rng) | .set_rng(0, &rng) | ||||
.set_param(param) | .set_param(param) | ||||
.execs({{n, c, h, w}, {}}); | .execs({{n, c, h, w}, {}}); | ||||
checker.set_dtype(0, dtype::QuantizedS32{1.f}) | |||||
.set_dtype(1, dtype::QuantizedS32{1.f}) | |||||
.set_rng(0, &rng) | |||||
.set_param(param) | |||||
.execs({{n, c, h, w}, {}}); | |||||
} | } | ||||
} | } | ||||
} | } | ||||
@@ -77,6 +117,59 @@ TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4) { | |||||
.set_rng(0, &rng) | .set_rng(0, &rng) | ||||
.set_param(param) | .set_param(param) | ||||
.execs({{1, 6, 768, 1280}, {}}); | .execs({{1, 6, 768, 1280}, {}}); | ||||
param.group = 2; | |||||
checker.set_dtype(0, dtype::QuantizedS8{1.f}) | |||||
.set_dtype(1, dtype::QuantizedS8{1.f}) | |||||
.set_rng(0, &rng) | |||||
.set_param(param) | |||||
.execs({{8, 6, 300, 300}, {}}); | |||||
param.group = 3; | |||||
checker.set_dtype(0, dtype::QuantizedS8{1.f}) | |||||
.set_dtype(1, dtype::QuantizedS8{1.f}) | |||||
.set_rng(0, &rng) | |||||
.set_param(param) | |||||
.execs({{8, 6, 300, 300}, {}}); | |||||
} | |||||
TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4_WEIGHT) { | |||||
Checker<RelayoutFormat> checker(handle_cuda()); | |||||
UniformIntRNG rng{-50, 50}; | |||||
param::RelayoutFormat param; | |||||
param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4_WEIGHT; | |||||
for (size_t oc : {1, 3, 4, 16, 33}) { | |||||
for (size_t ic : {1, 2, 3, 4, 8, 9, 11, 16, 33}) { | |||||
for (size_t h : {3, 5, 7}) { | |||||
for (size_t w : {3, 5, 7}) { | |||||
checker.set_dtype(0, dtype::QuantizedS8{1.f}) | |||||
.set_dtype(1, dtype::QuantizedS8{1.f}) | |||||
.set_rng(0, &rng) | |||||
.set_param(param) | |||||
.execs({{oc, ic, h, w}, {}}); | |||||
} | |||||
} | |||||
} | |||||
} | |||||
checker.set_dtype(0, dtype::QuantizedS8{1.f}) | |||||
.set_dtype(1, dtype::QuantizedS8{1.f}) | |||||
.set_rng(0, &rng) | |||||
.set_param(param) | |||||
.execs({{13, 13, 5, 5}, {}}); | |||||
checker.set_dtype(0, dtype::QuantizedS8{1.f}) | |||||
.set_dtype(1, dtype::QuantizedS8{1.f}) | |||||
.set_rng(0, &rng) | |||||
.set_param(param) | |||||
.execs({{4, 16, 16, 3, 3}, {}}); | |||||
checker.set_dtype(0, dtype::QuantizedS8{1.f}) | |||||
.set_dtype(1, dtype::QuantizedS8{1.f}) | |||||
.set_rng(0, &rng) | |||||
.set_param(param) | |||||
.execs({{4, 13, 11, 3, 3}, {}}); | |||||
} | } | ||||
TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4_DEFAULT) { | TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4_DEFAULT) { | ||||
@@ -6,7 +6,8 @@ | |||||
* | * | ||||
* Unless required by applicable law or agreed to in writing, | * Unless required by applicable law or agreed to in writing, | ||||
* software distributed under the License is distributed on an | * software distributed under the License is distributed on an | ||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | */ | ||||
#include "test/naive/fixture.h" | #include "test/naive/fixture.h" | ||||
@@ -17,6 +18,136 @@ | |||||
using namespace megdnn; | using namespace megdnn; | ||||
using namespace test; | using namespace test; | ||||
TEST_F(NAIVE, RELAYOUT_FORMAT_NCHW4_NCHW) { | |||||
Checker<RelayoutFormat> checker(handle(), /* check_dispatch */ false); | |||||
{ | |||||
auto tensor_nchw4 = TensorValue( | |||||
{1, 2, 1, 2, 4}, dtype::Float32(), | |||||
{1, 3, 5, 7, 2, 4, 6, 8, 9, 11, 13, 15, 10, 12, 14, 16}); | |||||
auto tensor_nchw = TensorValue( | |||||
{1, 8, 1, 2}, dtype::Float32(), | |||||
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); | |||||
RelayoutFormat::Param param{RelayoutFormat::Param::Mode::NCHW4_NCHW}; | |||||
checker.set_param(param).exect(Testcase{tensor_nchw4, {}}, | |||||
Testcase{{}, tensor_nchw}); | |||||
} | |||||
{ | |||||
auto tensor_nchw4 = TensorValue( | |||||
{1, 2, 1, 2, 4}, dtype::Float32(), | |||||
{1, 3, 5, 7, 2, 4, 6, 8, 9, 11, 13, 15, 10, 12, 14, 16}); | |||||
auto tensor_nchw = | |||||
TensorValue({1, 7, 1, 2}, dtype::Float32(), | |||||
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); | |||||
RelayoutFormat::Param param{RelayoutFormat::Param::Mode::NCHW4_NCHW}; | |||||
param.oc = 7; | |||||
checker.set_param(param).exect(Testcase{tensor_nchw4, {}}, | |||||
Testcase{{}, tensor_nchw}); | |||||
} | |||||
{ | |||||
auto tensor_nchw4 = TensorValue( | |||||
{1, 2, 1, 2, 4}, dtype::Float32(), | |||||
{1, 3, 5, 7, 2, 4, 6, 8, 9, 11, 13, 15, 10, 12, 14, 16}); | |||||
auto tensor_nchw = | |||||
TensorValue({1, 6, 1, 2}, dtype::Float32(), | |||||
{1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14}); | |||||
RelayoutFormat::Param param{RelayoutFormat::Param::Mode::NCHW4_NCHW}; | |||||
param.oc = 6; | |||||
param.group = 2; | |||||
checker.set_param(param).exect(Testcase{tensor_nchw4, {}}, | |||||
Testcase{{}, tensor_nchw}); | |||||
} | |||||
} | |||||
TEST_F(NAIVE, RELAYOUT_FORMAT_NCHW_NCHW4_WEIGHT) { | |||||
Checker<RelayoutFormat> checker(handle(), /* check_dispatch */ false); | |||||
{ | |||||
auto tensor_nchw = TensorValue({2, 2, 2, 2}, dtype::Float32(), | |||||
{1, 2, 3, 4, 5, 6, 7, 8, | |||||
9, 10, 11, 12, 13, 14, 15, 16}); | |||||
auto tensor_nchw4 = TensorValue( | |||||
{4, 1, 2, 2, 4}, dtype::Float32(), | |||||
{1, 5, 0, 0, 2, 6, 0, 0, 3, 7, 0, 0, 4, 8, 0, 0, | |||||
9, 13, 0, 0, 10, 14, 0, 0, 11, 15, 0, 0, 12, 16, 0, 0, | |||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}); | |||||
RelayoutFormat::Param param{ | |||||
RelayoutFormat::Param::Mode::NCHW_NCHW4_WEIGHT}; | |||||
checker.set_param(param).exect(Testcase{tensor_nchw, {}}, | |||||
Testcase{{}, tensor_nchw4}); | |||||
} | |||||
{ | |||||
auto tensor_nchw = TensorValue({2, 2, 1, 2, 2}, dtype::Float32(), | |||||
{1, 2, 3, 4, 5, 6, 7, 8, | |||||
9, 10, 11, 12, 13, 14, 15, 16}); | |||||
auto tensor_nchw4 = TensorValue( | |||||
{2, 4, 1, 2, 2, 4}, dtype::Float32(), | |||||
{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, | |||||
0, 6, 0, 0, 0, 7, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||||
0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 10, 0, 0, 0, 11, 0, 0, 0, | |||||
12, 0, 0, 0, 13, 0, 0, 0, 14, 0, 0, 0, 15, 0, 0, 0, 16, 0, 0, | |||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}); | |||||
RelayoutFormat::Param param{ | |||||
RelayoutFormat::Param::Mode::NCHW_NCHW4_WEIGHT}; | |||||
checker.set_param(param).exect(Testcase{tensor_nchw, {}}, | |||||
Testcase{{}, tensor_nchw4}); | |||||
} | |||||
} | |||||
TEST_F(NAIVE, RELAYOUT_FORMAT_NCHW_NCHW4) { | |||||
Checker<RelayoutFormat> checker(handle(), /* check_dispatch */ false); | |||||
{ | |||||
auto tensor_nchw = TensorValue( | |||||
{1, 8, 1, 2}, dtype::Float32(), | |||||
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); | |||||
auto tensor_nchw4 = TensorValue( | |||||
{1, 2, 1, 2, 4}, dtype::Float32(), | |||||
{1, 3, 5, 7, 2, 4, 6, 8, 9, 11, 13, 15, 10, 12, 14, 16}); | |||||
RelayoutFormat::Param param{RelayoutFormat::Param::Mode::NCHW_NCHW4}; | |||||
checker.set_param(param).exect(Testcase{tensor_nchw, {}}, | |||||
Testcase{{}, tensor_nchw4}); | |||||
} | |||||
{ | |||||
auto tensor_nchw = TensorValue( | |||||
{1, 8, 1, 2}, dtype::Float32(), | |||||
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); | |||||
auto tensor_nchw4 = TensorValue( | |||||
{1, 4, 1, 2, 4}, dtype::Float32(), | |||||
{1, 3, 0, 0, 2, 4, 0, 0, 5, 7, 0, 0, 6, 8, 0, 0, | |||||
9, 11, 0, 0, 10, 12, 0, 0, 13, 15, 0, 0, 14, 16, 0, 0}); | |||||
RelayoutFormat::Param param{RelayoutFormat::Param::Mode::NCHW_NCHW4}; | |||||
param.group = 4; | |||||
checker.set_param(param).exect(Testcase{tensor_nchw, {}}, | |||||
Testcase{{}, tensor_nchw4}); | |||||
} | |||||
{ | |||||
auto tensor_nchw = TensorValue({1, 6, 1, 2}, dtype::Float32(), | |||||
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); | |||||
auto tensor_nchw4 = TensorValue( | |||||
{1, 2, 1, 2, 4}, dtype::Float32(), | |||||
{1, 3, 5, 0, 2, 4, 6, 0, 7, 9, 11, 0, 8, 10, 12, 0}); | |||||
RelayoutFormat::Param param{RelayoutFormat::Param::Mode::NCHW_NCHW4}; | |||||
param.group = 2; | |||||
checker.set_param(param).exect(Testcase{tensor_nchw, {}}, | |||||
Testcase{{}, tensor_nchw4}); | |||||
} | |||||
} | |||||
TEST_F(NAIVE, RELAYOUT_FORMAT_NCHW88) { | TEST_F(NAIVE, RELAYOUT_FORMAT_NCHW88) { | ||||
Checker<RelayoutFormat> checker(handle(), /* check_dispatch */ false); | Checker<RelayoutFormat> checker(handle(), /* check_dispatch */ false); | ||||
@@ -1914,8 +1914,17 @@ TEST(TestEnableTensorCore, Nchw4Nchw) { | |||||
unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc); | unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc); | ||||
} | } | ||||
auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt); | auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt); | ||||
if (format == opr::ConvBias::Param::Format::NCHW4) { | |||||
#if CUDA_VERSION >= 10020 | |||||
//! try_conv_reformat_nchw322nchw4 used when cuda_version >= 10020 | |||||
ASSERT_EQ(1u, nr_dimshuffle); | |||||
#else | |||||
ASSERT_EQ(2u, nr_dimshuffle); | |||||
#endif | |||||
} else { | |||||
ASSERT_EQ(2u, nr_dimshuffle); | |||||
} | |||||
std::string json_name; | std::string json_name; | ||||
ASSERT_EQ(2u, nr_dimshuffle); | |||||
if (format == opr::ConvBias::Param::Format::NCHW4) { | if (format == opr::ConvBias::Param::Format::NCHW4) { | ||||
json_name = "TestGoptInference.Nchw4Nchw.NCHW4.json"; | json_name = "TestGoptInference.Nchw4Nchw.NCHW4.json"; | ||||
} else { | } else { | ||||
@@ -2856,8 +2865,6 @@ TEST(TestGoptInference, EnableCHWN4ShuffleRemove) { | |||||
} | } | ||||
#endif | #endif | ||||
//! close for cu111 ci, reopen it when bug fixed | |||||
#if CUDA_VERSION < 11000 | |||||
TEST(TestGoptInference, ConvertFormatNCHW4GPU) { | TEST(TestGoptInference, ConvertFormatNCHW4GPU) { | ||||
REQUIRE_GPU(1); | REQUIRE_GPU(1); | ||||
auto cn = CompNode::load("gpu0"); | auto cn = CompNode::load("gpu0"); | ||||
@@ -2936,7 +2943,6 @@ TEST(TestGoptInference, ConvertFormatNCHW4GPU) { | |||||
func->execute(); | func->execute(); | ||||
MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt); | MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt); | ||||
} | } | ||||
#endif | |||||
#endif | #endif | ||||
@@ -3076,8 +3082,6 @@ TEST(TestGoptInference, ConvertFormatNCHW4) { | |||||
MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3); | MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3); | ||||
} | } | ||||
//! close for cu111 ci, reopen it when bug fixed | |||||
#if CUDA_VERSION < 11000 | |||||
TEST(TestGoptInference, ConvertFormatNCHW4Ic3) { | TEST(TestGoptInference, ConvertFormatNCHW4Ic3) { | ||||
REQUIRE_GPU(1); | REQUIRE_GPU(1); | ||||
auto cn = CompNode::load("gpu0"); | auto cn = CompNode::load("gpu0"); | ||||
@@ -3139,7 +3143,6 @@ TEST(TestGoptInference, ConvertFormatNCHW4Ic3) { | |||||
func->execute(); | func->execute(); | ||||
MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3); | MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3); | ||||
} | } | ||||
#endif | |||||
TEST(TestGoptInference, ConvertFormatNCHW88) { | TEST(TestGoptInference, ConvertFormatNCHW88) { | ||||
HostTensorGenerator<> gen; | HostTensorGenerator<> gen; | ||||
@@ -183,7 +183,9 @@ namespace opr { | |||||
} | } | ||||
MGB_REG_OPR_SHALLOW_COPY(ParamPackConcat, opr_shallow_copy_param_pack_concat); | MGB_REG_OPR_SHALLOW_COPY(ParamPackConcat, opr_shallow_copy_param_pack_concat); | ||||
MGB_SEREG_OPR(RelayoutFormat, 1); | |||||
using RelayoutFormatV1 = opr::RelayoutFormat; | |||||
MGB_SEREG_OPR(RelayoutFormatV1, 1); | |||||
} // namespace opr | } // namespace opr | ||||
} // namespace mgb | } // namespace mgb | ||||
@@ -1977,8 +1977,7 @@ TEST(TestTensorRTReplace, FuseConvAdd) { | |||||
MGB_ASSERT_TENSOR_NEAR(outputs[0], outputs[2], 1e-3); | MGB_ASSERT_TENSOR_NEAR(outputs[0], outputs[2], 1e-3); | ||||
MGB_ASSERT_TENSOR_NEAR(outputs[1], outputs[3], 1e-3); | MGB_ASSERT_TENSOR_NEAR(outputs[1], outputs[3], 1e-3); | ||||
} | } | ||||
//! close for cu111 ci, reopen it when bug fixed | |||||
#if CUDA_VERSION < 11000 | |||||
TEST(TestTensorRTReplace, FuseConvAddNchw2nchw4) { | TEST(TestTensorRTReplace, FuseConvAddNchw2nchw4) { | ||||
REQUIRE_GPU(1); | REQUIRE_GPU(1); | ||||
auto cn = CompNode::load("gpu0"); | auto cn = CompNode::load("gpu0"); | ||||
@@ -2044,7 +2043,7 @@ TEST(TestTensorRTReplace, FuseConvAddNchw2nchw4) { | |||||
MGB_ASSERT_TENSOR_NEAR(outputs[0], outputs[1], 1e-3); | MGB_ASSERT_TENSOR_NEAR(outputs[0], outputs[1], 1e-3); | ||||
} | } | ||||
#endif | |||||
#endif // MGB_ENABLE_TENSOR_RT | #endif // MGB_ENABLE_TENSOR_RT | ||||