stride1 2x2 3x3 5x5 stride2 2x2 3x3 5x5
GitOrigin-RevId: 43d76311c2
tags/v1.0.0-rc1
@@ -33,7 +33,7 @@ KERN(stride2, 5) | |||||
#undef KERN | #undef KERN | ||||
} // namesapce conv_bias | |||||
} // namespace channel_wise_nchw44 | |||||
} // namespace arm_common | } // namespace arm_common | ||||
} // namespace megdnn | } // namespace megdnn | ||||
@@ -10,16 +10,15 @@ | |||||
*/ | */ | ||||
#include "src/arm_common/conv_bias/int8x8x16/algos.h" | #include "src/arm_common/conv_bias/int8x8x16/algos.h" | ||||
#include "src/arm_common/conv_bias/int8x8x16/channel_wise_nchw44.h" | |||||
#include "src/arm_common/conv_bias/int8x8x16/channel_wise_nchw44_8x8x16.h" | |||||
#include "src/arm_common/conv_bias/int8x8x16/conv_direct.h" | #include "src/arm_common/conv_bias/int8x8x16/conv_direct.h" | ||||
#include "src/arm_common/conv_bias/int8x8x16/conv_stride2.h" | #include "src/arm_common/conv_bias/int8x8x16/conv_stride2.h" | ||||
#include "midout.h" | #include "midout.h" | ||||
#include "src/common/opr_delegate.h" | |||||
MIDOUT_DECL(megdnn_arm_common_conv_bias_int8816_kimpl) | MIDOUT_DECL(megdnn_arm_common_conv_bias_int8816_kimpl) | ||||
#include <atomic> | |||||
#include <cstring> | |||||
#include <mutex> | |||||
using namespace megdnn; | using namespace megdnn; | ||||
using namespace arm_common; | using namespace arm_common; | ||||
@@ -550,4 +549,70 @@ ConvBiasImpl::AlgoI8x8x16Stride2Filter2::dispatch_kerns( | |||||
return {{kern, {group, 1_z, 1_z}}}; | return {{kern, {group, 1_z, 1_z}}}; | ||||
} | } | ||||
/* =====================8int8x8x16 channel_wise_nchw44 stride1 stride2 algo ===================== */ | |||||
bool ConvBiasImpl::AlgoS8x8x16ChanWiseStride1Stride2NCHW44::usable( | |||||
const NCBKernSizeParam& param, AlgoSelectionStrategy) const { | |||||
auto&& fm = param.filter_meta; | |||||
auto FH = fm.spatial[0]; | |||||
bool avaible = | |||||
//! src and filter are int8, dst is int16 | |||||
(param.src_type.enumv() == DTypeEnum::Int8 && | |||||
param.filter_type.enumv() == DTypeEnum::Int8 && | |||||
param.dst_type.enumv() == DTypeEnum::Int16) && | |||||
fm.format == param::Convolution::Format::NCHW44 && | |||||
param.bias_mode != megdnn::BiasMode::BIAS && | |||||
param.nonlineMode == megdnn::NonlineMode::IDENTITY && | |||||
!fm.should_flip && fm.spatial_ndim == 2 && fm.dilation[0] == 1 && | |||||
fm.dilation[1] == 1 && | |||||
(fm.stride[0] == fm.stride[1] && | |||||
(fm.stride[0] == 1 || fm.stride[0] == 2)) && | |||||
FH == fm.spatial[1] && (FH == 2 || FH == 3 || FH == 5) && | |||||
fm.icpg == 1 && fm.ocpg == 1 && fm.group % 4 == 0; | |||||
return avaible; | |||||
} | |||||
size_t ConvBiasImpl::AlgoS8x8x16ChanWiseStride1Stride2NCHW44::get_workspace( | |||||
const NCBKernSizeParam& param) const { | |||||
size_t stride_h = param.filter_meta.stride[0]; | |||||
size_t stride_w = param.filter_meta.stride[1]; | |||||
megdnn_assert(stride_h == stride_w); | |||||
if (stride_h == 1) { | |||||
return channel_wise_nchw44_8x8x16::stride1::get_bundle(param) | |||||
.total_size_in_bytes(); | |||||
} else if (stride_h == 2) { | |||||
return channel_wise_nchw44_8x8x16::stride2::get_bundle(param) | |||||
.total_size_in_bytes(); | |||||
} else { | |||||
return 0; | |||||
} | |||||
} | |||||
SmallVector<ConvBiasImpl::NCBKern> | |||||
ConvBiasImpl::AlgoS8x8x16ChanWiseStride1Stride2NCHW44::dispatch_kerns( | |||||
const NCBKernSizeParam& param) const { | |||||
size_t stride_h = param.filter_meta.stride[0]; | |||||
size_t stride_w = param.filter_meta.stride[1]; | |||||
if (stride_h == stride_w && stride_h == 1) { | |||||
MIDOUT_BEGIN( | |||||
megdnn_arm_common_conv_bias_int8816_kimpl, | |||||
midout_iv( | |||||
"AlgoS8x8x16ChanWiseStride1Stride2NCHW44_dispatch_kerns"_hash)) { | |||||
return channel_wise_nchw44_8x8x16::stride1::get_kimpls(param); | |||||
} | |||||
MIDOUT_END(); | |||||
return {}; | |||||
} else if (stride_h == stride_w && stride_h == 2) { | |||||
MIDOUT_BEGIN( | |||||
megdnn_arm_common_conv_bias_int8816_kimpl, | |||||
midout_iv( | |||||
"AlgoS8x8x16ChanWiseStride2NCHW44_dispatch_kerns"_hash)) { | |||||
return channel_wise_nchw44_8x8x16::stride2::get_kimpls(param); | |||||
} | |||||
MIDOUT_END(); | |||||
return {}; | |||||
} else { | |||||
return {}; | |||||
} | |||||
} | |||||
// vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen |
@@ -72,6 +72,18 @@ public: | |||||
const NCBKernSizeParam& param) const override; | const NCBKernSizeParam& param) const override; | ||||
}; | }; | ||||
class ConvBiasImpl::AlgoS8x8x16ChanWiseStride1Stride2NCHW44 final : public AlgoBase { | |||||
public: | |||||
bool is_reproducible() const override { return true; } | |||||
const char* name() const override { return "S8x8x16_CHAN_WISE_STRD1_STRD2_NCHW44"; } | |||||
bool usable(const NCBKernSizeParam& param, | |||||
AlgoSelectionStrategy algo_selection_strategy) const override; | |||||
size_t get_workspace( | |||||
const NCBKernSizeParam& param) const override; | |||||
virtual SmallVector<NCBKern> dispatch_kerns( | |||||
const NCBKernSizeParam& param) const override; | |||||
}; | |||||
} // namespace arm_common | } // namespace arm_common | ||||
} // namespace megdnn | } // namespace megdnn | ||||
@@ -0,0 +1,40 @@ | |||||
/** | |||||
* \file dnn/src/arm_common/conv_bias/int8x8x16/channel_wise_kernel.h | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
*/ | |||||
#include "src/arm_common/conv_bias/opr_impl.h" | |||||
#include "src/fallback/conv_bias/common.h" | |||||
namespace megdnn { | |||||
namespace arm_common { | |||||
namespace channel_wise_nchw44_8x8x16 { | |||||
#define KERN(stride, i) \ | |||||
template <BiasMode bias_mode> \ | |||||
void direct_##stride##_##i##x##i##_int8x8x16( \ | |||||
const int8_t* src, const int8_t* filter, const int16_t* bias, \ | |||||
void* dst, const size_t IH, const size_t IW, const size_t OH, \ | |||||
const size_t OW); | |||||
KERN(stride1, 2) | |||||
KERN(stride1, 3) | |||||
KERN(stride1, 5) | |||||
KERN(stride2, 2) | |||||
KERN(stride2, 3) | |||||
KERN(stride2, 5) | |||||
#undef KERN | |||||
} // namespace channel_wise_nchw44_8x8x16 | |||||
} // namespace arm_common | |||||
} // namespace megdnn | |||||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,57 @@ | |||||
/** | |||||
* \file dnn/src/arm_common/conv_bias/int8x8x16/channel_wise_nchw44.h | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
*/ | |||||
#pragma once | |||||
#include "src/arm_common/conv_bias/opr_impl.h" | |||||
namespace megdnn { | |||||
namespace arm_common { | |||||
namespace channel_wise_nchw44 { | |||||
using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam; | |||||
using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; | |||||
using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; | |||||
using conv_fun = std::function<void(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index)>; | |||||
namespace stride1 { | |||||
bool is_available(const NCBKernSizeParam& param); | |||||
WorkspaceBundle get_bundle(const NCBKernSizeParam& param); | |||||
template <bool quantized, size_t filter, BiasMode bias_mode, typename Op> | |||||
void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index); | |||||
SmallVector<ConvBiasImpl::NCBKern> get_kimpls(const NCBKernSizeParam& param); | |||||
} // namespace stride1 | |||||
namespace stride2 { | |||||
bool is_available(const NCBKernSizeParam& param); | |||||
WorkspaceBundle get_bundle(const NCBKernSizeParam& param); | |||||
template <bool quantized, size_t filter, BiasMode bias_mode, typename Op> | |||||
void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index); | |||||
SmallVector<ConvBiasImpl::NCBKern> get_kimpls(const NCBKernSizeParam& param); | |||||
} // namespace stride2 | |||||
} // namespace direct_int8_stride1 | |||||
} // namespace arm_common | |||||
} // namespace megdnn | |||||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,259 @@ | |||||
/** | |||||
* \file dnn/src/arm_common/conv_bias/int8x8x16/channel_wise_nchw44.cpp | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
*/ | |||||
#include "src/arm_common/conv_bias/int8x8x16/channel_wise_nchw44_8x8x16.h" | |||||
#include "src/arm_common/conv_bias/int8x8x16/channel_wise_kernel.h" | |||||
#include "src/common/opr_delegate.h" | |||||
#include "midout.h" | |||||
#include "src/fallback/conv_bias/common.h" | |||||
using namespace megdnn; | |||||
using namespace arm_common; | |||||
using namespace channel_wise_nchw44_8x8x16; | |||||
namespace { | |||||
void get_rectified_size( | |||||
const megdnn::fallback::ConvBiasImpl::NCBKernSizeParam& param, | |||||
size_t& IH2, size_t& IW2) { | |||||
auto&& fm = param.filter_meta; | |||||
auto SW = fm.stride[1]; | |||||
auto OH = param.osz[0]; | |||||
auto OW = param.osz[1]; | |||||
auto FH = fm.spatial[0]; | |||||
auto FW = fm.spatial[1]; | |||||
size_t OW2 = (OW + 3) & ~3; | |||||
IH2 = SW * OH + FH - SW; | |||||
IW2 = SW * OW2 + FW - SW; | |||||
} | |||||
} // namespace | |||||
MIDOUT_DECL(megdnn_arm_common_conv_bias_int8x8x16_nchw44_stride1) | |||||
MIDOUT_DECL(megdnn_arm_common_conv_bias_int8x8x16_nchw44_stride2) | |||||
WorkspaceBundle stride1::get_bundle( | |||||
const ConvBiasImpl::NCBKernSizeParam& param) { | |||||
size_t nr_threads = param.nr_threads; | |||||
size_t IH2, IW2; | |||||
get_rectified_size(param, IH2, IW2); | |||||
constexpr size_t pack_ic_size = 4_z; | |||||
//! The extra 16B is used to void ivalid read in kernel compute | |||||
size_t src_size = IH2 * IW2 * pack_ic_size * sizeof(int8_t) + 16; | |||||
SmallVector<size_t> sizes(nr_threads, src_size); | |||||
return {nullptr, sizes}; | |||||
} | |||||
//! compute one output channel | |||||
template <size_t filter, BiasMode bias_mode> | |||||
void stride1::do_conv_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) { | |||||
size_t PH = kern_param.filter_meta.padding[0]; | |||||
size_t PW = kern_param.filter_meta.padding[1]; | |||||
size_t OH = kern_param.osz[0]; | |||||
size_t OW = kern_param.osz[1]; | |||||
size_t IH = kern_param.isz[0]; | |||||
size_t IW = kern_param.isz[1]; | |||||
size_t IH2, IW2; | |||||
get_rectified_size(kern_param, IH2, IW2); | |||||
constexpr size_t pack_group_size = 4_z; | |||||
constexpr size_t pack_ic_size = 4_z; | |||||
size_t thread_id = ncb_index.thread_id, batch_id = ncb_index.ndrange_id[0]; | |||||
size_t group_id = ncb_index.ndrange_id[1]; | |||||
int8_t* padding_src = static_cast<int8_t*>(bundle.get(thread_id)); | |||||
const int8_t* sptr = | |||||
kern_param.src<dt_int8>(batch_id, group_id, 0, pack_group_size); | |||||
const int8_t* fptr = kern_param.filter<dt_int8>(group_id, pack_group_size); | |||||
void* dst = kern_param.dst<void>(batch_id, group_id, 0, pack_group_size); | |||||
const int16_t* bptr = | |||||
kern_param.bias<dt_int16>(batch_id, group_id, 0, pack_group_size); | |||||
//! copy in case of illegal read src when padding is zero | |||||
std::memset(padding_src, 0, sizeof(int8_t) * IH2 * IW2 * pack_ic_size); | |||||
rep(ih, IH) { | |||||
std::memcpy(padding_src + ((ih + PH) * IW2 + PW) * pack_ic_size, | |||||
sptr + ih * IW * pack_ic_size, | |||||
sizeof(int8_t) * IW * pack_ic_size); | |||||
} | |||||
sptr = padding_src; | |||||
#define KERN(_size) \ | |||||
direct_stride1_##_size##x##_size##_int8x8x16<bias_mode>( \ | |||||
sptr, fptr, bptr, dst, IH2, IW2, OH, OW); | |||||
DISPATCH_FILTER_CHANNEL_WISE(filter, KERN); | |||||
#undef KERN | |||||
} | |||||
SmallVector<ConvBiasImpl::NCBKern> stride1::get_kimpls( | |||||
const NCBKernSizeParam& param) { | |||||
auto fm = param.filter_meta; | |||||
size_t N = param.n; | |||||
size_t group = fm.group / 4; | |||||
megdnn_assert(fm.group % 4 == 0, | |||||
"nchw44 channel wise conv with group is not times of 4"); | |||||
WorkspaceBundle wbundle = get_bundle(param); | |||||
conv_fun do_conv_fun = nullptr; | |||||
#define DO_CONV_KERN_FUN(filter, bias_mode) \ | |||||
MIDOUT_BEGIN(megdnn_arm_common_conv_bias_int8x8x16_nchw44_stride1, \ | |||||
midout_iv(#filter #bias_mode##_hash)) { \ | |||||
do_conv_fun = do_conv_kern<filter, bias_mode>; \ | |||||
} \ | |||||
MIDOUT_END(); | |||||
#define GET_OP_PARAM(i, bias_mode) \ | |||||
switch (param.nonlineMode) { \ | |||||
case param::ConvBias::NonlineMode::IDENTITY: \ | |||||
DO_CONV_KERN_FUN(i, bias_mode) \ | |||||
break; \ | |||||
default: \ | |||||
megdnn_assert(0, "only support NonlineMode::IDENTITY"); \ | |||||
break; \ | |||||
} | |||||
#define GET_BIAS_MODE_PARAM(i) \ | |||||
switch (param.bias_mode) { \ | |||||
case BiasMode::NO_BIAS: \ | |||||
GET_OP_PARAM(i, BiasMode::NO_BIAS) \ | |||||
break; \ | |||||
case BiasMode::BROADCAST_CHANNEL_BIAS: \ | |||||
GET_OP_PARAM(i, BiasMode::BROADCAST_CHANNEL_BIAS) \ | |||||
break; \ | |||||
default: \ | |||||
megdnn_assert(0, \ | |||||
"only support BiasMode::NO_BIAS and " \ | |||||
"BiasMode::BROADCAST_CHANNEL_BIAS"); \ | |||||
break; \ | |||||
} | |||||
#define DISPATCH_CONV_KERN() \ | |||||
switch (param.filter_meta.spatial[0]) { \ | |||||
case 2: \ | |||||
GET_BIAS_MODE_PARAM(2) \ | |||||
break; \ | |||||
case 3: \ | |||||
GET_BIAS_MODE_PARAM(3) \ | |||||
break; \ | |||||
case 5: \ | |||||
GET_BIAS_MODE_PARAM(5) \ | |||||
break; \ | |||||
default: \ | |||||
megdnn_assert(0, "only support filtersize 2x2 3x3 5x5"); \ | |||||
break; \ | |||||
} | |||||
DISPATCH_CONV_KERN(); | |||||
megdnn_assert(do_conv_fun); | |||||
SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | |||||
auto exec_one_group = [wbundle, do_conv_fun]( | |||||
const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
wbundle.set(kern_param.workspace_ptr); | |||||
do_conv_fun(wbundle, kern_param, ncb_index); | |||||
}; | |||||
ret_kerns.push_back({exec_one_group, {N, group}}); | |||||
return ret_kerns; | |||||
#undef DO_CONV_KERN_FUN | |||||
} | |||||
WorkspaceBundle stride2::get_bundle( | |||||
const ConvBiasImpl::NCBKernSizeParam& param) { | |||||
size_t nr_threads = param.nr_threads; | |||||
size_t IH2, IW2; | |||||
get_rectified_size(param, IH2, IW2); | |||||
constexpr size_t pack_ic_size = 4_z; | |||||
//! The extra 16B is used to void ivalid read in kernel compute | |||||
size_t src_size = IH2 * IW2 * pack_ic_size * sizeof(int8_t) + 16; | |||||
SmallVector<size_t> sizes(nr_threads, src_size); | |||||
return {nullptr, sizes}; | |||||
} | |||||
//! compute one output channel | |||||
template <size_t filter, BiasMode bias_mode> | |||||
void stride2::do_conv_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) { | |||||
size_t PH = kern_param.filter_meta.padding[0]; | |||||
size_t PW = kern_param.filter_meta.padding[1]; | |||||
size_t OH = kern_param.osz[0]; | |||||
size_t OW = kern_param.osz[1]; | |||||
size_t IH = kern_param.isz[0]; | |||||
size_t IW = kern_param.isz[1]; | |||||
size_t IH2, IW2; | |||||
get_rectified_size(kern_param, IH2, IW2); | |||||
constexpr size_t pack_group_size = 4_z; | |||||
constexpr size_t pack_ic_size = 4_z; | |||||
size_t thread_id = ncb_index.thread_id, batch_id = ncb_index.ndrange_id[0]; | |||||
size_t group_id = ncb_index.ndrange_id[1]; | |||||
int8_t* padding_src = static_cast<int8_t*>(bundle.get(thread_id)); | |||||
const int8_t* sptr = | |||||
kern_param.src<dt_int8>(batch_id, group_id, 0, pack_group_size); | |||||
const int8_t* fptr = kern_param.filter<dt_int8>(group_id, pack_group_size); | |||||
void* dst = kern_param.dst<void>(batch_id, group_id, 0, pack_group_size); | |||||
const int16_t* bptr = | |||||
kern_param.bias<dt_int16>(batch_id, group_id, 0, pack_group_size); | |||||
//! copy in case of illegal read src when padding is zero | |||||
std::memset(padding_src, 0, sizeof(int8_t) * IH2 * IW2 * pack_ic_size); | |||||
rep(ih, IH) { | |||||
std::memcpy(padding_src + ((ih + PH) * IW2 + PW) * pack_ic_size, | |||||
sptr + ih * IW * pack_ic_size, | |||||
sizeof(int8_t) * IW * pack_ic_size); | |||||
} | |||||
sptr = padding_src; | |||||
#define KERN(_size) \ | |||||
direct_stride2_##_size##x##_size##_int8x8x16<bias_mode>( \ | |||||
sptr, fptr, bptr, dst, IH2, IW2, OH, OW); | |||||
DISPATCH_FILTER_CHANNEL_WISE(filter, KERN); | |||||
#undef KERN | |||||
} | |||||
SmallVector<ConvBiasImpl::NCBKern> stride2::get_kimpls( | |||||
const NCBKernSizeParam& param) { | |||||
auto fm = param.filter_meta; | |||||
size_t N = param.n; | |||||
size_t group = fm.group / 4; | |||||
megdnn_assert(fm.group % 4 == 0, | |||||
"nchw44 channel wise conv with group is not times of 4"); | |||||
WorkspaceBundle wbundle = get_bundle(param); | |||||
conv_fun do_conv_fun = nullptr; | |||||
#define DO_CONV_KERN_FUN(filter, bias_mode) \ | |||||
MIDOUT_BEGIN(megdnn_arm_common_conv_bias_int8x8x16_nchw44_stride2, \ | |||||
midout_iv(#filter #bias_mode##_hash)) { \ | |||||
do_conv_fun = do_conv_kern<filter, bias_mode>; \ | |||||
} \ | |||||
MIDOUT_END(); | |||||
DISPATCH_CONV_KERN(); | |||||
megdnn_assert(do_conv_fun); | |||||
SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | |||||
auto exec_one_group = [wbundle, do_conv_fun]( | |||||
const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
wbundle.set(kern_param.workspace_ptr); | |||||
do_conv_fun(wbundle, kern_param, ncb_index); | |||||
}; | |||||
ret_kerns.push_back({exec_one_group, {N, group}}); | |||||
return ret_kerns; | |||||
#undef DISPATCH_CONV_KERN | |||||
#undef GET_BIAS_MODE_PARAM | |||||
#undef GET_OP_PARAM | |||||
} | |||||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,57 @@ | |||||
/** | |||||
* \file dnn/src/arm_common/conv_bias/int8x8x16/channel_wise_nchw44.h | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
*/ | |||||
#pragma once | |||||
#include "src/arm_common/conv_bias/opr_impl.h" | |||||
namespace megdnn { | |||||
namespace arm_common { | |||||
namespace channel_wise_nchw44_8x8x16 { | |||||
using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam; | |||||
using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; | |||||
using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; | |||||
using conv_fun = std::function<void(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index)>; | |||||
namespace stride1 { | |||||
bool is_available(const NCBKernSizeParam& param); | |||||
WorkspaceBundle get_bundle(const NCBKernSizeParam& param); | |||||
template <size_t filter, BiasMode bias_mode> | |||||
void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index); | |||||
SmallVector<ConvBiasImpl::NCBKern> get_kimpls(const NCBKernSizeParam& param); | |||||
} // namespace stride1 | |||||
namespace stride2 { | |||||
bool is_available(const NCBKernSizeParam& param); | |||||
WorkspaceBundle get_bundle(const NCBKernSizeParam& param); | |||||
template <size_t filter, BiasMode bias_mode> | |||||
void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index); | |||||
SmallVector<ConvBiasImpl::NCBKern> get_kimpls(const NCBKernSizeParam& param); | |||||
} // namespace stride2 | |||||
} // namespace direct_int8_stride1 | |||||
} // namespace arm_common | |||||
} // namespace megdnn | |||||
// vim: syntax=cpp.doxygen |
@@ -48,6 +48,7 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj { | |||||
AlgoS8DirectStride1 s8_direct_stride1; | AlgoS8DirectStride1 s8_direct_stride1; | ||||
AlgoS8ChanWiseStride1NCHW44 s8_channel_wise_stride1_nchw44; | AlgoS8ChanWiseStride1NCHW44 s8_channel_wise_stride1_nchw44; | ||||
AlgoS8ChanWiseStride2NCHW44 s8_channel_wise_stride2_nchw44; | AlgoS8ChanWiseStride2NCHW44 s8_channel_wise_stride2_nchw44; | ||||
AlgoS8x8x16ChanWiseStride1Stride2NCHW44 s8x8x16_channel_wise_stride1_stride2_nchw44; | |||||
#if __ARM_FEATURE_DOTPROD | #if __ARM_FEATURE_DOTPROD | ||||
AlgoDotS8DirectStride1 ds8_direct_stride1; | AlgoDotS8DirectStride1 ds8_direct_stride1; | ||||
@@ -95,6 +96,7 @@ public: | |||||
direct_algos.emplace_back(&s8_direct_nchw_nchw44); | direct_algos.emplace_back(&s8_direct_nchw_nchw44); | ||||
direct_algos.emplace_back(&s8_direct_stride1); | direct_algos.emplace_back(&s8_direct_stride1); | ||||
direct_algos.emplace_back(&s8x8x16_channel_wise_stride1_stride2_nchw44); | |||||
direct_algos.emplace_back(&s8_channel_wise_stride1_nchw44); | direct_algos.emplace_back(&s8_channel_wise_stride1_nchw44); | ||||
direct_algos.emplace_back(&s8_channel_wise_stride2_nchw44); | direct_algos.emplace_back(&s8_channel_wise_stride2_nchw44); | ||||
@@ -54,6 +54,7 @@ private: | |||||
class AlgoS8ChanWiseStride1NCHW44; | class AlgoS8ChanWiseStride1NCHW44; | ||||
class AlgoS8ChanWiseStride2NCHW44; | class AlgoS8ChanWiseStride2NCHW44; | ||||
class AlgoS8x8x16ChanWiseStride1Stride2NCHW44; | |||||
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | ||||
class AlgoFP16WinogradF23; | class AlgoFP16WinogradF23; | ||||
@@ -558,6 +558,142 @@ void BENCHMARK_IM2COL_NCHW44_VS_NCHW(const char* algo_name, | |||||
} | } | ||||
} | } | ||||
std::vector<conv_bias::TestArg> get_nchw44_channel_wise_benchmark_args( | |||||
std::vector<size_t> kernel, size_t stride, bool no_bias, | |||||
bool no_nonlinemode, bool no_full_bias) { | |||||
using namespace conv_bias; | |||||
using Param = param::ConvBias; | |||||
using NLMode = param::ConvBias::NonlineMode; | |||||
std::vector<TestArg> args; | |||||
auto pack = [&](size_t n, size_t group, size_t w, size_t h, size_t kernel, | |||||
size_t stride, NLMode nlmode, bool pad) { | |||||
Param param; | |||||
param.stride_h = stride; | |||||
param.stride_w = stride; | |||||
if (pad) { | |||||
param.pad_h = kernel / 2; | |||||
param.pad_w = kernel / 2; | |||||
} else { | |||||
param.pad_h = 0; | |||||
param.pad_w = 0; | |||||
} | |||||
param.nonlineMode = nlmode; | |||||
param.format = param::ConvBias::Format::NCHW44; | |||||
param.sparse = param::ConvBias::Sparse::GROUP; | |||||
args.emplace_back(param, TensorShape{n, group, h, w, 4}, | |||||
TensorShape{group, 1, 1, kernel, kernel, 4}, | |||||
TensorShape{}); | |||||
if (!no_bias) { | |||||
args.emplace_back(param, TensorShape{n, group, h, w, 4}, | |||||
TensorShape{group, 1, 1, kernel, kernel, 4}, | |||||
TensorShape{1, group, 1, 1, 4}); | |||||
} | |||||
if (!no_full_bias) { | |||||
args.emplace_back( | |||||
param, TensorShape{n, group, h, w, 4}, | |||||
TensorShape{group, 1, 1, kernel, kernel, 4}, | |||||
TensorShape{n, group, | |||||
(h + 2 * param.pad_w - kernel) / stride + 1, | |||||
(w + 2 * param.pad_w - kernel) / stride + 1, | |||||
4}); | |||||
} | |||||
}; | |||||
std::vector<NLMode> nonlinemode = {NLMode::IDENTITY}; | |||||
if (!no_nonlinemode) { | |||||
nonlinemode.emplace_back(NLMode::RELU); | |||||
nonlinemode.emplace_back(NLMode::H_SWISH); | |||||
} | |||||
for (size_t n : {1}) { | |||||
for (auto nlmode : nonlinemode) { | |||||
for (bool pad : {true}) { | |||||
for (size_t group : {1, 2, 4, 128}) { | |||||
for (size_t size : {40,89,100,200}) { | |||||
for (size_t kern : kernel) { | |||||
pack(n, group, size, size, kern, stride, nlmode, | |||||
pad); | |||||
} | |||||
} | |||||
} | |||||
} | |||||
for (bool pad : {false}) { | |||||
for (size_t group : {1, 2, 4, 8, 16, 32, 64, 128}) { | |||||
for (size_t size : {40, 89, 100}) { | |||||
for (size_t kern : kernel) { | |||||
pack(n, group, size, size, kern, stride, nlmode, | |||||
pad); | |||||
} | |||||
} | |||||
} | |||||
} | |||||
} | |||||
} | |||||
return args; | |||||
} | |||||
void BENCHMARK_GROUPCONV_NCHW44_int8x8x16VS_int8x8x32(const char* algo_name0, | |||||
const char* algo_name1, Handle* handle, | |||||
size_t kernel,size_t stride = 1, size_t pack_size = 1) { | |||||
auto args = get_nchw44_channel_wise_benchmark_args({2, 3, 5}, stride, false, true, true); | |||||
using namespace conv_bias; | |||||
constexpr size_t RUN = 10; | |||||
Benchmarker<ConvBias> benchmark(handle); | |||||
benchmark.set_display(false); | |||||
benchmark.set_times(RUN); | |||||
benchmark.set_dtype(0, dtype::Int8()); | |||||
benchmark.set_dtype(1, dtype::Int8()); | |||||
benchmark.set_dtype(2, dtype::Int32()); | |||||
benchmark.set_dtype(4, dtype::Int32()); | |||||
Benchmarker<ConvBias> benchmark_algo1(handle); | |||||
benchmark_algo1.set_display(false); | |||||
benchmark_algo1.set_times(RUN); | |||||
benchmark_algo1.set_dtype(0, dtype::Int8()); | |||||
benchmark_algo1.set_dtype(1, dtype::Int8()); | |||||
benchmark_algo1.set_dtype(2, dtype::Int16()); | |||||
benchmark_algo1.set_dtype(4, dtype::Int16()); | |||||
for (auto&& arg : args) { | |||||
TensorLayout dst_layout; | |||||
auto opr = handle->create_operator<ConvBias>(); | |||||
opr->param() = arg.param; | |||||
opr->deduce_layout({arg.src, dtype::Float32()}, | |||||
{arg.filter, dtype::Float32()}, | |||||
{arg.bias, dtype::Float32()}, {}, dst_layout); | |||||
//! dst.nr_elems * IC * FH * FW * 2 | |||||
float computations = dst_layout.total_nr_elems() * arg.filter[1] * | |||||
arg.filter[2] * arg.filter[3] * 2.0 * pack_size/ | |||||
(1024 * 1024 * 1024) * 1e3; | |||||
benchmark.set_param(arg.param); | |||||
auto used = algo_benchmark<ConvBias>(benchmark, | |||||
{arg.src, arg.filter, {}, {}, {}}, | |||||
algo_name0) / | |||||
RUN; | |||||
arg.param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY; | |||||
arg.param.format = param::ConvBias::Format::NCHW44; | |||||
benchmark_algo1.set_param(arg.param); | |||||
auto used_algo1 = | |||||
algo_benchmark<ConvBias>( | |||||
benchmark_algo1, | |||||
{arg.src, arg.filter, {}, {}, {}}, | |||||
algo_name1) / | |||||
RUN; | |||||
printf("%s %s: normal: %f ms %f Gflops 8x8x16: %f ms %f GFlops " | |||||
"speedup: " | |||||
"%f\n", | |||||
arg.src.to_string().c_str(), arg.filter.to_string().c_str(), | |||||
used, computations / used, used_algo1, | |||||
computations / used_algo1, used / used_algo1); | |||||
} | |||||
} | |||||
#if MEGDNN_AARCH64 | #if MEGDNN_AARCH64 | ||||
TEST_F(ARM_COMMON, BENCHMARK_NCHW_VS_NCHW44_INT8x8x32) { | TEST_F(ARM_COMMON, BENCHMARK_NCHW_VS_NCHW44_INT8x8x32) { | ||||
printf("=========================compare " | printf("=========================compare " | ||||
@@ -579,6 +715,17 @@ TEST_F(ARM_COMMON, BENCHMARK_NCHW_VS_NCHW44_INT8x8x16) { | |||||
} | } | ||||
#endif | #endif | ||||
TEST_F(ARM_COMMON, BENCHMARK_GROUP_CONV_NCHW44_INT8x8x32_VS_INT8x8x16_STRIDE1) { | |||||
BENCHMARK_GROUPCONV_NCHW44_int8x8x16VS_int8x8x32("S8_CHAN_WISE_STRD1_NCHW44", | |||||
"S8x8x16_CHAN_WISE_STRD1_STRD2_NCHW44", | |||||
handle(), 3,1,4); | |||||
} | |||||
TEST_F(ARM_COMMON, BENCHMARK_GROUP_CONV_NCHW44_INT8x8x32_VS_INT8x8x16_STRIDE2) { | |||||
BENCHMARK_GROUPCONV_NCHW44_int8x8x16VS_int8x8x32("S8_CHAN_WISE_STRD2_NCHW44", | |||||
"S8x8x16_CHAN_WISE_STRD1_STRD2_NCHW44", | |||||
handle(), 3,2, 4); | |||||
} | |||||
TEST_F(ARM_COMMON, BENCHMARK_GROUP_CONVBIAS_QUANTIZED) { | TEST_F(ARM_COMMON, BENCHMARK_GROUP_CONVBIAS_QUANTIZED) { | ||||
constexpr size_t RUNS = 50; | constexpr size_t RUNS = 50; | ||||
param::ConvBias param; | param::ConvBias param; | ||||
@@ -9,6 +9,7 @@ | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | ||||
* implied. | * implied. | ||||
*/ | */ | ||||
#include "megdnn/dtype.h" | |||||
#include "test/arm_common/fixture.h" | #include "test/arm_common/fixture.h" | ||||
#include "test/common/benchmarker.h" | #include "test/common/benchmarker.h" | ||||
#include "test/common/conv_bias.h" | #include "test/common/conv_bias.h" | ||||
@@ -475,6 +476,36 @@ TEST_F(ARM_COMMON_MULTI_THREADS, | |||||
handle(), "S8_CHAN_WISE_STRD2_NCHW44"); | handle(), "S8_CHAN_WISE_STRD2_NCHW44"); | ||||
} | } | ||||
TEST_F(ARM_COMMON, | |||||
CONV_BIAS_INT8_INT8_INT16_CHANNEL_WISE_DIRECT1_NCHW44) { | |||||
Checker<ConvBias> checker(handle()); | |||||
checker.set_before_exec_callback( | |||||
conv_bias::ConvBiasAlgoChecker<ConvBias>("S8x8x16_CHAN_WISE_STRD1_STRD2_NCHW44")); | |||||
checker.set_dtype(0, dtype::Int8()); | |||||
checker.set_dtype(1, dtype::Int8()); | |||||
checker.set_dtype(2, dtype::Int16()); | |||||
checker.set_dtype(4, dtype::Int16()); | |||||
auto args = get_nchw44_channel_wise_args({2, 3, 5}, 1, false, true, true); | |||||
for (auto&& arg : args) { | |||||
checker.set_param(arg.param).execs({arg.src, arg.filter, {}, {}, {}}); | |||||
} | |||||
} | |||||
TEST_F(ARM_COMMON_MULTI_THREADS, | |||||
CONV_BIAS_INT8_INT8_INT16_CHANNEL_WISE_DIRECT2_NCHW44) { | |||||
Checker<ConvBias> checker(handle()); | |||||
checker.set_before_exec_callback( | |||||
conv_bias::ConvBiasAlgoChecker<ConvBias>("S8x8x16_CHAN_WISE_STRD1_STRD2_NCHW44")); | |||||
checker.set_dtype(0, dtype::Int8()); | |||||
checker.set_dtype(1, dtype::Int8()); | |||||
checker.set_dtype(2, dtype::Int16()); | |||||
checker.set_dtype(4, dtype::Int16()); | |||||
auto args = get_nchw44_channel_wise_args({2, 3, 5}, 2, false, true, true); | |||||
for (auto&& arg : args) { | |||||
checker.set_param(arg.param).execs({arg.src, arg.filter, {}, {}, {}}); | |||||
} | |||||
} | |||||
/********************************qint8 direct******************************/ | /********************************qint8 direct******************************/ | ||||
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_INT8_STRIDE1) { | TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_INT8_STRIDE1) { | ||||
checker_conv_bias_qint8x8x8(get_int8_quint8_conv_bias_args( | checker_conv_bias_qint8x8x8(get_int8_quint8_conv_bias_args( | ||||
@@ -1707,6 +1707,77 @@ TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, | |||||
} | } | ||||
TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, | TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, | ||||
BENCHMARK_CHANNEL_WISE_INT8_INT8_INT16_STRIDE1) { | |||||
constexpr size_t RUNS = 50; | |||||
param::ConvBias param; | |||||
param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY; | |||||
param.pad_h = 1; | |||||
param.pad_w = 1; | |||||
param.stride_h = 1; | |||||
param.stride_w = 1; | |||||
param.sparse = param::ConvBias::Sparse::GROUP; | |||||
param.format = param::ConvBias::Format::NCHW44; | |||||
std::vector<std::pair<SmallVector<TensorShape>, float>> | |||||
shapes_and_computation; | |||||
auto bench_case = [&](size_t N, size_t IC, size_t H, size_t W, size_t FS, | |||||
size_t P) { | |||||
size_t group = IC; | |||||
size_t OC = IC; | |||||
size_t S = 1; | |||||
SmallVector<TensorShape> shapes{ | |||||
{N, IC, H, W, 4}, | |||||
{group, 1, 1, FS, FS, 4}, | |||||
{1, OC, 1, 1, 4}, | |||||
{}, | |||||
{N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1, 4}}; | |||||
TensorShape dst{N, OC, (H + 2 * P - FS) / S + 1, | |||||
(W + 2 * P - FS) / S + 1, 4}; | |||||
float computations = | |||||
((IC / group) * FS * FS * dst.total_nr_elems() * 2 + | |||||
dst.total_nr_elems()) * | |||||
1e-6; | |||||
shapes_and_computation.push_back(std::make_pair(shapes, computations)); | |||||
}; | |||||
bench_case(1, 128, 200, 200, 3, 1); | |||||
bench_case(1, 128, 128, 128, 3, 1); | |||||
bench_case(1, 128, 100, 100, 3, 1); | |||||
bench_case(1, 128, 80, 80, 3, 1); | |||||
bench_case(1, 128, 56, 56, 3, 1); | |||||
bench_case(1, 128, 28, 28, 3, 1); | |||||
bench_case(1, 128, 14, 14, 3, 1); | |||||
bench_case(1, 64, 200, 200, 3, 1); | |||||
bench_case(1, 64, 128, 128, 3, 1); | |||||
bench_case(1, 64, 100, 100, 3, 1); | |||||
bench_case(1, 64, 80, 80, 3, 1); | |||||
bench_case(1, 64, 56, 56, 3, 1); | |||||
bench_case(1, 64, 28, 28, 3, 1); | |||||
bench_case(1, 64, 14, 14, 3, 1); | |||||
bench_case(1, 32, 200, 200, 3, 1); | |||||
bench_case(1, 32, 128, 128, 3, 1); | |||||
bench_case(1, 32, 100, 100, 3, 1); | |||||
bench_case(1, 32, 80, 80, 3, 1); | |||||
bench_case(1, 32, 56, 56, 3, 1); | |||||
bench_case(1, 32, 28, 28, 3, 1); | |||||
bench_case(1, 32, 14, 14, 3, 1); | |||||
std::string algo_name = "S8x8x16_CHAN_WISE_STRD1_STRD2_NCHW44"; | |||||
printf("Benchmarker S8x8x16_CHAN_WISE_STRD1_STRD2_NCHW44 algo\n"); | |||||
std::vector<DType> data_type = {dtype::Int8(), dtype::Int8(), | |||||
dtype::Int16(), dtype::Int16()}; | |||||
benchmark_impl(param, shapes_and_computation, algo_name, RUNS, | |||||
{4, {4, 5, 6, 7}}, {1, {4}}, data_type); | |||||
benchmark_impl(param, shapes_and_computation, algo_name, RUNS, | |||||
{4, {4, 5, 6, 7}}, {1, {7}}, data_type); | |||||
benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, | |||||
{1, {4}}, data_type); | |||||
} | |||||
TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, | |||||
BENCHMARK_IM2COL_NCHW44_INT8x8x32_STRIDE1) { | BENCHMARK_IM2COL_NCHW44_INT8x8x32_STRIDE1) { | ||||
constexpr size_t RUNS = 50; | constexpr size_t RUNS = 50; | ||||