GitOrigin-RevId: 8f310c3d13
tags/v0.3.2
@@ -40,6 +40,15 @@ | |||||
UNROLL_RAW16(cb, v0, ##a) \ | UNROLL_RAW16(cb, v0, ##a) \ | ||||
cb(16, ##a) cb(17, ##a) cb(18, ##a) cb(19, ##a) cb(20, ##a) cb(21, ##a) \ | cb(16, ##a) cb(17, ##a) cb(18, ##a) cb(19, ##a) cb(20, ##a) cb(21, ##a) \ | ||||
cb(22, ##a) cb(23, ##a) | cb(22, ##a) cb(23, ##a) | ||||
#define UNROLL_RAW25(cb, v0, a...) \ | |||||
UNROLL_RAW24(cb, v0, ##a) \ | |||||
cb(24, ##a) | |||||
#define UNROLL_RAW49(cb, v0, a...) \ | |||||
UNROLL_RAW25(cb, v0, ##a) \ | |||||
cb(25, ##a) cb(26, ##a) cb(27, ##a) cb(28, ##a) cb(29, ##a) cb(30, ##a) \ | |||||
cb(31, ##a) cb(32, ##a) cb(33, ##a) cb(34, ##a) cb(35, ##a) cb(36, ##a) \ | |||||
cb(37, ##a) cb(38, ##a) cb(39, ##a) cb(40, ##a) cb(41, ##a) cb(42, ##a) \ | |||||
cb(43, ##a) cb(44, ##a) cb(45, ##a) cb(46, ##a) cb(47, ##a) cb(48, ##a) | |||||
#define UNROLL_CALL0(step, cb, v...) UNROLL_RAW##step(cb, 0, ##v) | #define UNROLL_CALL0(step, cb, v...) UNROLL_RAW##step(cb, 0, ##v) | ||||
#define UNROLL_CALL1(step, cb, v...) UNROLL_CALL0(step, cb, ##v) | #define UNROLL_CALL1(step, cb, v...) UNROLL_CALL0(step, cb, ##v) | ||||
@@ -15,6 +15,7 @@ | |||||
#include "src/fallback/convolution/img2col_helper.h" | #include "src/fallback/convolution/img2col_helper.h" | ||||
#include "src/x86/conv_bias/int8/avx2_direct_conv_stride1.h" | #include "src/x86/conv_bias/int8/avx2_direct_conv_stride1.h" | ||||
#include "src/x86/conv_bias/int8/avx2_direct_conv_stride2.h" | #include "src/x86/conv_bias/int8/avx2_direct_conv_stride2.h" | ||||
#include "src/x86/conv_bias/int8/avx2_chanwise_stride1.h" | |||||
#include "src/x86/conv_bias/opr_impl.h" | #include "src/x86/conv_bias/opr_impl.h" | ||||
#include "src/x86/conv_bias/postprocess_helper.h" | #include "src/x86/conv_bias/postprocess_helper.h" | ||||
#include "src/x86/handle.h" | #include "src/x86/handle.h" | ||||
@@ -31,6 +32,65 @@ using namespace dnnl; | |||||
using namespace megdnn; | using namespace megdnn; | ||||
using namespace x86; | using namespace x86; | ||||
bool ConvBiasImpl::AlgoChanWiseAvx2Stride1Qint8::usable( | |||||
FallbackConvBiasImpl* /*opr*/, const NCBKernSizeParam& param, | |||||
AlgoSelectionStrategy /*algo_selection_strategy*/) const { | |||||
auto&& fm = param.filter_meta; | |||||
auto FH = fm.spatial[0]; | |||||
bool aviliable = | |||||
((param.src_type.enumv() == DTypeEnum::QuantizedS8 && | |||||
param.filter_type.enumv() == DTypeEnum::QuantizedS8 && | |||||
param.dst_type.enumv() == DTypeEnum::QuantizedS8) || | |||||
(((param.src_type.enumv() == DTypeEnum::Int8 && | |||||
param.filter_type.enumv() == DTypeEnum::Int8 && | |||||
param.dst_type.enumv() == DTypeEnum::Int32) || | |||||
(param.src_type.enumv() == DTypeEnum::QuantizedS8 && | |||||
param.filter_type.enumv() == DTypeEnum::QuantizedS8 && | |||||
param.dst_type.enumv() == DTypeEnum::QuantizedS32)))) && | |||||
fm.format == Param::Format::NCHW && fm.spatial_ndim == 2 && | |||||
fm.dilation[0] == 1 && fm.dilation[1] == 1 && | |||||
(FH == 2 || FH == 3 || FH == 5 || FH == 7) && fm.stride[0] == 1 && | |||||
fm.stride[1] == 1 && (fm.icpg == 1) && (fm.ocpg == 1) && | |||||
is_supported(SIMDType::AVX2); | |||||
return aviliable; | |||||
} | |||||
WorkspaceBundle ConvBiasImpl::AlgoChanWiseAvx2Stride1Qint8::get_bundle( | |||||
const NCBKernSizeParam& param) { | |||||
size_t nr_threads = param.nr_threads; | |||||
size_t IH2, IW2, OH2, OW2; | |||||
size_t src_size = 0, dst_size = 0, int32_temp = 0; | |||||
avx2_chanwise_stride1::get_rectified_size(param, IH2, IW2, OH2, OW2); | |||||
if (avx2_chanwise_stride1::need_src_copy(param)) { | |||||
src_size = IH2 * IW2 * sizeof(int8_t) * nr_threads; | |||||
} | |||||
if (avx2_chanwise_stride1::need_dst_copy(param)) { | |||||
dst_size = OH2 * OW2 * param.dst_type.size() * nr_threads; | |||||
} | |||||
bool dst_need_convert = param.dst_type.enumv() == DTypeEnum::QuantizedS8; | |||||
if (dst_need_convert) { | |||||
int32_temp = OH2 * OW2 * sizeof(int32_t) * nr_threads; | |||||
} | |||||
return dst_need_convert | |||||
? WorkspaceBundle(nullptr, {src_size, dst_size, int32_temp}) | |||||
: WorkspaceBundle(nullptr, {src_size, dst_size}); | |||||
} | |||||
size_t ConvBiasImpl::AlgoChanWiseAvx2Stride1Qint8::get_workspace( | |||||
FallbackConvBiasImpl*, const NCBKernSizeParam& param) const { | |||||
return get_bundle(param).total_size_in_bytes(); | |||||
} | |||||
SmallVector<fallback::ConvBiasImpl::NCBKern> | |||||
ConvBiasImpl::AlgoChanWiseAvx2Stride1Qint8::get_kimpls( | |||||
const NCBKernSizeParam& param) const { | |||||
auto bundle = get_bundle(param); | |||||
return avx2_chanwise_stride1::get_kimpls(param, bundle); | |||||
} | |||||
bool ConvBiasImpl::AlgoDirectAvx2Stride1Int8::usable( | bool ConvBiasImpl::AlgoDirectAvx2Stride1Int8::usable( | ||||
FallbackConvBiasImpl* /*opr*/, const NCBKernSizeParam& param, | FallbackConvBiasImpl* /*opr*/, const NCBKernSizeParam& param, | ||||
AlgoSelectionStrategy /*algo_selection_strategy*/) const { | AlgoSelectionStrategy /*algo_selection_strategy*/) const { | ||||
@@ -13,6 +13,29 @@ | |||||
namespace megdnn { | namespace megdnn { | ||||
namespace x86 { | namespace x86 { | ||||
/* ===================== avx2 stride1 chanwise algo ===================== */ | |||||
class ConvBiasImpl::AlgoChanWiseAvx2Stride1Qint8 final : public AlgoBase { | |||||
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const; | |||||
static WorkspaceBundle get_bundle(const NCBKernSizeParam& param); | |||||
public: | |||||
bool is_reproducible() const override { return true; } | |||||
const char* name() const override { | |||||
return "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"; | |||||
} | |||||
bool usable(FallbackConvBiasImpl* opr, const NCBKernSizeParam& param, | |||||
AlgoSelectionStrategy algo_selection_strategy) const override; | |||||
size_t get_workspace(FallbackConvBiasImpl* opr, | |||||
const NCBKernSizeParam& param) const override; | |||||
virtual SmallVector<NCBKern> dispatch_kerns( | |||||
fallback::ConvBiasImpl*, | |||||
const NCBKernSizeParam& param) const override { | |||||
return get_kimpls(param); | |||||
} | |||||
void* type() const override; | |||||
}; | |||||
/* ===================== avx2 stride1 direct algo ===================== */ | /* ===================== avx2 stride1 direct algo ===================== */ | ||||
class ConvBiasImpl::AlgoDirectAvx2Stride1Int8 final : public AlgoBase { | class ConvBiasImpl::AlgoDirectAvx2Stride1Int8 final : public AlgoBase { | ||||
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const; | SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const; | ||||
@@ -0,0 +1,39 @@ | |||||
/** | |||||
* \file src/x86/conv_bias/int8/avx2_chanwsie_kern.h | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | |||||
#pragma once | |||||
#include "src/x86/conv_bias/opr_impl.h" | |||||
namespace megdnn { | |||||
namespace x86 { | |||||
namespace avx2_chanwise_stride1 { | |||||
#define KERN(stride, i) \ | |||||
template <BiasMode bias_mode, bool is_quantized, typename Op> \ | |||||
MEGDNN_ATTRIBUTE_TARGET("avx2") \ | |||||
void avx2_chanwise_direct_##stride##_##i##x##i##_int8( \ | |||||
const int8_t* src, const int8_t* filter, const int32_t* bias, \ | |||||
int32_t* temp, int8_t* dst, const size_t IH, const size_t IW, \ | |||||
const size_t OH, const size_t OW, const Op& op); | |||||
KERN(stride1, 2) | |||||
KERN(stride1, 3) | |||||
KERN(stride1, 5) | |||||
KERN(stride1, 7) | |||||
#undef KERN | |||||
} // namespace avx2_chanwise_stride1 | |||||
} // namespace x86 | |||||
} // namespace megdnn | |||||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,251 @@ | |||||
/** | |||||
* \file src/x86/conv_bias/int8/avx2_chanwsie_stride1.cpp | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | |||||
#include "src/x86/conv_bias/int8/avx2_chanwise_stride1.h" | |||||
#include "src/x86/conv_bias/int8/avx2_chanwise_kern.h" | |||||
#include "src/x86/elemwise_op.h" | |||||
namespace megdnn { | |||||
namespace x86 { | |||||
namespace avx2_chanwise_stride1 { | |||||
bool need_dst_copy(const NCBKernSizeParam& param) { | |||||
return param.osz[1] % 16; | |||||
} | |||||
bool need_src_copy(const NCBKernSizeParam& param) { | |||||
auto&& fm = param.filter_meta; | |||||
return (fm.padding[0] != 0 || fm.padding[1] != 0) ? true | |||||
: need_dst_copy(param); | |||||
} | |||||
void get_rectified_size(const NCBKernSizeParam& param, size_t& IH2, size_t& IW2, | |||||
size_t& OH2, size_t& OW2) { | |||||
auto&& fm = param.filter_meta; | |||||
auto SW = fm.stride[1]; | |||||
auto OH = param.osz[0]; | |||||
auto OW = param.osz[1]; | |||||
auto FH = fm.spatial[0]; | |||||
auto FW = fm.spatial[1]; | |||||
OH2 = OH; | |||||
OW2 = (OW + 15) & ~15; | |||||
IH2 = SW * OH + FH - SW; | |||||
IW2 = SW * OW2 + FW - SW; | |||||
} | |||||
void copy_padding_kern(WorkspaceBundle bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index) { | |||||
size_t IH = kern_param.isz[0]; | |||||
size_t IW = kern_param.isz[1]; | |||||
size_t PH = kern_param.filter_meta.padding[0]; | |||||
size_t PW = kern_param.filter_meta.padding[1]; | |||||
size_t IH2, IW2, OH2, OW2; | |||||
get_rectified_size(kern_param, IH2, IW2, OH2, OW2); | |||||
bool need_src_copy_var = need_src_copy(kern_param); | |||||
size_t padding_group_size = IH2 * IW2; | |||||
bundle.set(kern_param.workspace_ptr); | |||||
size_t group_id = ncb_index.ndrange_id[0], | |||||
batch_id = ncb_index.ndrange_id[1], | |||||
channel_id = ncb_index.ndrange_id[2]; | |||||
size_t workspace_group_id = ncb_index.thread_id; | |||||
const int8_t* sptr = kern_param.src<int8_t>(batch_id, group_id, channel_id); | |||||
if (need_src_copy_var) { | |||||
int8_t* sptr_base = static_cast<int8_t*>(bundle.get(0)) + | |||||
workspace_group_id * padding_group_size; | |||||
std::memset(sptr_base, 0, sizeof(int8_t) * IH2 * IW2); | |||||
rep(ih, IH) { | |||||
std::memcpy(sptr_base + (ih + PH) * IW2 + PW, sptr + ih * IW, | |||||
sizeof(int8_t) * IW); | |||||
} | |||||
} | |||||
}; | |||||
template <size_t filter, BiasMode bias_mode, bool is_quantized, typename Op> | |||||
void conv_kimpl(WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) { | |||||
size_t OH = kern_param.osz[0]; | |||||
size_t OW = kern_param.osz[1]; | |||||
size_t IH2, IW2, OH2, OW2; | |||||
get_rectified_size(kern_param, IH2, IW2, OH2, OW2); | |||||
bool need_src_copy_var = need_src_copy(kern_param); | |||||
bool need_dst_copy_var = need_dst_copy(kern_param); | |||||
bool need_post_process = | |||||
kern_param.dst_type.enumv() == DTypeEnum::QuantizedS8; | |||||
Op op = Op(1.0f, 4.0f); | |||||
if (need_post_process) { | |||||
float scale_bias = | |||||
kern_param.bias_type.param<dtype::QuantizedS32>().scale; | |||||
float scale_dst = kern_param.dst_type.param<dtype::QuantizedS8>().scale; | |||||
op = Op(scale_bias, scale_dst); | |||||
} | |||||
size_t padding_group_size = IH2 * IW2; | |||||
bundle.set(kern_param.workspace_ptr); | |||||
size_t workspace_group_id = ncb_index.thread_id; | |||||
size_t group_id = ncb_index.ndrange_id[0], | |||||
batch_id = ncb_index.ndrange_id[1]; | |||||
const int8_t* sptr = kern_param.src<dt_int8>(batch_id, group_id); | |||||
const int8_t* fptr = | |||||
kern_param.filter<dt_int8>(group_id); | |||||
void* dst = kern_param.dst<void>(batch_id, group_id); | |||||
const int32_t* bptr = kern_param.bias<dt_int32>(batch_id, group_id); | |||||
if (need_src_copy_var) { | |||||
sptr = static_cast<int8_t*>(bundle.get(0)) + | |||||
workspace_group_id * padding_group_size; | |||||
} | |||||
void* dptr = nullptr; | |||||
int32_t* tptr = nullptr; | |||||
if (need_dst_copy_var) { | |||||
dptr = reinterpret_cast<void*>( | |||||
reinterpret_cast<ptrdiff_t>(bundle.get(1)) + | |||||
ncb_index.thread_id * OH2 * OW2 * kern_param.dst_type.size()); | |||||
} else { | |||||
dptr = dst; | |||||
} | |||||
#define KERN_NEED_POST_PROCESS(filter) \ | |||||
avx2_chanwise_direct_stride1_##filter##x##filter##_int8<bias_mode, true, \ | |||||
Op>( \ | |||||
sptr, fptr, bptr, tptr, static_cast<int8_t*>(dptr), IH2, IW2, OH2, \ | |||||
OW2, op) | |||||
#define KERN_NO_POST_PROCESS(filter) \ | |||||
avx2_chanwise_direct_stride1_##filter##x##filter##_int8<bias_mode, false, \ | |||||
Op>( \ | |||||
sptr, fptr, bptr, static_cast<int32_t*>(dptr), nullptr, IH2, IW2, \ | |||||
OH2, OW2, op) | |||||
if (need_post_process) { | |||||
tptr = static_cast<int32_t*>(bundle.get(2)) + | |||||
ncb_index.thread_id * OH2 * OW2 * kern_param.dst_type.size(); | |||||
DISPATCH_FILTER(filter, KERN_NEED_POST_PROCESS) | |||||
} else { | |||||
DISPATCH_FILTER(filter, KERN_NO_POST_PROCESS) | |||||
} | |||||
#undef KERN_NEED_POST_PROCESS | |||||
#undef KERN_NO_POST_PROCESS | |||||
if (need_dst_copy_var) { | |||||
rep(oh, OH) { | |||||
std::memcpy(reinterpret_cast<void*>( | |||||
reinterpret_cast<ptrdiff_t>(dst) + | |||||
oh * OW * kern_param.dst_type.size()), | |||||
reinterpret_cast<void*>( | |||||
reinterpret_cast<ptrdiff_t>(dptr) + | |||||
oh * OW2 * kern_param.dst_type.size()), | |||||
kern_param.dst_type.size() * OW); | |||||
} | |||||
} | |||||
}; | |||||
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& kern_param, | |||||
WorkspaceBundle bundle) { | |||||
MEGDNN_MARK_USED_VAR(kern_param); | |||||
auto fm = kern_param.filter_meta; | |||||
size_t group = fm.group; | |||||
size_t n = kern_param.n; | |||||
SmallVector<NCBKern> ncb_kerns; | |||||
conv_fun do_conv_fun = nullptr; | |||||
#define DO_CONV_KERN_FUN(filter, bias_mode, is_quantized, op) \ | |||||
do_conv_fun = conv_kimpl<filter, bias_mode, is_quantized, op>; | |||||
#define GET_OP_PARAM(i, bias_mode, is_quantized) \ | |||||
switch (kern_param.nonlineMode) { \ | |||||
case param::ConvBias::NonlineMode::IDENTITY: \ | |||||
DO_CONV_KERN_FUN(i, bias_mode, is_quantized, \ | |||||
TypeCvtOp<SIMDType::AVX2 MEGDNN_COMMA dt_qint32 \ | |||||
MEGDNN_COMMA dt_qint8>) \ | |||||
break; \ | |||||
case param::ConvBias::NonlineMode::RELU: \ | |||||
DO_CONV_KERN_FUN(i, bias_mode, is_quantized, \ | |||||
ReluOp<SIMDType::AVX2 MEGDNN_COMMA dt_qint32 \ | |||||
MEGDNN_COMMA dt_qint8>) \ | |||||
break; \ | |||||
case param::ConvBias::NonlineMode::H_SWISH: \ | |||||
DO_CONV_KERN_FUN(i, bias_mode, is_quantized, \ | |||||
HSwishOp<SIMDType::AVX2 MEGDNN_COMMA dt_qint32 \ | |||||
MEGDNN_COMMA dt_qint8>) \ | |||||
break; \ | |||||
default: \ | |||||
megdnn_assert(0); \ | |||||
break; \ | |||||
} | |||||
#define GET_BIAS_MODE_PARAM(i, is_quantized) \ | |||||
switch (kern_param.bias_mode) { \ | |||||
case BiasMode::NO_BIAS: \ | |||||
GET_OP_PARAM(i, BiasMode::NO_BIAS, is_quantized) \ | |||||
break; \ | |||||
case BiasMode::BROADCAST_CHANNEL_BIAS: \ | |||||
GET_OP_PARAM(i, BiasMode::BROADCAST_CHANNEL_BIAS, is_quantized) \ | |||||
break; \ | |||||
default: \ | |||||
megdnn_assert(0); \ | |||||
break; \ | |||||
} | |||||
#define GET_QUANTIZED(i) \ | |||||
switch (kern_param.dst_type.enumv()) { \ | |||||
case DTypeEnum::QuantizedS8: \ | |||||
GET_BIAS_MODE_PARAM(i, true) \ | |||||
break; \ | |||||
case DTypeEnum::QuantizedS32: \ | |||||
GET_BIAS_MODE_PARAM(i, false) \ | |||||
break; \ | |||||
case DTypeEnum::Int32: \ | |||||
GET_BIAS_MODE_PARAM(i, false) \ | |||||
break; \ | |||||
default: \ | |||||
megdnn_assert(0); \ | |||||
break; \ | |||||
} | |||||
#define DISPATCH_CONV_KERN() \ | |||||
switch (kern_param.filter_meta.spatial[0]) { \ | |||||
case 2: \ | |||||
GET_QUANTIZED(2) \ | |||||
break; \ | |||||
case 3: \ | |||||
GET_QUANTIZED(3) \ | |||||
break; \ | |||||
case 5: \ | |||||
GET_QUANTIZED(5) \ | |||||
break; \ | |||||
case 7: \ | |||||
GET_QUANTIZED(7) \ | |||||
break; \ | |||||
default: \ | |||||
megdnn_assert(0); \ | |||||
break; \ | |||||
} | |||||
DISPATCH_CONV_KERN(); | |||||
auto exec_one_group = [bundle, do_conv_fun](const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) { | |||||
copy_padding_kern(bundle, kern_param, ncb_index); | |||||
do_conv_fun(bundle, kern_param, ncb_index); | |||||
}; | |||||
ncb_kerns.push_back({exec_one_group, {group, n, 1_z}}); | |||||
return ncb_kerns; | |||||
} | |||||
} // namespace avx2_chanwise_stride1 | |||||
} // namespace x86 | |||||
} // namespace megdnn | |||||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,42 @@ | |||||
/** | |||||
* \file src/x86/conv_bias/int8/avx2_chanwsie_stride1.h | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | |||||
#pragma once | |||||
#include "src/x86/conv_bias/opr_impl.h" | |||||
namespace megdnn { | |||||
namespace x86 { | |||||
namespace avx2_chanwise_stride1 { | |||||
using NCBKern = fallback::ConvBiasImpl::NCBKern; | |||||
using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam; | |||||
using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; | |||||
using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; | |||||
using conv_fun = std::function<void(WorkspaceBundle bundle, | |||||
const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index)>; | |||||
bool need_dst_copy(const NCBKernSizeParam& param); | |||||
bool need_src_copy(const NCBKernSizeParam& param); | |||||
void get_rectified_size(const NCBKernSizeParam& param, size_t& IH2, size_t& IW2, | |||||
size_t& OH2, size_t& OW2); | |||||
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param, | |||||
WorkspaceBundle bundle); | |||||
} // namespace avx2_chanwise_stride1 | |||||
} // namespace x86 | |||||
} // namespace megdnn | |||||
// vim: syntax=cpp.doxygen |
@@ -10,7 +10,6 @@ | |||||
*/ | */ | ||||
#include "src/x86/conv_bias/int8/avx2_direct_conv_stride1.h" | #include "src/x86/conv_bias/int8/avx2_direct_conv_stride1.h" | ||||
#include "src/common/unroll_macro.h" | |||||
#include "src/x86/conv_bias/int8/common_helper.h" | #include "src/x86/conv_bias/int8/common_helper.h" | ||||
#include "src/x86/conv_bias/postprocess_helper.h" | #include "src/x86/conv_bias/postprocess_helper.h" | ||||
@@ -10,7 +10,6 @@ | |||||
*/ | */ | ||||
#include "src/x86/conv_bias/int8/avx2_direct_conv_stride2.h" | #include "src/x86/conv_bias/int8/avx2_direct_conv_stride2.h" | ||||
#include "src/common/unroll_macro.h" | |||||
#include "src/x86/conv_bias/int8/common_helper.h" | #include "src/x86/conv_bias/int8/common_helper.h" | ||||
#include "src/x86/conv_bias/postprocess_helper.h" | #include "src/x86/conv_bias/postprocess_helper.h" | ||||
@@ -11,6 +11,7 @@ | |||||
#pragma once | #pragma once | ||||
#include <immintrin.h> | #include <immintrin.h> | ||||
#include "src/common/unroll_macro.h" | |||||
#include "megdnn/arch.h" | #include "megdnn/arch.h" | ||||
#ifdef WIN32CMAKE | #ifdef WIN32CMAKE | ||||
#include <smmintrin.h> | #include <smmintrin.h> | ||||
@@ -65,6 +65,10 @@ void* ConvBiasImpl::AlgoAVX2DirectConvStride2::type() const { | |||||
return x86_algo_type; | return x86_algo_type; | ||||
} | } | ||||
void* ConvBiasImpl::AlgoChanWiseAvx2Stride1Qint8::type() const { | |||||
return x86_algo_type; | |||||
} | |||||
class ConvBiasImpl::AlgoPack : NonCopyableObj { | class ConvBiasImpl::AlgoPack : NonCopyableObj { | ||||
AlgoDirect stride1_direct_large_group{true}; | AlgoDirect stride1_direct_large_group{true}; | ||||
AlgoDirect stride1_direct_small_group{false}; | AlgoDirect stride1_direct_small_group{false}; | ||||
@@ -72,6 +76,7 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj { | |||||
AlgoDirectStride2 stride2_direct_small_group{false}; | AlgoDirectStride2 stride2_direct_small_group{false}; | ||||
AlgoDirectAvx2Stride1Int8 avx2_stride1_direct_int8; | AlgoDirectAvx2Stride1Int8 avx2_stride1_direct_int8; | ||||
AlgoAVX2DirectConvStride2 avx2_stride2_direct; | AlgoAVX2DirectConvStride2 avx2_stride2_direct; | ||||
AlgoChanWiseAvx2Stride1Qint8 avx2_stride1_chanwsie_qint8; | |||||
AlgoMatrixMul matmul; | AlgoMatrixMul matmul; | ||||
#if defined(MEGDNN_X86_WITH_MKL_DNN) | #if defined(MEGDNN_X86_WITH_MKL_DNN) | ||||
AlgoMkldnnMatmulQint8 mkldnn_matmul_qint8; | AlgoMkldnnMatmulQint8 mkldnn_matmul_qint8; | ||||
@@ -94,6 +99,7 @@ public: | |||||
all_algos.emplace_back(&stride2_direct_small_group); | all_algos.emplace_back(&stride2_direct_small_group); | ||||
all_algos.emplace_back(&avx2_stride1_direct_int8); | all_algos.emplace_back(&avx2_stride1_direct_int8); | ||||
all_algos.emplace_back(&avx2_stride2_direct); | all_algos.emplace_back(&avx2_stride2_direct); | ||||
all_algos.emplace_back(&avx2_stride1_chanwsie_qint8); | |||||
all_algos.emplace_back(&matmul); | all_algos.emplace_back(&matmul); | ||||
static CpuOprDelegationStorage<> storage; | static CpuOprDelegationStorage<> storage; | ||||
@@ -31,6 +31,7 @@ public: | |||||
class AlgoMatrixMul; | class AlgoMatrixMul; | ||||
class AlgoDirectAvx2Stride1Int8; | class AlgoDirectAvx2Stride1Int8; | ||||
class AlgoAVX2DirectConvStride2; | class AlgoAVX2DirectConvStride2; | ||||
class AlgoChanWiseAvx2Stride1Qint8; | |||||
#if defined(MEGDNN_X86_WITH_MKL_DNN) | #if defined(MEGDNN_X86_WITH_MKL_DNN) | ||||
class AlgoMkldnnConv; | class AlgoMkldnnConv; | ||||
class AlgoMkldnnQint8; | class AlgoMkldnnQint8; | ||||
@@ -258,6 +258,32 @@ struct TypeCvtOp<SIMDType::SSE4_2, dt_qint32, dt_qint8> | |||||
}; | }; | ||||
template <> | template <> | ||||
struct TypeCvtOp<SIMDType::AVX2, dt_qint32, dt_qint8> | |||||
: UnaryOpBase<SIMDType::AVX2, dt_qint32, dt_qint8> { | |||||
using UnaryOpBase::UnaryOpBase; | |||||
constexpr static size_t SIMD_WIDTH = 8; | |||||
MEGDNN_ATTRIBUTE_TARGET("avx2") | |||||
void operator()(const __m256ix2& vsrc, dt_qint8* dst) const { | |||||
_mm_store_si128((__m128i*)(dst), (operator()(vsrc))); | |||||
} | |||||
MEGDNN_ATTRIBUTE_TARGET("avx2") | |||||
__m128i operator()(const __m256ix2& vsrc) const { | |||||
auto cvtps_src0 = _mm256_cvtepi32_ps(vsrc.val[0]); | |||||
auto cvtps_src1 = _mm256_cvtepi32_ps(vsrc.val[1]); | |||||
auto vitem0 = _mm256_mul_ps(cvtps_src0, _mm256_set1_ps(this->scale)); | |||||
auto vitem1 = _mm256_mul_ps(cvtps_src1, _mm256_set1_ps(this->scale)); | |||||
return QConverter::convert<__m128i, __m256x2>({{vitem0, vitem1}}); | |||||
} | |||||
void operator()(src_ctype src, dst_ctype* dst) { | |||||
*reinterpret_cast<int8_t*>(dst) = saturate<int8_t, float>( | |||||
std::round(src.as_int32() * scale), -128, 127); | |||||
} | |||||
}; | |||||
template <> | |||||
struct TypeCvtOp<SIMDType::SSE4_2, dt_float32, dt_qint8> | struct TypeCvtOp<SIMDType::SSE4_2, dt_float32, dt_qint8> | ||||
: UnaryOpBase<SIMDType::SSE4_2, dt_float32, dt_qint8> { | : UnaryOpBase<SIMDType::SSE4_2, dt_float32, dt_qint8> { | ||||
using UnaryOpBase::UnaryOpBase; | using UnaryOpBase::UnaryOpBase; | ||||
@@ -40,6 +40,165 @@ TEST_F(X86, CONV_BIAS_FORWARD) { | |||||
.execs({arg.src, arg.filter, arg.bias, {}, {}}); | .execs({arg.src, arg.filter, arg.bias, {}, {}}); | ||||
} | } | ||||
} | } | ||||
TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_INT8x8x32) { | |||||
using namespace conv_bias; | |||||
std::vector<TestArg> args; | |||||
auto run = [&](size_t ic, size_t w, size_t h, size_t kernel, size_t p, | |||||
NonlineMode nonline_mode) { | |||||
if (w + 2 * p < kernel || h + 2 * p < kernel) | |||||
return; | |||||
param::ConvBias param; | |||||
param.stride_h = 1; | |||||
param.stride_w = 1; | |||||
param.pad_h = p; | |||||
param.pad_w = p; | |||||
param.nonlineMode = nonline_mode; | |||||
param.sparse = param::ConvBias::Sparse::GROUP; | |||||
//! no bias | |||||
args.emplace_back(param, TensorShape{2, ic, h, w}, | |||||
TensorShape{ic, 1, 1, kernel, kernel}, TensorShape{}); | |||||
//! bias channel | |||||
args.emplace_back(param, TensorShape{2, ic, h, w}, | |||||
TensorShape{ic, 1, 1, kernel, kernel}, | |||||
TensorShape{1, ic, 1, 1}); | |||||
}; | |||||
for (size_t kernel : {2, 3, 5, 7}) | |||||
for (size_t pad : {0, 1}) | |||||
for (size_t ic : {1, 5, 17, 20}) | |||||
for (size_t h : {7, 16, 38, 40}) | |||||
for (size_t w : {16, 25, 40, 55}) | |||||
for (NonlineMode nonline_mode : {NonlineMode::IDENTITY}) | |||||
run(ic, w, h, kernel, pad, nonline_mode); | |||||
Checker<ConvBias> checker(handle()); | |||||
UniformIntRNG rng{-50, 50}; | |||||
checker.set_dtype(0, dtype::Int8()) | |||||
.set_dtype(1, dtype::Int8()) | |||||
.set_dtype(2, dtype::Int32()) | |||||
.set_dtype(4, dtype::Int32()) | |||||
.set_rng(0, &rng) | |||||
.set_rng(1, &rng) | |||||
.set_rng(2, &rng) | |||||
.set_epsilon(1e-3); | |||||
checker.set_before_exec_callback( | |||||
conv_bias::ConvBiasAlgoChecker<ConvBiasForward>( | |||||
"X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1")); | |||||
for (auto&& arg : args) { | |||||
checker.set_param(arg.param).exec( | |||||
{arg.src, arg.filter, arg.bias, {}, {}}); | |||||
} | |||||
} | |||||
TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_QuantizedS32) { | |||||
using namespace conv_bias; | |||||
std::vector<TestArg> args; | |||||
auto run = [&](size_t ic, size_t w, size_t h, size_t kernel, size_t p, | |||||
NonlineMode nonline_mode) { | |||||
if (w + 2 * p < kernel || h + 2 * p < kernel) | |||||
return; | |||||
param::ConvBias param; | |||||
param.stride_h = 1; | |||||
param.stride_w = 1; | |||||
param.pad_h = p; | |||||
param.pad_w = p; | |||||
param.nonlineMode = nonline_mode; | |||||
param.sparse = param::ConvBias::Sparse::GROUP; | |||||
//! no bias | |||||
args.emplace_back(param, TensorShape{2, ic, h, w}, | |||||
TensorShape{ic, 1, 1, kernel, kernel}, TensorShape{}); | |||||
//! bias channel | |||||
args.emplace_back(param, TensorShape{2, ic, h, w}, | |||||
TensorShape{ic, 1, 1, kernel, kernel}, | |||||
TensorShape{1, ic, 1, 1}); | |||||
}; | |||||
for (size_t kernel : {2, 3, 5, 7}) | |||||
for (size_t pad : {0, 1}) | |||||
for (size_t ic : {1, 3, 5, 7, 17}) | |||||
for (size_t h : {10, 17, 25, 30}) | |||||
for (size_t w : {19, 28, 58, 168}) | |||||
for (NonlineMode nonline_mode : {NonlineMode::IDENTITY}) | |||||
run(ic, w, h, kernel, pad, nonline_mode); | |||||
Checker<ConvBias> checker(handle()); | |||||
UniformIntRNG rng{-50, 50}; | |||||
checker.set_dtype(0, dtype::QuantizedS8(2.5f)) | |||||
.set_dtype(1, dtype::QuantizedS8(2.5f)) | |||||
.set_dtype(2, dtype::QuantizedS32(6.25f)) | |||||
.set_dtype(4, {}) | |||||
.set_rng(0, &rng) | |||||
.set_rng(1, &rng) | |||||
.set_rng(2, &rng) | |||||
.set_epsilon(1e-3); | |||||
checker.set_before_exec_callback( | |||||
conv_bias::ConvBiasAlgoChecker<ConvBiasForward>( | |||||
"X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1")); | |||||
for (auto&& arg : args) { | |||||
checker.set_param(arg.param).exec( | |||||
{arg.src, arg.filter, arg.bias, {}, {}}); | |||||
} | |||||
} | |||||
TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_QuantizedS8x8x8) { | |||||
using namespace conv_bias; | |||||
std::vector<TestArg> args; | |||||
auto run = [&](size_t ic, size_t w, size_t h, size_t kernel, size_t p, | |||||
NonlineMode nonline_mode) { | |||||
if (w + 2 * p < kernel || h + 2 * p < kernel) | |||||
return; | |||||
param::ConvBias param; | |||||
param.stride_h = 1; | |||||
param.stride_w = 1; | |||||
param.pad_h = p; | |||||
param.pad_w = p; | |||||
param.nonlineMode = nonline_mode; | |||||
param.sparse = param::ConvBias::Sparse::GROUP; | |||||
//! no bias | |||||
args.emplace_back(param, TensorShape{2, ic, h, w}, | |||||
TensorShape{ic, 1, 1, kernel, kernel}, TensorShape{}); | |||||
//! bias channel | |||||
args.emplace_back(param, TensorShape{2, ic, h, w}, | |||||
TensorShape{ic, 1, 1, kernel, kernel}, | |||||
TensorShape{1, ic, 1, 1}); | |||||
}; | |||||
for (size_t kernel : {2, 3, 5, 7}) | |||||
for (size_t pad : {0, 1}) | |||||
for (size_t ic : {1, 3, 5, 7, 17}) | |||||
for (size_t h : {10, 15, 17, 30}) | |||||
for (size_t w : {19, 28, 58, 168}) | |||||
for (NonlineMode nonline_mode : | |||||
{NonlineMode::IDENTITY, NonlineMode::H_SWISH, | |||||
NonlineMode::RELU}) | |||||
run(ic, w, h, kernel, pad, nonline_mode); | |||||
Checker<ConvBias> checker(handle()); | |||||
UniformIntRNG rng{-50, 50}; | |||||
checker.set_dtype(0, dtype::QuantizedS8(2.5f)) | |||||
.set_dtype(1, dtype::QuantizedS8(2.5f)) | |||||
.set_dtype(2, dtype::QuantizedS32(6.25f)) | |||||
.set_dtype(4, dtype::QuantizedS8(60.25f)) | |||||
.set_rng(0, &rng) | |||||
.set_rng(1, &rng) | |||||
.set_rng(2, &rng) | |||||
.set_epsilon(1e-3); | |||||
checker.set_before_exec_callback( | |||||
conv_bias::ConvBiasAlgoChecker<ConvBiasForward>( | |||||
"X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1")); | |||||
for (auto&& arg : args) { | |||||
checker.set_param(arg.param).exec( | |||||
{arg.src, arg.filter, arg.bias, {}, {}}); | |||||
} | |||||
} | |||||
TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE1_INT8x8x32) { | TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE1_INT8x8x32) { | ||||
using namespace conv_bias; | using namespace conv_bias; | ||||
std::vector<TestArg> args; | std::vector<TestArg> args; | ||||
@@ -1556,6 +1715,67 @@ void benchmark_impl_comp(const param::ConvBias param, | |||||
} | } | ||||
} // namespace | } // namespace | ||||
TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_CHANWISE_AVX2_INT8) { | |||||
constexpr size_t RUNS = 50; | |||||
param::ConvBias param; | |||||
param.stride_h = 1; | |||||
param.stride_w = 1; | |||||
param.sparse = param::ConvBias::Sparse::GROUP; | |||||
std::vector<DType> data_type = {dtype::Int8(), dtype::Int8(), | |||||
dtype::Int32(), dtype::Int32()}; | |||||
std::vector<std::pair<SmallVector<TensorShape>, float>> | |||||
shapes_and_computation; | |||||
auto bench_case = [&](size_t N, size_t IC, size_t H, size_t W, size_t FS) { | |||||
param.pad_h = FS / 2; | |||||
param.pad_w = FS / 2; | |||||
SmallVector<TensorShape> shapes{ | |||||
{N, IC, H, W}, {IC, 1, 1, FS, FS}, {}, {}, {}}; | |||||
TensorShape dst{N, IC, (H + 2 * param.pad_h - FS) + 1, | |||||
(W + 2 * param.pad_w - FS) + 1}; | |||||
float computations = (FS * FS * dst.total_nr_elems() * 2) * 1e-6; | |||||
shapes_and_computation.push_back(std::make_pair(shapes, computations)); | |||||
}; | |||||
bench_case(1, 32, 112, 112, 7); | |||||
bench_case(1, 144, 56, 56, 7); | |||||
bench_case(1, 192, 28, 28, 7); | |||||
bench_case(1, 384, 28, 28, 7); | |||||
bench_case(1, 576, 14, 14, 7); | |||||
bench_case(1, 960, 7, 7, 7); | |||||
bench_case(1, 32, 112, 112, 5); | |||||
bench_case(1, 144, 56, 56, 5); | |||||
bench_case(1, 192, 28, 28, 5); | |||||
bench_case(1, 384, 28, 28, 5); | |||||
bench_case(1, 576, 14, 14, 5); | |||||
bench_case(1, 960, 7, 7, 5); | |||||
bench_case(1, 32, 112, 112, 3); | |||||
bench_case(1, 144, 56, 56, 3); | |||||
bench_case(1, 192, 28, 28, 3); | |||||
bench_case(1, 384, 28, 28, 3); | |||||
bench_case(1, 576, 14, 14, 3); | |||||
bench_case(1, 960, 7, 7, 3); | |||||
bench_case(1, 32, 112, 112, 2); | |||||
bench_case(1, 144, 56, 56, 2); | |||||
bench_case(1, 192, 28, 28, 2); | |||||
bench_case(1, 384, 28, 28, 2); | |||||
bench_case(1, 576, 14, 14, 2); | |||||
bench_case(1, 960, 7, 7, 2); | |||||
std::string algo_name = "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"; | |||||
printf("Benchmark X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1\n"); | |||||
benchmark_impl(param, shapes_and_computation, algo_name, RUNS, | |||||
{4, {4, 5, 6, 7}}, {1, {4}}, data_type); | |||||
benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, | |||||
{1, {4}}, data_type); | |||||
shapes_and_computation.clear(); | |||||
} | |||||
TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECT_AVX2_INT8) { | TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECT_AVX2_INT8) { | ||||
constexpr size_t RUNS = 50; | constexpr size_t RUNS = 50; | ||||
param::ConvBias param; | param::ConvBias param; | ||||