GitOrigin-RevId: 90aa75d51e
tags/v1.0.0-rc1
@@ -612,7 +612,7 @@ bool PoolingImpl::AlgoFilter3ModexStridexNCHW44::usable( | |||||
(param.mode == Mode::MAX || param.mode == Mode::AVERAGE) && | (param.mode == Mode::MAX || param.mode == Mode::AVERAGE) && | ||||
FH == 3 && FW == 3 && SW == SH && (SH == 1 || SW == 2); | FH == 3 && FW == 3 && SW == SH && (SH == 1 || SW == 2); | ||||
//! Int8 not support average, because its round mode is different form | //! Int8 not support average, because its round mode is different form | ||||
//! quint8 | |||||
//! qint8 | |||||
avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && | avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && | ||||
param.mode == Mode::AVERAGE); | param.mode == Mode::AVERAGE); | ||||
return avaible; | return avaible; | ||||
@@ -705,7 +705,7 @@ bool PoolingImpl::AlgoFilter2ModexStridexNCHW44::usable( | |||||
(param.mode == Mode::MAX || param.mode == Mode::AVERAGE) && | (param.mode == Mode::MAX || param.mode == Mode::AVERAGE) && | ||||
FH == 2 && FW == 2 && SH == SW && (SW == 1 || SW == 2); | FH == 2 && FW == 2 && SH == SW && (SW == 1 || SW == 2); | ||||
//! Int8 not support average, because its round mode is different form | //! Int8 not support average, because its round mode is different form | ||||
//! quint8 | |||||
//! qint8 | |||||
avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && | avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && | ||||
param.mode == Mode::AVERAGE); | param.mode == Mode::AVERAGE); | ||||
return avaible; | return avaible; | ||||
@@ -799,7 +799,7 @@ bool PoolingImpl::AlgoFilter4ModexStridexNCHW44::usable( | |||||
FH == 4 && FW == 4 && SH == SW && (SW == 1 || SW == 2); | FH == 4 && FW == 4 && SH == SW && (SW == 1 || SW == 2); | ||||
//! Int8 not support average, because its round mode is different form | //! Int8 not support average, because its round mode is different form | ||||
//! quint8 | |||||
//! qint8 | |||||
avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && | avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && | ||||
param.mode == Mode::AVERAGE); | param.mode == Mode::AVERAGE); | ||||
return avaible; | return avaible; | ||||
@@ -892,7 +892,7 @@ bool PoolingImpl::AlgoFilter5ModexStridexNCHW44::usable( | |||||
(param.mode == Mode::MAX || param.mode == Mode::AVERAGE) && | (param.mode == Mode::MAX || param.mode == Mode::AVERAGE) && | ||||
FH == 5 && FW == 5 && SH == SW && (SW == 1 || SW == 2); | FH == 5 && FW == 5 && SH == SW && (SW == 1 || SW == 2); | ||||
//! Int8 not support average, because its round mode is different form | //! Int8 not support average, because its round mode is different form | ||||
//! quint8 | |||||
//! qint8 | |||||
avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && | avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && | ||||
param.mode == Mode::AVERAGE); | param.mode == Mode::AVERAGE); | ||||
return avaible; | return avaible; | ||||
@@ -47,7 +47,7 @@ size_t ConvBiasImpl::AlgoConv1x1::get_oc_tile_size_heuristic( | |||||
return round_up<size_t>(oc_block_size_one_thread, 24); | return round_up<size_t>(oc_block_size_one_thread, 24); | ||||
} | } | ||||
size_t ConvBiasImpl::AlgoConv1x1::get_workspace( | |||||
WorkspaceBundle ConvBiasImpl::AlgoConv1x1::get_bundle_according_packmode( | |||||
const NCBKernSizeParam& param) const { | const NCBKernSizeParam& param) const { | ||||
size_t OH = param.osz[0]; | size_t OH = param.osz[0]; | ||||
size_t OW = param.osz[1]; | size_t OW = param.osz[1]; | ||||
@@ -58,168 +58,195 @@ size_t ConvBiasImpl::AlgoConv1x1::get_workspace( | |||||
auto pack_mode = m_matmul_algo->packmode(); | auto pack_mode = m_matmul_algo->packmode(); | ||||
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { | if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { | ||||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 0) { | |||||
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> dispatcher; | |||||
return dispatcher | |||||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||||
midout_iv("get_bundle_default"_hash)) { | |||||
return Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT>() | |||||
.get_bundle(param, matmul_param, m_matmul_algo, | .get_bundle(param, matmul_param, m_matmul_algo, | ||||
compt_oc_block_size) | |||||
.total_size_in_bytes(); | |||||
compt_oc_block_size); | |||||
} | } | ||||
MIDOUT_END(); | MIDOUT_END(); | ||||
} else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { | } else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { | ||||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 1) { | |||||
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> | |||||
dispatcher; | |||||
return dispatcher | |||||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||||
midout_iv("get_bundle_only_packa"_hash)) { | |||||
return Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA>() | |||||
.get_bundle(param, matmul_param, m_matmul_algo, | .get_bundle(param, matmul_param, m_matmul_algo, | ||||
compt_oc_block_size) | |||||
.total_size_in_bytes(); | |||||
compt_oc_block_size); | |||||
} | } | ||||
MIDOUT_END(); | MIDOUT_END(); | ||||
} else { | } else { | ||||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 2) { | |||||
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::NO_PACK> dispatcher; | |||||
return dispatcher | |||||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||||
midout_iv("get_bundle_no_pack"_hash)) { | |||||
return Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::NO_PACK>() | |||||
.get_bundle(param, matmul_param, m_matmul_algo, | .get_bundle(param, matmul_param, m_matmul_algo, | ||||
compt_oc_block_size) | |||||
.total_size_in_bytes(); | |||||
compt_oc_block_size); | |||||
} | } | ||||
MIDOUT_END(); | MIDOUT_END(); | ||||
} | } | ||||
return 0; | |||||
return {nullptr, {}}; | |||||
} | } | ||||
SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoConv1x1::dispatch_kerns( | |||||
size_t ConvBiasImpl::AlgoConv1x1::get_workspace( | |||||
const NCBKernSizeParam& param) const { | const NCBKernSizeParam& param) const { | ||||
SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||||
return get_bundle_according_packmode(param).total_size_in_bytes(); | |||||
} | |||||
SmallVector<ConvBiasImpl::NCBKern> | |||||
ConvBiasImpl::AlgoConv1x1::get_kerns_according_packmode( | |||||
const NCBKernSizeParam& param, bool weight_preprocess) const { | |||||
size_t OH = param.osz[0]; | size_t OH = param.osz[0]; | ||||
size_t OW = param.osz[1]; | size_t OW = param.osz[1]; | ||||
size_t OC = param.filter_meta.ocpg; | |||||
size_t compt_oc_block_size = get_oc_tile_size_heuristic(param); | size_t compt_oc_block_size = get_oc_tile_size_heuristic(param); | ||||
size_t GROUP = param.filter_meta.group; | |||||
size_t BATCH = param.n; | |||||
size_t oc_blocks_per_group = div_ceil(OC, compt_oc_block_size); | |||||
auto pack_mode = m_matmul_algo->packmode(); | |||||
Conv1x1StrategyBase* conv1x1_strategy = | |||||
Conv1x1Factory::make_conv1x1_strategy(param, pack_mode, | |||||
param.filter_meta.format); | |||||
auto matmul_param = | auto matmul_param = | ||||
utils::get_matmul_kern_param(param, OH * OW, compt_oc_block_size); | utils::get_matmul_kern_param(param, OH * OW, compt_oc_block_size); | ||||
WorkspaceBundle whole_bundle = {nullptr, {}}; | |||||
WorkspaceBundle thread_bundle = {nullptr, {}}; | |||||
WorkspaceBundle matmul_bundle = {nullptr, {}}; | |||||
auto pack_mode = m_matmul_algo->packmode(); | |||||
WorkspaceBundle whole_bundle = get_bundle_according_packmode(param); | |||||
//! NO_PACK not implement get_bundle | |||||
WorkspaceBundle matmul_bundle ={nullptr,{}}; | |||||
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::NO_PACK) { | |||||
matmul_bundle = {nullptr, | |||||
{0, 0, m_matmul_algo->get_workspace(matmul_param)}}; | |||||
} else { | |||||
matmul_bundle = m_matmul_algo->get_bundle(matmul_param); | |||||
} | |||||
WorkspaceBundle thread_bundle = utils::get_thread_bundle( | |||||
param, matmul_bundle.get_size(2), compt_oc_block_size); | |||||
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { | if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { | ||||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 0) { | |||||
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> dispatcher; | |||||
whole_bundle = dispatcher.get_bundle( | |||||
param, matmul_param, m_matmul_algo, compt_oc_block_size); | |||||
matmul_bundle = m_matmul_algo->get_bundle(matmul_param); | |||||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||||
midout_iv("get_kern_default"_hash)) { | |||||
if (!weight_preprocess) { | |||||
return Conv1x1Kerns< | |||||
MatrixMulImpl::AlgoBase::PackMode::DEFAULT>() | |||||
.get_kern(param, whole_bundle, matmul_bundle, | |||||
thread_bundle, conv1x1_strategy, | |||||
m_matmul_algo, compt_oc_block_size); | |||||
} else { | |||||
return Conv1x1Kerns< | |||||
MatrixMulImpl::AlgoBase::PackMode::DEFAULT>() | |||||
.get_kern_preprocess(param, whole_bundle, matmul_bundle, | |||||
conv1x1_strategy, m_matmul_algo, | |||||
compt_oc_block_size); | |||||
} | |||||
} | } | ||||
MIDOUT_END(); | MIDOUT_END(); | ||||
} else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { | } else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { | ||||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 1) { | |||||
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> | |||||
dispatcher; | |||||
whole_bundle = dispatcher.get_bundle( | |||||
param, matmul_param, m_matmul_algo, compt_oc_block_size); | |||||
matmul_bundle = m_matmul_algo->get_bundle(matmul_param); | |||||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||||
midout_iv("get_kern_only_packa"_hash)) { | |||||
if (!weight_preprocess) { | |||||
return Conv1x1Kerns< | |||||
MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA>() | |||||
.get_kern(param, whole_bundle, matmul_bundle, | |||||
thread_bundle, conv1x1_strategy, | |||||
m_matmul_algo, compt_oc_block_size); | |||||
} else { | |||||
return Conv1x1Kerns< | |||||
MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA>() | |||||
.get_kern_preprocess(param, whole_bundle, matmul_bundle, | |||||
conv1x1_strategy, m_matmul_algo, | |||||
compt_oc_block_size); | |||||
} | |||||
} | } | ||||
MIDOUT_END(); | MIDOUT_END(); | ||||
} else { | } else { | ||||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 2) { | |||||
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::NO_PACK> dispatcher; | |||||
whole_bundle = dispatcher.get_bundle( | |||||
param, matmul_param, m_matmul_algo, compt_oc_block_size); | |||||
matmul_bundle = { | |||||
nullptr, | |||||
{0, 0, m_matmul_algo->get_workspace(matmul_param)}}; | |||||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||||
midout_iv("get_kern_no_pack"_hash)) { | |||||
if (!weight_preprocess) { | |||||
return Conv1x1Kerns< | |||||
MatrixMulImpl::AlgoBase::PackMode::NO_PACK>() | |||||
.get_kern(param, whole_bundle, matmul_bundle, | |||||
thread_bundle, conv1x1_strategy, | |||||
m_matmul_algo, compt_oc_block_size); | |||||
} else { | |||||
return Conv1x1Kerns< | |||||
MatrixMulImpl::AlgoBase::PackMode::NO_PACK>() | |||||
.get_kern_preprocess(param, whole_bundle, matmul_bundle, | |||||
conv1x1_strategy, m_matmul_algo, | |||||
compt_oc_block_size); | |||||
} | |||||
} | } | ||||
MIDOUT_END(); | MIDOUT_END(); | ||||
} | } | ||||
} | |||||
//! get thread bundle | |||||
thread_bundle = utils::get_thread_bundle(param, matmul_bundle.get_size(2), | |||||
compt_oc_block_size); | |||||
Conv1x1StrategyBase* conv1x1_strategy = | |||||
Conv1x1Factory::make_conv1x1_strategy(param, pack_mode, | |||||
param.filter_meta.format); | |||||
SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoConv1x1::dispatch_kerns( | |||||
const NCBKernSizeParam& param) const { | |||||
return get_kerns_according_packmode(param, false); | |||||
} | |||||
auto kern_packA = [this, whole_bundle, matmul_bundle, param, | |||||
compt_oc_block_size, conv1x1_strategy]( | |||||
const NCBKernParam& ncb_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
conv1x1_strategy->packA(whole_bundle, matmul_bundle, | |||||
compt_oc_block_size, this->m_matmul_algo, param, | |||||
ncb_param, std::move(ncb_index)); | |||||
}; | |||||
auto kern_packB = [this, whole_bundle, matmul_bundle, param, | |||||
conv1x1_strategy]( | |||||
const NCBKernParam& ncb_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
conv1x1_strategy->packB(whole_bundle, matmul_bundle, | |||||
this->m_matmul_algo, param, ncb_param, | |||||
std::move(ncb_index)); | |||||
}; | |||||
auto kern_compt = [this, whole_bundle, matmul_bundle, thread_bundle, param, | |||||
compt_oc_block_size, conv1x1_strategy]( | |||||
const NCBKernParam& ncb_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
conv1x1_strategy->exec(whole_bundle, matmul_bundle, thread_bundle, | |||||
compt_oc_block_size, this->m_matmul_algo, param, | |||||
ncb_param, std::move(ncb_index)); | |||||
}; | |||||
SmallVector<TensorLayout> | |||||
ConvBiasImpl::AlgoConv1x1::deduce_preprocessed_filter_layout( | |||||
const NCBKernSizeParam& param) const { | |||||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||||
midout_iv("deduce_preprocessed_filter_layout"_hash)) { | |||||
WorkspaceBundle wb = get_bundle_according_packmode(param); | |||||
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT || | |||||
pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { | |||||
//! if enable filter preprocess kern_packA should not dispatch | |||||
if (!is_enable_filter_preprocess(param)) { | |||||
ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); | |||||
} | |||||
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { | |||||
ret_kern.push_back({kern_packB, {1}}); | |||||
} | |||||
size_t GROUP = param.filter_meta.group; | |||||
SmallVector<TensorLayout> preprocessed_layouts; | |||||
preprocessed_layouts.push_back( | |||||
{{GROUP, wb.get_size(0)}, dtype::Int8()}); | |||||
return preprocessed_layouts; | |||||
} | } | ||||
ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}}); | |||||
return ret_kern; | |||||
MIDOUT_END(); | |||||
return {}; | |||||
} | |||||
SmallVector<ConvBiasImpl::NCBKern> | |||||
ConvBiasImpl::AlgoConv1x1::dispatch_preprocess_kerns( | |||||
const NCBKernSizeParam& param) const { | |||||
return get_kerns_according_packmode(param, true); | |||||
} | } | ||||
bool ConvBiasImpl::AlgoConv1x1::usable(const NCBKernSizeParam& param, | bool ConvBiasImpl::AlgoConv1x1::usable(const NCBKernSizeParam& param, | ||||
AlgoSelectionStrategy) const { | AlgoSelectionStrategy) const { | ||||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 2) { | MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 2) { | ||||
if (param.filter_meta.format != param::ConvBias::Format::NCHW && | |||||
param.filter_meta.format != param::ConvBias::Format::NCHW44 && | |||||
param.filter_meta.format != param::ConvBias::Format::NCHW44_DOT) | |||||
return false; | |||||
size_t FH = param.filter_meta.spatial[0], | size_t FH = param.filter_meta.spatial[0], | ||||
FW = param.filter_meta.spatial[1]; | FW = param.filter_meta.spatial[1]; | ||||
size_t PH = param.filter_meta.padding[0], | size_t PH = param.filter_meta.padding[0], | ||||
PW = param.filter_meta.padding[1]; | PW = param.filter_meta.padding[1]; | ||||
size_t SH = param.filter_meta.stride[0], | size_t SH = param.filter_meta.stride[0], | ||||
SW = param.filter_meta.stride[1]; | SW = param.filter_meta.stride[1]; | ||||
if (FH != 1 || FW != 1 || PH || PW || SH != 1 || SW != 1) | |||||
auto format = param.filter_meta.format; | |||||
size_t OH = param.osz[0]; | |||||
size_t OW = param.osz[1]; | |||||
#if MEGDNN_AARCH64 || MEGDNN_ARMV7 | |||||
if (format != param::ConvBias::Format::NCHW && | |||||
format != param::ConvBias::Format::NCHW44 && | |||||
format != param::ConvBias::Format::NCHW44_DOT) { | |||||
return false; | return false; | ||||
if (param.src_type.enumv() != param.filter_type.enumv()) { | |||||
} | |||||
//! hybird mode is not support | |||||
if (param.filter_meta.format == param::ConvBias::Format::NCHW44 || | |||||
param.filter_meta.format == param::ConvBias::Format::NCHW44_DOT) { | |||||
if (param.filter_meta.icpg < 4_z || param.filter_meta.icpg == 1 || | |||||
param.filter_meta.ocpg == 1) { | |||||
return false; | |||||
} | |||||
} | |||||
#else | |||||
if (format != param::ConvBias::Format::NCHW) { | |||||
return false; | return false; | ||||
} | } | ||||
//! only matmul's packmode is packa or default support weight preprocess | |||||
if (is_enable_filter_preprocess(param) && | |||||
(m_matmul_algo->packmode() == | |||||
fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK)) { | |||||
#endif | |||||
//! param | |||||
if (FH != 1 || FW != 1 || PH || PW || SH != 1 || SW != 1) { | |||||
return false; | return false; | ||||
} | } | ||||
if (param.src_type.enumv() != DTypeEnum::Int8 && | |||||
param.src_type.enumv() != DTypeEnum::QuantizedS8 && | |||||
param.src_type.enumv() != DTypeEnum::Quantized8Asymm && | |||||
//! data type | |||||
if (param.src_type.enumv() != param.filter_type.enumv() || | |||||
(param.src_type.enumv() != DTypeEnum::Int8 && | |||||
param.src_type.enumv() != DTypeEnum::QuantizedS8 && | |||||
param.src_type.enumv() != DTypeEnum::Quantized8Asymm && | |||||
#if !MEGDNN_DISABLE_FLOAT16 | #if !MEGDNN_DISABLE_FLOAT16 | ||||
param.src_type.enumv() != DTypeEnum::Float16 && | |||||
param.src_type.enumv() != DTypeEnum::Float16 && | |||||
#endif | #endif | ||||
param.src_type.enumv() != DTypeEnum::Float32) { | |||||
param.src_type.enumv() != DTypeEnum::Float32)) { | |||||
return false; | return false; | ||||
} | } | ||||
//! make sure 8x8x16 and 8x8x32 biasmode is nobias and nonlineMode | //! make sure 8x8x16 and 8x8x32 biasmode is nobias and nonlineMode | ||||
//! is identity otherwise return false mean that 8x8x32 and 8x8x16 | //! is identity otherwise return false mean that 8x8x32 and 8x8x16 | ||||
//! not support PostProcess | //! not support PostProcess | ||||
@@ -231,27 +258,13 @@ bool ConvBiasImpl::AlgoConv1x1::usable(const NCBKernSizeParam& param, | |||||
return false; | return false; | ||||
} | } | ||||
} | } | ||||
if (param.filter_meta.format == param::ConvBias::Format::NCHW44 || | |||||
param.filter_meta.format == param::ConvBias::Format::NCHW44_DOT) { | |||||
if (param.filter_meta.icpg < 4_z || param.filter_meta.icpg == 1 || | |||||
param.filter_meta.ocpg == 1) { | |||||
return false; | |||||
} | |||||
} | |||||
size_t OH = param.osz[0]; | |||||
size_t OW = param.osz[1]; | |||||
MatrixMulImpl::KernSizeParam matmul_param = | MatrixMulImpl::KernSizeParam matmul_param = | ||||
utils::get_matmul_kern_param(param, OH * OW, | utils::get_matmul_kern_param(param, OH * OW, | ||||
get_oc_tile_size_heuristic(param)); | get_oc_tile_size_heuristic(param)); | ||||
bool matmul_usable = m_matmul_algo->usable(matmul_param); | bool matmul_usable = m_matmul_algo->usable(matmul_param); | ||||
auto pack_mode = m_matmul_algo->packmode(); | auto pack_mode = m_matmul_algo->packmode(); | ||||
bool strategy_usable = Conv1x1Factory::can_make_conv1x1_strategy( | bool strategy_usable = Conv1x1Factory::can_make_conv1x1_strategy( | ||||
param, pack_mode, param.filter_meta.format); | param, pack_mode, param.filter_meta.format); | ||||
return matmul_usable && strategy_usable && | return matmul_usable && strategy_usable && | ||||
(param.filter_meta.dilation[0] == | (param.filter_meta.dilation[0] == | ||||
param.filter_meta.dilation[1] && | param.filter_meta.dilation[1] && | ||||
@@ -262,121 +275,6 @@ bool ConvBiasImpl::AlgoConv1x1::usable(const NCBKernSizeParam& param, | |||||
return false; | return false; | ||||
} | } | ||||
SmallVector<TensorLayout> | |||||
ConvBiasImpl::AlgoConv1x1::deduce_preprocessed_filter_layout( | |||||
const NCBKernSizeParam& param) const { | |||||
MIDOUT_BEGIN( | |||||
megdnn_fallback_conv1x1, | |||||
midout_iv( | |||||
"ConvBiasImpl::AlgoConv1x1::deduce_preprocessed_filter_layout"_hash)) { | |||||
fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc = | |||||
m_matmul_algo->matmul_description(); | |||||
bool default_pack = matmul_desc.packmode == | |||||
MatrixMulImpl::AlgoBase::PackMode::DEFAULT; | |||||
bool only_packA = matmul_desc.packmode == | |||||
MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA; | |||||
//! only support default_pack and only_packa mode | |||||
if (matmul_desc.packmode == | |||||
MatrixMulImpl::AlgoBase::PackMode::NO_PACK) { | |||||
return {}; | |||||
} | |||||
size_t OH = param.osz[0]; | |||||
size_t OW = param.osz[1]; | |||||
size_t compt_oc_block_size = get_oc_tile_size_heuristic(param); | |||||
auto matmul_param = utils::get_matmul_kern_param(param, OH * OW, | |||||
compt_oc_block_size); | |||||
WorkspaceBundle wb(nullptr, {}); | |||||
if (default_pack) { | |||||
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> dispatcher; | |||||
wb = dispatcher.get_bundle(param, matmul_param, m_matmul_algo, | |||||
compt_oc_block_size); | |||||
} else if (only_packA) { | |||||
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> | |||||
dispatcher; | |||||
wb = dispatcher.get_bundle(param, matmul_param, m_matmul_algo, | |||||
compt_oc_block_size); | |||||
} | |||||
size_t GROUP = param.filter_meta.group; | |||||
SmallVector<TensorLayout> preprocessed_layouts; | |||||
preprocessed_layouts.push_back( | |||||
{{GROUP, wb.get_size(0)}, dtype::Int8()}); | |||||
return preprocessed_layouts; | |||||
} | |||||
MIDOUT_END(); | |||||
return {}; | |||||
} | |||||
SmallVector<ConvBiasImpl::NCBKern> | |||||
ConvBiasImpl::AlgoConv1x1::dispatch_preprocess_kerns( | |||||
const NCBKernSizeParam& param) const { | |||||
MIDOUT_BEGIN( | |||||
megdnn_fallback_conv1x1, | |||||
midout_iv( | |||||
"ConvBiasImpl::AlgoConv1x1::dispatch_preprocess_kerns"_hash)) { | |||||
SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||||
size_t OH = param.osz[0]; | |||||
size_t OW = param.osz[1]; | |||||
size_t OC = param.filter_meta.ocpg; | |||||
size_t compt_oc_block_size = get_oc_tile_size_heuristic(param); | |||||
size_t GROUP = param.filter_meta.group; | |||||
size_t oc_blocks_per_group = div_ceil(OC, compt_oc_block_size); | |||||
auto matmul_param = utils::get_matmul_kern_param(param, OH * OW, | |||||
compt_oc_block_size); | |||||
WorkspaceBundle whole_bundle = {nullptr, {}}; | |||||
WorkspaceBundle matmul_bundle = {nullptr, {}}; | |||||
auto pack_mode = m_matmul_algo->packmode(); | |||||
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { | |||||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||||
midout_iv("get_defaul_matmul_packmode_bundle"_hash)) { | |||||
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> | |||||
dispatcher; | |||||
whole_bundle = dispatcher.get_bundle(param, matmul_param, | |||||
m_matmul_algo, | |||||
compt_oc_block_size); | |||||
matmul_bundle = m_matmul_algo->get_bundle(matmul_param); | |||||
} | |||||
MIDOUT_END(); | |||||
} else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { | |||||
MIDOUT_BEGIN( | |||||
megdnn_fallback_conv1x1, | |||||
midout_iv("get_onlypacka_matmul_packmode_bundle"_hash)) { | |||||
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> | |||||
dispatcher; | |||||
whole_bundle = dispatcher.get_bundle(param, matmul_param, | |||||
m_matmul_algo, | |||||
compt_oc_block_size); | |||||
matmul_bundle = m_matmul_algo->get_bundle(matmul_param); | |||||
} | |||||
MIDOUT_END(); | |||||
} else { | |||||
//! if nopack return null so that OprWeightPreprocessProxy can run | |||||
//! with nopack mode | |||||
return {}; | |||||
} | |||||
Conv1x1StrategyBase* conv1x1_strategy = | |||||
Conv1x1Factory::make_conv1x1_strategy(param, pack_mode, | |||||
param.filter_meta.format); | |||||
auto kern_packA = [this, whole_bundle, matmul_bundle, param, | |||||
compt_oc_block_size, conv1x1_strategy]( | |||||
const NCBKernParam& ncb_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
conv1x1_strategy->packA(whole_bundle, matmul_bundle, | |||||
compt_oc_block_size, this->m_matmul_algo, | |||||
param, ncb_param, std::move(ncb_index)); | |||||
}; | |||||
ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); | |||||
return ret_kern; | |||||
} | |||||
MIDOUT_END(); | |||||
return {}; | |||||
} | |||||
bool ConvBiasImpl::AlgoConv1x1::is_preferred( | bool ConvBiasImpl::AlgoConv1x1::is_preferred( | ||||
const NCBKernSizeParam& param) const { | const NCBKernSizeParam& param) const { | ||||
@@ -20,6 +20,11 @@ namespace megdnn { | |||||
namespace fallback { | namespace fallback { | ||||
class ConvBiasImpl::AlgoConv1x1 final : public AlgoBase { | class ConvBiasImpl::AlgoConv1x1 final : public AlgoBase { | ||||
WorkspaceBundle get_bundle_according_packmode( | |||||
const NCBKernSizeParam& param) const; | |||||
SmallVector<NCBKern> get_kerns_according_packmode( | |||||
const NCBKernSizeParam& param, bool weight_preprocess) const; | |||||
public: | public: | ||||
AlgoConv1x1(MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) | AlgoConv1x1(MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) | ||||
: m_matmul_algo(matmul_algo), m_oc_block_size(oc_block_size) {} | : m_matmul_algo(matmul_algo), m_oc_block_size(oc_block_size) {} | ||||
@@ -41,7 +46,7 @@ public: | |||||
const NCBKernSizeParam& param) const override; | const NCBKernSizeParam& param) const override; | ||||
bool is_preferred(const NCBKernSizeParam&) const override; | bool is_preferred(const NCBKernSizeParam&) const override; | ||||
SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | ||||
const NCBKernSizeParam& param) const override; | const NCBKernSizeParam& param) const override; | ||||
size_t get_preprocess_workspace( | size_t get_preprocess_workspace( | ||||
@@ -360,23 +360,23 @@ ConvBiasImpl::AlgoConv1x1Gemv::dispatch_kerns( | |||||
dt_uint8, PostprocessMode::QUANTIZED, | dt_uint8, PostprocessMode::QUANTIZED, | ||||
"NCHW::GEMV::QUINT8x8x32_QUINT8"_hash); | "NCHW::GEMV::QUINT8x8x32_QUINT8"_hash); | ||||
break; | break; | ||||
//!no support nchw44 8x8x16 | |||||
case param::ConvBias::Format::NCHW44: | case param::ConvBias::Format::NCHW44: | ||||
cb1(param::ConvBias::Format::NCHW44, dt_float32, dt_float32, | cb1(param::ConvBias::Format::NCHW44, dt_float32, dt_float32, | ||||
PostprocessMode::FLOAT, "NCHW44::GEMV::FLOAT"_hash); | PostprocessMode::FLOAT, "NCHW44::GEMV::FLOAT"_hash); | ||||
cb2(param::ConvBias::Format::NCHW44, dt_int8, dt_int32, dt_int32, | |||||
dt_int8, dt_int32, dt_int32, PostprocessMode::NO_PROCESS, | |||||
cb3(param::ConvBias::Format::NCHW44, dt_int8, dt_int32, dt_int32, | |||||
dt_int8, dt_int32, dt_int32, PostprocessMode::ADD_BIAS, | |||||
"NCHW44::GEMV::INT8x8x32_INT32"_hash); | "NCHW44::GEMV::INT8x8x32_INT32"_hash); | ||||
cb2(param::ConvBias::Format::NCHW44, dtype::QuantizedS8, | |||||
cb3(param::ConvBias::Format::NCHW44, dtype::QuantizedS8, | |||||
dtype::QuantizedS32, dtype::QuantizedS32, dt_int8, dt_int32, | dtype::QuantizedS32, dtype::QuantizedS32, dt_int8, dt_int32, | ||||
dt_int32, PostprocessMode::NO_PROCESS, | |||||
dt_int32, PostprocessMode::ADD_BIAS, | |||||
"NCHW44::GEMV::QINT8x8x32_QINT32"_hash); | "NCHW44::GEMV::QINT8x8x32_QINT32"_hash); | ||||
cb2(param::ConvBias::Format::NCHW44, dtype::QuantizedS8, | cb2(param::ConvBias::Format::NCHW44, dtype::QuantizedS8, | ||||
dtype::QuantizedS32, dtype::QuantizedS8, dt_int8, dt_int32, | dtype::QuantizedS32, dtype::QuantizedS8, dt_int8, dt_int32, | ||||
dt_int8, PostprocessMode::QUANTIZED, | dt_int8, PostprocessMode::QUANTIZED, | ||||
"NCHW44::GEMV::QINT8x8x32_QINT8"_hash); | "NCHW44::GEMV::QINT8x8x32_QINT8"_hash); | ||||
break; | break; | ||||
//!no support nchw44-dot 8x8x16 | |||||
case param::ConvBias::Format::NCHW44_DOT: | case param::ConvBias::Format::NCHW44_DOT: | ||||
cb3(param::ConvBias::Format::NCHW44_DOT, dt_int8, dt_int32, | cb3(param::ConvBias::Format::NCHW44_DOT, dt_int8, dt_int32, | ||||
dt_int32, dt_int8, dt_int32, dt_int32, | dt_int32, dt_int8, dt_int32, dt_int32, | ||||
@@ -420,81 +420,74 @@ bool ConvBiasImpl::AlgoConv1x1Gemv::usable(const NCBKernSizeParam& param, | |||||
MIDOUT_BEGIN(megdnn_fallback_conv1x1_gemv, | MIDOUT_BEGIN(megdnn_fallback_conv1x1_gemv, | ||||
midout_iv("AlgoConv1x1Gemv::usable"_hash)) { | midout_iv("AlgoConv1x1Gemv::usable"_hash)) { | ||||
auto format = param.filter_meta.format; | auto format = param.filter_meta.format; | ||||
#if MEGDNN_X86 | |||||
if (format != param::ConvBias::Format::NCHW) | |||||
return false; | |||||
#elif MEGDNN_AARCH64 || MEGDNN_ARMV7 | |||||
if (format != param::ConvBias::Format::NCHW && | |||||
format != param::ConvBias::Format::NCHW44 && | |||||
format != param::ConvBias::Format::NCHW44_DOT) | |||||
return false; | |||||
#endif | |||||
//! whether 1x1 | |||||
size_t FH = param.filter_meta.spatial[0], | size_t FH = param.filter_meta.spatial[0], | ||||
FW = param.filter_meta.spatial[1]; | FW = param.filter_meta.spatial[1]; | ||||
size_t PH = param.filter_meta.padding[0], | size_t PH = param.filter_meta.padding[0], | ||||
PW = param.filter_meta.padding[1]; | PW = param.filter_meta.padding[1]; | ||||
size_t SH = param.filter_meta.stride[0], | size_t SH = param.filter_meta.stride[0], | ||||
SW = param.filter_meta.stride[1]; | SW = param.filter_meta.stride[1]; | ||||
if (FH != 1 || FW != 1 || PH || PW || SH != 1 || SW != 1) { | |||||
return false; | |||||
} | |||||
//! whether gemv | |||||
size_t OH = param.osz[0]; | size_t OH = param.osz[0]; | ||||
size_t OW = param.osz[1]; | size_t OW = param.osz[1]; | ||||
if (OH * OW != 1) { | |||||
//! whether gemv and 1x1 | |||||
if (OH * OW != 1 || FH != 1 || FW != 1 || PH || PW || SH != 1 || | |||||
SW != 1) { | |||||
return false; | return false; | ||||
} | } | ||||
//! even no naive support in gemv | |||||
if ((param.src_type.enumv() == param.filter_type.enumv() && | |||||
param.src_type.enumv() == DTypeEnum::Int16) && | |||||
param.dst_type.enumv() == DTypeEnum::Int32) { | |||||
#if MEGDNN_AARCH64 || MEGDNN_ARMV7 | |||||
if (format != param::ConvBias::Format::NCHW && | |||||
format != param::ConvBias::Format::NCHW44 && | |||||
format != param::ConvBias::Format::NCHW44_DOT) { | |||||
return false; | return false; | ||||
} | } | ||||
//! make sure 8x8x16 and 8x8x32 biasmode is nobias and nonlineMode | |||||
//! is identity otherwise return false mean that 8x8x32 and 8x8x16 | |||||
//! not support PostProcess | |||||
if (param.dst_type.enumv() == DTypeEnum::Int16 || | |||||
param.dst_type.enumv() == DTypeEnum::Int32 || | |||||
param.dst_type.enumv() == DTypeEnum::QuantizedS32) { | |||||
if (param.nonlineMode != megdnn::NonlineMode::IDENTITY) { | |||||
return false; | |||||
} | |||||
} | |||||
//! supports a few dtypes | |||||
if (param.src_type.enumv() != param.filter_type.enumv()) { | |||||
#else | |||||
if (format != param::ConvBias::Format::NCHW) { | |||||
return false; | return false; | ||||
} | } | ||||
if (param.src_type.enumv() != DTypeEnum::Int8 && | |||||
param.src_type.enumv() != DTypeEnum::QuantizedS8 && | |||||
param.src_type.enumv() != DTypeEnum::Quantized8Asymm && | |||||
#endif | |||||
//! supports a few dtypes | |||||
if (param.src_type.enumv() != param.filter_type.enumv() || | |||||
(param.src_type.enumv() != DTypeEnum::Int8 && | |||||
param.src_type.enumv() != DTypeEnum::QuantizedS8 && | |||||
param.src_type.enumv() != DTypeEnum::Quantized8Asymm && | |||||
#if !MEGDNN_DISABLE_FLOAT16 | #if !MEGDNN_DISABLE_FLOAT16 | ||||
param.src_type.enumv() != DTypeEnum::Float16 && | |||||
param.src_type.enumv() != DTypeEnum::Float16 && | |||||
#endif | #endif | ||||
param.src_type.enumv() != DTypeEnum::Float32) { | |||||
param.src_type.enumv() != DTypeEnum::Float32)) { | |||||
return false; | return false; | ||||
} | } | ||||
#if MEGDNN_AARCH64 || MEGDNN_ARMV7 | |||||
if (format == param::ConvBias::Format::NCHW44) { | if (format == param::ConvBias::Format::NCHW44) { | ||||
if (param.src_type.enumv() != DTypeEnum::Float32 && | if (param.src_type.enumv() != DTypeEnum::Float32 && | ||||
param.src_type.enumv() != DTypeEnum::Int8 && | param.src_type.enumv() != DTypeEnum::Int8 && | ||||
param.src_type.enumv() != DTypeEnum::QuantizedS8) { | param.src_type.enumv() != DTypeEnum::QuantizedS8) { | ||||
return false; | return false; | ||||
} | } | ||||
//! 8x8x16 is not support nchw44 | |||||
if (param.src_type.enumv() == DTypeEnum::Int8 && | |||||
param.dst_type.enumv() == DTypeEnum::Int16) { | |||||
return false; | |||||
} | |||||
} else if (format == param::ConvBias::Format::NCHW44_DOT) { | } else if (format == param::ConvBias::Format::NCHW44_DOT) { | ||||
if (param.src_type.enumv() != DTypeEnum::Int8 && | |||||
param.src_type.enumv() != DTypeEnum::QuantizedS8) { | |||||
if ((param.src_type.enumv() != DTypeEnum::Int8 && | |||||
param.src_type.enumv() != DTypeEnum::QuantizedS8) || | |||||
param.dst_type.enumv() == DTypeEnum::Int16) { | |||||
return false; | return false; | ||||
} | } | ||||
} | } | ||||
#endif | |||||
//! make sure 8x8x16 and 8x8x32 biasmode nonlineMode is identity | |||||
//! otherwise return false | |||||
if (param.dst_type.enumv() == DTypeEnum::Int16 || | |||||
param.dst_type.enumv() == DTypeEnum::Int32 || | |||||
param.dst_type.enumv() == DTypeEnum::QuantizedS32) { | |||||
if (param.nonlineMode != megdnn::NonlineMode::IDENTITY) { | |||||
return false; | |||||
} | |||||
} | |||||
//! even no naive support in gemv | |||||
if ((param.src_type.enumv() == param.filter_type.enumv() && | |||||
param.src_type.enumv() == DTypeEnum::Int16) && | |||||
param.dst_type.enumv() == DTypeEnum::Int32) { | |||||
return false; | |||||
} | |||||
return (param.filter_meta.dilation[0] == | return (param.filter_meta.dilation[0] == | ||||
param.filter_meta.dilation[1] && | param.filter_meta.dilation[1] && | ||||
param.filter_meta.dilation[0] == 1) && | param.filter_meta.dilation[0] == 1) && | ||||
@@ -11,14 +11,19 @@ | |||||
#pragma once | #pragma once | ||||
#include "src/fallback/conv_bias/conv1x1/conv1x1_strategy.h" | |||||
#include "src/fallback/conv_bias/conv1x1/conv1x1_utils.h" | #include "src/fallback/conv_bias/conv1x1/conv1x1_utils.h" | ||||
#include "src/fallback/conv_bias/opr_impl.h" | |||||
namespace megdnn { | namespace megdnn { | ||||
namespace fallback { | namespace fallback { | ||||
namespace conv1x1 { | namespace conv1x1 { | ||||
template <MatrixMulImpl::AlgoBase::PackMode pack_mode> | template <MatrixMulImpl::AlgoBase::PackMode pack_mode> | ||||
class Conv1x1Kerns { | |||||
class Conv1x1Kerns; | |||||
template <> | |||||
class Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> { | |||||
public: | public: | ||||
//! get_bundle | //! get_bundle | ||||
WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param, | WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param, | ||||
@@ -28,13 +33,12 @@ public: | |||||
size_t GROUP = param.filter_meta.group; | size_t GROUP = param.filter_meta.group; | ||||
size_t OC = param.filter_meta.ocpg; | size_t OC = param.filter_meta.ocpg; | ||||
size_t BATCH = param.n; | size_t BATCH = param.n; | ||||
//! bundle per thread | //! bundle per thread | ||||
//! matmul_param records a matmul with M = oc_tile_size, K = IC, N = OH | //! matmul_param records a matmul with M = oc_tile_size, K = IC, N = OH | ||||
//! * OW this does not bother packb bytes | //! * OW this does not bother packb bytes | ||||
auto matmul_bundle = matmul_algo->get_bundle(matmul_param); | auto matmul_bundle = matmul_algo->get_bundle(matmul_param); | ||||
auto thread_bundle = utils::get_thread_bundle(param, matmul_bundle.get_size(2), | |||||
oc_tile_size); | |||||
auto thread_bundle = utils::get_thread_bundle( | |||||
param, matmul_bundle.get_size(2), oc_tile_size); | |||||
//! size per thread | //! size per thread | ||||
size_t all_threads_bytes = | size_t all_threads_bytes = | ||||
thread_bundle.total_size_in_bytes() * param.nr_threads; | thread_bundle.total_size_in_bytes() * param.nr_threads; | ||||
@@ -46,11 +50,6 @@ public: | |||||
is_enable_filter_preprocess(param) | is_enable_filter_preprocess(param) | ||||
? 0 | ? 0 | ||||
: packa_bytes_per_oc_tile * oc_tiles_per_group * GROUP; | : packa_bytes_per_oc_tile * oc_tiles_per_group * GROUP; | ||||
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) | |||||
return WorkspaceBundle{nullptr, | |||||
{all_packa_bytes, 0, all_threads_bytes}}; | |||||
//! packb size = N * GROUP * packb_size_per_group | //! packb size = N * GROUP * packb_size_per_group | ||||
size_t packb_bytes_per_group = matmul_bundle.get_size(1); | size_t packb_bytes_per_group = matmul_bundle.get_size(1); | ||||
size_t all_packb_bytes = packb_bytes_per_group * GROUP * BATCH; | size_t all_packb_bytes = packb_bytes_per_group * GROUP * BATCH; | ||||
@@ -58,6 +57,165 @@ public: | |||||
return WorkspaceBundle{ | return WorkspaceBundle{ | ||||
nullptr, {all_packa_bytes, all_packb_bytes, all_threads_bytes}}; | nullptr, {all_packa_bytes, all_packb_bytes, all_threads_bytes}}; | ||||
} | } | ||||
SmallVector<ConvBiasImpl::NCBKern> get_kern( | |||||
const ConvBiasImpl::NCBKernSizeParam& param, | |||||
WorkspaceBundle& whole_bundle, WorkspaceBundle& matmul_bundle, | |||||
WorkspaceBundle& thread_bundle, | |||||
Conv1x1StrategyBase* conv1x1_strategy, | |||||
const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) { | |||||
auto kern_packA = | |||||
[whole_bundle, matmul_bundle, param, matmul_algo, oc_block_size, | |||||
conv1x1_strategy]( | |||||
const ConvBiasImpl::NCBKernParam& ncb_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||||
conv1x1_strategy->packA(whole_bundle, matmul_bundle, | |||||
oc_block_size, matmul_algo, param, | |||||
ncb_param, std::move(ncb_index)); | |||||
}; | |||||
auto kern_packB = | |||||
[whole_bundle, matmul_bundle, param, matmul_algo, | |||||
conv1x1_strategy]( | |||||
const ConvBiasImpl::NCBKernParam& ncb_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||||
conv1x1_strategy->packB(whole_bundle, matmul_bundle, | |||||
matmul_algo, param, ncb_param, | |||||
std::move(ncb_index)); | |||||
}; | |||||
auto kern_compt = | |||||
[whole_bundle, matmul_bundle, thread_bundle, matmul_algo, param, | |||||
oc_block_size, conv1x1_strategy]( | |||||
const ConvBiasImpl::NCBKernParam& ncb_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||||
conv1x1_strategy->exec(whole_bundle, matmul_bundle, | |||||
thread_bundle, oc_block_size, | |||||
matmul_algo, param, ncb_param, | |||||
std::move(ncb_index)); | |||||
}; | |||||
size_t GROUP = param.filter_meta.group; | |||||
size_t BATCH = param.n; | |||||
size_t OC = param.filter_meta.ocpg; | |||||
size_t oc_blocks_per_group = div_ceil(OC, oc_block_size); | |||||
SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||||
if (!is_enable_filter_preprocess(param)) { | |||||
ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); | |||||
} | |||||
ret_kern.push_back({kern_packB, {BATCH}}); | |||||
ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}}); | |||||
return ret_kern; | |||||
} | |||||
SmallVector<ConvBiasImpl::NCBKern> get_kern_preprocess( | |||||
const ConvBiasImpl::NCBKernSizeParam& param, | |||||
WorkspaceBundle& whole_bundle, WorkspaceBundle& matmul_bundle, | |||||
Conv1x1StrategyBase* conv1x1_strategy, | |||||
const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) { | |||||
auto kern_packA = | |||||
[whole_bundle, matmul_bundle, param, matmul_algo, oc_block_size, | |||||
conv1x1_strategy]( | |||||
const ConvBiasImpl::NCBKernParam& ncb_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||||
conv1x1_strategy->packA(whole_bundle, matmul_bundle, | |||||
oc_block_size, matmul_algo, param, | |||||
ncb_param, std::move(ncb_index)); | |||||
}; | |||||
size_t GROUP = param.filter_meta.group; | |||||
size_t OC = param.filter_meta.ocpg; | |||||
size_t oc_blocks_per_group = div_ceil(OC, oc_block_size); | |||||
SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||||
ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); | |||||
return ret_kern; | |||||
} | |||||
}; | |||||
template<> | |||||
class Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> { | |||||
public: | |||||
//! get_bundle | |||||
WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param, | |||||
const MatrixMulImpl::KernSizeParam& matmul_param, | |||||
const MatrixMulImpl::AlgoBase* matmul_algo, | |||||
size_t oc_tile_size) { | |||||
size_t GROUP = param.filter_meta.group; | |||||
size_t OC = param.filter_meta.ocpg; | |||||
//! bundle per thread | |||||
//! matmul_param records a matmul with M = oc_tile_size, K = IC, N = OH | |||||
//! * OW this does not bother packb bytes | |||||
auto matmul_bundle = matmul_algo->get_bundle(matmul_param); | |||||
auto thread_bundle = utils::get_thread_bundle( | |||||
param, matmul_bundle.get_size(2), oc_tile_size); | |||||
//! size per thread | |||||
size_t all_threads_bytes = | |||||
thread_bundle.total_size_in_bytes() * param.nr_threads; | |||||
//! packa size = GROUP * packa_size_each_group | |||||
size_t packa_bytes_per_oc_tile = matmul_bundle.get_size(0); | |||||
size_t oc_tiles_per_group = div_ceil(OC, oc_tile_size); | |||||
size_t all_packa_bytes = | |||||
is_enable_filter_preprocess(param) | |||||
? 0 | |||||
: packa_bytes_per_oc_tile * oc_tiles_per_group * GROUP; | |||||
return WorkspaceBundle{nullptr, | |||||
{all_packa_bytes, 0, all_threads_bytes}}; | |||||
} | |||||
SmallVector<ConvBiasImpl::NCBKern> get_kern( | |||||
const ConvBiasImpl::NCBKernSizeParam& param, | |||||
WorkspaceBundle& whole_bundle, WorkspaceBundle& matmul_bundle, | |||||
WorkspaceBundle& thread_bundle, | |||||
Conv1x1StrategyBase* conv1x1_strategy, | |||||
const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) { | |||||
auto kern_packA = | |||||
[whole_bundle, matmul_bundle, param, matmul_algo, oc_block_size, | |||||
conv1x1_strategy]( | |||||
const ConvBiasImpl::NCBKernParam& ncb_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||||
conv1x1_strategy->packA(whole_bundle, matmul_bundle, | |||||
oc_block_size, matmul_algo, param, | |||||
ncb_param, std::move(ncb_index)); | |||||
}; | |||||
auto kern_compt = | |||||
[whole_bundle, matmul_bundle, thread_bundle, matmul_algo, param, | |||||
oc_block_size, conv1x1_strategy]( | |||||
const ConvBiasImpl::NCBKernParam& ncb_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||||
conv1x1_strategy->exec(whole_bundle, matmul_bundle, | |||||
thread_bundle, oc_block_size, | |||||
matmul_algo, param, ncb_param, | |||||
std::move(ncb_index)); | |||||
}; | |||||
size_t GROUP = param.filter_meta.group; | |||||
size_t BATCH = param.n; | |||||
size_t OC = param.filter_meta.ocpg; | |||||
size_t oc_blocks_per_group = div_ceil(OC, oc_block_size); | |||||
SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||||
if (!is_enable_filter_preprocess(param)) { | |||||
ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); | |||||
} | |||||
ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}}); | |||||
return ret_kern; | |||||
} | |||||
SmallVector<ConvBiasImpl::NCBKern> get_kern_preprocess( | |||||
const ConvBiasImpl::NCBKernSizeParam& param, | |||||
WorkspaceBundle& whole_bundle, WorkspaceBundle& matmul_bundle, | |||||
Conv1x1StrategyBase* conv1x1_strategy, | |||||
const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) { | |||||
auto kern_packA = | |||||
[whole_bundle, matmul_bundle, param, matmul_algo, oc_block_size, | |||||
conv1x1_strategy]( | |||||
const ConvBiasImpl::NCBKernParam& ncb_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||||
conv1x1_strategy->packA(whole_bundle, matmul_bundle, | |||||
oc_block_size, matmul_algo, param, | |||||
ncb_param, std::move(ncb_index)); | |||||
}; | |||||
size_t GROUP = param.filter_meta.group; | |||||
size_t OC = param.filter_meta.ocpg; | |||||
size_t oc_blocks_per_group = div_ceil(OC, oc_block_size); | |||||
SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||||
ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); | |||||
return ret_kern; | |||||
} | |||||
}; | }; | ||||
template<> | template<> | ||||
@@ -69,14 +227,47 @@ public: | |||||
const MatrixMulImpl::AlgoBase* matmul_algo, | const MatrixMulImpl::AlgoBase* matmul_algo, | ||||
size_t oc_tile_size) { | size_t oc_tile_size) { | ||||
size_t matmul_size = matmul_algo->get_workspace(matmul_param); | size_t matmul_size = matmul_algo->get_workspace(matmul_param); | ||||
auto thread_bundle = utils::get_thread_bundle(param, matmul_size, oc_tile_size); | |||||
auto thread_bundle = | |||||
utils::get_thread_bundle(param, matmul_size, oc_tile_size); | |||||
//! size per thread | //! size per thread | ||||
size_t all_threads_bytes = | size_t all_threads_bytes = | ||||
thread_bundle.total_size_in_bytes() * param.nr_threads; | thread_bundle.total_size_in_bytes() * param.nr_threads; | ||||
return WorkspaceBundle{nullptr, {0, 0, all_threads_bytes}}; | return WorkspaceBundle{nullptr, {0, 0, all_threads_bytes}}; | ||||
} | } | ||||
SmallVector<ConvBiasImpl::NCBKern> get_kern( | |||||
const ConvBiasImpl::NCBKernSizeParam& param, | |||||
WorkspaceBundle& whole_bundle, WorkspaceBundle& matmul_bundle, | |||||
WorkspaceBundle& thread_bundle, | |||||
Conv1x1StrategyBase* conv1x1_strategy, | |||||
const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) { | |||||
auto kern_compt = | |||||
[whole_bundle, matmul_bundle, thread_bundle, matmul_algo, param, | |||||
oc_block_size, conv1x1_strategy]( | |||||
const ConvBiasImpl::NCBKernParam& ncb_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||||
conv1x1_strategy->exec(whole_bundle, matmul_bundle, | |||||
thread_bundle, oc_block_size, | |||||
matmul_algo, param, ncb_param, | |||||
std::move(ncb_index)); | |||||
}; | |||||
size_t GROUP = param.filter_meta.group; | |||||
size_t BATCH = param.n; | |||||
size_t OC = param.filter_meta.ocpg; | |||||
size_t oc_blocks_per_group = div_ceil(OC, oc_block_size); | |||||
SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||||
ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}}); | |||||
return ret_kern; | |||||
} | |||||
SmallVector<ConvBiasImpl::NCBKern> get_kern_preprocess( | |||||
const ConvBiasImpl::NCBKernSizeParam&, WorkspaceBundle&, | |||||
WorkspaceBundle&, Conv1x1StrategyBase*, | |||||
const MatrixMulImpl::AlgoBase*, size_t) { | |||||
return {}; | |||||
} | |||||
}; | }; | ||||
} // namespace conv1x1 | } // namespace conv1x1 | ||||
} // namespace fallback | } // namespace fallback | ||||
} // namespace megdnn | } // namespace megdnn | ||||
// vim: syntax=cpp.doxygen |
@@ -59,7 +59,8 @@ public: | |||||
template <typename src_ctype, typename bias_ctype, typename dst_ctype, | template <typename src_ctype, typename bias_ctype, typename dst_ctype, | ||||
typename op_ctype, typename op_dtype, | typename op_ctype, typename op_dtype, | ||||
megdnn::PostprocessMode postprocess_mode, MatrixMulImpl::AlgoBase::PackMode pack_mode> | |||||
megdnn::PostprocessMode postprocess_mode, | |||||
MatrixMulImpl::AlgoBase::PackMode pack_mode> | |||||
class Conv1x1Strategy : public Conv1x1StrategyBase { | class Conv1x1Strategy : public Conv1x1StrategyBase { | ||||
public: | public: | ||||
explicit Conv1x1Strategy(size_t pack_size = 1) : m_pack_size(pack_size) {} | explicit Conv1x1Strategy(size_t pack_size = 1) : m_pack_size(pack_size) {} | ||||
@@ -136,32 +137,30 @@ public: | |||||
size_t packb_bytes_per_group = matmul_bundle.get_size(1); | size_t packb_bytes_per_group = matmul_bundle.get_size(1); | ||||
size_t GROUP = param.filter_meta.group; | size_t GROUP = param.filter_meta.group; | ||||
size_t BATCH = param.n; | |||||
size_t SH = param.filter_meta.stride[0]; | size_t SH = param.filter_meta.stride[0]; | ||||
size_t SW = param.filter_meta.stride[1]; | size_t SW = param.filter_meta.stride[1]; | ||||
size_t OH = param.osz[0]; | size_t OH = param.osz[0]; | ||||
size_t OW = param.osz[1]; | size_t OW = param.osz[1]; | ||||
size_t OC = param.filter_meta.ocpg; | size_t OC = param.filter_meta.ocpg; | ||||
size_t batch = ncb_index.ndrange_id[0]; | |||||
MatrixMulImpl::KernParam matmul_kern_param; | MatrixMulImpl::KernParam matmul_kern_param; | ||||
static_cast<MatrixMulImpl::KernSizeParam&>(matmul_kern_param) = | static_cast<MatrixMulImpl::KernSizeParam&>(matmul_kern_param) = | ||||
utils::get_matmul_kern_param(param, OH * OW, OC); | utils::get_matmul_kern_param(param, OH * OW, OC); | ||||
rep(batch, BATCH) { | |||||
rep(g, GROUP) { | |||||
if (SH == 2 && SW == 2) | |||||
megdnn_throw("no support for stride = 2"); | |||||
size_t bytes_offset_of_b_panel = | |||||
batch * packb_bytes_per_group * GROUP + | |||||
g * packb_bytes_per_group; | |||||
src_ctype* b_panel = reinterpret_cast<src_ctype*>( | |||||
reinterpret_cast<int8_t*>(whole_bundle.get(1)) + | |||||
bytes_offset_of_b_panel); | |||||
matmul_kern_param.B_ptr = const_cast<src_ctype*>( | |||||
ncb_param.src<src_ctype>(batch, g)); | |||||
matmul_algo->pack_B(matmul_kern_param, b_panel, 0, OH * OW); | |||||
} | |||||
rep(g, GROUP) { | |||||
if (SH == 2 && SW == 2) | |||||
megdnn_throw("no support for stride = 2"); | |||||
size_t bytes_offset_of_b_panel = | |||||
batch * packb_bytes_per_group * GROUP + | |||||
g * packb_bytes_per_group; | |||||
src_ctype* b_panel = reinterpret_cast<src_ctype*>( | |||||
reinterpret_cast<int8_t*>(whole_bundle.get(1)) + | |||||
bytes_offset_of_b_panel); | |||||
matmul_kern_param.B_ptr = const_cast<src_ctype*>( | |||||
ncb_param.src<src_ctype>(batch, g)); | |||||
matmul_algo->pack_B(matmul_kern_param, b_panel, 0, OH * OW); | |||||
} | } | ||||
} else { | } else { | ||||
megdnn_log_error("OnlyPackA mode and NoPack mode has no packB kernel"); | megdnn_log_error("OnlyPackA mode and NoPack mode has no packB kernel"); | ||||