GitOrigin-RevId: 90aa75d51e
tags/v1.0.0-rc1
@@ -612,7 +612,7 @@ bool PoolingImpl::AlgoFilter3ModexStridexNCHW44::usable( | |||
(param.mode == Mode::MAX || param.mode == Mode::AVERAGE) && | |||
FH == 3 && FW == 3 && SW == SH && (SH == 1 || SW == 2); | |||
//! Int8 not support average, because its round mode is different form | |||
//! quint8 | |||
//! qint8 | |||
avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && | |||
param.mode == Mode::AVERAGE); | |||
return avaible; | |||
@@ -705,7 +705,7 @@ bool PoolingImpl::AlgoFilter2ModexStridexNCHW44::usable( | |||
(param.mode == Mode::MAX || param.mode == Mode::AVERAGE) && | |||
FH == 2 && FW == 2 && SH == SW && (SW == 1 || SW == 2); | |||
//! Int8 not support average, because its round mode is different form | |||
//! quint8 | |||
//! qint8 | |||
avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && | |||
param.mode == Mode::AVERAGE); | |||
return avaible; | |||
@@ -799,7 +799,7 @@ bool PoolingImpl::AlgoFilter4ModexStridexNCHW44::usable( | |||
FH == 4 && FW == 4 && SH == SW && (SW == 1 || SW == 2); | |||
//! Int8 not support average, because its round mode is different form | |||
//! quint8 | |||
//! qint8 | |||
avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && | |||
param.mode == Mode::AVERAGE); | |||
return avaible; | |||
@@ -892,7 +892,7 @@ bool PoolingImpl::AlgoFilter5ModexStridexNCHW44::usable( | |||
(param.mode == Mode::MAX || param.mode == Mode::AVERAGE) && | |||
FH == 5 && FW == 5 && SH == SW && (SW == 1 || SW == 2); | |||
//! Int8 not support average, because its round mode is different form | |||
//! quint8 | |||
//! qint8 | |||
avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && | |||
param.mode == Mode::AVERAGE); | |||
return avaible; | |||
@@ -47,7 +47,7 @@ size_t ConvBiasImpl::AlgoConv1x1::get_oc_tile_size_heuristic( | |||
return round_up<size_t>(oc_block_size_one_thread, 24); | |||
} | |||
size_t ConvBiasImpl::AlgoConv1x1::get_workspace( | |||
WorkspaceBundle ConvBiasImpl::AlgoConv1x1::get_bundle_according_packmode( | |||
const NCBKernSizeParam& param) const { | |||
size_t OH = param.osz[0]; | |||
size_t OW = param.osz[1]; | |||
@@ -58,168 +58,195 @@ size_t ConvBiasImpl::AlgoConv1x1::get_workspace( | |||
auto pack_mode = m_matmul_algo->packmode(); | |||
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { | |||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 0) { | |||
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> dispatcher; | |||
return dispatcher | |||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||
midout_iv("get_bundle_default"_hash)) { | |||
return Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT>() | |||
.get_bundle(param, matmul_param, m_matmul_algo, | |||
compt_oc_block_size) | |||
.total_size_in_bytes(); | |||
compt_oc_block_size); | |||
} | |||
MIDOUT_END(); | |||
} else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { | |||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 1) { | |||
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> | |||
dispatcher; | |||
return dispatcher | |||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||
midout_iv("get_bundle_only_packa"_hash)) { | |||
return Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA>() | |||
.get_bundle(param, matmul_param, m_matmul_algo, | |||
compt_oc_block_size) | |||
.total_size_in_bytes(); | |||
compt_oc_block_size); | |||
} | |||
MIDOUT_END(); | |||
} else { | |||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 2) { | |||
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::NO_PACK> dispatcher; | |||
return dispatcher | |||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||
midout_iv("get_bundle_no_pack"_hash)) { | |||
return Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::NO_PACK>() | |||
.get_bundle(param, matmul_param, m_matmul_algo, | |||
compt_oc_block_size) | |||
.total_size_in_bytes(); | |||
compt_oc_block_size); | |||
} | |||
MIDOUT_END(); | |||
} | |||
return 0; | |||
return {nullptr, {}}; | |||
} | |||
SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoConv1x1::dispatch_kerns( | |||
size_t ConvBiasImpl::AlgoConv1x1::get_workspace( | |||
const NCBKernSizeParam& param) const { | |||
SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||
return get_bundle_according_packmode(param).total_size_in_bytes(); | |||
} | |||
SmallVector<ConvBiasImpl::NCBKern> | |||
ConvBiasImpl::AlgoConv1x1::get_kerns_according_packmode( | |||
const NCBKernSizeParam& param, bool weight_preprocess) const { | |||
size_t OH = param.osz[0]; | |||
size_t OW = param.osz[1]; | |||
size_t OC = param.filter_meta.ocpg; | |||
size_t compt_oc_block_size = get_oc_tile_size_heuristic(param); | |||
size_t GROUP = param.filter_meta.group; | |||
size_t BATCH = param.n; | |||
size_t oc_blocks_per_group = div_ceil(OC, compt_oc_block_size); | |||
auto pack_mode = m_matmul_algo->packmode(); | |||
Conv1x1StrategyBase* conv1x1_strategy = | |||
Conv1x1Factory::make_conv1x1_strategy(param, pack_mode, | |||
param.filter_meta.format); | |||
auto matmul_param = | |||
utils::get_matmul_kern_param(param, OH * OW, compt_oc_block_size); | |||
WorkspaceBundle whole_bundle = {nullptr, {}}; | |||
WorkspaceBundle thread_bundle = {nullptr, {}}; | |||
WorkspaceBundle matmul_bundle = {nullptr, {}}; | |||
auto pack_mode = m_matmul_algo->packmode(); | |||
WorkspaceBundle whole_bundle = get_bundle_according_packmode(param); | |||
//! NO_PACK not implement get_bundle | |||
WorkspaceBundle matmul_bundle ={nullptr,{}}; | |||
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::NO_PACK) { | |||
matmul_bundle = {nullptr, | |||
{0, 0, m_matmul_algo->get_workspace(matmul_param)}}; | |||
} else { | |||
matmul_bundle = m_matmul_algo->get_bundle(matmul_param); | |||
} | |||
WorkspaceBundle thread_bundle = utils::get_thread_bundle( | |||
param, matmul_bundle.get_size(2), compt_oc_block_size); | |||
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { | |||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 0) { | |||
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> dispatcher; | |||
whole_bundle = dispatcher.get_bundle( | |||
param, matmul_param, m_matmul_algo, compt_oc_block_size); | |||
matmul_bundle = m_matmul_algo->get_bundle(matmul_param); | |||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||
midout_iv("get_kern_default"_hash)) { | |||
if (!weight_preprocess) { | |||
return Conv1x1Kerns< | |||
MatrixMulImpl::AlgoBase::PackMode::DEFAULT>() | |||
.get_kern(param, whole_bundle, matmul_bundle, | |||
thread_bundle, conv1x1_strategy, | |||
m_matmul_algo, compt_oc_block_size); | |||
} else { | |||
return Conv1x1Kerns< | |||
MatrixMulImpl::AlgoBase::PackMode::DEFAULT>() | |||
.get_kern_preprocess(param, whole_bundle, matmul_bundle, | |||
conv1x1_strategy, m_matmul_algo, | |||
compt_oc_block_size); | |||
} | |||
} | |||
MIDOUT_END(); | |||
} else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { | |||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 1) { | |||
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> | |||
dispatcher; | |||
whole_bundle = dispatcher.get_bundle( | |||
param, matmul_param, m_matmul_algo, compt_oc_block_size); | |||
matmul_bundle = m_matmul_algo->get_bundle(matmul_param); | |||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||
midout_iv("get_kern_only_packa"_hash)) { | |||
if (!weight_preprocess) { | |||
return Conv1x1Kerns< | |||
MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA>() | |||
.get_kern(param, whole_bundle, matmul_bundle, | |||
thread_bundle, conv1x1_strategy, | |||
m_matmul_algo, compt_oc_block_size); | |||
} else { | |||
return Conv1x1Kerns< | |||
MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA>() | |||
.get_kern_preprocess(param, whole_bundle, matmul_bundle, | |||
conv1x1_strategy, m_matmul_algo, | |||
compt_oc_block_size); | |||
} | |||
} | |||
MIDOUT_END(); | |||
} else { | |||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 2) { | |||
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::NO_PACK> dispatcher; | |||
whole_bundle = dispatcher.get_bundle( | |||
param, matmul_param, m_matmul_algo, compt_oc_block_size); | |||
matmul_bundle = { | |||
nullptr, | |||
{0, 0, m_matmul_algo->get_workspace(matmul_param)}}; | |||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||
midout_iv("get_kern_no_pack"_hash)) { | |||
if (!weight_preprocess) { | |||
return Conv1x1Kerns< | |||
MatrixMulImpl::AlgoBase::PackMode::NO_PACK>() | |||
.get_kern(param, whole_bundle, matmul_bundle, | |||
thread_bundle, conv1x1_strategy, | |||
m_matmul_algo, compt_oc_block_size); | |||
} else { | |||
return Conv1x1Kerns< | |||
MatrixMulImpl::AlgoBase::PackMode::NO_PACK>() | |||
.get_kern_preprocess(param, whole_bundle, matmul_bundle, | |||
conv1x1_strategy, m_matmul_algo, | |||
compt_oc_block_size); | |||
} | |||
} | |||
MIDOUT_END(); | |||
} | |||
} | |||
//! get thread bundle | |||
thread_bundle = utils::get_thread_bundle(param, matmul_bundle.get_size(2), | |||
compt_oc_block_size); | |||
Conv1x1StrategyBase* conv1x1_strategy = | |||
Conv1x1Factory::make_conv1x1_strategy(param, pack_mode, | |||
param.filter_meta.format); | |||
SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoConv1x1::dispatch_kerns( | |||
const NCBKernSizeParam& param) const { | |||
return get_kerns_according_packmode(param, false); | |||
} | |||
auto kern_packA = [this, whole_bundle, matmul_bundle, param, | |||
compt_oc_block_size, conv1x1_strategy]( | |||
const NCBKernParam& ncb_param, | |||
const NCBKernIndex& ncb_index) mutable { | |||
conv1x1_strategy->packA(whole_bundle, matmul_bundle, | |||
compt_oc_block_size, this->m_matmul_algo, param, | |||
ncb_param, std::move(ncb_index)); | |||
}; | |||
auto kern_packB = [this, whole_bundle, matmul_bundle, param, | |||
conv1x1_strategy]( | |||
const NCBKernParam& ncb_param, | |||
const NCBKernIndex& ncb_index) mutable { | |||
conv1x1_strategy->packB(whole_bundle, matmul_bundle, | |||
this->m_matmul_algo, param, ncb_param, | |||
std::move(ncb_index)); | |||
}; | |||
auto kern_compt = [this, whole_bundle, matmul_bundle, thread_bundle, param, | |||
compt_oc_block_size, conv1x1_strategy]( | |||
const NCBKernParam& ncb_param, | |||
const NCBKernIndex& ncb_index) mutable { | |||
conv1x1_strategy->exec(whole_bundle, matmul_bundle, thread_bundle, | |||
compt_oc_block_size, this->m_matmul_algo, param, | |||
ncb_param, std::move(ncb_index)); | |||
}; | |||
SmallVector<TensorLayout> | |||
ConvBiasImpl::AlgoConv1x1::deduce_preprocessed_filter_layout( | |||
const NCBKernSizeParam& param) const { | |||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||
midout_iv("deduce_preprocessed_filter_layout"_hash)) { | |||
WorkspaceBundle wb = get_bundle_according_packmode(param); | |||
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT || | |||
pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { | |||
//! if enable filter preprocess kern_packA should not dispatch | |||
if (!is_enable_filter_preprocess(param)) { | |||
ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); | |||
} | |||
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { | |||
ret_kern.push_back({kern_packB, {1}}); | |||
} | |||
size_t GROUP = param.filter_meta.group; | |||
SmallVector<TensorLayout> preprocessed_layouts; | |||
preprocessed_layouts.push_back( | |||
{{GROUP, wb.get_size(0)}, dtype::Int8()}); | |||
return preprocessed_layouts; | |||
} | |||
ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}}); | |||
return ret_kern; | |||
MIDOUT_END(); | |||
return {}; | |||
} | |||
SmallVector<ConvBiasImpl::NCBKern> | |||
ConvBiasImpl::AlgoConv1x1::dispatch_preprocess_kerns( | |||
const NCBKernSizeParam& param) const { | |||
return get_kerns_according_packmode(param, true); | |||
} | |||
bool ConvBiasImpl::AlgoConv1x1::usable(const NCBKernSizeParam& param, | |||
AlgoSelectionStrategy) const { | |||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 2) { | |||
if (param.filter_meta.format != param::ConvBias::Format::NCHW && | |||
param.filter_meta.format != param::ConvBias::Format::NCHW44 && | |||
param.filter_meta.format != param::ConvBias::Format::NCHW44_DOT) | |||
return false; | |||
size_t FH = param.filter_meta.spatial[0], | |||
FW = param.filter_meta.spatial[1]; | |||
size_t PH = param.filter_meta.padding[0], | |||
PW = param.filter_meta.padding[1]; | |||
size_t SH = param.filter_meta.stride[0], | |||
SW = param.filter_meta.stride[1]; | |||
if (FH != 1 || FW != 1 || PH || PW || SH != 1 || SW != 1) | |||
auto format = param.filter_meta.format; | |||
size_t OH = param.osz[0]; | |||
size_t OW = param.osz[1]; | |||
#if MEGDNN_AARCH64 || MEGDNN_ARMV7 | |||
if (format != param::ConvBias::Format::NCHW && | |||
format != param::ConvBias::Format::NCHW44 && | |||
format != param::ConvBias::Format::NCHW44_DOT) { | |||
return false; | |||
if (param.src_type.enumv() != param.filter_type.enumv()) { | |||
} | |||
//! hybird mode is not support | |||
if (param.filter_meta.format == param::ConvBias::Format::NCHW44 || | |||
param.filter_meta.format == param::ConvBias::Format::NCHW44_DOT) { | |||
if (param.filter_meta.icpg < 4_z || param.filter_meta.icpg == 1 || | |||
param.filter_meta.ocpg == 1) { | |||
return false; | |||
} | |||
} | |||
#else | |||
if (format != param::ConvBias::Format::NCHW) { | |||
return false; | |||
} | |||
//! only matmul's packmode is packa or default support weight preprocess | |||
if (is_enable_filter_preprocess(param) && | |||
(m_matmul_algo->packmode() == | |||
fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK)) { | |||
#endif | |||
//! param | |||
if (FH != 1 || FW != 1 || PH || PW || SH != 1 || SW != 1) { | |||
return false; | |||
} | |||
if (param.src_type.enumv() != DTypeEnum::Int8 && | |||
param.src_type.enumv() != DTypeEnum::QuantizedS8 && | |||
param.src_type.enumv() != DTypeEnum::Quantized8Asymm && | |||
//! data type | |||
if (param.src_type.enumv() != param.filter_type.enumv() || | |||
(param.src_type.enumv() != DTypeEnum::Int8 && | |||
param.src_type.enumv() != DTypeEnum::QuantizedS8 && | |||
param.src_type.enumv() != DTypeEnum::Quantized8Asymm && | |||
#if !MEGDNN_DISABLE_FLOAT16 | |||
param.src_type.enumv() != DTypeEnum::Float16 && | |||
param.src_type.enumv() != DTypeEnum::Float16 && | |||
#endif | |||
param.src_type.enumv() != DTypeEnum::Float32) { | |||
param.src_type.enumv() != DTypeEnum::Float32)) { | |||
return false; | |||
} | |||
//! make sure 8x8x16 and 8x8x32 biasmode is nobias and nonlineMode | |||
//! is identity otherwise return false mean that 8x8x32 and 8x8x16 | |||
//! not support PostProcess | |||
@@ -231,27 +258,13 @@ bool ConvBiasImpl::AlgoConv1x1::usable(const NCBKernSizeParam& param, | |||
return false; | |||
} | |||
} | |||
if (param.filter_meta.format == param::ConvBias::Format::NCHW44 || | |||
param.filter_meta.format == param::ConvBias::Format::NCHW44_DOT) { | |||
if (param.filter_meta.icpg < 4_z || param.filter_meta.icpg == 1 || | |||
param.filter_meta.ocpg == 1) { | |||
return false; | |||
} | |||
} | |||
size_t OH = param.osz[0]; | |||
size_t OW = param.osz[1]; | |||
MatrixMulImpl::KernSizeParam matmul_param = | |||
utils::get_matmul_kern_param(param, OH * OW, | |||
get_oc_tile_size_heuristic(param)); | |||
bool matmul_usable = m_matmul_algo->usable(matmul_param); | |||
auto pack_mode = m_matmul_algo->packmode(); | |||
bool strategy_usable = Conv1x1Factory::can_make_conv1x1_strategy( | |||
param, pack_mode, param.filter_meta.format); | |||
return matmul_usable && strategy_usable && | |||
(param.filter_meta.dilation[0] == | |||
param.filter_meta.dilation[1] && | |||
@@ -262,121 +275,6 @@ bool ConvBiasImpl::AlgoConv1x1::usable(const NCBKernSizeParam& param, | |||
return false; | |||
} | |||
SmallVector<TensorLayout> | |||
ConvBiasImpl::AlgoConv1x1::deduce_preprocessed_filter_layout( | |||
const NCBKernSizeParam& param) const { | |||
MIDOUT_BEGIN( | |||
megdnn_fallback_conv1x1, | |||
midout_iv( | |||
"ConvBiasImpl::AlgoConv1x1::deduce_preprocessed_filter_layout"_hash)) { | |||
fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc = | |||
m_matmul_algo->matmul_description(); | |||
bool default_pack = matmul_desc.packmode == | |||
MatrixMulImpl::AlgoBase::PackMode::DEFAULT; | |||
bool only_packA = matmul_desc.packmode == | |||
MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA; | |||
//! only support default_pack and only_packa mode | |||
if (matmul_desc.packmode == | |||
MatrixMulImpl::AlgoBase::PackMode::NO_PACK) { | |||
return {}; | |||
} | |||
size_t OH = param.osz[0]; | |||
size_t OW = param.osz[1]; | |||
size_t compt_oc_block_size = get_oc_tile_size_heuristic(param); | |||
auto matmul_param = utils::get_matmul_kern_param(param, OH * OW, | |||
compt_oc_block_size); | |||
WorkspaceBundle wb(nullptr, {}); | |||
if (default_pack) { | |||
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> dispatcher; | |||
wb = dispatcher.get_bundle(param, matmul_param, m_matmul_algo, | |||
compt_oc_block_size); | |||
} else if (only_packA) { | |||
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> | |||
dispatcher; | |||
wb = dispatcher.get_bundle(param, matmul_param, m_matmul_algo, | |||
compt_oc_block_size); | |||
} | |||
size_t GROUP = param.filter_meta.group; | |||
SmallVector<TensorLayout> preprocessed_layouts; | |||
preprocessed_layouts.push_back( | |||
{{GROUP, wb.get_size(0)}, dtype::Int8()}); | |||
return preprocessed_layouts; | |||
} | |||
MIDOUT_END(); | |||
return {}; | |||
} | |||
SmallVector<ConvBiasImpl::NCBKern> | |||
ConvBiasImpl::AlgoConv1x1::dispatch_preprocess_kerns( | |||
const NCBKernSizeParam& param) const { | |||
MIDOUT_BEGIN( | |||
megdnn_fallback_conv1x1, | |||
midout_iv( | |||
"ConvBiasImpl::AlgoConv1x1::dispatch_preprocess_kerns"_hash)) { | |||
SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||
size_t OH = param.osz[0]; | |||
size_t OW = param.osz[1]; | |||
size_t OC = param.filter_meta.ocpg; | |||
size_t compt_oc_block_size = get_oc_tile_size_heuristic(param); | |||
size_t GROUP = param.filter_meta.group; | |||
size_t oc_blocks_per_group = div_ceil(OC, compt_oc_block_size); | |||
auto matmul_param = utils::get_matmul_kern_param(param, OH * OW, | |||
compt_oc_block_size); | |||
WorkspaceBundle whole_bundle = {nullptr, {}}; | |||
WorkspaceBundle matmul_bundle = {nullptr, {}}; | |||
auto pack_mode = m_matmul_algo->packmode(); | |||
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { | |||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||
midout_iv("get_defaul_matmul_packmode_bundle"_hash)) { | |||
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> | |||
dispatcher; | |||
whole_bundle = dispatcher.get_bundle(param, matmul_param, | |||
m_matmul_algo, | |||
compt_oc_block_size); | |||
matmul_bundle = m_matmul_algo->get_bundle(matmul_param); | |||
} | |||
MIDOUT_END(); | |||
} else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { | |||
MIDOUT_BEGIN( | |||
megdnn_fallback_conv1x1, | |||
midout_iv("get_onlypacka_matmul_packmode_bundle"_hash)) { | |||
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> | |||
dispatcher; | |||
whole_bundle = dispatcher.get_bundle(param, matmul_param, | |||
m_matmul_algo, | |||
compt_oc_block_size); | |||
matmul_bundle = m_matmul_algo->get_bundle(matmul_param); | |||
} | |||
MIDOUT_END(); | |||
} else { | |||
//! if nopack return null so that OprWeightPreprocessProxy can run | |||
//! with nopack mode | |||
return {}; | |||
} | |||
Conv1x1StrategyBase* conv1x1_strategy = | |||
Conv1x1Factory::make_conv1x1_strategy(param, pack_mode, | |||
param.filter_meta.format); | |||
auto kern_packA = [this, whole_bundle, matmul_bundle, param, | |||
compt_oc_block_size, conv1x1_strategy]( | |||
const NCBKernParam& ncb_param, | |||
const NCBKernIndex& ncb_index) mutable { | |||
conv1x1_strategy->packA(whole_bundle, matmul_bundle, | |||
compt_oc_block_size, this->m_matmul_algo, | |||
param, ncb_param, std::move(ncb_index)); | |||
}; | |||
ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); | |||
return ret_kern; | |||
} | |||
MIDOUT_END(); | |||
return {}; | |||
} | |||
bool ConvBiasImpl::AlgoConv1x1::is_preferred( | |||
const NCBKernSizeParam& param) const { | |||
@@ -20,6 +20,11 @@ namespace megdnn { | |||
namespace fallback { | |||
class ConvBiasImpl::AlgoConv1x1 final : public AlgoBase { | |||
WorkspaceBundle get_bundle_according_packmode( | |||
const NCBKernSizeParam& param) const; | |||
SmallVector<NCBKern> get_kerns_according_packmode( | |||
const NCBKernSizeParam& param, bool weight_preprocess) const; | |||
public: | |||
AlgoConv1x1(MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) | |||
: m_matmul_algo(matmul_algo), m_oc_block_size(oc_block_size) {} | |||
@@ -41,7 +46,7 @@ public: | |||
const NCBKernSizeParam& param) const override; | |||
bool is_preferred(const NCBKernSizeParam&) const override; | |||
SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||
const NCBKernSizeParam& param) const override; | |||
size_t get_preprocess_workspace( | |||
@@ -360,23 +360,23 @@ ConvBiasImpl::AlgoConv1x1Gemv::dispatch_kerns( | |||
dt_uint8, PostprocessMode::QUANTIZED, | |||
"NCHW::GEMV::QUINT8x8x32_QUINT8"_hash); | |||
break; | |||
//!no support nchw44 8x8x16 | |||
case param::ConvBias::Format::NCHW44: | |||
cb1(param::ConvBias::Format::NCHW44, dt_float32, dt_float32, | |||
PostprocessMode::FLOAT, "NCHW44::GEMV::FLOAT"_hash); | |||
cb2(param::ConvBias::Format::NCHW44, dt_int8, dt_int32, dt_int32, | |||
dt_int8, dt_int32, dt_int32, PostprocessMode::NO_PROCESS, | |||
cb3(param::ConvBias::Format::NCHW44, dt_int8, dt_int32, dt_int32, | |||
dt_int8, dt_int32, dt_int32, PostprocessMode::ADD_BIAS, | |||
"NCHW44::GEMV::INT8x8x32_INT32"_hash); | |||
cb2(param::ConvBias::Format::NCHW44, dtype::QuantizedS8, | |||
cb3(param::ConvBias::Format::NCHW44, dtype::QuantizedS8, | |||
dtype::QuantizedS32, dtype::QuantizedS32, dt_int8, dt_int32, | |||
dt_int32, PostprocessMode::NO_PROCESS, | |||
dt_int32, PostprocessMode::ADD_BIAS, | |||
"NCHW44::GEMV::QINT8x8x32_QINT32"_hash); | |||
cb2(param::ConvBias::Format::NCHW44, dtype::QuantizedS8, | |||
dtype::QuantizedS32, dtype::QuantizedS8, dt_int8, dt_int32, | |||
dt_int8, PostprocessMode::QUANTIZED, | |||
"NCHW44::GEMV::QINT8x8x32_QINT8"_hash); | |||
break; | |||
//!no support nchw44-dot 8x8x16 | |||
case param::ConvBias::Format::NCHW44_DOT: | |||
cb3(param::ConvBias::Format::NCHW44_DOT, dt_int8, dt_int32, | |||
dt_int32, dt_int8, dt_int32, dt_int32, | |||
@@ -420,81 +420,74 @@ bool ConvBiasImpl::AlgoConv1x1Gemv::usable(const NCBKernSizeParam& param, | |||
MIDOUT_BEGIN(megdnn_fallback_conv1x1_gemv, | |||
midout_iv("AlgoConv1x1Gemv::usable"_hash)) { | |||
auto format = param.filter_meta.format; | |||
#if MEGDNN_X86 | |||
if (format != param::ConvBias::Format::NCHW) | |||
return false; | |||
#elif MEGDNN_AARCH64 || MEGDNN_ARMV7 | |||
if (format != param::ConvBias::Format::NCHW && | |||
format != param::ConvBias::Format::NCHW44 && | |||
format != param::ConvBias::Format::NCHW44_DOT) | |||
return false; | |||
#endif | |||
//! whether 1x1 | |||
size_t FH = param.filter_meta.spatial[0], | |||
FW = param.filter_meta.spatial[1]; | |||
size_t PH = param.filter_meta.padding[0], | |||
PW = param.filter_meta.padding[1]; | |||
size_t SH = param.filter_meta.stride[0], | |||
SW = param.filter_meta.stride[1]; | |||
if (FH != 1 || FW != 1 || PH || PW || SH != 1 || SW != 1) { | |||
return false; | |||
} | |||
//! whether gemv | |||
size_t OH = param.osz[0]; | |||
size_t OW = param.osz[1]; | |||
if (OH * OW != 1) { | |||
//! whether gemv and 1x1 | |||
if (OH * OW != 1 || FH != 1 || FW != 1 || PH || PW || SH != 1 || | |||
SW != 1) { | |||
return false; | |||
} | |||
//! even no naive support in gemv | |||
if ((param.src_type.enumv() == param.filter_type.enumv() && | |||
param.src_type.enumv() == DTypeEnum::Int16) && | |||
param.dst_type.enumv() == DTypeEnum::Int32) { | |||
#if MEGDNN_AARCH64 || MEGDNN_ARMV7 | |||
if (format != param::ConvBias::Format::NCHW && | |||
format != param::ConvBias::Format::NCHW44 && | |||
format != param::ConvBias::Format::NCHW44_DOT) { | |||
return false; | |||
} | |||
//! make sure 8x8x16 and 8x8x32 biasmode is nobias and nonlineMode | |||
//! is identity otherwise return false mean that 8x8x32 and 8x8x16 | |||
//! not support PostProcess | |||
if (param.dst_type.enumv() == DTypeEnum::Int16 || | |||
param.dst_type.enumv() == DTypeEnum::Int32 || | |||
param.dst_type.enumv() == DTypeEnum::QuantizedS32) { | |||
if (param.nonlineMode != megdnn::NonlineMode::IDENTITY) { | |||
return false; | |||
} | |||
} | |||
//! supports a few dtypes | |||
if (param.src_type.enumv() != param.filter_type.enumv()) { | |||
#else | |||
if (format != param::ConvBias::Format::NCHW) { | |||
return false; | |||
} | |||
if (param.src_type.enumv() != DTypeEnum::Int8 && | |||
param.src_type.enumv() != DTypeEnum::QuantizedS8 && | |||
param.src_type.enumv() != DTypeEnum::Quantized8Asymm && | |||
#endif | |||
//! supports a few dtypes | |||
if (param.src_type.enumv() != param.filter_type.enumv() || | |||
(param.src_type.enumv() != DTypeEnum::Int8 && | |||
param.src_type.enumv() != DTypeEnum::QuantizedS8 && | |||
param.src_type.enumv() != DTypeEnum::Quantized8Asymm && | |||
#if !MEGDNN_DISABLE_FLOAT16 | |||
param.src_type.enumv() != DTypeEnum::Float16 && | |||
param.src_type.enumv() != DTypeEnum::Float16 && | |||
#endif | |||
param.src_type.enumv() != DTypeEnum::Float32) { | |||
param.src_type.enumv() != DTypeEnum::Float32)) { | |||
return false; | |||
} | |||
#if MEGDNN_AARCH64 || MEGDNN_ARMV7 | |||
if (format == param::ConvBias::Format::NCHW44) { | |||
if (param.src_type.enumv() != DTypeEnum::Float32 && | |||
param.src_type.enumv() != DTypeEnum::Int8 && | |||
param.src_type.enumv() != DTypeEnum::QuantizedS8) { | |||
return false; | |||
} | |||
//! 8x8x16 is not support nchw44 | |||
if (param.src_type.enumv() == DTypeEnum::Int8 && | |||
param.dst_type.enumv() == DTypeEnum::Int16) { | |||
return false; | |||
} | |||
} else if (format == param::ConvBias::Format::NCHW44_DOT) { | |||
if (param.src_type.enumv() != DTypeEnum::Int8 && | |||
param.src_type.enumv() != DTypeEnum::QuantizedS8) { | |||
if ((param.src_type.enumv() != DTypeEnum::Int8 && | |||
param.src_type.enumv() != DTypeEnum::QuantizedS8) || | |||
param.dst_type.enumv() == DTypeEnum::Int16) { | |||
return false; | |||
} | |||
} | |||
#endif | |||
//! make sure 8x8x16 and 8x8x32 biasmode nonlineMode is identity | |||
//! otherwise return false | |||
if (param.dst_type.enumv() == DTypeEnum::Int16 || | |||
param.dst_type.enumv() == DTypeEnum::Int32 || | |||
param.dst_type.enumv() == DTypeEnum::QuantizedS32) { | |||
if (param.nonlineMode != megdnn::NonlineMode::IDENTITY) { | |||
return false; | |||
} | |||
} | |||
//! even no naive support in gemv | |||
if ((param.src_type.enumv() == param.filter_type.enumv() && | |||
param.src_type.enumv() == DTypeEnum::Int16) && | |||
param.dst_type.enumv() == DTypeEnum::Int32) { | |||
return false; | |||
} | |||
return (param.filter_meta.dilation[0] == | |||
param.filter_meta.dilation[1] && | |||
param.filter_meta.dilation[0] == 1) && | |||
@@ -11,14 +11,19 @@ | |||
#pragma once | |||
#include "src/fallback/conv_bias/conv1x1/conv1x1_strategy.h" | |||
#include "src/fallback/conv_bias/conv1x1/conv1x1_utils.h" | |||
#include "src/fallback/conv_bias/opr_impl.h" | |||
namespace megdnn { | |||
namespace fallback { | |||
namespace conv1x1 { | |||
template <MatrixMulImpl::AlgoBase::PackMode pack_mode> | |||
class Conv1x1Kerns { | |||
class Conv1x1Kerns; | |||
template <> | |||
class Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> { | |||
public: | |||
//! get_bundle | |||
WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param, | |||
@@ -28,13 +33,12 @@ public: | |||
size_t GROUP = param.filter_meta.group; | |||
size_t OC = param.filter_meta.ocpg; | |||
size_t BATCH = param.n; | |||
//! bundle per thread | |||
//! matmul_param records a matmul with M = oc_tile_size, K = IC, N = OH | |||
//! * OW this does not bother packb bytes | |||
auto matmul_bundle = matmul_algo->get_bundle(matmul_param); | |||
auto thread_bundle = utils::get_thread_bundle(param, matmul_bundle.get_size(2), | |||
oc_tile_size); | |||
auto thread_bundle = utils::get_thread_bundle( | |||
param, matmul_bundle.get_size(2), oc_tile_size); | |||
//! size per thread | |||
size_t all_threads_bytes = | |||
thread_bundle.total_size_in_bytes() * param.nr_threads; | |||
@@ -46,11 +50,6 @@ public: | |||
is_enable_filter_preprocess(param) | |||
? 0 | |||
: packa_bytes_per_oc_tile * oc_tiles_per_group * GROUP; | |||
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) | |||
return WorkspaceBundle{nullptr, | |||
{all_packa_bytes, 0, all_threads_bytes}}; | |||
//! packb size = N * GROUP * packb_size_per_group | |||
size_t packb_bytes_per_group = matmul_bundle.get_size(1); | |||
size_t all_packb_bytes = packb_bytes_per_group * GROUP * BATCH; | |||
@@ -58,6 +57,165 @@ public: | |||
return WorkspaceBundle{ | |||
nullptr, {all_packa_bytes, all_packb_bytes, all_threads_bytes}}; | |||
} | |||
SmallVector<ConvBiasImpl::NCBKern> get_kern( | |||
const ConvBiasImpl::NCBKernSizeParam& param, | |||
WorkspaceBundle& whole_bundle, WorkspaceBundle& matmul_bundle, | |||
WorkspaceBundle& thread_bundle, | |||
Conv1x1StrategyBase* conv1x1_strategy, | |||
const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) { | |||
auto kern_packA = | |||
[whole_bundle, matmul_bundle, param, matmul_algo, oc_block_size, | |||
conv1x1_strategy]( | |||
const ConvBiasImpl::NCBKernParam& ncb_param, | |||
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||
conv1x1_strategy->packA(whole_bundle, matmul_bundle, | |||
oc_block_size, matmul_algo, param, | |||
ncb_param, std::move(ncb_index)); | |||
}; | |||
auto kern_packB = | |||
[whole_bundle, matmul_bundle, param, matmul_algo, | |||
conv1x1_strategy]( | |||
const ConvBiasImpl::NCBKernParam& ncb_param, | |||
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||
conv1x1_strategy->packB(whole_bundle, matmul_bundle, | |||
matmul_algo, param, ncb_param, | |||
std::move(ncb_index)); | |||
}; | |||
auto kern_compt = | |||
[whole_bundle, matmul_bundle, thread_bundle, matmul_algo, param, | |||
oc_block_size, conv1x1_strategy]( | |||
const ConvBiasImpl::NCBKernParam& ncb_param, | |||
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||
conv1x1_strategy->exec(whole_bundle, matmul_bundle, | |||
thread_bundle, oc_block_size, | |||
matmul_algo, param, ncb_param, | |||
std::move(ncb_index)); | |||
}; | |||
size_t GROUP = param.filter_meta.group; | |||
size_t BATCH = param.n; | |||
size_t OC = param.filter_meta.ocpg; | |||
size_t oc_blocks_per_group = div_ceil(OC, oc_block_size); | |||
SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||
if (!is_enable_filter_preprocess(param)) { | |||
ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); | |||
} | |||
ret_kern.push_back({kern_packB, {BATCH}}); | |||
ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}}); | |||
return ret_kern; | |||
} | |||
SmallVector<ConvBiasImpl::NCBKern> get_kern_preprocess( | |||
const ConvBiasImpl::NCBKernSizeParam& param, | |||
WorkspaceBundle& whole_bundle, WorkspaceBundle& matmul_bundle, | |||
Conv1x1StrategyBase* conv1x1_strategy, | |||
const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) { | |||
auto kern_packA = | |||
[whole_bundle, matmul_bundle, param, matmul_algo, oc_block_size, | |||
conv1x1_strategy]( | |||
const ConvBiasImpl::NCBKernParam& ncb_param, | |||
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||
conv1x1_strategy->packA(whole_bundle, matmul_bundle, | |||
oc_block_size, matmul_algo, param, | |||
ncb_param, std::move(ncb_index)); | |||
}; | |||
size_t GROUP = param.filter_meta.group; | |||
size_t OC = param.filter_meta.ocpg; | |||
size_t oc_blocks_per_group = div_ceil(OC, oc_block_size); | |||
SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||
ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); | |||
return ret_kern; | |||
} | |||
}; | |||
template<> | |||
class Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> { | |||
public: | |||
//! get_bundle | |||
WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param, | |||
const MatrixMulImpl::KernSizeParam& matmul_param, | |||
const MatrixMulImpl::AlgoBase* matmul_algo, | |||
size_t oc_tile_size) { | |||
size_t GROUP = param.filter_meta.group; | |||
size_t OC = param.filter_meta.ocpg; | |||
//! bundle per thread | |||
//! matmul_param records a matmul with M = oc_tile_size, K = IC, N = OH | |||
//! * OW this does not bother packb bytes | |||
auto matmul_bundle = matmul_algo->get_bundle(matmul_param); | |||
auto thread_bundle = utils::get_thread_bundle( | |||
param, matmul_bundle.get_size(2), oc_tile_size); | |||
//! size per thread | |||
size_t all_threads_bytes = | |||
thread_bundle.total_size_in_bytes() * param.nr_threads; | |||
//! packa size = GROUP * packa_size_each_group | |||
size_t packa_bytes_per_oc_tile = matmul_bundle.get_size(0); | |||
size_t oc_tiles_per_group = div_ceil(OC, oc_tile_size); | |||
size_t all_packa_bytes = | |||
is_enable_filter_preprocess(param) | |||
? 0 | |||
: packa_bytes_per_oc_tile * oc_tiles_per_group * GROUP; | |||
return WorkspaceBundle{nullptr, | |||
{all_packa_bytes, 0, all_threads_bytes}}; | |||
} | |||
SmallVector<ConvBiasImpl::NCBKern> get_kern( | |||
const ConvBiasImpl::NCBKernSizeParam& param, | |||
WorkspaceBundle& whole_bundle, WorkspaceBundle& matmul_bundle, | |||
WorkspaceBundle& thread_bundle, | |||
Conv1x1StrategyBase* conv1x1_strategy, | |||
const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) { | |||
auto kern_packA = | |||
[whole_bundle, matmul_bundle, param, matmul_algo, oc_block_size, | |||
conv1x1_strategy]( | |||
const ConvBiasImpl::NCBKernParam& ncb_param, | |||
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||
conv1x1_strategy->packA(whole_bundle, matmul_bundle, | |||
oc_block_size, matmul_algo, param, | |||
ncb_param, std::move(ncb_index)); | |||
}; | |||
auto kern_compt = | |||
[whole_bundle, matmul_bundle, thread_bundle, matmul_algo, param, | |||
oc_block_size, conv1x1_strategy]( | |||
const ConvBiasImpl::NCBKernParam& ncb_param, | |||
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||
conv1x1_strategy->exec(whole_bundle, matmul_bundle, | |||
thread_bundle, oc_block_size, | |||
matmul_algo, param, ncb_param, | |||
std::move(ncb_index)); | |||
}; | |||
size_t GROUP = param.filter_meta.group; | |||
size_t BATCH = param.n; | |||
size_t OC = param.filter_meta.ocpg; | |||
size_t oc_blocks_per_group = div_ceil(OC, oc_block_size); | |||
SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||
if (!is_enable_filter_preprocess(param)) { | |||
ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); | |||
} | |||
ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}}); | |||
return ret_kern; | |||
} | |||
SmallVector<ConvBiasImpl::NCBKern> get_kern_preprocess( | |||
const ConvBiasImpl::NCBKernSizeParam& param, | |||
WorkspaceBundle& whole_bundle, WorkspaceBundle& matmul_bundle, | |||
Conv1x1StrategyBase* conv1x1_strategy, | |||
const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) { | |||
auto kern_packA = | |||
[whole_bundle, matmul_bundle, param, matmul_algo, oc_block_size, | |||
conv1x1_strategy]( | |||
const ConvBiasImpl::NCBKernParam& ncb_param, | |||
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||
conv1x1_strategy->packA(whole_bundle, matmul_bundle, | |||
oc_block_size, matmul_algo, param, | |||
ncb_param, std::move(ncb_index)); | |||
}; | |||
size_t GROUP = param.filter_meta.group; | |||
size_t OC = param.filter_meta.ocpg; | |||
size_t oc_blocks_per_group = div_ceil(OC, oc_block_size); | |||
SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||
ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); | |||
return ret_kern; | |||
} | |||
}; | |||
template<> | |||
@@ -69,14 +227,47 @@ public: | |||
const MatrixMulImpl::AlgoBase* matmul_algo, | |||
size_t oc_tile_size) { | |||
size_t matmul_size = matmul_algo->get_workspace(matmul_param); | |||
auto thread_bundle = utils::get_thread_bundle(param, matmul_size, oc_tile_size); | |||
auto thread_bundle = | |||
utils::get_thread_bundle(param, matmul_size, oc_tile_size); | |||
//! size per thread | |||
size_t all_threads_bytes = | |||
thread_bundle.total_size_in_bytes() * param.nr_threads; | |||
return WorkspaceBundle{nullptr, {0, 0, all_threads_bytes}}; | |||
} | |||
SmallVector<ConvBiasImpl::NCBKern> get_kern( | |||
const ConvBiasImpl::NCBKernSizeParam& param, | |||
WorkspaceBundle& whole_bundle, WorkspaceBundle& matmul_bundle, | |||
WorkspaceBundle& thread_bundle, | |||
Conv1x1StrategyBase* conv1x1_strategy, | |||
const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) { | |||
auto kern_compt = | |||
[whole_bundle, matmul_bundle, thread_bundle, matmul_algo, param, | |||
oc_block_size, conv1x1_strategy]( | |||
const ConvBiasImpl::NCBKernParam& ncb_param, | |||
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||
conv1x1_strategy->exec(whole_bundle, matmul_bundle, | |||
thread_bundle, oc_block_size, | |||
matmul_algo, param, ncb_param, | |||
std::move(ncb_index)); | |||
}; | |||
size_t GROUP = param.filter_meta.group; | |||
size_t BATCH = param.n; | |||
size_t OC = param.filter_meta.ocpg; | |||
size_t oc_blocks_per_group = div_ceil(OC, oc_block_size); | |||
SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||
ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}}); | |||
return ret_kern; | |||
} | |||
SmallVector<ConvBiasImpl::NCBKern> get_kern_preprocess( | |||
const ConvBiasImpl::NCBKernSizeParam&, WorkspaceBundle&, | |||
WorkspaceBundle&, Conv1x1StrategyBase*, | |||
const MatrixMulImpl::AlgoBase*, size_t) { | |||
return {}; | |||
} | |||
}; | |||
} // namespace conv1x1 | |||
} // namespace fallback | |||
} // namespace megdnn | |||
// vim: syntax=cpp.doxygen |
@@ -59,7 +59,8 @@ public: | |||
template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
typename op_ctype, typename op_dtype, | |||
megdnn::PostprocessMode postprocess_mode, MatrixMulImpl::AlgoBase::PackMode pack_mode> | |||
megdnn::PostprocessMode postprocess_mode, | |||
MatrixMulImpl::AlgoBase::PackMode pack_mode> | |||
class Conv1x1Strategy : public Conv1x1StrategyBase { | |||
public: | |||
explicit Conv1x1Strategy(size_t pack_size = 1) : m_pack_size(pack_size) {} | |||
@@ -136,32 +137,30 @@ public: | |||
size_t packb_bytes_per_group = matmul_bundle.get_size(1); | |||
size_t GROUP = param.filter_meta.group; | |||
size_t BATCH = param.n; | |||
size_t SH = param.filter_meta.stride[0]; | |||
size_t SW = param.filter_meta.stride[1]; | |||
size_t OH = param.osz[0]; | |||
size_t OW = param.osz[1]; | |||
size_t OC = param.filter_meta.ocpg; | |||
size_t batch = ncb_index.ndrange_id[0]; | |||
MatrixMulImpl::KernParam matmul_kern_param; | |||
static_cast<MatrixMulImpl::KernSizeParam&>(matmul_kern_param) = | |||
utils::get_matmul_kern_param(param, OH * OW, OC); | |||
rep(batch, BATCH) { | |||
rep(g, GROUP) { | |||
if (SH == 2 && SW == 2) | |||
megdnn_throw("no support for stride = 2"); | |||
size_t bytes_offset_of_b_panel = | |||
batch * packb_bytes_per_group * GROUP + | |||
g * packb_bytes_per_group; | |||
src_ctype* b_panel = reinterpret_cast<src_ctype*>( | |||
reinterpret_cast<int8_t*>(whole_bundle.get(1)) + | |||
bytes_offset_of_b_panel); | |||
matmul_kern_param.B_ptr = const_cast<src_ctype*>( | |||
ncb_param.src<src_ctype>(batch, g)); | |||
matmul_algo->pack_B(matmul_kern_param, b_panel, 0, OH * OW); | |||
} | |||
rep(g, GROUP) { | |||
if (SH == 2 && SW == 2) | |||
megdnn_throw("no support for stride = 2"); | |||
size_t bytes_offset_of_b_panel = | |||
batch * packb_bytes_per_group * GROUP + | |||
g * packb_bytes_per_group; | |||
src_ctype* b_panel = reinterpret_cast<src_ctype*>( | |||
reinterpret_cast<int8_t*>(whole_bundle.get(1)) + | |||
bytes_offset_of_b_panel); | |||
matmul_kern_param.B_ptr = const_cast<src_ctype*>( | |||
ncb_param.src<src_ctype>(batch, g)); | |||
matmul_algo->pack_B(matmul_kern_param, b_panel, 0, OH * OW); | |||
} | |||
} else { | |||
megdnn_log_error("OnlyPackA mode and NoPack mode has no packB kernel"); | |||