diff --git a/dnn/src/arm_common/pooling/algo.cpp b/dnn/src/arm_common/pooling/algo.cpp index 64c9b767..62419758 100644 --- a/dnn/src/arm_common/pooling/algo.cpp +++ b/dnn/src/arm_common/pooling/algo.cpp @@ -612,7 +612,7 @@ bool PoolingImpl::AlgoFilter3ModexStridexNCHW44::usable( (param.mode == Mode::MAX || param.mode == Mode::AVERAGE) && FH == 3 && FW == 3 && SW == SH && (SH == 1 || SW == 2); //! Int8 not support average, because its round mode is different form - //! quint8 + //! qint8 avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && param.mode == Mode::AVERAGE); return avaible; @@ -705,7 +705,7 @@ bool PoolingImpl::AlgoFilter2ModexStridexNCHW44::usable( (param.mode == Mode::MAX || param.mode == Mode::AVERAGE) && FH == 2 && FW == 2 && SH == SW && (SW == 1 || SW == 2); //! Int8 not support average, because its round mode is different form - //! quint8 + //! qint8 avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && param.mode == Mode::AVERAGE); return avaible; @@ -799,7 +799,7 @@ bool PoolingImpl::AlgoFilter4ModexStridexNCHW44::usable( FH == 4 && FW == 4 && SH == SW && (SW == 1 || SW == 2); //! Int8 not support average, because its round mode is different form - //! quint8 + //! qint8 avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && param.mode == Mode::AVERAGE); return avaible; @@ -892,7 +892,7 @@ bool PoolingImpl::AlgoFilter5ModexStridexNCHW44::usable( (param.mode == Mode::MAX || param.mode == Mode::AVERAGE) && FH == 5 && FW == 5 && SH == SW && (SW == 1 || SW == 2); //! Int8 not support average, because its round mode is different form - //! quint8 + //! qint8 avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && param.mode == Mode::AVERAGE); return avaible; diff --git a/dnn/src/fallback/conv_bias/conv1x1/algos.cpp b/dnn/src/fallback/conv_bias/conv1x1/algos.cpp index 0fadedbe..8eb504b2 100644 --- a/dnn/src/fallback/conv_bias/conv1x1/algos.cpp +++ b/dnn/src/fallback/conv_bias/conv1x1/algos.cpp @@ -47,7 +47,7 @@ size_t ConvBiasImpl::AlgoConv1x1::get_oc_tile_size_heuristic( return round_up(oc_block_size_one_thread, 24); } -size_t ConvBiasImpl::AlgoConv1x1::get_workspace( +WorkspaceBundle ConvBiasImpl::AlgoConv1x1::get_bundle_according_packmode( const NCBKernSizeParam& param) const { size_t OH = param.osz[0]; size_t OW = param.osz[1]; @@ -58,168 +58,195 @@ size_t ConvBiasImpl::AlgoConv1x1::get_workspace( auto pack_mode = m_matmul_algo->packmode(); if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { - MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 0) { - Conv1x1Kerns dispatcher; - return dispatcher + MIDOUT_BEGIN(megdnn_fallback_conv1x1, + midout_iv("get_bundle_default"_hash)) { + return Conv1x1Kerns() .get_bundle(param, matmul_param, m_matmul_algo, - compt_oc_block_size) - .total_size_in_bytes(); + compt_oc_block_size); } MIDOUT_END(); } else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { - MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 1) { - Conv1x1Kerns - dispatcher; - return dispatcher + MIDOUT_BEGIN(megdnn_fallback_conv1x1, + midout_iv("get_bundle_only_packa"_hash)) { + return Conv1x1Kerns() .get_bundle(param, matmul_param, m_matmul_algo, - compt_oc_block_size) - .total_size_in_bytes(); + compt_oc_block_size); } MIDOUT_END(); } else { - MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 2) { - Conv1x1Kerns dispatcher; - return dispatcher + MIDOUT_BEGIN(megdnn_fallback_conv1x1, + midout_iv("get_bundle_no_pack"_hash)) { + return Conv1x1Kerns() .get_bundle(param, matmul_param, m_matmul_algo, - compt_oc_block_size) - .total_size_in_bytes(); + compt_oc_block_size); } MIDOUT_END(); } - return 0; + return {nullptr, {}}; } -SmallVector ConvBiasImpl::AlgoConv1x1::dispatch_kerns( +size_t ConvBiasImpl::AlgoConv1x1::get_workspace( const NCBKernSizeParam& param) const { - SmallVector ret_kern; + return get_bundle_according_packmode(param).total_size_in_bytes(); +} + +SmallVector +ConvBiasImpl::AlgoConv1x1::get_kerns_according_packmode( + const NCBKernSizeParam& param, bool weight_preprocess) const { size_t OH = param.osz[0]; size_t OW = param.osz[1]; - size_t OC = param.filter_meta.ocpg; size_t compt_oc_block_size = get_oc_tile_size_heuristic(param); - size_t GROUP = param.filter_meta.group; - size_t BATCH = param.n; - size_t oc_blocks_per_group = div_ceil(OC, compt_oc_block_size); + auto pack_mode = m_matmul_algo->packmode(); + Conv1x1StrategyBase* conv1x1_strategy = + Conv1x1Factory::make_conv1x1_strategy(param, pack_mode, + param.filter_meta.format); auto matmul_param = utils::get_matmul_kern_param(param, OH * OW, compt_oc_block_size); - WorkspaceBundle whole_bundle = {nullptr, {}}; - WorkspaceBundle thread_bundle = {nullptr, {}}; - WorkspaceBundle matmul_bundle = {nullptr, {}}; - auto pack_mode = m_matmul_algo->packmode(); + WorkspaceBundle whole_bundle = get_bundle_according_packmode(param); + //! NO_PACK not implement get_bundle + WorkspaceBundle matmul_bundle ={nullptr,{}}; + if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::NO_PACK) { + matmul_bundle = {nullptr, + {0, 0, m_matmul_algo->get_workspace(matmul_param)}}; + } else { + matmul_bundle = m_matmul_algo->get_bundle(matmul_param); + } + WorkspaceBundle thread_bundle = utils::get_thread_bundle( + param, matmul_bundle.get_size(2), compt_oc_block_size); + if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { - MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 0) { - Conv1x1Kerns dispatcher; - whole_bundle = dispatcher.get_bundle( - param, matmul_param, m_matmul_algo, compt_oc_block_size); - matmul_bundle = m_matmul_algo->get_bundle(matmul_param); + MIDOUT_BEGIN(megdnn_fallback_conv1x1, + midout_iv("get_kern_default"_hash)) { + if (!weight_preprocess) { + return Conv1x1Kerns< + MatrixMulImpl::AlgoBase::PackMode::DEFAULT>() + .get_kern(param, whole_bundle, matmul_bundle, + thread_bundle, conv1x1_strategy, + m_matmul_algo, compt_oc_block_size); + } else { + return Conv1x1Kerns< + MatrixMulImpl::AlgoBase::PackMode::DEFAULT>() + .get_kern_preprocess(param, whole_bundle, matmul_bundle, + conv1x1_strategy, m_matmul_algo, + compt_oc_block_size); + } } MIDOUT_END(); } else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { - MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 1) { - Conv1x1Kerns - dispatcher; - whole_bundle = dispatcher.get_bundle( - param, matmul_param, m_matmul_algo, compt_oc_block_size); - matmul_bundle = m_matmul_algo->get_bundle(matmul_param); + MIDOUT_BEGIN(megdnn_fallback_conv1x1, + midout_iv("get_kern_only_packa"_hash)) { + if (!weight_preprocess) { + return Conv1x1Kerns< + MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA>() + .get_kern(param, whole_bundle, matmul_bundle, + thread_bundle, conv1x1_strategy, + m_matmul_algo, compt_oc_block_size); + } else { + return Conv1x1Kerns< + MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA>() + .get_kern_preprocess(param, whole_bundle, matmul_bundle, + conv1x1_strategy, m_matmul_algo, + compt_oc_block_size); + } } MIDOUT_END(); } else { - MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 2) { - Conv1x1Kerns dispatcher; - whole_bundle = dispatcher.get_bundle( - param, matmul_param, m_matmul_algo, compt_oc_block_size); - matmul_bundle = { - nullptr, - {0, 0, m_matmul_algo->get_workspace(matmul_param)}}; + MIDOUT_BEGIN(megdnn_fallback_conv1x1, + midout_iv("get_kern_no_pack"_hash)) { + if (!weight_preprocess) { + return Conv1x1Kerns< + MatrixMulImpl::AlgoBase::PackMode::NO_PACK>() + .get_kern(param, whole_bundle, matmul_bundle, + thread_bundle, conv1x1_strategy, + m_matmul_algo, compt_oc_block_size); + } else { + return Conv1x1Kerns< + MatrixMulImpl::AlgoBase::PackMode::NO_PACK>() + .get_kern_preprocess(param, whole_bundle, matmul_bundle, + conv1x1_strategy, m_matmul_algo, + compt_oc_block_size); + } } MIDOUT_END(); } +} - //! get thread bundle - thread_bundle = utils::get_thread_bundle(param, matmul_bundle.get_size(2), - compt_oc_block_size); - - Conv1x1StrategyBase* conv1x1_strategy = - Conv1x1Factory::make_conv1x1_strategy(param, pack_mode, - param.filter_meta.format); +SmallVector ConvBiasImpl::AlgoConv1x1::dispatch_kerns( + const NCBKernSizeParam& param) const { + return get_kerns_according_packmode(param, false); +} - auto kern_packA = [this, whole_bundle, matmul_bundle, param, - compt_oc_block_size, conv1x1_strategy]( - const NCBKernParam& ncb_param, - const NCBKernIndex& ncb_index) mutable { - conv1x1_strategy->packA(whole_bundle, matmul_bundle, - compt_oc_block_size, this->m_matmul_algo, param, - ncb_param, std::move(ncb_index)); - }; - auto kern_packB = [this, whole_bundle, matmul_bundle, param, - conv1x1_strategy]( - const NCBKernParam& ncb_param, - const NCBKernIndex& ncb_index) mutable { - conv1x1_strategy->packB(whole_bundle, matmul_bundle, - this->m_matmul_algo, param, ncb_param, - std::move(ncb_index)); - }; - auto kern_compt = [this, whole_bundle, matmul_bundle, thread_bundle, param, - compt_oc_block_size, conv1x1_strategy]( - const NCBKernParam& ncb_param, - const NCBKernIndex& ncb_index) mutable { - conv1x1_strategy->exec(whole_bundle, matmul_bundle, thread_bundle, - compt_oc_block_size, this->m_matmul_algo, param, - ncb_param, std::move(ncb_index)); - }; +SmallVector +ConvBiasImpl::AlgoConv1x1::deduce_preprocessed_filter_layout( + const NCBKernSizeParam& param) const { + MIDOUT_BEGIN(megdnn_fallback_conv1x1, + midout_iv("deduce_preprocessed_filter_layout"_hash)) { + WorkspaceBundle wb = get_bundle_according_packmode(param); - if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT || - pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { - //! if enable filter preprocess kern_packA should not dispatch - if (!is_enable_filter_preprocess(param)) { - ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); - } - if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { - ret_kern.push_back({kern_packB, {1}}); - } + size_t GROUP = param.filter_meta.group; + SmallVector preprocessed_layouts; + preprocessed_layouts.push_back( + {{GROUP, wb.get_size(0)}, dtype::Int8()}); + return preprocessed_layouts; } - ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}}); - return ret_kern; + MIDOUT_END(); + return {}; +} + +SmallVector +ConvBiasImpl::AlgoConv1x1::dispatch_preprocess_kerns( + const NCBKernSizeParam& param) const { + return get_kerns_according_packmode(param, true); } bool ConvBiasImpl::AlgoConv1x1::usable(const NCBKernSizeParam& param, AlgoSelectionStrategy) const { MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 2) { - if (param.filter_meta.format != param::ConvBias::Format::NCHW && - param.filter_meta.format != param::ConvBias::Format::NCHW44 && - param.filter_meta.format != param::ConvBias::Format::NCHW44_DOT) - return false; - size_t FH = param.filter_meta.spatial[0], FW = param.filter_meta.spatial[1]; size_t PH = param.filter_meta.padding[0], PW = param.filter_meta.padding[1]; size_t SH = param.filter_meta.stride[0], SW = param.filter_meta.stride[1]; - if (FH != 1 || FW != 1 || PH || PW || SH != 1 || SW != 1) + auto format = param.filter_meta.format; + size_t OH = param.osz[0]; + size_t OW = param.osz[1]; +#if MEGDNN_AARCH64 || MEGDNN_ARMV7 + if (format != param::ConvBias::Format::NCHW && + format != param::ConvBias::Format::NCHW44 && + format != param::ConvBias::Format::NCHW44_DOT) { return false; - if (param.src_type.enumv() != param.filter_type.enumv()) { + } + //! hybird mode is not support + if (param.filter_meta.format == param::ConvBias::Format::NCHW44 || + param.filter_meta.format == param::ConvBias::Format::NCHW44_DOT) { + if (param.filter_meta.icpg < 4_z || param.filter_meta.icpg == 1 || + param.filter_meta.ocpg == 1) { + return false; + } + } +#else + if (format != param::ConvBias::Format::NCHW) { return false; } - - //! only matmul's packmode is packa or default support weight preprocess - if (is_enable_filter_preprocess(param) && - (m_matmul_algo->packmode() == - fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK)) { +#endif + //! param + if (FH != 1 || FW != 1 || PH || PW || SH != 1 || SW != 1) { return false; } - - if (param.src_type.enumv() != DTypeEnum::Int8 && - param.src_type.enumv() != DTypeEnum::QuantizedS8 && - param.src_type.enumv() != DTypeEnum::Quantized8Asymm && + //! data type + if (param.src_type.enumv() != param.filter_type.enumv() || + (param.src_type.enumv() != DTypeEnum::Int8 && + param.src_type.enumv() != DTypeEnum::QuantizedS8 && + param.src_type.enumv() != DTypeEnum::Quantized8Asymm && #if !MEGDNN_DISABLE_FLOAT16 - param.src_type.enumv() != DTypeEnum::Float16 && + param.src_type.enumv() != DTypeEnum::Float16 && #endif - param.src_type.enumv() != DTypeEnum::Float32) { + param.src_type.enumv() != DTypeEnum::Float32)) { return false; } - //! make sure 8x8x16 and 8x8x32 biasmode is nobias and nonlineMode //! is identity otherwise return false mean that 8x8x32 and 8x8x16 //! not support PostProcess @@ -231,27 +258,13 @@ bool ConvBiasImpl::AlgoConv1x1::usable(const NCBKernSizeParam& param, return false; } } - - if (param.filter_meta.format == param::ConvBias::Format::NCHW44 || - param.filter_meta.format == param::ConvBias::Format::NCHW44_DOT) { - if (param.filter_meta.icpg < 4_z || param.filter_meta.icpg == 1 || - param.filter_meta.ocpg == 1) { - return false; - } - } - - size_t OH = param.osz[0]; - size_t OW = param.osz[1]; - MatrixMulImpl::KernSizeParam matmul_param = utils::get_matmul_kern_param(param, OH * OW, get_oc_tile_size_heuristic(param)); bool matmul_usable = m_matmul_algo->usable(matmul_param); - auto pack_mode = m_matmul_algo->packmode(); bool strategy_usable = Conv1x1Factory::can_make_conv1x1_strategy( param, pack_mode, param.filter_meta.format); - return matmul_usable && strategy_usable && (param.filter_meta.dilation[0] == param.filter_meta.dilation[1] && @@ -262,121 +275,6 @@ bool ConvBiasImpl::AlgoConv1x1::usable(const NCBKernSizeParam& param, return false; } -SmallVector -ConvBiasImpl::AlgoConv1x1::deduce_preprocessed_filter_layout( - const NCBKernSizeParam& param) const { - MIDOUT_BEGIN( - megdnn_fallback_conv1x1, - midout_iv( - "ConvBiasImpl::AlgoConv1x1::deduce_preprocessed_filter_layout"_hash)) { - fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc = - m_matmul_algo->matmul_description(); - bool default_pack = matmul_desc.packmode == - MatrixMulImpl::AlgoBase::PackMode::DEFAULT; - bool only_packA = matmul_desc.packmode == - MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA; - //! only support default_pack and only_packa mode - if (matmul_desc.packmode == - MatrixMulImpl::AlgoBase::PackMode::NO_PACK) { - return {}; - } - size_t OH = param.osz[0]; - size_t OW = param.osz[1]; - size_t compt_oc_block_size = get_oc_tile_size_heuristic(param); - - auto matmul_param = utils::get_matmul_kern_param(param, OH * OW, - compt_oc_block_size); - - WorkspaceBundle wb(nullptr, {}); - if (default_pack) { - Conv1x1Kerns dispatcher; - wb = dispatcher.get_bundle(param, matmul_param, m_matmul_algo, - compt_oc_block_size); - } else if (only_packA) { - Conv1x1Kerns - dispatcher; - wb = dispatcher.get_bundle(param, matmul_param, m_matmul_algo, - compt_oc_block_size); - } - - size_t GROUP = param.filter_meta.group; - SmallVector preprocessed_layouts; - preprocessed_layouts.push_back( - {{GROUP, wb.get_size(0)}, dtype::Int8()}); - return preprocessed_layouts; - } - MIDOUT_END(); - return {}; -} - -SmallVector -ConvBiasImpl::AlgoConv1x1::dispatch_preprocess_kerns( - const NCBKernSizeParam& param) const { - MIDOUT_BEGIN( - megdnn_fallback_conv1x1, - midout_iv( - "ConvBiasImpl::AlgoConv1x1::dispatch_preprocess_kerns"_hash)) { - SmallVector ret_kern; - size_t OH = param.osz[0]; - size_t OW = param.osz[1]; - size_t OC = param.filter_meta.ocpg; - size_t compt_oc_block_size = get_oc_tile_size_heuristic(param); - size_t GROUP = param.filter_meta.group; - size_t oc_blocks_per_group = div_ceil(OC, compt_oc_block_size); - - auto matmul_param = utils::get_matmul_kern_param(param, OH * OW, - compt_oc_block_size); - WorkspaceBundle whole_bundle = {nullptr, {}}; - WorkspaceBundle matmul_bundle = {nullptr, {}}; - auto pack_mode = m_matmul_algo->packmode(); - if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { - MIDOUT_BEGIN(megdnn_fallback_conv1x1, - midout_iv("get_defaul_matmul_packmode_bundle"_hash)) { - Conv1x1Kerns - dispatcher; - whole_bundle = dispatcher.get_bundle(param, matmul_param, - m_matmul_algo, - compt_oc_block_size); - matmul_bundle = m_matmul_algo->get_bundle(matmul_param); - } - MIDOUT_END(); - } else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { - MIDOUT_BEGIN( - megdnn_fallback_conv1x1, - midout_iv("get_onlypacka_matmul_packmode_bundle"_hash)) { - Conv1x1Kerns - dispatcher; - whole_bundle = dispatcher.get_bundle(param, matmul_param, - m_matmul_algo, - compt_oc_block_size); - matmul_bundle = m_matmul_algo->get_bundle(matmul_param); - } - MIDOUT_END(); - } else { - //! if nopack return null so that OprWeightPreprocessProxy can run - //! with nopack mode - return {}; - } - - Conv1x1StrategyBase* conv1x1_strategy = - Conv1x1Factory::make_conv1x1_strategy(param, pack_mode, - param.filter_meta.format); - - auto kern_packA = [this, whole_bundle, matmul_bundle, param, - compt_oc_block_size, conv1x1_strategy]( - const NCBKernParam& ncb_param, - const NCBKernIndex& ncb_index) mutable { - conv1x1_strategy->packA(whole_bundle, matmul_bundle, - compt_oc_block_size, this->m_matmul_algo, - param, ncb_param, std::move(ncb_index)); - }; - - ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); - return ret_kern; - } - MIDOUT_END(); - return {}; -} bool ConvBiasImpl::AlgoConv1x1::is_preferred( const NCBKernSizeParam& param) const { diff --git a/dnn/src/fallback/conv_bias/conv1x1/algos.h b/dnn/src/fallback/conv_bias/conv1x1/algos.h index b4267100..6c7f5bf0 100644 --- a/dnn/src/fallback/conv_bias/conv1x1/algos.h +++ b/dnn/src/fallback/conv_bias/conv1x1/algos.h @@ -20,6 +20,11 @@ namespace megdnn { namespace fallback { class ConvBiasImpl::AlgoConv1x1 final : public AlgoBase { + WorkspaceBundle get_bundle_according_packmode( + const NCBKernSizeParam& param) const; + SmallVector get_kerns_according_packmode( + const NCBKernSizeParam& param, bool weight_preprocess) const; + public: AlgoConv1x1(MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) : m_matmul_algo(matmul_algo), m_oc_block_size(oc_block_size) {} @@ -41,7 +46,7 @@ public: const NCBKernSizeParam& param) const override; bool is_preferred(const NCBKernSizeParam&) const override; - + SmallVector deduce_preprocessed_filter_layout( const NCBKernSizeParam& param) const override; size_t get_preprocess_workspace( diff --git a/dnn/src/fallback/conv_bias/conv1x1/algos_conv1x1_gemv.cpp b/dnn/src/fallback/conv_bias/conv1x1/algos_conv1x1_gemv.cpp index ef6a599e..83a3f024 100644 --- a/dnn/src/fallback/conv_bias/conv1x1/algos_conv1x1_gemv.cpp +++ b/dnn/src/fallback/conv_bias/conv1x1/algos_conv1x1_gemv.cpp @@ -360,23 +360,23 @@ ConvBiasImpl::AlgoConv1x1Gemv::dispatch_kerns( dt_uint8, PostprocessMode::QUANTIZED, "NCHW::GEMV::QUINT8x8x32_QUINT8"_hash); break; - + //!no support nchw44 8x8x16 case param::ConvBias::Format::NCHW44: cb1(param::ConvBias::Format::NCHW44, dt_float32, dt_float32, PostprocessMode::FLOAT, "NCHW44::GEMV::FLOAT"_hash); - cb2(param::ConvBias::Format::NCHW44, dt_int8, dt_int32, dt_int32, - dt_int8, dt_int32, dt_int32, PostprocessMode::NO_PROCESS, + cb3(param::ConvBias::Format::NCHW44, dt_int8, dt_int32, dt_int32, + dt_int8, dt_int32, dt_int32, PostprocessMode::ADD_BIAS, "NCHW44::GEMV::INT8x8x32_INT32"_hash); - cb2(param::ConvBias::Format::NCHW44, dtype::QuantizedS8, + cb3(param::ConvBias::Format::NCHW44, dtype::QuantizedS8, dtype::QuantizedS32, dtype::QuantizedS32, dt_int8, dt_int32, - dt_int32, PostprocessMode::NO_PROCESS, + dt_int32, PostprocessMode::ADD_BIAS, "NCHW44::GEMV::QINT8x8x32_QINT32"_hash); cb2(param::ConvBias::Format::NCHW44, dtype::QuantizedS8, dtype::QuantizedS32, dtype::QuantizedS8, dt_int8, dt_int32, dt_int8, PostprocessMode::QUANTIZED, "NCHW44::GEMV::QINT8x8x32_QINT8"_hash); break; - + //!no support nchw44-dot 8x8x16 case param::ConvBias::Format::NCHW44_DOT: cb3(param::ConvBias::Format::NCHW44_DOT, dt_int8, dt_int32, dt_int32, dt_int8, dt_int32, dt_int32, @@ -420,81 +420,74 @@ bool ConvBiasImpl::AlgoConv1x1Gemv::usable(const NCBKernSizeParam& param, MIDOUT_BEGIN(megdnn_fallback_conv1x1_gemv, midout_iv("AlgoConv1x1Gemv::usable"_hash)) { auto format = param.filter_meta.format; -#if MEGDNN_X86 - if (format != param::ConvBias::Format::NCHW) - return false; -#elif MEGDNN_AARCH64 || MEGDNN_ARMV7 - if (format != param::ConvBias::Format::NCHW && - format != param::ConvBias::Format::NCHW44 && - format != param::ConvBias::Format::NCHW44_DOT) - return false; -#endif - - //! whether 1x1 size_t FH = param.filter_meta.spatial[0], FW = param.filter_meta.spatial[1]; size_t PH = param.filter_meta.padding[0], PW = param.filter_meta.padding[1]; size_t SH = param.filter_meta.stride[0], SW = param.filter_meta.stride[1]; - - if (FH != 1 || FW != 1 || PH || PW || SH != 1 || SW != 1) { - return false; - } - - //! whether gemv size_t OH = param.osz[0]; size_t OW = param.osz[1]; - if (OH * OW != 1) { + //! whether gemv and 1x1 + if (OH * OW != 1 || FH != 1 || FW != 1 || PH || PW || SH != 1 || + SW != 1) { return false; } - - //! even no naive support in gemv - if ((param.src_type.enumv() == param.filter_type.enumv() && - param.src_type.enumv() == DTypeEnum::Int16) && - param.dst_type.enumv() == DTypeEnum::Int32) { +#if MEGDNN_AARCH64 || MEGDNN_ARMV7 + if (format != param::ConvBias::Format::NCHW && + format != param::ConvBias::Format::NCHW44 && + format != param::ConvBias::Format::NCHW44_DOT) { return false; } - - //! make sure 8x8x16 and 8x8x32 biasmode is nobias and nonlineMode - //! is identity otherwise return false mean that 8x8x32 and 8x8x16 - //! not support PostProcess - if (param.dst_type.enumv() == DTypeEnum::Int16 || - param.dst_type.enumv() == DTypeEnum::Int32 || - param.dst_type.enumv() == DTypeEnum::QuantizedS32) { - if (param.nonlineMode != megdnn::NonlineMode::IDENTITY) { - return false; - } - } - - //! supports a few dtypes - if (param.src_type.enumv() != param.filter_type.enumv()) { +#else + if (format != param::ConvBias::Format::NCHW) { return false; } - - if (param.src_type.enumv() != DTypeEnum::Int8 && - param.src_type.enumv() != DTypeEnum::QuantizedS8 && - param.src_type.enumv() != DTypeEnum::Quantized8Asymm && +#endif + //! supports a few dtypes + if (param.src_type.enumv() != param.filter_type.enumv() || + (param.src_type.enumv() != DTypeEnum::Int8 && + param.src_type.enumv() != DTypeEnum::QuantizedS8 && + param.src_type.enumv() != DTypeEnum::Quantized8Asymm && #if !MEGDNN_DISABLE_FLOAT16 - param.src_type.enumv() != DTypeEnum::Float16 && + param.src_type.enumv() != DTypeEnum::Float16 && #endif - param.src_type.enumv() != DTypeEnum::Float32) { + param.src_type.enumv() != DTypeEnum::Float32)) { return false; } -#if MEGDNN_AARCH64 || MEGDNN_ARMV7 if (format == param::ConvBias::Format::NCHW44) { if (param.src_type.enumv() != DTypeEnum::Float32 && param.src_type.enumv() != DTypeEnum::Int8 && param.src_type.enumv() != DTypeEnum::QuantizedS8) { return false; } + //! 8x8x16 is not support nchw44 + if (param.src_type.enumv() == DTypeEnum::Int8 && + param.dst_type.enumv() == DTypeEnum::Int16) { + return false; + } } else if (format == param::ConvBias::Format::NCHW44_DOT) { - if (param.src_type.enumv() != DTypeEnum::Int8 && - param.src_type.enumv() != DTypeEnum::QuantizedS8) { + if ((param.src_type.enumv() != DTypeEnum::Int8 && + param.src_type.enumv() != DTypeEnum::QuantizedS8) || + param.dst_type.enumv() == DTypeEnum::Int16) { return false; } } -#endif + //! make sure 8x8x16 and 8x8x32 biasmode nonlineMode is identity + //! otherwise return false + if (param.dst_type.enumv() == DTypeEnum::Int16 || + param.dst_type.enumv() == DTypeEnum::Int32 || + param.dst_type.enumv() == DTypeEnum::QuantizedS32) { + if (param.nonlineMode != megdnn::NonlineMode::IDENTITY) { + return false; + } + } + //! even no naive support in gemv + if ((param.src_type.enumv() == param.filter_type.enumv() && + param.src_type.enumv() == DTypeEnum::Int16) && + param.dst_type.enumv() == DTypeEnum::Int32) { + return false; + } return (param.filter_meta.dilation[0] == param.filter_meta.dilation[1] && param.filter_meta.dilation[0] == 1) && diff --git a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h index 30bb6147..357fa6ad 100644 --- a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h +++ b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h @@ -11,14 +11,19 @@ #pragma once +#include "src/fallback/conv_bias/conv1x1/conv1x1_strategy.h" #include "src/fallback/conv_bias/conv1x1/conv1x1_utils.h" +#include "src/fallback/conv_bias/opr_impl.h" namespace megdnn { namespace fallback { namespace conv1x1 { template -class Conv1x1Kerns { +class Conv1x1Kerns; + +template <> +class Conv1x1Kerns { public: //! get_bundle WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param, @@ -28,13 +33,12 @@ public: size_t GROUP = param.filter_meta.group; size_t OC = param.filter_meta.ocpg; size_t BATCH = param.n; - //! bundle per thread //! matmul_param records a matmul with M = oc_tile_size, K = IC, N = OH //! * OW this does not bother packb bytes auto matmul_bundle = matmul_algo->get_bundle(matmul_param); - auto thread_bundle = utils::get_thread_bundle(param, matmul_bundle.get_size(2), - oc_tile_size); + auto thread_bundle = utils::get_thread_bundle( + param, matmul_bundle.get_size(2), oc_tile_size); //! size per thread size_t all_threads_bytes = thread_bundle.total_size_in_bytes() * param.nr_threads; @@ -46,11 +50,6 @@ public: is_enable_filter_preprocess(param) ? 0 : packa_bytes_per_oc_tile * oc_tiles_per_group * GROUP; - - if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) - return WorkspaceBundle{nullptr, - {all_packa_bytes, 0, all_threads_bytes}}; - //! packb size = N * GROUP * packb_size_per_group size_t packb_bytes_per_group = matmul_bundle.get_size(1); size_t all_packb_bytes = packb_bytes_per_group * GROUP * BATCH; @@ -58,6 +57,165 @@ public: return WorkspaceBundle{ nullptr, {all_packa_bytes, all_packb_bytes, all_threads_bytes}}; } + + SmallVector get_kern( + const ConvBiasImpl::NCBKernSizeParam& param, + WorkspaceBundle& whole_bundle, WorkspaceBundle& matmul_bundle, + WorkspaceBundle& thread_bundle, + Conv1x1StrategyBase* conv1x1_strategy, + const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) { + auto kern_packA = + [whole_bundle, matmul_bundle, param, matmul_algo, oc_block_size, + conv1x1_strategy]( + const ConvBiasImpl::NCBKernParam& ncb_param, + const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { + conv1x1_strategy->packA(whole_bundle, matmul_bundle, + oc_block_size, matmul_algo, param, + ncb_param, std::move(ncb_index)); + }; + auto kern_packB = + [whole_bundle, matmul_bundle, param, matmul_algo, + conv1x1_strategy]( + const ConvBiasImpl::NCBKernParam& ncb_param, + const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { + conv1x1_strategy->packB(whole_bundle, matmul_bundle, + matmul_algo, param, ncb_param, + std::move(ncb_index)); + }; + auto kern_compt = + [whole_bundle, matmul_bundle, thread_bundle, matmul_algo, param, + oc_block_size, conv1x1_strategy]( + const ConvBiasImpl::NCBKernParam& ncb_param, + const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { + conv1x1_strategy->exec(whole_bundle, matmul_bundle, + thread_bundle, oc_block_size, + matmul_algo, param, ncb_param, + std::move(ncb_index)); + }; + size_t GROUP = param.filter_meta.group; + size_t BATCH = param.n; + size_t OC = param.filter_meta.ocpg; + size_t oc_blocks_per_group = div_ceil(OC, oc_block_size); + SmallVector ret_kern; + if (!is_enable_filter_preprocess(param)) { + ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); + } + ret_kern.push_back({kern_packB, {BATCH}}); + ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}}); + return ret_kern; + } + SmallVector get_kern_preprocess( + const ConvBiasImpl::NCBKernSizeParam& param, + WorkspaceBundle& whole_bundle, WorkspaceBundle& matmul_bundle, + Conv1x1StrategyBase* conv1x1_strategy, + const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) { + auto kern_packA = + [whole_bundle, matmul_bundle, param, matmul_algo, oc_block_size, + conv1x1_strategy]( + const ConvBiasImpl::NCBKernParam& ncb_param, + const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { + conv1x1_strategy->packA(whole_bundle, matmul_bundle, + oc_block_size, matmul_algo, param, + ncb_param, std::move(ncb_index)); + }; + size_t GROUP = param.filter_meta.group; + size_t OC = param.filter_meta.ocpg; + size_t oc_blocks_per_group = div_ceil(OC, oc_block_size); + SmallVector ret_kern; + ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); + return ret_kern; + } + +}; + +template<> +class Conv1x1Kerns { +public: + //! get_bundle + WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param, + const MatrixMulImpl::KernSizeParam& matmul_param, + const MatrixMulImpl::AlgoBase* matmul_algo, + size_t oc_tile_size) { + size_t GROUP = param.filter_meta.group; + size_t OC = param.filter_meta.ocpg; + //! bundle per thread + //! matmul_param records a matmul with M = oc_tile_size, K = IC, N = OH + //! * OW this does not bother packb bytes + auto matmul_bundle = matmul_algo->get_bundle(matmul_param); + auto thread_bundle = utils::get_thread_bundle( + param, matmul_bundle.get_size(2), oc_tile_size); + //! size per thread + size_t all_threads_bytes = + thread_bundle.total_size_in_bytes() * param.nr_threads; + + //! packa size = GROUP * packa_size_each_group + size_t packa_bytes_per_oc_tile = matmul_bundle.get_size(0); + size_t oc_tiles_per_group = div_ceil(OC, oc_tile_size); + size_t all_packa_bytes = + is_enable_filter_preprocess(param) + ? 0 + : packa_bytes_per_oc_tile * oc_tiles_per_group * GROUP; + + return WorkspaceBundle{nullptr, + {all_packa_bytes, 0, all_threads_bytes}}; + } + SmallVector get_kern( + const ConvBiasImpl::NCBKernSizeParam& param, + WorkspaceBundle& whole_bundle, WorkspaceBundle& matmul_bundle, + WorkspaceBundle& thread_bundle, + Conv1x1StrategyBase* conv1x1_strategy, + const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) { + auto kern_packA = + [whole_bundle, matmul_bundle, param, matmul_algo, oc_block_size, + conv1x1_strategy]( + const ConvBiasImpl::NCBKernParam& ncb_param, + const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { + conv1x1_strategy->packA(whole_bundle, matmul_bundle, + oc_block_size, matmul_algo, param, + ncb_param, std::move(ncb_index)); + }; + auto kern_compt = + [whole_bundle, matmul_bundle, thread_bundle, matmul_algo, param, + oc_block_size, conv1x1_strategy]( + const ConvBiasImpl::NCBKernParam& ncb_param, + const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { + conv1x1_strategy->exec(whole_bundle, matmul_bundle, + thread_bundle, oc_block_size, + matmul_algo, param, ncb_param, + std::move(ncb_index)); + }; + size_t GROUP = param.filter_meta.group; + size_t BATCH = param.n; + size_t OC = param.filter_meta.ocpg; + size_t oc_blocks_per_group = div_ceil(OC, oc_block_size); + SmallVector ret_kern; + if (!is_enable_filter_preprocess(param)) { + ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); + } + ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}}); + return ret_kern; + } + SmallVector get_kern_preprocess( + const ConvBiasImpl::NCBKernSizeParam& param, + WorkspaceBundle& whole_bundle, WorkspaceBundle& matmul_bundle, + Conv1x1StrategyBase* conv1x1_strategy, + const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) { + auto kern_packA = + [whole_bundle, matmul_bundle, param, matmul_algo, oc_block_size, + conv1x1_strategy]( + const ConvBiasImpl::NCBKernParam& ncb_param, + const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { + conv1x1_strategy->packA(whole_bundle, matmul_bundle, + oc_block_size, matmul_algo, param, + ncb_param, std::move(ncb_index)); + }; + size_t GROUP = param.filter_meta.group; + size_t OC = param.filter_meta.ocpg; + size_t oc_blocks_per_group = div_ceil(OC, oc_block_size); + SmallVector ret_kern; + ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); + return ret_kern; + } }; template<> @@ -69,14 +227,47 @@ public: const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_tile_size) { size_t matmul_size = matmul_algo->get_workspace(matmul_param); - auto thread_bundle = utils::get_thread_bundle(param, matmul_size, oc_tile_size); + auto thread_bundle = + utils::get_thread_bundle(param, matmul_size, oc_tile_size); //! size per thread size_t all_threads_bytes = thread_bundle.total_size_in_bytes() * param.nr_threads; return WorkspaceBundle{nullptr, {0, 0, all_threads_bytes}}; } + SmallVector get_kern( + const ConvBiasImpl::NCBKernSizeParam& param, + WorkspaceBundle& whole_bundle, WorkspaceBundle& matmul_bundle, + WorkspaceBundle& thread_bundle, + Conv1x1StrategyBase* conv1x1_strategy, + const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) { + auto kern_compt = + [whole_bundle, matmul_bundle, thread_bundle, matmul_algo, param, + oc_block_size, conv1x1_strategy]( + const ConvBiasImpl::NCBKernParam& ncb_param, + const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { + conv1x1_strategy->exec(whole_bundle, matmul_bundle, + thread_bundle, oc_block_size, + matmul_algo, param, ncb_param, + std::move(ncb_index)); + }; + size_t GROUP = param.filter_meta.group; + size_t BATCH = param.n; + size_t OC = param.filter_meta.ocpg; + size_t oc_blocks_per_group = div_ceil(OC, oc_block_size); + SmallVector ret_kern; + ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}}); + return ret_kern; + } + SmallVector get_kern_preprocess( + const ConvBiasImpl::NCBKernSizeParam&, WorkspaceBundle&, + WorkspaceBundle&, Conv1x1StrategyBase*, + const MatrixMulImpl::AlgoBase*, size_t) { + return {}; + } }; } // namespace conv1x1 } // namespace fallback } // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h index cc173cf5..eaf62551 100644 --- a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h +++ b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h @@ -59,7 +59,8 @@ public: template + megdnn::PostprocessMode postprocess_mode, + MatrixMulImpl::AlgoBase::PackMode pack_mode> class Conv1x1Strategy : public Conv1x1StrategyBase { public: explicit Conv1x1Strategy(size_t pack_size = 1) : m_pack_size(pack_size) {} @@ -136,32 +137,30 @@ public: size_t packb_bytes_per_group = matmul_bundle.get_size(1); size_t GROUP = param.filter_meta.group; - size_t BATCH = param.n; size_t SH = param.filter_meta.stride[0]; size_t SW = param.filter_meta.stride[1]; size_t OH = param.osz[0]; size_t OW = param.osz[1]; size_t OC = param.filter_meta.ocpg; + size_t batch = ncb_index.ndrange_id[0]; MatrixMulImpl::KernParam matmul_kern_param; static_cast(matmul_kern_param) = utils::get_matmul_kern_param(param, OH * OW, OC); - rep(batch, BATCH) { - rep(g, GROUP) { - if (SH == 2 && SW == 2) - megdnn_throw("no support for stride = 2"); - - size_t bytes_offset_of_b_panel = - batch * packb_bytes_per_group * GROUP + - g * packb_bytes_per_group; - src_ctype* b_panel = reinterpret_cast( - reinterpret_cast(whole_bundle.get(1)) + - bytes_offset_of_b_panel); - matmul_kern_param.B_ptr = const_cast( - ncb_param.src(batch, g)); - matmul_algo->pack_B(matmul_kern_param, b_panel, 0, OH * OW); - } + rep(g, GROUP) { + if (SH == 2 && SW == 2) + megdnn_throw("no support for stride = 2"); + + size_t bytes_offset_of_b_panel = + batch * packb_bytes_per_group * GROUP + + g * packb_bytes_per_group; + src_ctype* b_panel = reinterpret_cast( + reinterpret_cast(whole_bundle.get(1)) + + bytes_offset_of_b_panel); + matmul_kern_param.B_ptr = const_cast( + ncb_param.src(batch, g)); + matmul_algo->pack_B(matmul_kern_param, b_panel, 0, OH * OW); } } else { megdnn_log_error("OnlyPackA mode and NoPack mode has no packB kernel");