diff --git a/dnn/src/aarch64/conv_bias/fp16/algos.cpp b/dnn/src/aarch64/conv_bias/fp16/algos.cpp index 4bce406b..cab6df54 100644 --- a/dnn/src/aarch64/conv_bias/fp16/algos.cpp +++ b/dnn/src/aarch64/conv_bias/fp16/algos.cpp @@ -89,19 +89,20 @@ ConvBiasImpl::AlgoF16DirectStride2::get_kimpls( conv = fp16::conv_stride2::do_conv_7x7_stride2; } - WorkspaceBundle wbundle = arm_common::MultithreadDirectConvCommon< + WorkspaceBundle bundle = arm_common::MultithreadDirectConvCommon< dt_float16, __fp16>::get_bundle_stride(param, m_large_group); SmallVector ret_kerns; //! Dense conv and small group if (m_large_group) { //! Channel wise conv and big groups - auto exec_one_group = [wbundle, conv](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + auto exec_one_group = [bundle, conv]( + const NCBKernParam& kern_param, + const NCBKernIndex& ncb_index) mutable { auto fm = kern_param.filter_meta; size_t IC = fm.icpg; size_t OC = fm.ocpg; - WorkspaceBundle bundle = wbundle; + bundle.set(kern_param.workspace_ptr); for (size_t ic = 0; ic < IC; ic++) { arm_common::MultithreadDirectConvCommon:: copy_padding_kern_stride(bundle, kern_param, ncb_index, @@ -115,16 +116,17 @@ ConvBiasImpl::AlgoF16DirectStride2::get_kimpls( }; ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); } else { - WorkspaceBundle bundle = wbundle; auto copy_padding = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); arm_common::MultithreadDirectConvCommon:: copy_padding_kern_stride(bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({copy_padding, {group, N, IC}}); auto do_conv = [bundle, conv](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); arm_common::MultithreadDirectConvCommon:: do_conv_kern_stride(bundle, kern_param, ncb_index, conv, ncb_index.ndrange_id); diff --git a/dnn/src/aarch64/conv_bias/fp32/algos.cpp b/dnn/src/aarch64/conv_bias/fp32/algos.cpp index 300e0c2b..38848b46 100644 --- a/dnn/src/aarch64/conv_bias/fp32/algos.cpp +++ b/dnn/src/aarch64/conv_bias/fp32/algos.cpp @@ -88,19 +88,20 @@ ConvBiasImpl::AlgoF32DirectStride2::get_kimpls( conv = fp32::conv_stride2::do_conv_7x7_stride2; } - WorkspaceBundle wbundle = arm_common::MultithreadDirectConvCommon< + WorkspaceBundle bundle = arm_common::MultithreadDirectConvCommon< float, float>::get_bundle_stride(param, m_large_group); SmallVector ret_kerns; //! Dense conv and small group if (m_large_group) { //! Channel wise conv and big groups - auto exec_one_group = [wbundle, conv](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + auto exec_one_group = [bundle, conv]( + const NCBKernParam& kern_param, + const NCBKernIndex& ncb_index) mutable { auto fm = kern_param.filter_meta; size_t IC = fm.icpg; size_t OC = fm.ocpg; - WorkspaceBundle bundle = wbundle; + bundle.set(kern_param.workspace_ptr); for (size_t ic = 0; ic < IC; ic++) { arm_common::MultithreadDirectConvCommon:: copy_padding_kern_stride(bundle, kern_param, ncb_index, @@ -116,16 +117,17 @@ ConvBiasImpl::AlgoF32DirectStride2::get_kimpls( }; ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); } else { - WorkspaceBundle bundle = wbundle; auto copy_padding = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); arm_common::MultithreadDirectConvCommon:: copy_padding_kern_stride(bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({copy_padding, {group, N, IC}}); auto do_conv = [bundle, conv](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); arm_common::MultithreadDirectConvCommon< float, float>::do_conv_kern_stride(bundle, kern_param, ncb_index, conv, diff --git a/dnn/src/arm_common/conv_bias/direct/multi_thread_common.cpp b/dnn/src/arm_common/conv_bias/direct/multi_thread_common.cpp index e833b7fb..7304e4d4 100644 --- a/dnn/src/arm_common/conv_bias/direct/multi_thread_common.cpp +++ b/dnn/src/arm_common/conv_bias/direct/multi_thread_common.cpp @@ -119,7 +119,8 @@ MultithreadDirectConvCommon::get_bundle_stride( //! Process one output channel weight flip template void MultithreadDirectConvCommon::weight_flip_kern( - WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, + const WorkspaceBundle& bundle, + const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { size_t FH = kern_param.filter_meta.spatial[0]; @@ -131,7 +132,6 @@ void MultithreadDirectConvCommon::weight_flip_kern( group_id = ncb_index.ndrange_id[0]; const io_ctype* filter = kern_param.filter(group_id) + channel_id * FH * FW * IC; - bundle.set(kern_param.workspace_ptr); io_ctype* filter_flip = static_cast(bundle.get(1)) + (workspace_group_id * IC * OC + channel_id * IC) * FH * FW; @@ -148,7 +148,8 @@ void MultithreadDirectConvCommon::weight_flip_kern( //! Process one input channel copy padding template void MultithreadDirectConvCommon::copy_padding_kern( - WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, + const WorkspaceBundle& bundle, + const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { size_t IH = kern_param.isz[0]; @@ -161,7 +162,6 @@ void MultithreadDirectConvCommon::copy_padding_kern( size_t padding_group_size = IH2 * IW2 * IC; size_t N = kern_param.n; size_t GROUP = kern_param.filter_meta.group; - bundle.set(kern_param.workspace_ptr); //! Used for get the workspace offset size_t workspace_group_id = workspace_ids[0], @@ -191,7 +191,7 @@ void MultithreadDirectConvCommon::copy_padding_kern( //! Process one input channel copy padding template void MultithreadDirectConvCommon:: - copy_padding_kern_stride(WorkspaceBundle bundle, + copy_padding_kern_stride(const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { @@ -208,7 +208,6 @@ void MultithreadDirectConvCommon:: size_t GROUP = kern_param.filter_meta.group; get_rectified_size(kern_param, IH, IW, OH, OW, FH, FW, PH, PW, IH2, IW2, OW2); size_t padding_group_size = IH2 * IW2 * IC; - bundle.set(kern_param.workspace_ptr); //! Used for get the workspace offset size_t workspace_group_id = workspace_ids[0], @@ -235,7 +234,8 @@ void MultithreadDirectConvCommon:: //! compute one output channel template void MultithreadDirectConvCommon::do_conv_kern( - WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, + const WorkspaceBundle& bundle, + const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index, const kern_direct_conv_f32& fun, const CpuNDRange& workspace_ids) { size_t OH = kern_param.osz[0]; @@ -251,7 +251,6 @@ void MultithreadDirectConvCommon::do_conv_kern( size_t padding_group_size = IH2 * IW2 * IC; size_t N = kern_param.n; size_t GROUP = kern_param.filter_meta.group; - bundle.set(kern_param.workspace_ptr); size_t group_id = ncb_index.ndrange_id[0], batch_id = ncb_index.ndrange_id[1]; @@ -305,7 +304,8 @@ void MultithreadDirectConvCommon::do_conv_kern( //! compute one output channel template void MultithreadDirectConvCommon::do_conv_kern_stride( - WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, + const WorkspaceBundle& bundle, + const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index, const kern_direct_conv_f32_stride& fun, const CpuNDRange& workspace_ids) { @@ -323,7 +323,6 @@ void MultithreadDirectConvCommon::do_conv_kern_stride( size_t padding_group_size = IH2 * IW2 * IC; size_t GROUP = kern_param.filter_meta.group; - bundle.set(kern_param.workspace_ptr); //! Used for get the workspace offset size_t group_id = ncb_index.ndrange_id[0], diff --git a/dnn/src/arm_common/conv_bias/direct/multi_thread_common.h b/dnn/src/arm_common/conv_bias/direct/multi_thread_common.h index 55c1ea91..81480972 100644 --- a/dnn/src/arm_common/conv_bias/direct/multi_thread_common.h +++ b/dnn/src/arm_common/conv_bias/direct/multi_thread_common.h @@ -35,24 +35,24 @@ public: bool m_large_group); static WorkspaceBundle get_bundle_stride(const NCBKernSizeParam& param, bool m_large_group); - static void weight_flip_kern(WorkspaceBundle bundle, + static void weight_flip_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids); - static void copy_padding_kern(WorkspaceBundle bundle, + static void copy_padding_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids); - static void copy_padding_kern_stride(WorkspaceBundle bundle, + static void copy_padding_kern_stride(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids); - static void do_conv_kern(WorkspaceBundle bundle, + static void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const kern_direct_conv_f32& fun, const CpuNDRange& workspace_ids); - static void do_conv_kern_stride(WorkspaceBundle bundle, + static void do_conv_kern_stride(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const kern_direct_conv_f32_stride& fun, diff --git a/dnn/src/arm_common/conv_bias/f16/algos.cpp b/dnn/src/arm_common/conv_bias/f16/algos.cpp index 7183c524..47af1e38 100644 --- a/dnn/src/arm_common/conv_bias/f16/algos.cpp +++ b/dnn/src/arm_common/conv_bias/f16/algos.cpp @@ -362,7 +362,7 @@ SmallVector ConvBiasImpl::AlgoF16Direct::get_kimpls( size_t IC = param.filter_meta.icpg; size_t OC = param.filter_meta.ocpg; size_t group = fm.group; - WorkspaceBundle wbundle = + WorkspaceBundle bundle = MultithreadDirectConvCommon::get_bundle( param, m_large_group); SmallVector ret_kerns; @@ -370,12 +370,12 @@ SmallVector ConvBiasImpl::AlgoF16Direct::get_kimpls( //! one group for better performance if (m_large_group) { //! Channel wise conv and big groups - auto exec_one_group = [wbundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + auto exec_one_group = [bundle](const NCBKernParam& kern_param, + const NCBKernIndex& ncb_index) mutable { auto fm = kern_param.filter_meta; size_t IC = fm.icpg; size_t OC = fm.ocpg; - WorkspaceBundle bundle = wbundle; + bundle.set(kern_param.workspace_ptr); if (fm.should_flip) { for (size_t oc = 0; oc < OC; oc++) { MultithreadDirectConvCommon:: @@ -397,10 +397,10 @@ SmallVector ConvBiasImpl::AlgoF16Direct::get_kimpls( }; ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); } else { - WorkspaceBundle bundle = wbundle; if (fm.should_flip) { auto weight_flip = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); MultithreadDirectConvCommon:: weight_flip_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id); @@ -408,13 +408,15 @@ SmallVector ConvBiasImpl::AlgoF16Direct::get_kimpls( ret_kerns.push_back({weight_flip, {group, 1_z, OC}}); } auto copy_padding = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); MultithreadDirectConvCommon::copy_padding_kern( bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({copy_padding, {group, N, IC}}); auto do_conv = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); MultithreadDirectConvCommon::do_conv_kern( bundle, kern_param, ncb_index, fp16::conv_bias::kern_direct_f16, ncb_index.ndrange_id); @@ -488,7 +490,7 @@ ConvBiasImpl::AlgoF16DirectStride1::get_kimpls( } SWITCH_KERN(); - WorkspaceBundle wbundle = + WorkspaceBundle bundle = MultithreadDirectConvCommon::get_bundle_stride( param, m_large_group); SmallVector ret_kerns; @@ -496,13 +498,13 @@ ConvBiasImpl::AlgoF16DirectStride1::get_kimpls( //! one group for better performance if (m_large_group) { //! Channel wise conv and big groups - auto exec_one_group = [wbundle, conv_kern_function]( + auto exec_one_group = [bundle, conv_kern_function]( const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { auto fm = kern_param.filter_meta; size_t IC = fm.icpg; size_t OC = fm.ocpg; - WorkspaceBundle bundle = wbundle; + bundle.set(kern_param.workspace_ptr); for (size_t ic = 0; ic < IC; ic++) { MultithreadDirectConvCommon:: copy_padding_kern_stride(bundle, kern_param, ncb_index, @@ -517,9 +519,9 @@ ConvBiasImpl::AlgoF16DirectStride1::get_kimpls( }; ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); } else { - WorkspaceBundle bundle = wbundle; auto copy_padding = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); MultithreadDirectConvCommon:: copy_padding_kern_stride(bundle, kern_param, ncb_index, ncb_index.ndrange_id); @@ -527,7 +529,8 @@ ConvBiasImpl::AlgoF16DirectStride1::get_kimpls( ret_kerns.push_back({copy_padding, {group, N, IC}}); auto do_conv = [bundle, conv_kern_function]( const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); MultithreadDirectConvCommon:: do_conv_kern_stride(bundle, kern_param, ncb_index, conv_kern_function, diff --git a/dnn/src/arm_common/conv_bias/fp32/algos.cpp b/dnn/src/arm_common/conv_bias/fp32/algos.cpp index a142b9a6..63087636 100644 --- a/dnn/src/arm_common/conv_bias/fp32/algos.cpp +++ b/dnn/src/arm_common/conv_bias/fp32/algos.cpp @@ -597,7 +597,7 @@ SmallVector ConvBiasImpl::AlgoF32Direct::get_kimpls( size_t IC = param.filter_meta.icpg; size_t OC = param.filter_meta.ocpg; size_t group = fm.group; - WorkspaceBundle wbundle = + WorkspaceBundle bundle = MultithreadDirectConvCommon::get_bundle( param, m_large_group); SmallVector ret_kerns; @@ -605,12 +605,12 @@ SmallVector ConvBiasImpl::AlgoF32Direct::get_kimpls( //! one group for better performance if (m_large_group) { //! Channel wise conv and big groups - auto exec_one_group = [wbundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + auto exec_one_group = [bundle](const NCBKernParam& kern_param, + const NCBKernIndex& ncb_index) mutable { auto fm = kern_param.filter_meta; size_t IC = fm.icpg; size_t OC = fm.ocpg; - WorkspaceBundle bundle = wbundle; + bundle.set(kern_param.workspace_ptr); if (fm.should_flip) { for (size_t oc = 0; oc < OC; oc++) { MultithreadDirectConvCommon::weight_flip_kern( @@ -631,23 +631,25 @@ SmallVector ConvBiasImpl::AlgoF32Direct::get_kimpls( }; ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); } else { - WorkspaceBundle bundle = wbundle; if (fm.should_flip) { auto weight_flip = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); MultithreadDirectConvCommon::weight_flip_kern( bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({weight_flip, {group, 1_z, OC}}); } auto copy_padding = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); MultithreadDirectConvCommon::copy_padding_kern( bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({copy_padding, {group, N, IC}}); auto do_conv = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); MultithreadDirectConvCommon::do_conv_kern( bundle, kern_param, ncb_index, fp32::conv_bias::kern_direct, ncb_index.ndrange_id); @@ -734,7 +736,7 @@ ConvBiasImpl::AlgoF32DirectStride1::get_kimpls( } SWITCH_KERN_STR1(); - WorkspaceBundle wbundle = + WorkspaceBundle bundle = MultithreadDirectConvCommon::get_bundle_stride( param, m_large_group); SmallVector ret_kerns; @@ -742,13 +744,13 @@ ConvBiasImpl::AlgoF32DirectStride1::get_kimpls( //! one group for better performance if (m_large_group) { //! Channel wise conv and big groups - auto exec_one_group = [wbundle, conv_kern_function]( + auto exec_one_group = [bundle, conv_kern_function]( const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { auto fm = kern_param.filter_meta; size_t IC = fm.icpg; size_t OC = fm.ocpg; - WorkspaceBundle bundle = wbundle; + bundle.set(kern_param.workspace_ptr); for (size_t ic = 0; ic < IC; ic++) { MultithreadDirectConvCommon:: copy_padding_kern_stride(bundle, kern_param, ncb_index, @@ -762,16 +764,17 @@ ConvBiasImpl::AlgoF32DirectStride1::get_kimpls( }; ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); } else { - WorkspaceBundle bundle = wbundle; auto copy_padding = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); MultithreadDirectConvCommon::copy_padding_kern_stride( bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({copy_padding, {group, N, IC}}); auto do_conv = [bundle, conv_kern_function]( const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); MultithreadDirectConvCommon::do_conv_kern_stride( bundle, kern_param, ncb_index, conv_kern_function, ncb_index.ndrange_id); @@ -859,7 +862,7 @@ ConvBiasImpl::AlgoF32DirectStride2::get_kimpls( } SWITCH_KERN_STR2(); - WorkspaceBundle wbundle = + WorkspaceBundle bundle = MultithreadDirectConvCommon::get_bundle_stride( param, m_large_group); SmallVector ret_kerns; @@ -867,13 +870,13 @@ ConvBiasImpl::AlgoF32DirectStride2::get_kimpls( //! one group for better performance if (m_large_group) { //! Channel wise conv and big groups - auto exec_one_group = [wbundle, conv_kern_function]( + auto exec_one_group = [bundle, conv_kern_function]( const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { auto fm = kern_param.filter_meta; size_t IC = fm.icpg; size_t OC = fm.ocpg; - WorkspaceBundle bundle = wbundle; + bundle.set(kern_param.workspace_ptr); for (size_t ic = 0; ic < IC; ic++) { MultithreadDirectConvCommon:: copy_padding_kern_stride(bundle, kern_param, ncb_index, @@ -887,16 +890,17 @@ ConvBiasImpl::AlgoF32DirectStride2::get_kimpls( }; ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); } else { - WorkspaceBundle bundle = wbundle; auto copy_padding = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); MultithreadDirectConvCommon::copy_padding_kern_stride( bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({copy_padding, {group, N, IC}}); auto do_conv = [bundle, conv_kern_function]( const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); MultithreadDirectConvCommon::do_conv_kern_stride( bundle, kern_param, ncb_index, conv_kern_function, ncb_index.ndrange_id); diff --git a/dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw44_algo.cpp b/dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw44_algo.cpp index fcc2aeb9..920aa183 100644 --- a/dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw44_algo.cpp +++ b/dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw44_algo.cpp @@ -22,7 +22,8 @@ using namespace megdnn; using namespace arm_common; using conv_fun = std::function; MIDOUT_DECL(megdnn_arm_common_conv_bias_fp32_nchw44_stride1) @@ -67,7 +68,7 @@ static WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param) { }; template -static void do_conv_kern(WorkspaceBundle bundle, +static void do_conv_kern(const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index, const CpuNDRange&, const CpuNDRange&) { @@ -87,7 +88,6 @@ static void do_conv_kern(WorkspaceBundle bundle, int oh2 = 0; int ow2 = 0; get_rectified_size(kern_param, ih2, iw2, oh2, ow2); - bundle.set(kern_param.workspace_ptr); constexpr int pack_c = 4; const int batch_id = ncb_index.ndrange_id[0]; @@ -281,7 +281,6 @@ ConvBiasImpl::AlgoF32DirectNCHW44::dispatch_kerns( megdnn_assert(do_conv_fun); SmallVector ret_kerns; - WorkspaceBundle bundle = wbundle; int oh = param.osz[0]; int ic = param.filter_meta.icpg; int iw = param.isz[1]; @@ -291,10 +290,11 @@ ConvBiasImpl::AlgoF32DirectNCHW44::dispatch_kerns( CpuNDRange ncb_range = {static_cast(batch), static_cast(group), static_cast(div_ceil(oh, oh_block))}; - auto do_conv = [bundle, do_conv_fun, ncb_range]( + auto do_conv = [wbundle, do_conv_fun, ncb_range]( const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { - do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id, + const NCBKernIndex& ncb_index) mutable { + wbundle.set(kern_param.workspace_ptr); + do_conv_fun(wbundle, kern_param, ncb_index, ncb_index.ndrange_id, ncb_range); }; ret_kerns.push_back({do_conv, ncb_range}); diff --git a/dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw_nchw44_algo.cpp b/dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw_nchw44_algo.cpp index 8fc0962d..82dd3231 100644 --- a/dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw_nchw44_algo.cpp +++ b/dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw_nchw44_algo.cpp @@ -23,7 +23,8 @@ using namespace megdnn; using namespace arm_common; using conv_fun = std::function; MIDOUT_DECL(megdnn_arm_common_conv_bias_fp32_nchw_nchw44) @@ -105,10 +106,9 @@ static inline void copy_pad_src(float* sptr_base, const float* sptr_origin, sptr_base += iw2 * pad_bottom; } } -static void pack_weight(WorkspaceBundle bundle, +static void pack_weight(const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index) { - bundle.set(kern_param.workspace_ptr); const int group_id = ncb_index.ndrange_id[0]; int fh = kern_param.filter_meta.spatial[0]; int fw = kern_param.filter_meta.spatial[1]; @@ -124,7 +124,7 @@ static void pack_weight(WorkspaceBundle bundle, } template -static void do_conv_kern(WorkspaceBundle bundle, +static void do_conv_kern(const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index, const CpuNDRange&, const CpuNDRange&) { @@ -144,7 +144,6 @@ static void do_conv_kern(WorkspaceBundle bundle, int oh2 = 0; int ow2 = 0; get_rectified_size(kern_param, ih2, iw2, oh2, ow2); - bundle.set(kern_param.workspace_ptr); constexpr int pack_c = 4; const int batch_id = ncb_index.ndrange_id[0]; @@ -220,7 +219,7 @@ ConvBiasImpl::AlgoF32DirectNCHWNCHW44::dispatch_kerns( auto fm = param.filter_meta; const int batch = param.n; const int group = fm.group; - WorkspaceBundle wbundle = get_bundle(param); + WorkspaceBundle bundle = get_bundle(param); conv_fun do_conv_fun = nullptr; // NOTE: remain_w is not used to gen hash of midout for compatible with // shape runtime @@ -301,11 +300,11 @@ ConvBiasImpl::AlgoF32DirectNCHWNCHW44::dispatch_kerns( megdnn_assert(do_conv_fun); SmallVector ret_kerns; - WorkspaceBundle bundle = wbundle; int oh = param.osz[0]; int oh_block = block_helper(param.nr_threads, oh, 0); auto do_pack_weight = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); pack_weight(bundle, kern_param, ncb_index); }; ret_kerns.push_back({do_pack_weight, {static_cast(group)}}); @@ -314,7 +313,8 @@ ConvBiasImpl::AlgoF32DirectNCHWNCHW44::dispatch_kerns( static_cast(div_ceil(oh, oh_block))}; auto do_conv = [bundle, do_conv_fun, ncb_range]( const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id, ncb_range); }; diff --git a/dnn/src/arm_common/conv_bias/int8/channel_wise_nchw44.cpp b/dnn/src/arm_common/conv_bias/int8/channel_wise_nchw44.cpp index dbe44780..c0790f10 100644 --- a/dnn/src/arm_common/conv_bias/int8/channel_wise_nchw44.cpp +++ b/dnn/src/arm_common/conv_bias/int8/channel_wise_nchw44.cpp @@ -76,7 +76,7 @@ WorkspaceBundle stride1::get_bundle( //! compute one output channel template -void stride1::do_conv_kern(WorkspaceBundle bundle, +void stride1::do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index) { size_t PH = kern_param.filter_meta.padding[0]; @@ -100,7 +100,6 @@ void stride1::do_conv_kern(WorkspaceBundle bundle, size_t thread_id = ncb_index.thread_id, batch_id = ncb_index.ndrange_id[0]; size_t group_id = ncb_index.ndrange_id[1]; - bundle.set(kern_param.workspace_ptr); int8_t* padding_src = static_cast(bundle.get(thread_id)); const int8_t* sptr = kern_param.src(batch_id, group_id, 0, pack_group_size); @@ -210,7 +209,8 @@ SmallVector stride1::get_kimpls( SmallVector ret_kerns; auto exec_one_group = [wbundle, do_conv_fun]( const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + wbundle.set(kern_param.workspace_ptr); do_conv_fun(wbundle, kern_param, ncb_index); }; ret_kerns.push_back({exec_one_group, {N, group}}); @@ -253,7 +253,7 @@ WorkspaceBundle stride2::get_bundle( //! compute one output channel template -void stride2::do_conv_kern(WorkspaceBundle bundle, +void stride2::do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index) { size_t PH = kern_param.filter_meta.padding[0]; @@ -277,7 +277,6 @@ void stride2::do_conv_kern(WorkspaceBundle bundle, size_t thread_id = ncb_index.thread_id, batch_id = ncb_index.ndrange_id[0]; size_t group_id = ncb_index.ndrange_id[1]; - bundle.set(kern_param.workspace_ptr); int8_t* padding_src = static_cast(bundle.get(thread_id)); const int8_t* sptr = kern_param.src(batch_id, group_id, 0, pack_group_size); @@ -325,7 +324,8 @@ SmallVector stride2::get_kimpls( SmallVector ret_kerns; auto exec_one_group = [wbundle, do_conv_fun]( const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + wbundle.set(kern_param.workspace_ptr); do_conv_fun(wbundle, kern_param, ncb_index); }; ret_kerns.push_back({exec_one_group, {N, group}}); diff --git a/dnn/src/arm_common/conv_bias/int8/channel_wise_nchw44.h b/dnn/src/arm_common/conv_bias/int8/channel_wise_nchw44.h index 46efaaa4..d869a6d1 100644 --- a/dnn/src/arm_common/conv_bias/int8/channel_wise_nchw44.h +++ b/dnn/src/arm_common/conv_bias/int8/channel_wise_nchw44.h @@ -21,7 +21,7 @@ using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam; using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; -using conv_fun = std::function; @@ -32,7 +32,7 @@ bool is_available(const NCBKernSizeParam& param); WorkspaceBundle get_bundle(const NCBKernSizeParam& param); template -void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, +void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index); SmallVector get_kimpls(const NCBKernSizeParam& param); @@ -44,7 +44,7 @@ bool is_available(const NCBKernSizeParam& param); WorkspaceBundle get_bundle(const NCBKernSizeParam& param); template -void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, +void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index); SmallVector get_kimpls(const NCBKernSizeParam& param); diff --git a/dnn/src/arm_common/conv_bias/int8/direct_dotprod_nchw44_algo.cpp b/dnn/src/arm_common/conv_bias/int8/direct_dotprod_nchw44_algo.cpp index 27fbde62..f7b08374 100644 --- a/dnn/src/arm_common/conv_bias/int8/direct_dotprod_nchw44_algo.cpp +++ b/dnn/src/arm_common/conv_bias/int8/direct_dotprod_nchw44_algo.cpp @@ -24,9 +24,10 @@ using namespace arm_common; MIDOUT_DECL(megdnn_arm_common_conv_bias_int8) -using direct_fun = std::function; +using direct_fun = + std::function; namespace { @@ -71,7 +72,7 @@ static WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param) { template -static void conv_kern(WorkspaceBundle bundle, +static void conv_kern(const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& ncb_param, const ConvBiasImpl::NCBKernIndex& ncb_index) { const int OH = ncb_param.osz[0]; @@ -93,7 +94,6 @@ static void conv_kern(WorkspaceBundle bundle, constexpr int IC_PACK_SIZE = 4; constexpr int OC_PACK_SIZE = 4; - bundle.set(ncb_param.workspace_ptr); const int batch_id = ncb_index.ndrange_id[0]; const int group_id = ncb_index.ndrange_id[1]; @@ -326,8 +326,10 @@ ConvBiasImpl::AlgoDotS8Direct_NCHW44::dispatch_kerns( IC * IW * sizeof(int8_t) * 2); size_t oh_tiles = static_cast(div_ceil(OH, oh_tile_size)); - auto do_conv = [wbundle, kernel](const NCBKernParam& ncb_param, - const NCBKernIndex& ncb_index) { + auto do_conv = [wbundle, kernel]( + const NCBKernParam& ncb_param, + const NCBKernIndex& ncb_index) mutable { + wbundle.set(ncb_param.workspace_ptr); kernel(wbundle, ncb_param, std::move(ncb_index)); }; diff --git a/dnn/src/arm_common/conv_bias/int8/direct_nchw44_algo.cpp b/dnn/src/arm_common/conv_bias/int8/direct_nchw44_algo.cpp index 0d82c2bf..3209d428 100644 --- a/dnn/src/arm_common/conv_bias/int8/direct_nchw44_algo.cpp +++ b/dnn/src/arm_common/conv_bias/int8/direct_nchw44_algo.cpp @@ -23,7 +23,8 @@ using namespace megdnn; using namespace arm_common; using conv_fun = std::function; MIDOUT_DECL(megdnn_arm_common_conv_bias_int8_nchw44) @@ -64,7 +65,7 @@ static WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param) { } }; -static void copy_padding_kern(WorkspaceBundle bundle, +static void copy_padding_kern(const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { @@ -78,7 +79,6 @@ static void copy_padding_kern(WorkspaceBundle bundle, int IH2, IW2; get_rectified_size(kern_param, IH2, IW2); int padding_group_size = IH2 * IW2 * IC; - bundle.set(kern_param.workspace_ptr); //! Used for get the workspace offset constexpr int pack_ic = 4; constexpr int expend_element = 4; @@ -128,7 +128,7 @@ static void copy_padding_kern(WorkspaceBundle bundle, template -static void do_conv_kern(WorkspaceBundle bundle, +static void do_conv_kern(const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids, @@ -153,7 +153,6 @@ static void do_conv_kern(WorkspaceBundle bundle, op = Op(scale_bias, scale_dst); } size_t padding_group_size = IH2 * IW2 * IC; - bundle.set(kern_param.workspace_ptr); constexpr size_t pack_c = 4; constexpr size_t src_expand_size = 4; @@ -375,7 +374,6 @@ ConvBiasImpl::AlgoS8DirectNCHW44::dispatch_kerns( megdnn_assert(do_conv_fun); SmallVector ret_kerns; - WorkspaceBundle bundle = wbundle; constexpr size_t pack_oc = 4; size_t oc_step = pack_oc; @@ -384,28 +382,31 @@ ConvBiasImpl::AlgoS8DirectNCHW44::dispatch_kerns( } if (group == 1) { CpuNDRange ncb_range = {N, group, div_ceil(OC, oc_step)}; - auto copy_padding = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { - copy_padding_kern(bundle, kern_param, ncb_index, + auto copy_padding = [wbundle](const NCBKernParam& kern_param, + const NCBKernIndex& ncb_index) mutable { + wbundle.set(kern_param.workspace_ptr); + copy_padding_kern(wbundle, kern_param, ncb_index, ncb_index.ndrange_id); }; constexpr size_t pack_ic = 4; ret_kerns.push_back({copy_padding, {N, group, div_ceil(IC, pack_ic)}}); - auto do_conv = [bundle, do_conv_fun, ncb_range]( + auto do_conv = [wbundle, do_conv_fun, ncb_range]( const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { - do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id, + const NCBKernIndex& ncb_index) mutable { + wbundle.set(kern_param.workspace_ptr); + do_conv_fun(wbundle, kern_param, ncb_index, ncb_index.ndrange_id, ncb_range); }; ret_kerns.push_back({do_conv, ncb_range}); } else { CpuNDRange ncb_range = {N, group, 1}; - auto do_conv = [bundle, do_conv_fun, ncb_range]( + auto do_conv = [wbundle, do_conv_fun, ncb_range]( const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { - copy_padding_kern(bundle, kern_param, ncb_index, + const NCBKernIndex& ncb_index) mutable { + wbundle.set(kern_param.workspace_ptr); + copy_padding_kern(wbundle, kern_param, ncb_index, {0, ncb_index.thread_id, 0}); - do_conv_fun(bundle, kern_param, ncb_index, + do_conv_fun(wbundle, kern_param, ncb_index, {0, ncb_index.thread_id, 0}, ncb_range); }; ret_kerns.push_back({do_conv, ncb_range}); diff --git a/dnn/src/arm_common/conv_bias/int8/direct_nchw_nchw44_algo.cpp b/dnn/src/arm_common/conv_bias/int8/direct_nchw_nchw44_algo.cpp index 4fa25bfe..93999a61 100644 --- a/dnn/src/arm_common/conv_bias/int8/direct_nchw_nchw44_algo.cpp +++ b/dnn/src/arm_common/conv_bias/int8/direct_nchw_nchw44_algo.cpp @@ -22,7 +22,8 @@ using namespace megdnn; using namespace arm_common; using conv_fun = std::function; MIDOUT_DECL(megdnn_arm_common_conv_bias_int8_nchw_nchw44) @@ -77,7 +78,7 @@ static WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param) { return {nullptr, {src_size, weight_size, tmp_size * param.nr_threads}}; }; -static void copy_padding_kern(WorkspaceBundle bundle, +static void copy_padding_kern(const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { @@ -92,7 +93,6 @@ static void copy_padding_kern(WorkspaceBundle bundle, int ih2, iw2, oh2, ow2; get_rectified_size(kern_param, ih2, iw2, oh2, ow2); int padding_group_size = ih2 * iw2 * ic; - bundle.set(kern_param.workspace_ptr); //! Used for get the workspace offset const int src_expand = stride_h == 2 ? 4 : 16; @@ -124,10 +124,9 @@ static void copy_padding_kern(WorkspaceBundle bundle, iw, iw2, pw, nullptr); } } -static void pack_weight(WorkspaceBundle bundle, +static void pack_weight(const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index) { - bundle.set(kern_param.workspace_ptr); const int group_id = ncb_index.ndrange_id[0]; int fh = kern_param.filter_meta.spatial[0]; int fw = kern_param.filter_meta.spatial[1]; @@ -151,7 +150,7 @@ static void pack_weight(WorkspaceBundle bundle, } } template -static void do_conv_kern(WorkspaceBundle bundle, +static void do_conv_kern(const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids, @@ -177,7 +176,6 @@ static void do_conv_kern(WorkspaceBundle bundle, op = Op(scale_bias, scale_dst); } int padding_group_size = ih2 * iw2 * ic; - bundle.set(kern_param.workspace_ptr); constexpr int pack_c = 4; constexpr int src_expand_size = stride == 2 ? 4 : 16; @@ -258,7 +256,7 @@ ConvBiasImpl::AlgoS8DirectNCHWNCHW44::dispatch_kerns( size_t N = param.n; size_t OC = fm.ocpg; size_t group = fm.group; - WorkspaceBundle wbundle = get_bundle(param); + WorkspaceBundle bundle = get_bundle(param); conv_fun do_conv_fun = nullptr; // NOTE: remain_w is not used to gen hash of midout for compatible with changing // shape runtime @@ -342,18 +340,19 @@ ConvBiasImpl::AlgoS8DirectNCHWNCHW44::dispatch_kerns( megdnn_assert(do_conv_fun); SmallVector ret_kerns; - WorkspaceBundle bundle = wbundle; constexpr size_t pack_oc = 8; size_t oc_step = pack_oc; auto copy_padding = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); copy_padding_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({copy_padding, {N, group, fm.icpg}}); auto do_pack_weight = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); pack_weight(bundle, kern_param, ncb_index); }; ret_kerns.push_back({do_pack_weight, {static_cast(group)}}); @@ -361,7 +360,8 @@ ConvBiasImpl::AlgoS8DirectNCHWNCHW44::dispatch_kerns( CpuNDRange ncb_range = {N, group, div_ceil(OC, oc_step)}; auto do_conv = [bundle, do_conv_fun, ncb_range]( const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id, ncb_range); }; diff --git a/dnn/src/arm_common/conv_bias/int8/dot_direct_nchw_nchw44_algo.cpp b/dnn/src/arm_common/conv_bias/int8/dot_direct_nchw_nchw44_algo.cpp index fb86977d..798dc967 100644 --- a/dnn/src/arm_common/conv_bias/int8/dot_direct_nchw_nchw44_algo.cpp +++ b/dnn/src/arm_common/conv_bias/int8/dot_direct_nchw_nchw44_algo.cpp @@ -22,7 +22,8 @@ using namespace megdnn; using namespace arm_common; using conv_fun = std::function; MIDOUT_DECL(megdnn_arm_common_conv_bias_int8_nchw44_dot) @@ -82,7 +83,7 @@ static WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param) { temp_size * param.nr_threads}}; }; -void do_weight_trans(WorkspaceBundle bundle, +void do_weight_trans(const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex&, const CpuNDRange&) { const int ic = kern_param.filter_meta.icpg; @@ -90,7 +91,6 @@ void do_weight_trans(WorkspaceBundle bundle, const int fh = kern_param.filter_meta.spatial[0]; const int fw = kern_param.filter_meta.spatial[1]; const int fw2 = round_up(fw, 4); - bundle.set(kern_param.workspace_ptr); auto packed_weight = reinterpret_cast(bundle.get(1)); auto origin_weight = kern_param.filter(); pack_weight_int8_nchw_nchw44_dot(packed_weight, origin_weight, oc, ic, fh, @@ -98,7 +98,7 @@ void do_weight_trans(WorkspaceBundle bundle, } template -static void do_conv_kern(WorkspaceBundle bundle, +static void do_conv_kern(const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index, const CpuNDRange&, const CpuNDRange&) { @@ -117,7 +117,6 @@ static void do_conv_kern(WorkspaceBundle bundle, int ih2 = 0; int iw2 = 0; get_rectified_size(kern_param, ih2, iw2); - bundle.set(kern_param.workspace_ptr); constexpr int pack_c = 4; const int batch_id = ncb_index.ndrange_id[0]; @@ -205,7 +204,7 @@ ConvBiasImpl::AlgoDotS8DirectNCHWNCHW44::dispatch_kerns( auto fm = param.filter_meta; const int batch = param.n; const int group = fm.group; - WorkspaceBundle wbundle = get_bundle(param); + WorkspaceBundle bundle = get_bundle(param); conv_fun do_conv_fun = nullptr; // NOTE: remain_w is not used to gen hash of midout for compatible with // shape runtime @@ -288,7 +287,6 @@ ConvBiasImpl::AlgoDotS8DirectNCHWNCHW44::dispatch_kerns( megdnn_assert(do_conv_fun); SmallVector ret_kerns; - WorkspaceBundle bundle = wbundle; int oh = param.osz[0]; int ic = param.filter_meta.icpg; int iw = param.isz[1]; @@ -302,14 +300,16 @@ ConvBiasImpl::AlgoDotS8DirectNCHWNCHW44::dispatch_kerns( static_cast(div_ceil(oh, oh_block))}; auto do_trans_weight = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); do_weight_trans(bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({do_trans_weight, {1}}); auto do_conv = [bundle, do_conv_fun, ncb_range]( const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id, ncb_range); }; diff --git a/dnn/src/arm_common/conv_bias/int8/stride1.cpp b/dnn/src/arm_common/conv_bias/int8/stride1.cpp index 43ab1ff4..a8a0c6f9 100644 --- a/dnn/src/arm_common/conv_bias/int8/stride1.cpp +++ b/dnn/src/arm_common/conv_bias/int8/stride1.cpp @@ -107,7 +107,8 @@ WorkspaceBundle direct_int8_stride1::get_bundle( } //! Process one input channel copy padding void direct_int8_stride1::copy_padding_kern( - WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, + const WorkspaceBundle& bundle, + const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { size_t IH = kern_param.isz[0]; @@ -121,7 +122,6 @@ void direct_int8_stride1::copy_padding_kern( get_rectified_size(kern_param, IH2, IW2, OH2, OW2); bool need_src_copy_var = need_src_copy(kern_param); size_t padding_group_size = IH2 * IW2 * IC; - bundle.set(kern_param.workspace_ptr); //! Used for get the workspace offset size_t workspace_group_id = workspace_ids[0], workspace_batch_id = workspace_ids[1], channel_id = workspace_ids[2], @@ -145,7 +145,7 @@ void direct_int8_stride1::copy_padding_kern( }; //! compute one output channel template -void direct_int8_stride1::do_conv_kern(WorkspaceBundle bundle, +void direct_int8_stride1::do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { @@ -170,7 +170,6 @@ void direct_int8_stride1::do_conv_kern(WorkspaceBundle bundle, op = Op(scale_bias, scale_dst); } size_t padding_group_size = IH2 * IW2 * IC; - bundle.set(kern_param.workspace_ptr); //! Used for get the workspace offset size_t workspace_group_id = workspace_ids[0], workspace_batch_id = workspace_ids[1], oc = workspace_ids[2]; @@ -263,7 +262,7 @@ SmallVector direct_int8_stride1::get_kimpls( size_t IC = param.filter_meta.icpg; size_t OC = param.filter_meta.ocpg; size_t group = fm.group; - WorkspaceBundle wbundle = get_bundle(param, m_large_group); + WorkspaceBundle bundle = get_bundle(param, m_large_group); conv_fun do_conv_fun = nullptr; #define DO_CONV_KERN_FUN(filter, bias_mode, op) \ @@ -324,13 +323,13 @@ SmallVector direct_int8_stride1::get_kimpls( SmallVector ret_kerns; if (m_large_group) { - auto exec_one_group = [wbundle, do_conv_fun]( + auto exec_one_group = [bundle, do_conv_fun]( const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { auto fm = kern_param.filter_meta; size_t IC = fm.icpg; size_t OC = fm.ocpg; - WorkspaceBundle bundle = wbundle; + bundle.set(kern_param.workspace_ptr); for (size_t ic = 0; ic < IC; ic++) { copy_padding_kern(bundle, kern_param, ncb_index, {ncb_index.thread_id, 0, ic}); @@ -342,15 +341,17 @@ SmallVector direct_int8_stride1::get_kimpls( }; ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); } else { - WorkspaceBundle bundle = wbundle; auto copy_padding = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); copy_padding_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({copy_padding, {group, N, IC}}); - auto do_conv = [bundle, do_conv_fun](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + auto do_conv = [bundle, do_conv_fun]( + const NCBKernParam& kern_param, + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({do_conv, {group, N, OC}}); diff --git a/dnn/src/arm_common/conv_bias/int8/stride1.h b/dnn/src/arm_common/conv_bias/int8/stride1.h index a56db1ed..b78157bb 100644 --- a/dnn/src/arm_common/conv_bias/int8/stride1.h +++ b/dnn/src/arm_common/conv_bias/int8/stride1.h @@ -21,19 +21,19 @@ using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; using conv_fun = std::function; bool can_conv_direct_stride1_int8(const NCBKernSizeParam& param); WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); -void copy_padding_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, +void copy_padding_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids); template -void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, +void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids); diff --git a/dnn/src/arm_common/conv_bias/int8/stride1_dotprod.cpp b/dnn/src/arm_common/conv_bias/int8/stride1_dotprod.cpp index 051cd95a..603fedbb 100644 --- a/dnn/src/arm_common/conv_bias/int8/stride1_dotprod.cpp +++ b/dnn/src/arm_common/conv_bias/int8/stride1_dotprod.cpp @@ -109,7 +109,8 @@ WorkspaceBundle direct_dotprod_int8_stride1::get_bundle( } //! Process one input channel copy padding void direct_dotprod_int8_stride1::copy_padding_kern( - WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, + const WorkspaceBundle& bundle, + const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { size_t IH = kern_param.isz[0]; @@ -123,7 +124,6 @@ void direct_dotprod_int8_stride1::copy_padding_kern( get_rectified_size(kern_param, IH2, IW2, OH2, OW2); bool need_src_copy_var = need_src_copy(kern_param); size_t padding_group_size = IH2 * IW2 * IC; - bundle.set(kern_param.workspace_ptr); //! Used for get the workspace offset size_t group_id = ncb_index.ndrange_id[0], @@ -148,7 +148,7 @@ void direct_dotprod_int8_stride1::copy_padding_kern( //! compute one output channel template void direct_dotprod_int8_stride1::do_conv_kern( - WorkspaceBundle bundle, const NCBKernParam& kern_param, + const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { size_t OH = kern_param.osz[0]; size_t OW = kern_param.osz[1]; @@ -172,7 +172,6 @@ void direct_dotprod_int8_stride1::do_conv_kern( } size_t padding_group_size = IH2 * IW2 * IC; - bundle.set(kern_param.workspace_ptr); //! Used for get the workspace offset size_t workspace_group_id = workspace_ids[0], workspace_batch_id = workspace_ids[1], oc = workspace_ids[2]; @@ -264,7 +263,7 @@ SmallVector direct_dotprod_int8_stride1::get_kimpls( size_t IC = param.filter_meta.icpg; size_t OC = param.filter_meta.ocpg; size_t group = fm.group; - WorkspaceBundle wbundle = get_bundle(param, m_large_group); + WorkspaceBundle bundle = get_bundle(param, m_large_group); conv_fun do_conv_fun = nullptr; #define DO_CONV_KERN_FUN(filter, bias_mode, op) \ @@ -325,13 +324,13 @@ SmallVector direct_dotprod_int8_stride1::get_kimpls( SmallVector ret_kerns; if (m_large_group) { - auto exec_one_group = [wbundle, do_conv_fun]( + auto exec_one_group = [bundle, do_conv_fun]( const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { auto fm = kern_param.filter_meta; size_t IC = fm.icpg; size_t OC = fm.ocpg; - WorkspaceBundle bundle = wbundle; + bundle.set(kern_param.workspace_ptr); for (size_t ic = 0; ic < IC; ic++) { copy_padding_kern(bundle, kern_param, ncb_index, {ncb_index.thread_id, 0, ic}); @@ -343,15 +342,17 @@ SmallVector direct_dotprod_int8_stride1::get_kimpls( }; ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); } else { - WorkspaceBundle bundle = wbundle; auto copy_padding = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); copy_padding_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({copy_padding, {group, N, IC}}); - auto do_conv = [bundle, do_conv_fun](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + auto do_conv = [bundle, do_conv_fun]( + const NCBKernParam& kern_param, + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({do_conv, {group, N, OC}}); diff --git a/dnn/src/arm_common/conv_bias/int8/stride1_dotprod.h b/dnn/src/arm_common/conv_bias/int8/stride1_dotprod.h index 7c32328e..cf436726 100644 --- a/dnn/src/arm_common/conv_bias/int8/stride1_dotprod.h +++ b/dnn/src/arm_common/conv_bias/int8/stride1_dotprod.h @@ -20,19 +20,20 @@ using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; using conv_fun = std::function; bool can_conv_direct_stride1_int8(const NCBKernSizeParam& param); WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); -void copy_padding_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, +void copy_padding_kern(const WorkspaceBundle& bundle, + const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids); template -void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, +void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids); diff --git a/dnn/src/arm_common/conv_bias/int8/stride2.cpp b/dnn/src/arm_common/conv_bias/int8/stride2.cpp index 22db808d..1b6bd6ad 100644 --- a/dnn/src/arm_common/conv_bias/int8/stride2.cpp +++ b/dnn/src/arm_common/conv_bias/int8/stride2.cpp @@ -115,7 +115,8 @@ WorkspaceBundle direct_int8_stride2::get_bundle( } //! Process one input channel copy padding void direct_int8_stride2::copy_padding_kern( - WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, + const WorkspaceBundle& bundle, + const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { size_t IH = kern_param.isz[0]; @@ -129,7 +130,6 @@ void direct_int8_stride2::copy_padding_kern( get_rectified_size(kern_param, IH2, IW2, OH2, OW2); bool need_src_copy_var = need_src_copy(kern_param); size_t padding_group_size = IH2 * IW2 * IC; - bundle.set(kern_param.workspace_ptr); //! Used for get the workspace offset size_t group_id = ncb_index.ndrange_id[0], @@ -153,7 +153,7 @@ void direct_int8_stride2::copy_padding_kern( }; //! compute one output channel template -void direct_int8_stride2::do_conv_kern(WorkspaceBundle bundle, +void direct_int8_stride2::do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { @@ -178,7 +178,6 @@ void direct_int8_stride2::do_conv_kern(WorkspaceBundle bundle, op = Op(scale_bias, scale_dst); } size_t padding_group_size = IH2 * IW2 * IC; - bundle.set(kern_param.workspace_ptr); //! Used for get the workspace offset size_t workspace_group_id = workspace_ids[0], workspace_batch_id = workspace_ids[1], oc = workspace_ids[2]; @@ -270,7 +269,7 @@ SmallVector direct_int8_stride2::get_kimpls( size_t IC = param.filter_meta.icpg; size_t OC = param.filter_meta.ocpg; size_t group = fm.group; - WorkspaceBundle wbundle = get_bundle(param, m_large_group); + WorkspaceBundle bundle = get_bundle(param, m_large_group); conv_fun do_conv_fun = nullptr; #define DO_CONV_KERN_FUN(filter, bias_mode, op) \ @@ -331,13 +330,13 @@ SmallVector direct_int8_stride2::get_kimpls( SmallVector ret_kerns; if (m_large_group) { - auto exec_one_group = [wbundle, do_conv_fun]( + auto exec_one_group = [bundle, do_conv_fun]( const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { auto fm = kern_param.filter_meta; size_t IC = fm.icpg; size_t OC = fm.ocpg; - WorkspaceBundle bundle = wbundle; + bundle.set(kern_param.workspace_ptr); for (size_t ic = 0; ic < IC; ic++) { copy_padding_kern(bundle, kern_param, ncb_index, {ncb_index.thread_id, 0, ic}); @@ -349,15 +348,17 @@ SmallVector direct_int8_stride2::get_kimpls( }; ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); } else { - WorkspaceBundle bundle = wbundle; auto copy_padding = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); copy_padding_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({copy_padding, {group, N, IC}}); - auto do_conv = [bundle, do_conv_fun](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + auto do_conv = [bundle, do_conv_fun]( + const NCBKernParam& kern_param, + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({do_conv, {group, N, OC}}); diff --git a/dnn/src/arm_common/conv_bias/int8/stride2.h b/dnn/src/arm_common/conv_bias/int8/stride2.h index 7509b425..4112c7d4 100644 --- a/dnn/src/arm_common/conv_bias/int8/stride2.h +++ b/dnn/src/arm_common/conv_bias/int8/stride2.h @@ -21,18 +21,19 @@ using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; using conv_fun = std::function; bool can_conv_direct_stride2_int8(const NCBKernSizeParam& param); WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); -void copy_padding_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, +void copy_padding_kern(const WorkspaceBundle& bundle, + const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids); template -void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, +void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids); diff --git a/dnn/src/arm_common/conv_bias/int8/stride2_dotprod.cpp b/dnn/src/arm_common/conv_bias/int8/stride2_dotprod.cpp index 90344fc9..459afc7c 100644 --- a/dnn/src/arm_common/conv_bias/int8/stride2_dotprod.cpp +++ b/dnn/src/arm_common/conv_bias/int8/stride2_dotprod.cpp @@ -116,7 +116,8 @@ WorkspaceBundle direct_dotprod_int8_stride2::get_bundle( } //! Process one input channel copy padding void direct_dotprod_int8_stride2::copy_padding_kern( - WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, + const WorkspaceBundle& bundle, + const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { size_t IH = kern_param.isz[0]; @@ -130,7 +131,6 @@ void direct_dotprod_int8_stride2::copy_padding_kern( get_rectified_size(kern_param, IH2, IW2, OH2, OW2); bool need_src_copy_var = need_src_copy(kern_param); size_t padding_group_size = IH2 * IW2 * IC; - bundle.set(kern_param.workspace_ptr); //! Used for get the workspace offset size_t workspace_group_id = workspace_ids[0], @@ -154,7 +154,7 @@ void direct_dotprod_int8_stride2::copy_padding_kern( //! compute one output channel template void direct_dotprod_int8_stride2::do_conv_kern( - WorkspaceBundle bundle, const NCBKernParam& kern_param, + const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { size_t OH = kern_param.osz[0]; size_t OW = kern_param.osz[1]; @@ -177,7 +177,6 @@ void direct_dotprod_int8_stride2::do_conv_kern( op = Op(scale_bias, scale_dst); } size_t padding_group_size = IH2 * IW2 * IC; - bundle.set(kern_param.workspace_ptr); //! Used for get the workspace offset size_t workspace_group_id = workspace_ids[0], workspace_batch_id = workspace_ids[1], oc = workspace_ids[2]; @@ -270,7 +269,7 @@ SmallVector direct_dotprod_int8_stride2::get_kimpls( size_t IC = param.filter_meta.icpg; size_t OC = param.filter_meta.ocpg; size_t group = fm.group; - WorkspaceBundle wbundle = get_bundle(param, m_large_group); + WorkspaceBundle bundle = get_bundle(param, m_large_group); conv_fun do_conv_fun = nullptr; #define DO_CONV_KERN_FUN(filter, bias_mode, op) \ @@ -331,13 +330,13 @@ SmallVector direct_dotprod_int8_stride2::get_kimpls( SmallVector ret_kerns; if (m_large_group) { - auto exec_one_group = [wbundle, do_conv_fun]( + auto exec_one_group = [bundle, do_conv_fun]( const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { auto fm = kern_param.filter_meta; size_t IC = fm.icpg; size_t OC = fm.ocpg; - WorkspaceBundle bundle = wbundle; + bundle.set(kern_param.workspace_ptr); for (size_t ic = 0; ic < IC; ic++) { copy_padding_kern(bundle, kern_param, ncb_index, {ncb_index.thread_id, 0, ic}); @@ -349,15 +348,17 @@ SmallVector direct_dotprod_int8_stride2::get_kimpls( }; ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); } else { - WorkspaceBundle bundle = wbundle; auto copy_padding = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); copy_padding_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({copy_padding, {group, N, IC}}); - auto do_conv = [bundle, do_conv_fun](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + auto do_conv = [bundle, do_conv_fun]( + const NCBKernParam& kern_param, + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({do_conv, {group, N, OC}}); diff --git a/dnn/src/arm_common/conv_bias/int8/stride2_dotprod.h b/dnn/src/arm_common/conv_bias/int8/stride2_dotprod.h index 639cb224..d36e0843 100644 --- a/dnn/src/arm_common/conv_bias/int8/stride2_dotprod.h +++ b/dnn/src/arm_common/conv_bias/int8/stride2_dotprod.h @@ -21,19 +21,20 @@ using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; using conv_fun = std::function; bool can_conv_direct_stride2_int8(const NCBKernSizeParam& param); WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); -void copy_padding_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, +void copy_padding_kern(const WorkspaceBundle& bundle, + const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids); template -void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, +void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids); diff --git a/dnn/src/arm_common/conv_bias/int8x8x16/algos.cpp b/dnn/src/arm_common/conv_bias/int8x8x16/algos.cpp index 6b99902f..c8ba37e2 100644 --- a/dnn/src/arm_common/conv_bias/int8x8x16/algos.cpp +++ b/dnn/src/arm_common/conv_bias/int8x8x16/algos.cpp @@ -139,7 +139,8 @@ size_t ConvBiasImpl::AlgoI8x8x16Direct::get_workspace( } //! Process one input channel copy padding void ConvBiasImpl::AlgoI8x8x16Direct::copy_padding_kern( - WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, + const WorkspaceBundle& bundle, + const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { size_t IH = kern_param.isz[0]; @@ -154,7 +155,6 @@ void ConvBiasImpl::AlgoI8x8x16Direct::copy_padding_kern( get_rectified_size_str1(IH, IW, OH, OW, PH, PW, IH2, IW2, OH2, OW2); bool need_src_copy_var = need_src_copy_str1(kern_param); size_t padding_group_size = IH2 * IW2 * IC; - bundle.set(kern_param.workspace_ptr); //! Used for get the workspace offset size_t workspace_group_id = workspace_ids[0], @@ -178,7 +178,7 @@ void ConvBiasImpl::AlgoI8x8x16Direct::copy_padding_kern( }; //! compute one output channel void ConvBiasImpl::AlgoI8x8x16Direct::do_conv_kern( - WorkspaceBundle bundle, const NCBKernParam& kern_param, + const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { size_t OH = kern_param.osz[0]; size_t OW = kern_param.osz[1]; @@ -214,7 +214,6 @@ void ConvBiasImpl::AlgoI8x8x16Direct::do_conv_kern( fun_add_to_dst = conv_bias::conv_direct_5x5_sc_int8_int8_int16; } - bundle.set(kern_param.workspace_ptr); //! Used for get the workspace offset size_t workspace_group_id = workspace_ids[0], workspace_batch_id = workspace_ids[1], oc = workspace_ids[2]; @@ -256,15 +255,15 @@ SmallVector ConvBiasImpl::AlgoI8x8x16Direct::get_kimpls( size_t IC = param.filter_meta.icpg; size_t OC = param.filter_meta.ocpg; size_t group = fm.group; - WorkspaceBundle wbundle = get_bundle(param); + WorkspaceBundle bundle = get_bundle(param); SmallVector ret_kerns; if (m_large_group) { - auto exec_one_group = [wbundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + auto exec_one_group = [bundle](const NCBKernParam& kern_param, + const NCBKernIndex& ncb_index) mutable { auto fm = kern_param.filter_meta; size_t IC = fm.icpg; size_t OC = fm.ocpg; - WorkspaceBundle bundle = wbundle; + bundle.set(kern_param.workspace_ptr); for (size_t ic = 0; ic < IC; ic++) { copy_padding_kern(bundle, kern_param, ncb_index, {ncb_index.thread_id, 0, ic}); @@ -276,15 +275,16 @@ SmallVector ConvBiasImpl::AlgoI8x8x16Direct::get_kimpls( }; ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); } else { - WorkspaceBundle bundle = wbundle; auto copy_padding = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); copy_padding_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({copy_padding, {group, N, IC}}); auto do_conv = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); do_conv_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({do_conv, {group, N, OC}}); @@ -360,7 +360,8 @@ size_t ConvBiasImpl::AlgoI8x8x16Stride2::get_workspace( } //! Process one input channel copy padding void ConvBiasImpl::AlgoI8x8x16Stride2::copy_padding_kern( - WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, + const WorkspaceBundle& bundle, + const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { size_t IH = kern_param.isz[0]; @@ -378,7 +379,6 @@ void ConvBiasImpl::AlgoI8x8x16Stride2::copy_padding_kern( bool need_src_copy_var = need_src_copy_str2(kern_param); size_t padding_group_size = IH2 * IW2 * IC; - bundle.set(kern_param.workspace_ptr); size_t workspace_group_id = workspace_ids[0], workspace_batch_id = workspace_ids[1], channel_id = workspace_ids[2]; @@ -400,7 +400,7 @@ void ConvBiasImpl::AlgoI8x8x16Stride2::copy_padding_kern( }; //! compute one output channel void ConvBiasImpl::AlgoI8x8x16Stride2::do_conv_kern( - WorkspaceBundle bundle, const NCBKernParam& kern_param, + const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { size_t OH = kern_param.osz[0]; size_t OW = kern_param.osz[1]; @@ -436,7 +436,6 @@ void ConvBiasImpl::AlgoI8x8x16Stride2::do_conv_kern( fun_add_to_dst = conv_bias::conv_stride2_5x5_sc_int8_int8_int16; } - bundle.set(kern_param.workspace_ptr); //! Used for get the workspace offset size_t workspace_group_id = workspace_ids[0], workspace_batch_id = workspace_ids[1], oc = workspace_ids[2]; @@ -476,15 +475,15 @@ SmallVector ConvBiasImpl::AlgoI8x8x16Stride2::get_kimpls( size_t IC = param.filter_meta.icpg; size_t OC = param.filter_meta.ocpg; size_t group = fm.group; - WorkspaceBundle wbundle = get_bundle(param); + WorkspaceBundle bundle = get_bundle(param); SmallVector ret_kerns; if (m_large_group) { - auto exec_one_group = [wbundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + auto exec_one_group = [bundle](const NCBKernParam& kern_param, + const NCBKernIndex& ncb_index) mutable { auto fm = kern_param.filter_meta; size_t IC = fm.icpg; size_t OC = fm.ocpg; - WorkspaceBundle bundle = wbundle; + bundle.set(kern_param.workspace_ptr); for (size_t ic = 0; ic < IC; ic++) { copy_padding_kern(bundle, kern_param, ncb_index, {ncb_index.thread_id, 0, ic}); @@ -496,15 +495,16 @@ SmallVector ConvBiasImpl::AlgoI8x8x16Stride2::get_kimpls( }; ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); } else { - WorkspaceBundle bundle = wbundle; auto copy_padding = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); copy_padding_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({copy_padding, {group, N, IC}}); auto do_conv = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); do_conv_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({do_conv, {group, N, OC}}); diff --git a/dnn/src/arm_common/conv_bias/int8x8x16/algos.h b/dnn/src/arm_common/conv_bias/int8x8x16/algos.h index c89e0a6a..cec50258 100644 --- a/dnn/src/arm_common/conv_bias/int8x8x16/algos.h +++ b/dnn/src/arm_common/conv_bias/int8x8x16/algos.h @@ -18,11 +18,11 @@ namespace arm_common { class ConvBiasImpl::AlgoI8x8x16Direct final : public AlgoBase { SmallVector get_kimpls(const NCBKernSizeParam& param) const; WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const; - static void copy_padding_kern(WorkspaceBundle bundle, + static void copy_padding_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids); - static void do_conv_kern(WorkspaceBundle bundle, + static void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids); @@ -47,11 +47,11 @@ public: class ConvBiasImpl::AlgoI8x8x16Stride2 final : public AlgoBase { SmallVector get_kimpls(const NCBKernSizeParam& param) const; WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const; - static void copy_padding_kern(WorkspaceBundle bundle, + static void copy_padding_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids); - static void do_conv_kern(WorkspaceBundle bundle, + static void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids); diff --git a/dnn/src/arm_common/conv_bias/quint8/stride1.cpp b/dnn/src/arm_common/conv_bias/quint8/stride1.cpp index 71477490..d07748e7 100644 --- a/dnn/src/arm_common/conv_bias/quint8/stride1.cpp +++ b/dnn/src/arm_common/conv_bias/quint8/stride1.cpp @@ -99,7 +99,8 @@ WorkspaceBundle direct_quint8_stride1::get_bundle( } //! Process one input channel copy padding void direct_quint8_stride1::copy_padding_kern( - WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, + const WorkspaceBundle& bundle, + const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { size_t IH = kern_param.isz[0]; @@ -114,8 +115,6 @@ void direct_quint8_stride1::copy_padding_kern( bool need_src_copy_var = need_src_copy(kern_param); size_t padding_group_size = IH2 * IW2 * IC; - bundle.set(kern_param.workspace_ptr); - //! Used for get the workspace offset size_t workspace_group_id = workspace_ids[0], workspace_batch_id = workspace_ids[1], @@ -142,7 +141,7 @@ void direct_quint8_stride1::copy_padding_kern( }; //! compute one output channel template -void direct_quint8_stride1::do_conv_kern(WorkspaceBundle bundle, +void direct_quint8_stride1::do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { @@ -180,7 +179,6 @@ void direct_quint8_stride1::do_conv_kern(WorkspaceBundle bundle, } size_t padding_group_size = IH2 * IW2 * IC; - bundle.set(kern_param.workspace_ptr); //! Used for get the workspace offset size_t workspace_group_id = workspace_ids[0], workspace_batch_id = workspace_ids[1], oc = workspace_ids[2], @@ -272,7 +270,7 @@ SmallVector direct_quint8_stride1::get_kimpls( size_t IC = param.filter_meta.icpg; size_t OC = param.filter_meta.ocpg; size_t group = fm.group; - WorkspaceBundle wbundle = get_bundle(param, m_large_group); + WorkspaceBundle bundle = get_bundle(param, m_large_group); conv_fun do_conv_fun = nullptr; #define DO_CONV_KERN_FUN(filter, bias_mode, op) \ @@ -333,13 +331,13 @@ SmallVector direct_quint8_stride1::get_kimpls( SmallVector ret_kerns; if (m_large_group) { - auto exec_one_group = [wbundle, do_conv_fun]( + auto exec_one_group = [bundle, do_conv_fun]( const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { auto fm = kern_param.filter_meta; size_t IC = fm.icpg; size_t OC = fm.ocpg; - WorkspaceBundle bundle = wbundle; + bundle.set(kern_param.workspace_ptr); for (size_t ic = 0; ic < IC; ic++) { copy_padding_kern(bundle, kern_param, ncb_index, {ncb_index.thread_id, 0, ic}); @@ -351,15 +349,17 @@ SmallVector direct_quint8_stride1::get_kimpls( }; ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); } else { - WorkspaceBundle bundle = wbundle; auto copy_padding = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); copy_padding_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({copy_padding, {group, N, IC}}); - auto do_conv = [bundle, do_conv_fun](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + auto do_conv = [bundle, do_conv_fun]( + const NCBKernParam& kern_param, + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({do_conv, {group, N, OC}}); diff --git a/dnn/src/arm_common/conv_bias/quint8/stride1.h b/dnn/src/arm_common/conv_bias/quint8/stride1.h index b0de5c91..a6553716 100644 --- a/dnn/src/arm_common/conv_bias/quint8/stride1.h +++ b/dnn/src/arm_common/conv_bias/quint8/stride1.h @@ -21,19 +21,20 @@ using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; using conv_fun = std::function; bool can_conv_direct_stride1_quint8(const NCBKernSizeParam& param); WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); -void copy_padding_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, +void copy_padding_kern(const WorkspaceBundle& bundle, + const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids); template -void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, +void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids); diff --git a/dnn/src/arm_common/conv_bias/quint8/stride1_dotprod.cpp b/dnn/src/arm_common/conv_bias/quint8/stride1_dotprod.cpp index d72805ad..ae5e2357 100644 --- a/dnn/src/arm_common/conv_bias/quint8/stride1_dotprod.cpp +++ b/dnn/src/arm_common/conv_bias/quint8/stride1_dotprod.cpp @@ -101,7 +101,8 @@ WorkspaceBundle direct_dotprod_quint8_stride1::get_bundle( } //! Process one input channel copy padding void direct_dotprod_quint8_stride1::copy_padding_kern( - WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, + const WorkspaceBundle& bundle, + const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { size_t IH = kern_param.isz[0]; @@ -115,7 +116,6 @@ void direct_dotprod_quint8_stride1::copy_padding_kern( get_rectified_size(kern_param, IH2, IW2, OH2, OW2); bool need_src_copy_var = need_src_copy(kern_param); size_t padding_group_size = IH2 * IW2 * IC; - bundle.set(kern_param.workspace_ptr); //! Used for get the workspace offset size_t workspace_group_id = workspace_ids[0], @@ -144,7 +144,7 @@ void direct_dotprod_quint8_stride1::copy_padding_kern( //! compute one output channel template void direct_dotprod_quint8_stride1::do_conv_kern( - WorkspaceBundle bundle, const NCBKernParam& kern_param, + const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { size_t OH = kern_param.osz[0]; size_t OW = kern_param.osz[1]; @@ -177,7 +177,6 @@ void direct_dotprod_quint8_stride1::do_conv_kern( } size_t padding_group_size = IH2 * IW2 * IC; - bundle.set(kern_param.workspace_ptr); //! Used for get the workspace offset size_t workspace_group_id = workspace_ids[0], workspace_batch_id = workspace_ids[1], oc = workspace_ids[2], @@ -271,7 +270,7 @@ SmallVector direct_dotprod_quint8_stride1::get_kimpls( size_t IC = param.filter_meta.icpg; size_t OC = param.filter_meta.ocpg; size_t group = fm.group; - WorkspaceBundle wbundle = get_bundle(param, m_large_group); + WorkspaceBundle bundle = get_bundle(param, m_large_group); conv_fun do_conv_fun = nullptr; #define DO_CONV_KERN_FUN(filter, bias_mode, op) \ @@ -332,13 +331,13 @@ SmallVector direct_dotprod_quint8_stride1::get_kimpls( SmallVector ret_kerns; if (m_large_group) { - auto exec_one_group = [wbundle, do_conv_fun]( + auto exec_one_group = [bundle, do_conv_fun]( const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { auto fm = kern_param.filter_meta; size_t IC = fm.icpg; size_t OC = fm.ocpg; - WorkspaceBundle bundle = wbundle; + bundle.set(kern_param.workspace_ptr); for (size_t ic = 0; ic < IC; ic++) { copy_padding_kern(bundle, kern_param, ncb_index, {ncb_index.thread_id, 0, ic}); @@ -350,15 +349,17 @@ SmallVector direct_dotprod_quint8_stride1::get_kimpls( }; ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); }else { - WorkspaceBundle bundle = wbundle; auto copy_padding = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); copy_padding_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({copy_padding, {group, N, IC}}); - auto do_conv = [bundle, do_conv_fun](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + auto do_conv = [bundle, do_conv_fun]( + const NCBKernParam& kern_param, + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({do_conv, {group, N, OC}}); diff --git a/dnn/src/arm_common/conv_bias/quint8/stride1_dotprod.h b/dnn/src/arm_common/conv_bias/quint8/stride1_dotprod.h index d79a8095..3e5efe8e 100644 --- a/dnn/src/arm_common/conv_bias/quint8/stride1_dotprod.h +++ b/dnn/src/arm_common/conv_bias/quint8/stride1_dotprod.h @@ -21,19 +21,20 @@ using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; using conv_fun = std::function; bool can_conv_direct_stride1_quint8(const NCBKernSizeParam& param); WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); -void copy_padding_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, +void copy_padding_kern(const WorkspaceBundle& bundle, + const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids); template -void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, +void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids); diff --git a/dnn/src/arm_common/conv_bias/quint8/stride2.cpp b/dnn/src/arm_common/conv_bias/quint8/stride2.cpp index ffc366c4..5527ea6f 100644 --- a/dnn/src/arm_common/conv_bias/quint8/stride2.cpp +++ b/dnn/src/arm_common/conv_bias/quint8/stride2.cpp @@ -108,7 +108,8 @@ WorkspaceBundle direct_quint8_stride2::get_bundle( } //! Process one input channel copy padding void direct_quint8_stride2::copy_padding_kern( - WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, + const WorkspaceBundle& bundle, + const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { size_t IH = kern_param.isz[0]; @@ -122,7 +123,6 @@ void direct_quint8_stride2::copy_padding_kern( get_rectified_size(kern_param, IH2, IW2, OH2, OW2); bool need_src_copy_var = need_src_copy(kern_param); size_t padding_group_size = IH2 * IW2 * IC; - bundle.set(kern_param.workspace_ptr); //! Used for get the workspace offset size_t workspace_group_id = workspace_ids[0], @@ -149,7 +149,7 @@ void direct_quint8_stride2::copy_padding_kern( }; //! compute one output channel template -void direct_quint8_stride2::do_conv_kern(WorkspaceBundle bundle, +void direct_quint8_stride2::do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { @@ -187,7 +187,6 @@ void direct_quint8_stride2::do_conv_kern(WorkspaceBundle bundle, } size_t padding_group_size = IH2 * IW2 * IC; - bundle.set(kern_param.workspace_ptr); //! Used for get the workspace offset size_t workspace_group_id = workspace_ids[0], workspace_batch_id = workspace_ids[1], oc = workspace_ids[2], @@ -279,7 +278,7 @@ SmallVector direct_quint8_stride2::get_kimpls( size_t IC = param.filter_meta.icpg; size_t OC = param.filter_meta.ocpg; size_t group = fm.group; - WorkspaceBundle wbundle = get_bundle(param, m_large_group); + WorkspaceBundle bundle = get_bundle(param, m_large_group); conv_fun do_conv_fun = nullptr; #define DO_CONV_KERN_FUN(filter, bias_mode, op) \ @@ -340,13 +339,13 @@ SmallVector direct_quint8_stride2::get_kimpls( SmallVector ret_kerns; if (m_large_group) { - auto exec_one_group = [wbundle, do_conv_fun]( + auto exec_one_group = [bundle, do_conv_fun]( const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { auto fm = kern_param.filter_meta; size_t IC = fm.icpg; size_t OC = fm.ocpg; - WorkspaceBundle bundle = wbundle; + bundle.set(kern_param.workspace_ptr); for (size_t ic = 0; ic < IC; ic++) { copy_padding_kern(bundle, kern_param, ncb_index, {ncb_index.thread_id, 0, ic}); @@ -358,15 +357,17 @@ SmallVector direct_quint8_stride2::get_kimpls( }; ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); }else { - WorkspaceBundle bundle = wbundle; auto copy_padding = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); copy_padding_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({copy_padding, {group, N, IC}}); - auto do_conv = [bundle, do_conv_fun](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + auto do_conv = [bundle, do_conv_fun]( + const NCBKernParam& kern_param, + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({do_conv, {group, N, OC}}); diff --git a/dnn/src/arm_common/conv_bias/quint8/stride2.h b/dnn/src/arm_common/conv_bias/quint8/stride2.h index b73d02e2..c70679e7 100644 --- a/dnn/src/arm_common/conv_bias/quint8/stride2.h +++ b/dnn/src/arm_common/conv_bias/quint8/stride2.h @@ -21,19 +21,20 @@ using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; using conv_fun = std::function; bool can_conv_direct_stride2_quint8(const NCBKernSizeParam& param); WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); -void copy_padding_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, +void copy_padding_kern(const WorkspaceBundle& bundle, + const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids); template -void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, +void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids); diff --git a/dnn/src/arm_common/conv_bias/quint8/stride2_dotprod.cpp b/dnn/src/arm_common/conv_bias/quint8/stride2_dotprod.cpp index 0ce54962..6aae1f3f 100644 --- a/dnn/src/arm_common/conv_bias/quint8/stride2_dotprod.cpp +++ b/dnn/src/arm_common/conv_bias/quint8/stride2_dotprod.cpp @@ -108,8 +108,10 @@ WorkspaceBundle direct_dotprod_quint8_stride2::get_bundle( } //! Process one input channel copy padding void direct_dotprod_quint8_stride2::copy_padding_kern( - WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, - const ConvBiasImpl::NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { + const WorkspaceBundle& bundle, + const ConvBiasImpl::NCBKernParam& kern_param, + const ConvBiasImpl::NCBKernIndex& ncb_index, + const CpuNDRange& workspace_ids) { size_t IH = kern_param.isz[0]; size_t IW = kern_param.isz[1]; size_t IC = kern_param.filter_meta.icpg; @@ -121,7 +123,6 @@ void direct_dotprod_quint8_stride2::copy_padding_kern( get_rectified_size(kern_param, IH2, IW2, OH2, OW2); bool need_src_copy_var = need_src_copy(kern_param); size_t padding_group_size = IH2 * IW2 * IC; - bundle.set(kern_param.workspace_ptr); //! Used for get the workspace offset size_t workspace_group_id = workspace_ids[0], @@ -149,7 +150,7 @@ void direct_dotprod_quint8_stride2::copy_padding_kern( //! compute one output channel template void direct_dotprod_quint8_stride2::do_conv_kern( - WorkspaceBundle bundle, const NCBKernParam& kern_param, + const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { size_t OH = kern_param.osz[0]; size_t OW = kern_param.osz[1]; @@ -182,7 +183,6 @@ void direct_dotprod_quint8_stride2::do_conv_kern( } size_t padding_group_size = IH2 * IW2 * IC; - bundle.set(kern_param.workspace_ptr); //! Used for get the workspace offset size_t workspace_group_id = workspace_ids[0], workspace_batch_id = workspace_ids[1], oc = workspace_ids[2], @@ -276,7 +276,7 @@ SmallVector direct_dotprod_quint8_stride2::get_kimpls( size_t IC = param.filter_meta.icpg; size_t OC = param.filter_meta.ocpg; size_t group = fm.group; - WorkspaceBundle wbundle = get_bundle(param, m_large_group); + WorkspaceBundle bundle = get_bundle(param, m_large_group); conv_fun do_conv_fun = nullptr; #define DO_CONV_KERN_FUN(filter, bias_mode, op) \ @@ -337,13 +337,13 @@ SmallVector direct_dotprod_quint8_stride2::get_kimpls( SmallVector ret_kerns; if (m_large_group) { - auto exec_one_group = [wbundle, do_conv_fun]( + auto exec_one_group = [bundle, do_conv_fun]( const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { auto fm = kern_param.filter_meta; size_t IC = fm.icpg; size_t OC = fm.ocpg; - WorkspaceBundle bundle = wbundle; + bundle.set(kern_param.workspace_ptr); for (size_t ic = 0; ic < IC; ic++) { copy_padding_kern(bundle, kern_param, ncb_index, {ncb_index.thread_id, 0, ic}); @@ -355,15 +355,17 @@ SmallVector direct_dotprod_quint8_stride2::get_kimpls( }; ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); }else { - WorkspaceBundle bundle = wbundle; auto copy_padding = [bundle](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); copy_padding_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({copy_padding, {group, N, IC}}); - auto do_conv = [bundle, do_conv_fun](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + auto do_conv = [bundle, do_conv_fun]( + const NCBKernParam& kern_param, + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({do_conv, {group, N, OC}}); diff --git a/dnn/src/arm_common/conv_bias/quint8/stride2_dotprod.h b/dnn/src/arm_common/conv_bias/quint8/stride2_dotprod.h index 0c8049d9..bd8e9465 100644 --- a/dnn/src/arm_common/conv_bias/quint8/stride2_dotprod.h +++ b/dnn/src/arm_common/conv_bias/quint8/stride2_dotprod.h @@ -21,19 +21,20 @@ using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; using conv_fun = std::function; bool can_conv_direct_stride2_quint8(const NCBKernSizeParam& param); WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); -void copy_padding_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, +void copy_padding_kern(const WorkspaceBundle& bundle, + const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids); template -void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, +void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids); diff --git a/dnn/src/fallback/conv_bias/im2col/algos.cpp b/dnn/src/fallback/conv_bias/im2col/algos.cpp index 9f9f1b1f..6d94e8e5 100644 --- a/dnn/src/fallback/conv_bias/im2col/algos.cpp +++ b/dnn/src/fallback/conv_bias/im2col/algos.cpp @@ -39,7 +39,7 @@ struct Im2colBundelIndex { using Pack_Mode=fallback::MatrixMulImpl::AlgoBase::PackMode; //! Process one input channel copy padding -static void copy_padding_kern(WorkspaceBundle bundle, +static void copy_padding_kern(WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& param, const ConvBiasImpl::NCBKernIndex& ncb_index, StrategyBase* im2colstrategy, size_t pack_oc_size) { @@ -48,7 +48,7 @@ static void copy_padding_kern(WorkspaceBundle bundle, //! packA_kern static void packA_kern( - WorkspaceBundle bundle, + WorkspaceBundle& bundle, const fallback::ConvBiasImpl::NCBKernParam& param, fallback::MatrixMulImpl::KernSizeParam matmulparam, fallback::MatrixMulImpl::AlgoBase* matmul_algo, @@ -72,11 +72,12 @@ class Im2colKerns { public: //! conv kernel static void kerns( - WorkspaceBundle bundle, WorkspaceBundle bundle_thread, + const WorkspaceBundle& bundle, WorkspaceBundle bundle_thread, const ConvBiasImpl::NCBKernParam& param, fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param, const fallback::MatrixMulImpl::AlgoBase* matmul_algo, - const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc, + const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& + matmul_desc, StrategyParam strategyparam, fallback::ConvBiasImpl::NCBKernIndex ncb_index, size_t ohw_tile_size, StrategyBase* im2colstrategy) { @@ -100,7 +101,6 @@ public: strategyparam.output_block_oc_size = output_block_oc_size; strategyparam.output_block_size = output_block_size; - bundle.set(param.workspace_ptr); bundle_thread.set( static_cast( bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) + @@ -153,11 +153,12 @@ class Im2colKerns { public: //! conv kernel static void kerns( - WorkspaceBundle bundle, WorkspaceBundle bundle_thread, + const WorkspaceBundle& bundle, WorkspaceBundle bundle_thread, const ConvBiasImpl::NCBKernParam& param, fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param, const fallback::MatrixMulImpl::AlgoBase* matmul_algo, - const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc, + const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& + matmul_desc, StrategyParam strategyparam, fallback::ConvBiasImpl::NCBKernIndex ncb_index, size_t ohw_tile_size, StrategyBase* im2colstrategy) { @@ -169,7 +170,6 @@ public: strategyparam.oc_tile_size, OC - ncb_index.ndrange_id[3] * strategyparam.oc_tile_size); - bundle.set(param.workspace_ptr); bundle_thread.set( static_cast( bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) + @@ -236,11 +236,12 @@ class Im2colKerns { public: //! conv kernel static void kerns( - WorkspaceBundle bundle, WorkspaceBundle bundle_thread, + const WorkspaceBundle& bundle, WorkspaceBundle bundle_thread, const ConvBiasImpl::NCBKernParam& param, fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param, const fallback::MatrixMulImpl::AlgoBase* matmul_algo, - const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc, + const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& + matmul_desc, StrategyParam strategyparam, fallback::ConvBiasImpl::NCBKernIndex ncb_index, size_t ohw_tile_size, StrategyBase* im2colstrategy) { @@ -264,7 +265,6 @@ public: strategyparam.output_block_oc_size = output_block_oc_size; strategyparam.output_block_size = output_block_size; - bundle.set(param.workspace_ptr); bundle_thread.set( static_cast( bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) + @@ -567,16 +567,18 @@ SmallVector ConvBiasImpl::AlgoIm2col::dispatch_kerns( auto kern_padding = [bundle, im2colstrategy, pack_oc_size = pack_oc_size]( const NCBKernParam& param, - const NCBKernIndex& ncb_index) { + const NCBKernIndex& ncb_index) mutable { + bundle.set(param.workspace_ptr); copy_padding_kern(bundle, param, ncb_index, im2colstrategy, pack_oc_size); }; auto kern_packA = [bundle, matmul_algo = m_matmul_algo, matmul_param, im2colstrategy, - pack_oc_size = pack_oc_size, - mdesc = mdesc](const NCBKernParam& param, - const NCBKernIndex& ncb_index) { + pack_oc_size = pack_oc_size, mdesc = mdesc]( + const NCBKernParam& param, + const NCBKernIndex& ncb_index) mutable { + bundle.set(param.workspace_ptr); packA_kern(bundle, param, matmul_param, matmul_algo, ncb_index, im2colstrategy, mdesc, pack_oc_size); }; @@ -586,8 +588,10 @@ SmallVector ConvBiasImpl::AlgoIm2col::dispatch_kerns( matmul_algo = m_matmul_algo, ohw_tile_size = ohw_tile_size, strategyparam = strategyparam, matmul_desc = mdesc, - im2colstrategy](const NCBKernParam& param, - const NCBKernIndex& ncb_index) { + im2colstrategy]( + const NCBKernParam& param, + const NCBKernIndex& ncb_index) mutable { + bundle.set(param.workspace_ptr); Im2colKerns::kerns( bundle, bundle_thread, param, matmul_param, matmul_algo, matmul_desc, strategyparam, @@ -608,8 +612,10 @@ SmallVector ConvBiasImpl::AlgoIm2col::dispatch_kerns( matmul_algo = m_matmul_algo, strategyparam = strategyparam, ohw_tile_size = ohw_tile_size, matmul_desc = mdesc, - im2colstrategy](const NCBKernParam& param, - const NCBKernIndex& ncb_index) { + im2colstrategy]( + const NCBKernParam& param, + const NCBKernIndex& ncb_index) mutable { + bundle.set(param.workspace_ptr); Im2colKerns::kerns( bundle, bundle_thread, param, matmul_param, matmul_algo, matmul_desc, strategyparam, @@ -628,14 +634,15 @@ SmallVector ConvBiasImpl::AlgoIm2col::dispatch_kerns( matmul_algo = m_matmul_algo, strategyparam = strategyparam, ohw_tile_size = ohw_tile_size, matmul_desc = mdesc, - im2colstrategy](const NCBKernParam& param, - const NCBKernIndex& ncb_index) { + im2colstrategy]( + const NCBKernParam& param, + const NCBKernIndex& ncb_index) mutable { + bundle.set(param.workspace_ptr); Im2colKerns::kerns( bundle, bundle_thread, param, matmul_param, matmul_algo, matmul_desc, strategyparam, ncb_index, ohw_tile_size, im2colstrategy); }; - if (need_padding) { ret_kern.push_back({kern_padding, {param.n, GROUP, IC}}); } diff --git a/dnn/src/fallback/conv_bias/im2col/strategy_base.h b/dnn/src/fallback/conv_bias/im2col/strategy_base.h index 976873ca..1c3233d6 100644 --- a/dnn/src/fallback/conv_bias/im2col/strategy_base.h +++ b/dnn/src/fallback/conv_bias/im2col/strategy_base.h @@ -50,21 +50,22 @@ public: StrategyBase() = default; virtual ~StrategyBase() = default; virtual void copy_padding_kern( - WorkspaceBundle bundle, + const WorkspaceBundle& bundle, const fallback::ConvBiasImpl::NCBKernParam& param, const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, size_t pack_size) = 0; virtual void packA_kern( - WorkspaceBundle bundle, + const WorkspaceBundle& bundle, const fallback::ConvBiasImpl::NCBKernParam& param, fallback::MatrixMulImpl::KernSizeParam matmulparam, const fallback::MatrixMulImpl::AlgoBase* matmul_algo, const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, - const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desec, + const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& + matmul_desec, size_t pack_size) = 0; virtual void exec_im2col( - WorkspaceBundle bundle, WorkspaceBundle bundle_thread, + const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread, const StrategyParam& sparam, const fallback::ConvBiasImpl::NCBKernParam& param, fallback::MatrixMulImpl::KernParam matmul_param, @@ -72,17 +73,18 @@ public: virtual void exec_matmul( const fallback::ConvBiasImpl::NCBKernParam& param, - const StrategyParam& sparam, WorkspaceBundle bundle, - WorkspaceBundle bundle_thread, + const StrategyParam& sparam, const WorkspaceBundle& bundle, + const WorkspaceBundle& bundle_thread, fallback::MatrixMulImpl::KernParam matmul_param, const fallback::MatrixMulImpl::AlgoBase* matmul_algo, const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, - const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc - ) = 0; + const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& + matmul_desc) = 0; virtual void exec_postprocess( const fallback::ConvBiasImpl::NCBKernParam& param, - const StrategyParam& sparam, WorkspaceBundle bundle_thread) = 0; + const StrategyParam& sparam, + const WorkspaceBundle& bundle_thread) = 0; }; template (0); if (param.src_type.enumv() == DTypeEnum::Quantized8Asymm) { @@ -212,8 +213,8 @@ void copy_dst(const fallback::ConvBiasImpl::NCBKernParam& param, template void copy_bias(const fallback::ConvBiasImpl::NCBKernParam& param, - WorkspaceBundle bundle_thread, const StrategyParam& sparam, - size_t bias_index) { + const WorkspaceBundle& bundle_thread, + const StrategyParam& sparam, size_t bias_index) { const bias_ctype* bias_ptr = static_cast( param.bias(sparam.batch_id, sparam.group_id)); bias_ctype* bias_temp_ptr = static_cast( @@ -235,11 +236,11 @@ void copy_bias(const fallback::ConvBiasImpl::NCBKernParam& param, } } -template +template void do_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, - const StrategyParam& sparam, WorkspaceBundle bundle_thread, + const StrategyParam& sparam, + const WorkspaceBundle& bundle_thread, size_t matmul_bundle_index, size_t bias_bundle_index) { copy_bias(param, bundle_thread, sparam, bias_bundle_index); void* matmul_dst = get_matmul_dst_ptr( @@ -288,32 +289,32 @@ public: Strategy() = default; virtual void packA_kern( - WorkspaceBundle bundle, + const WorkspaceBundle& bundle, const fallback::ConvBiasImpl::NCBKernParam& param, fallback::MatrixMulImpl::KernSizeParam matmulparam, const fallback::MatrixMulImpl::AlgoBase* matmul_algo, const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, - const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc, + const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& + matmul_desc, size_t pack_size) override; virtual void exec_im2col( - WorkspaceBundle bundle, WorkspaceBundle bundle_thread, + const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread, const StrategyParam& sparam, const fallback::ConvBiasImpl::NCBKernParam& param, fallback::MatrixMulImpl::KernParam matmul_param, const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; - void exec_matmul( - const fallback::ConvBiasImpl::NCBKernParam& param, - const StrategyParam& sparam, WorkspaceBundle bundle, - WorkspaceBundle bundle_thread, - fallback::MatrixMulImpl::KernParam matmul_param, - const fallback::MatrixMulImpl::AlgoBase* matmul_algo, - const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, - const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc - ) override; + void exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param, + const StrategyParam& sparam, const WorkspaceBundle& bundle, + const WorkspaceBundle& bundle_thread, + fallback::MatrixMulImpl::KernParam matmul_param, + const fallback::MatrixMulImpl::AlgoBase* matmul_algo, + const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, + const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& + matmul_desc) override; void exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, const StrategyParam& sparam, - WorkspaceBundle bundle_thread) override { + const WorkspaceBundle& bundle_thread) override { do_postprocess(param, sparam, bundle_thread, THREAD_BUNDLE_IM2COL_INDEX, @@ -341,11 +342,12 @@ public: Strategy() = default; - void exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, - const StrategyParam& sparam, - const fallback::ConvBiasImpl::NCBKernParam& param, - fallback::MatrixMulImpl::KernParam matmul_param, - const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; + void exec_im2col( + const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread, + const StrategyParam& sparam, + const fallback::ConvBiasImpl::NCBKernParam& param, + fallback::MatrixMulImpl::KernParam matmul_param, + const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; }; template (param, sparam, bundle_thread, THREAD_BUNDLE_MATMULDST_INDEX, @@ -423,7 +425,7 @@ public: Strategy() = default; void packA_kern( - WorkspaceBundle bundle, + const WorkspaceBundle& bundle, const fallback::ConvBiasImpl::NCBKernParam& param, fallback::MatrixMulImpl::KernSizeParam matmulparam, const fallback::MatrixMulImpl::AlgoBase* matmul_algo, @@ -431,21 +433,21 @@ public: const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& MDsec, size_t pack_size) override; - void exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, - const StrategyParam& sparam, - const fallback::ConvBiasImpl::NCBKernParam& param, - fallback::MatrixMulImpl::KernParam matmul_param, - const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; - - void exec_matmul( + void exec_im2col( + const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread, + const StrategyParam& sparam, const fallback::ConvBiasImpl::NCBKernParam& param, - const StrategyParam& sparam, WorkspaceBundle bundle, - WorkspaceBundle bundle_thread, fallback::MatrixMulImpl::KernParam matmul_param, - const fallback::MatrixMulImpl::AlgoBase* matmul_algo, - const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, - const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc - ) override; + const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; + + void exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param, + const StrategyParam& sparam, const WorkspaceBundle& bundle, + const WorkspaceBundle& bundle_thread, + fallback::MatrixMulImpl::KernParam matmul_param, + const fallback::MatrixMulImpl::AlgoBase* matmul_algo, + const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, + const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& + matmul_desc) override; void* get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param, const WorkspaceBundle& bundle_thread, @@ -453,7 +455,7 @@ public: void exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, const StrategyParam& sparam, - WorkspaceBundle bundle_thread) override { + const WorkspaceBundle& bundle_thread) override { do_postprocess(param, sparam, bundle_thread, THREAD_BUNDLE_MATMULDST_INDEX, @@ -476,11 +478,12 @@ public: constexpr static size_t THREAD_BUNDLE_IM2COL_INDEX = 1; constexpr static size_t THREAD_BUNDLE_BIAS_INDEX = 2; - void exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, - const StrategyParam& sparam, - const fallback::ConvBiasImpl::NCBKernParam& param, - fallback::MatrixMulImpl::KernParam matmul_param, - const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; + void exec_im2col( + const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread, + const StrategyParam& sparam, + const fallback::ConvBiasImpl::NCBKernParam& param, + fallback::MatrixMulImpl::KernParam matmul_param, + const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; }; template void Strategy:: - packA_kern(WorkspaceBundle bundle, + packA_kern(const WorkspaceBundle& bundle, const fallback::ConvBiasImpl::NCBKernParam& param, fallback::MatrixMulImpl::KernSizeParam matmulparam, const fallback::MatrixMulImpl::AlgoBase* matmul_algo, @@ -26,7 +26,6 @@ void Strategy(matmul_param) = @@ -50,7 +49,8 @@ template void Strategy:: - exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, + exec_im2col(const WorkspaceBundle& bundle, + const WorkspaceBundle& bundle_thread, const StrategyParam& sparam, const fallback::ConvBiasImpl::NCBKernParam& param, fallback::MatrixMulImpl::KernParam matmul_param, @@ -139,8 +139,8 @@ template :: exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param, - const StrategyParam& sparam, WorkspaceBundle bundle, - WorkspaceBundle bundle_thread, + const StrategyParam& sparam, const WorkspaceBundle& bundle, + const WorkspaceBundle& bundle_thread, fallback::MatrixMulImpl::KernParam matmul_param, const fallback::MatrixMulImpl::AlgoBase* matmul_algo, const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, diff --git a/dnn/src/fallback/conv_bias/im2col/strategy_default_nchw44.cpp b/dnn/src/fallback/conv_bias/im2col/strategy_default_nchw44.cpp index b4d869f0..e2b77721 100644 --- a/dnn/src/fallback/conv_bias/im2col/strategy_default_nchw44.cpp +++ b/dnn/src/fallback/conv_bias/im2col/strategy_default_nchw44.cpp @@ -29,7 +29,8 @@ template void Strategy:: - exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, + exec_im2col(const WorkspaceBundle& bundle, + const WorkspaceBundle& bundle_thread, const StrategyParam& sparam, const fallback::ConvBiasImpl::NCBKernParam& param, fallback::MatrixMulImpl::KernParam matmul_param, diff --git a/dnn/src/fallback/conv_bias/im2col/strategy_fuse_nchw44.cpp b/dnn/src/fallback/conv_bias/im2col/strategy_fuse_nchw44.cpp index a8d30c31..354c3812 100644 --- a/dnn/src/fallback/conv_bias/im2col/strategy_fuse_nchw44.cpp +++ b/dnn/src/fallback/conv_bias/im2col/strategy_fuse_nchw44.cpp @@ -169,7 +169,8 @@ void naive_fuse_im2col_packB(dt_int8* src, size_t ic, size_t iw, size_t ih, template void StrategyFuse4x4x16Nchw44:: - exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, + exec_im2col(const WorkspaceBundle& bundle, + const WorkspaceBundle& bundle_thread, const StrategyParam& sparam, const fallback::ConvBiasImpl::NCBKernParam& param, fallback::MatrixMulImpl::KernParam, diff --git a/dnn/src/fallback/conv_bias/im2col/strategy_fuse_nchw44_dot.cpp b/dnn/src/fallback/conv_bias/im2col/strategy_fuse_nchw44_dot.cpp index eeb34a69..610b98e2 100644 --- a/dnn/src/fallback/conv_bias/im2col/strategy_fuse_nchw44_dot.cpp +++ b/dnn/src/fallback/conv_bias/im2col/strategy_fuse_nchw44_dot.cpp @@ -172,7 +172,8 @@ void fuse_packb(const dt_int8* __restrict src, dt_int8* __restrict dst, template void StrategyFuse8x12x4Nchw44Dot:: - exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, + exec_im2col(const WorkspaceBundle& bundle, + const WorkspaceBundle& bundle_thread, const StrategyParam& sparam, const fallback::ConvBiasImpl::NCBKernParam& param, fallback::MatrixMulImpl::KernParam /*matmul_param*/, @@ -207,7 +208,6 @@ void StrategyFuse8x12x4Nchw44Dot:: sparam.output_block_size); } - namespace megdnn { template class StrategyFuse8x12x4Nchw44Dot void StrategyFuse8x12x1Nchw44K3x3S2:: - exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, + exec_im2col(const WorkspaceBundle& bundle, + const WorkspaceBundle& bundle_thread, const StrategyParam& sparam, const fallback::ConvBiasImpl::NCBKernParam& param, fallback::MatrixMulImpl::KernParam /*matmul_param*/, diff --git a/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp b/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp index 8a26382b..57db835c 100644 --- a/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp +++ b/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp @@ -19,7 +19,7 @@ template void Strategy:: - packA_kern(WorkspaceBundle bundle, + packA_kern(const WorkspaceBundle& bundle, const fallback::ConvBiasImpl::NCBKernParam& param, fallback::MatrixMulImpl::KernSizeParam matmulparam, const fallback::MatrixMulImpl::AlgoBase* matmul_algo, @@ -61,8 +61,8 @@ template :: exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param, - const StrategyParam& sparam, WorkspaceBundle bundle, - WorkspaceBundle bundle_thread, + const StrategyParam& sparam, const WorkspaceBundle& bundle, + const WorkspaceBundle& bundle_thread, fallback::MatrixMulImpl::KernParam matmul_param, const fallback::MatrixMulImpl::AlgoBase* matmul_algo, const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, @@ -96,7 +96,8 @@ template void Strategy:: - exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, + exec_im2col(const WorkspaceBundle& bundle, + const WorkspaceBundle& bundle_thread, const StrategyParam& sparam, const fallback::ConvBiasImpl::NCBKernParam& param, fallback::MatrixMulImpl::KernParam matmul_param, diff --git a/dnn/src/fallback/conv_bias/im2col/strategy_onlypacka.cpp b/dnn/src/fallback/conv_bias/im2col/strategy_onlypacka.cpp index bdf5633d..0e622597 100644 --- a/dnn/src/fallback/conv_bias/im2col/strategy_onlypacka.cpp +++ b/dnn/src/fallback/conv_bias/im2col/strategy_onlypacka.cpp @@ -19,7 +19,7 @@ template void Strategy:: - packA_kern(WorkspaceBundle bundle, + packA_kern(const WorkspaceBundle& bundle, const fallback::ConvBiasImpl::NCBKernParam& param, fallback::MatrixMulImpl::KernSizeParam matmulparam, const fallback::MatrixMulImpl::AlgoBase* matmul_algo, @@ -27,7 +27,6 @@ void Strategy(matmul_param) = matmulparam; @@ -56,8 +55,8 @@ template :: exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param, - const StrategyParam& sparam, WorkspaceBundle bundle, - WorkspaceBundle bundle_thread, + const StrategyParam& sparam, const WorkspaceBundle& bundle, + const WorkspaceBundle& bundle_thread, fallback::MatrixMulImpl::KernParam matmul_param, const fallback::MatrixMulImpl::AlgoBase* matmul_algo, const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, @@ -96,7 +95,8 @@ template void Strategy:: - exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, + exec_im2col(const WorkspaceBundle& bundle, + const WorkspaceBundle& bundle_thread, const StrategyParam& sparam, const fallback::ConvBiasImpl::NCBKernParam& param, fallback::MatrixMulImpl::KernParam matmul_param, diff --git a/dnn/src/fallback/conv_bias/winograd/winograd.h b/dnn/src/fallback/conv_bias/winograd/winograd.h index 745e3411..022867c4 100644 --- a/dnn/src/fallback/conv_bias/winograd/winograd.h +++ b/dnn/src/fallback/conv_bias/winograd/winograd.h @@ -194,12 +194,12 @@ public: IC, 0, OC); } - static void filter_process(Strategy strategy, WorkspaceBundle bundle_top, - WorkspaceBundle bundle_compute, + static void filter_process(Strategy strategy, + const WorkspaceBundle& bundle_top, + const WorkspaceBundle& bundle_compute, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index) { - bundle_top.set(kern_param.workspace_ptr); - bundle_compute.set(bundle_top.get(0)); + size_t compute_workspace_size_per_thread = bundle_compute.total_size_in_bytes(); size_t thread_id = ncb_index.thread_id; @@ -236,8 +236,8 @@ public: } static void winograd_compute( - Strategy strategy, WorkspaceBundle bundle_top, - WorkspaceBundle bundle_compute, + Strategy strategy, const WorkspaceBundle& bundle_top, + const WorkspaceBundle& bundle_compute, fallback::MatrixMulImpl::AlgoBase* matmul_algo, fallback::MatrixMulImpl::KernParam matmul_param, size_t unit_tile_size, size_t unit_oc_size, @@ -265,9 +265,6 @@ public: size_t group_id = ncb_index.ndrange_id[0]; size_t thread_id = ncb_index.thread_id; - bundle_top.set(ncb_param.workspace_ptr); - bundle_compute.set(bundle_top.get(0)); - const stype* src_ptr = ncb_param.src(batch_id, group_id); dst_type* dst_ptr = ncb_param.dst(batch_id, group_id); const output_compute_type* bias_ptr = @@ -419,14 +416,16 @@ public: param.filter_meta.format == param::ConvBias::Format::NCHW44) { //! probably a gcc bug, labmda require capturing 'this' to call //! static member function - auto filter_process_kern = [this, strategy, bundle_top, - bundle_compute]( - const NCBKernParam& ncb_param, - const NCBKernIndex& ncb_index) { - MEGDNN_MARK_USED_VAR(this); - filter_process(strategy, bundle_top, bundle_compute, ncb_param, - std::move(ncb_index)); - }; + auto filter_process_kern = + [this, strategy, bundle_top, bundle_compute]( + const NCBKernParam& ncb_param, + const NCBKernIndex& ncb_index) mutable { + MEGDNN_MARK_USED_VAR(this); + bundle_top.set(ncb_param.workspace_ptr); + bundle_compute.set(bundle_top.get(0)); + filter_process(strategy, bundle_top, bundle_compute, + ncb_param, std::move(ncb_index)); + }; size_t oc_parallelism = OC; if (param.filter_meta.format == param::ConvBias::Format::NCHW88) { megdnn_assert(OC % 8 == 0); @@ -438,18 +437,22 @@ public: } kerns.push_back({filter_process_kern, {GROUP, 1, oc_parallelism}}); } - auto winograd_compute_kern = [strategy, bundle_top, bundle_compute, - matmul_algo, matmul_param, unit_tile_size, - unit_oc_size]( - const NCBKernParam& ncb_param, - const NCBKernIndex& ncb_index) { - MIDOUT_BEGIN(megdnn_fallback_conv_bias_winograd_common, 0, 0) { - winograd_compute(strategy, bundle_top, bundle_compute, - matmul_algo, matmul_param, unit_tile_size, - unit_oc_size, ncb_param, std::move(ncb_index)); - } - MIDOUT_END(); - }; + auto winograd_compute_kern = + [strategy, bundle_top, bundle_compute, matmul_algo, + matmul_param, unit_tile_size, + unit_oc_size](const NCBKernParam& ncb_param, + const NCBKernIndex& ncb_index) mutable { + MIDOUT_BEGIN(megdnn_fallback_conv_bias_winograd_common, 0, + 0) { + bundle_top.set(ncb_param.workspace_ptr); + bundle_compute.set(bundle_top.get(0)); + winograd_compute(strategy, bundle_top, bundle_compute, + matmul_algo, matmul_param, + unit_tile_size, unit_oc_size, + ncb_param, std::move(ncb_index)); + } + MIDOUT_END(); + }; kerns.push_back( {winograd_compute_kern, {GROUP, N, nr_hw_tiles, nr_oc_tiles}}); return kerns; diff --git a/dnn/src/naive/handle.h b/dnn/src/naive/handle.h index db301b19..15d3c63e 100644 --- a/dnn/src/naive/handle.h +++ b/dnn/src/naive/handle.h @@ -186,10 +186,7 @@ public: */ #define MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN(_handle, _parallelism, _stmt) \ do { \ - auto _kern = [=](size_t index, size_t thread_id) { \ - _stmt(index, thread_id); \ - }; \ - _handle->dispatch_kern(_kern, _parallelism); \ + _handle->dispatch_kern(_stmt, _parallelism); \ } while (0) //! disptch kern on current opr diff --git a/dnn/src/x86/conv_bias/f32/algos.cpp b/dnn/src/x86/conv_bias/f32/algos.cpp index 28f80d3e..b13fc314 100644 --- a/dnn/src/x86/conv_bias/f32/algos.cpp +++ b/dnn/src/x86/conv_bias/f32/algos.cpp @@ -58,45 +58,47 @@ void get_rectified_size(size_t IH, size_t IW, size_t OH, size_t OW, size_t FH, } } // namespace -#define GET_KERN \ - auto fm = param.filter_meta; \ - size_t N = param.n; \ - size_t IC = param.filter_meta.icpg; \ - size_t OC = param.filter_meta.ocpg; \ - size_t group = fm.group; \ - WorkspaceBundle wbundle = get_bundle(param); \ - SmallVector ret_kerns; \ - if (m_large_group) { \ - auto exec_one_group = [wbundle](const NCBKernParam& kern_param, \ - const NCBKernIndex& ncb_index) { \ - auto fm = kern_param.filter_meta; \ - size_t IC = fm.icpg; \ - size_t OC = fm.ocpg; \ - WorkspaceBundle bundle = wbundle; \ - for (size_t ic = 0; ic < IC; ic++) { \ - copy_padding_kern(bundle, kern_param, ncb_index, \ - {ncb_index.thread_id, 0, ic}); \ - } \ - for (size_t oc = 0; oc < OC; oc++) { \ - do_conv_kern(bundle, kern_param, ncb_index, \ - {ncb_index.thread_id, 0, oc}); \ - } \ - }; \ - ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); \ - } else { \ - auto copy_padding = [wbundle](const NCBKernParam& kern_param, \ - const NCBKernIndex& ncb_index) { \ - copy_padding_kern(wbundle, kern_param, ncb_index, \ - ncb_index.ndrange_id); \ - }; \ - ret_kerns.push_back({copy_padding, {group, N, IC}}); \ - auto do_conv = [wbundle](const NCBKernParam& kern_param, \ - const NCBKernIndex& ncb_index) { \ - do_conv_kern(wbundle, kern_param, ncb_index, \ - ncb_index.ndrange_id); \ - }; \ - ret_kerns.push_back({do_conv, {group, N, OC}}); \ - } \ +#define GET_KERN \ + auto fm = param.filter_meta; \ + size_t N = param.n; \ + size_t IC = param.filter_meta.icpg; \ + size_t OC = param.filter_meta.ocpg; \ + size_t group = fm.group; \ + WorkspaceBundle bundle = get_bundle(param); \ + SmallVector ret_kerns; \ + if (m_large_group) { \ + auto exec_one_group = [bundle]( \ + const NCBKernParam& kern_param, \ + const NCBKernIndex& ncb_index) mutable { \ + bundle.set(kern_param.workspace_ptr); \ + auto fm = kern_param.filter_meta; \ + size_t IC = fm.icpg; \ + size_t OC = fm.ocpg; \ + for (size_t ic = 0; ic < IC; ic++) { \ + copy_padding_kern(bundle, kern_param, ncb_index, \ + {ncb_index.thread_id, 0, ic}); \ + } \ + for (size_t oc = 0; oc < OC; oc++) { \ + do_conv_kern(bundle, kern_param, ncb_index, \ + {ncb_index.thread_id, 0, oc}); \ + } \ + }; \ + ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); \ + } else { \ + auto copy_padding = [bundle](const NCBKernParam& kern_param, \ + const NCBKernIndex& ncb_index) mutable { \ + bundle.set(kern_param.workspace_ptr); \ + copy_padding_kern(bundle, kern_param, ncb_index, \ + ncb_index.ndrange_id); \ + }; \ + ret_kerns.push_back({copy_padding, {group, N, IC}}); \ + auto do_conv = [bundle](const NCBKernParam& kern_param, \ + const NCBKernIndex& ncb_index) mutable { \ + bundle.set(kern_param.workspace_ptr); \ + do_conv_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id); \ + }; \ + ret_kerns.push_back({do_conv, {group, N, OC}}); \ + } \ return ret_kerns; /* ===================== direct algo ===================== */ @@ -146,7 +148,8 @@ size_t ConvBiasImpl::AlgoDirect::get_workspace( //! Process one input channel copy padding void ConvBiasImpl::AlgoDirect::copy_padding_kern( - WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, + const WorkspaceBundle& bundle, + const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { size_t IH = kern_param.isz[0]; @@ -169,7 +172,6 @@ void ConvBiasImpl::AlgoDirect::copy_padding_kern( const float* sptr = static_cast( kern_param.src(batch_id, group_id)) + channel_id * IH * IW; - bundle.set(kern_param.workspace_ptr); //! Used for get the workspace offset size_t workspace_group_id = workspace_ids[0], @@ -239,7 +241,7 @@ void ConvBiasImpl::AlgoDirect::copy_padding_kern( func = detail::convolution_##mode##_fh##fsize##_##simd; //! compute one output channel -void ConvBiasImpl::AlgoDirect::do_conv_kern(WorkspaceBundle bundle, +void ConvBiasImpl::AlgoDirect::do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { @@ -265,7 +267,6 @@ void ConvBiasImpl::AlgoDirect::do_conv_kern(WorkspaceBundle bundle, func = nullptr; DISPATCH; - bundle.set(kern_param.workspace_ptr); size_t bias_offset = 0; if (kern_param.bias_mode == megdnn::BiasMode::BIAS) { bias_offset = OH * OW; @@ -367,7 +368,8 @@ size_t ConvBiasImpl::AlgoDirectStride2::get_workspace( } //! Process one input channel copy padding void ConvBiasImpl::AlgoDirectStride2::copy_padding_kern( - WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, + const WorkspaceBundle& bundle, + const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { size_t IH = kern_param.isz[0]; @@ -390,7 +392,6 @@ void ConvBiasImpl::AlgoDirectStride2::copy_padding_kern( const float* sptr = static_cast( kern_param.src(batch_id, group_id)) + channel_id * IH * IW; - bundle.set(kern_param.workspace_ptr); //! Used for get the workspace offset size_t workspace_group_id = workspace_ids[0], workspace_batch_id = workspace_ids[1], @@ -411,7 +412,7 @@ void ConvBiasImpl::AlgoDirectStride2::copy_padding_kern( //! compute one output channel void ConvBiasImpl::AlgoDirectStride2::do_conv_kern( - WorkspaceBundle bundle, const NCBKernParam& kern_param, + const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { size_t OH = kern_param.osz[0]; size_t OW = kern_param.osz[1]; @@ -446,7 +447,6 @@ void ConvBiasImpl::AlgoDirectStride2::do_conv_kern( func_add_dst = conv_general_simd::do_conv_7x7_stride2; } - bundle.set(kern_param.workspace_ptr); size_t bias_offset = 0; if (kern_param.bias_mode == megdnn::BiasMode::BIAS) { bias_offset = OH * OW; diff --git a/dnn/src/x86/conv_bias/f32/algos.h b/dnn/src/x86/conv_bias/f32/algos.h index 8c06542b..144f5713 100644 --- a/dnn/src/x86/conv_bias/f32/algos.h +++ b/dnn/src/x86/conv_bias/f32/algos.h @@ -20,11 +20,11 @@ class ConvBiasImpl::AlgoDirect final : public AlgoBase { SmallVector get_kimpls(const NCBKernSizeParam& param) const; WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const; - static void copy_padding_kern(WorkspaceBundle bundle, + static void copy_padding_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids); - static void do_conv_kern(WorkspaceBundle bundle, + static void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids); @@ -57,11 +57,11 @@ class ConvBiasImpl::AlgoDirectStride2 final : public AlgoBase { SmallVector get_kimpls(const NCBKernSizeParam& param) const; WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const; - static void copy_padding_kern(WorkspaceBundle bundle, + static void copy_padding_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, - const CpuNDRange& workspace_ids); - static void do_conv_kern(WorkspaceBundle bundle, + const CpuNDRange& workspace_ids); + static void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids); diff --git a/dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.cpp b/dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.cpp index 19d18ee1..deffaa86 100644 --- a/dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.cpp +++ b/dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.cpp @@ -19,7 +19,7 @@ namespace x86 { namespace avx2_chanwise_stride1 { template -void conv_kimpl(WorkspaceBundle bundle, const NCBKernParam& kern_param, +void conv_kimpl(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index) { size_t OH = kern_param.osz[0]; size_t OW = kern_param.osz[1]; @@ -38,9 +38,6 @@ void conv_kimpl(WorkspaceBundle bundle, const NCBKernParam& kern_param, op = Op(scale_bias, scale_dst); } size_t padding_group_size = IH2 * IW2; - - bundle.set(kern_param.workspace_ptr); - size_t workspace_group_id = ncb_index.thread_id; size_t group_id = ncb_index.ndrange_id[0], batch_id = ncb_index.ndrange_id[1]; @@ -98,7 +95,7 @@ void conv_kimpl(WorkspaceBundle bundle, const NCBKernParam& kern_param, } }; SmallVector get_kimpls(const NCBKernSizeParam& kern_param, - WorkspaceBundle bundle) { + const WorkspaceBundle& bundle) { MEGDNN_MARK_USED_VAR(kern_param); auto fm = kern_param.filter_meta; size_t group = fm.group; @@ -182,8 +179,10 @@ SmallVector get_kimpls(const NCBKernSizeParam& kern_param, DISPATCH_CONV_KERN(); - auto exec_one_group = [bundle, do_conv_fun](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + auto exec_one_group = [bundle = bundle, do_conv_fun]( + const NCBKernParam& kern_param, + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); copy_padding_kern(bundle, kern_param, ncb_index); do_conv_fun(bundle, kern_param, ncb_index); }; diff --git a/dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.h b/dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.h index 518501b2..0061c900 100644 --- a/dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.h +++ b/dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.h @@ -17,11 +17,11 @@ namespace megdnn { namespace x86 { namespace avx2_chanwise_stride1 { -using conv_fun = std::function; SmallVector get_kimpls(const NCBKernSizeParam& param, - WorkspaceBundle bundle); + const WorkspaceBundle& bundle); } // namespace avx2_chanwise_stride1 } // namespace x86 diff --git a/dnn/src/x86/conv_bias/int8/avx2_chanwise_stride2.cpp b/dnn/src/x86/conv_bias/int8/avx2_chanwise_stride2.cpp index ca4c70a6..19b2df2e 100644 --- a/dnn/src/x86/conv_bias/int8/avx2_chanwise_stride2.cpp +++ b/dnn/src/x86/conv_bias/int8/avx2_chanwise_stride2.cpp @@ -19,7 +19,7 @@ namespace x86 { namespace avx2_chanwise_stride2 { template -void conv_kimpl(WorkspaceBundle bundle, const NCBKernParam& kern_param, +void conv_kimpl(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index) { size_t OH = kern_param.osz[0]; size_t OW = kern_param.osz[1]; @@ -38,9 +38,6 @@ void conv_kimpl(WorkspaceBundle bundle, const NCBKernParam& kern_param, op = Op(scale_bias, scale_dst); } size_t padding_group_size = IH2 * IW2; - - bundle.set(kern_param.workspace_ptr); - size_t workspace_group_id = ncb_index.thread_id; size_t group_id = ncb_index.ndrange_id[0], batch_id = ncb_index.ndrange_id[1]; @@ -98,7 +95,7 @@ void conv_kimpl(WorkspaceBundle bundle, const NCBKernParam& kern_param, } }; SmallVector get_kimpls(const NCBKernSizeParam& kern_param, - WorkspaceBundle bundle) { + const WorkspaceBundle& bundle) { MEGDNN_MARK_USED_VAR(kern_param); auto fm = kern_param.filter_meta; size_t group = fm.group; @@ -187,8 +184,10 @@ SmallVector get_kimpls(const NCBKernSizeParam& kern_param, DISPATCH_CONV_KERN(); - auto exec_one_group = [bundle, do_conv_fun](const NCBKernParam& kern_param, - const NCBKernIndex& ncb_index) { + auto exec_one_group = [bundle = bundle, do_conv_fun]( + const NCBKernParam& kern_param, + const NCBKernIndex& ncb_index) mutable { + bundle.set(kern_param.workspace_ptr); copy_padding_kern(bundle, kern_param, ncb_index); do_conv_fun(bundle, kern_param, ncb_index); }; diff --git a/dnn/src/x86/conv_bias/int8/avx2_chanwise_stride2.h b/dnn/src/x86/conv_bias/int8/avx2_chanwise_stride2.h index 63ee7df4..183c8103 100644 --- a/dnn/src/x86/conv_bias/int8/avx2_chanwise_stride2.h +++ b/dnn/src/x86/conv_bias/int8/avx2_chanwise_stride2.h @@ -17,11 +17,11 @@ namespace megdnn { namespace x86 { namespace avx2_chanwise_stride2 { -using conv_fun = std::function; SmallVector get_kimpls(const NCBKernSizeParam& param, - WorkspaceBundle bundle); + const WorkspaceBundle& bundle); } // namespace avx2_chanwise_stride2 } // namespace x86 diff --git a/dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride1.cpp b/dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride1.cpp index a5d5baea..d7c4e17a 100644 --- a/dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride1.cpp +++ b/dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride1.cpp @@ -19,7 +19,7 @@ namespace direct_conv_avx2_stride1 { //! layout:(N,IC,IH,IW)-->(N,IC/2,H,W,2) MEGDNN_ATTRIBUTE_TARGET("sse4.1") -void pack_src_conv_avx2_stride1(WorkspaceBundle bundle, +void pack_src_conv_avx2_stride1(const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index) { int32_t ih = kern_param.isz[0]; @@ -48,7 +48,6 @@ void pack_src_conv_avx2_stride1(WorkspaceBundle bundle, const int8_t* src_ptr = kern_param.src(batch_id, group_id) + ic_step * channel_id * c_stride; - bundle.set(kern_param.workspace_ptr); int8_t* packed_src = static_cast(bundle.get(0)) + batch_id * group * packed_group_size + group_id * packed_group_size + @@ -103,7 +102,7 @@ void pack_src_conv_avx2_stride1(WorkspaceBundle bundle, MEGDNN_ATTRIBUTE_TARGET("sse4.1") static inline void pack_filter_conv_avx2_stride1( - WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, + const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index) { MEGDNN_MARK_USED_VAR(ncb_index); int32_t oc = kern_param.filter_meta.ocpg; @@ -129,7 +128,6 @@ static inline void pack_filter_conv_avx2_stride1( oc_index_id = ncb_index.ndrange_id[1]; const int8_t* pack_filter_ptr = kern_param.filter(group_id); - bundle.set(kern_param.workspace_ptr); int16_t* out_ptr = static_cast(bundle.get(1)) + group_id * round_up(oc, oc_step) * oc_out_stride; @@ -602,7 +600,7 @@ inline void AlgoAVX2DirectConvStride1S8S8S32_forward( #undef cb_switch #undef cb } -void do_conv_kern(WorkspaceBundle bundle, +void do_conv_kern(const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index) { auto&& fm = kern_param.filter_meta; @@ -635,8 +633,6 @@ void do_conv_kern(WorkspaceBundle bundle, batch_id = ncb_index.ndrange_id[1], channel_id = ncb_index.ndrange_id[2]; - bundle.set(kern_param.workspace_ptr); - int8_t* src_ptr = static_cast(bundle.get(0)) + group_id * packed_group_size + batch_id * group * packed_group_size; @@ -672,7 +668,7 @@ void do_conv_kern(WorkspaceBundle bundle, oc_stride, kern_param); } -void do_post_process(WorkspaceBundle bundle, +void do_post_process(const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index) { auto&& fm = kern_param.filter_meta; @@ -683,7 +679,6 @@ void do_post_process(WorkspaceBundle bundle, size_t group_id = ncb_index.ndrange_id[0], batch_id = ncb_index.ndrange_id[1]; - bundle.set(kern_param.workspace_ptr); bool need_post_process = kern_param.dst_type.enumv() == DTypeEnum::QuantizedS8; void* dst_tptr = nullptr; @@ -729,21 +724,22 @@ void do_post_process(WorkspaceBundle bundle, } SmallVector get_kimpls(const NCBKernSizeParam& kern_param, - WorkspaceBundle bundle) { + const WorkspaceBundle& bundle) { SmallVector ncb_kerns; auto fm = kern_param.filter_meta; size_t N = kern_param.n; size_t IC = kern_param.filter_meta.icpg; size_t OC = kern_param.filter_meta.ocpg; size_t group = fm.group; -#define cb(task) \ - auto task = [bundle, tmp_func]( \ - const ConvBiasImpl::NCBKernParam& kern_param, \ - const ConvBiasImpl::NCBKernIndex& ncb_index) { \ - tmp_func(bundle, kern_param, \ - {ncb_index.thread_id, \ - {ncb_index.ndrange_id[0], ncb_index.ndrange_id[1], \ - ncb_index.ndrange_id[2]}}); \ +#define cb(task) \ + auto task = [bundle = bundle, tmp_func]( \ + const ConvBiasImpl::NCBKernParam& kern_param, \ + const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { \ + bundle.set(kern_param.workspace_ptr); \ + tmp_func(bundle, kern_param, \ + {ncb_index.thread_id, \ + {ncb_index.ndrange_id[0], ncb_index.ndrange_id[1], \ + ncb_index.ndrange_id[2]}}); \ }; auto tmp_func = pack_src_conv_avx2_stride1; cb(pack_src_task); diff --git a/dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride1.h b/dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride1.h index 5c192c41..df4db7ef 100644 --- a/dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride1.h +++ b/dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride1.h @@ -20,7 +20,7 @@ using NCBKern = fallback::ConvBiasImpl::NCBKern; using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam; SmallVector get_kimpls(const NCBKernSizeParam& param, - WorkspaceBundle bundle); + const WorkspaceBundle& bundle); } // namespace direct_conv_avx2_stride1 } // namespace x86 diff --git a/dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride2.cpp b/dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride2.cpp index aceb285a..8f581ff2 100644 --- a/dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride2.cpp +++ b/dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride2.cpp @@ -19,7 +19,7 @@ namespace direct_conv_avx2_stride2 { //! layout:(N,IC,IH,IW)-->(N,IC/2,H,2*W_envnW_odd) MEGDNN_ATTRIBUTE_TARGET("sse4.1") -void pack_src_conv_avx2_stride2(WorkspaceBundle bundle, +void pack_src_conv_avx2_stride2(const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index) { int32_t ih = kern_param.isz[0]; @@ -46,7 +46,6 @@ void pack_src_conv_avx2_stride2(WorkspaceBundle bundle, const int8_t* src_ptr = kern_param.src(batch_id, group_id) + ic_step * channel_id * c_stride; - bundle.set(kern_param.workspace_ptr); int8_t* packed_src = static_cast(bundle.get(0)) + batch_id * group * packed_group_size + group_id * packed_group_size + @@ -161,7 +160,7 @@ void pack_src_conv_avx2_stride2(WorkspaceBundle bundle, MEGDNN_ATTRIBUTE_TARGET("sse4.1") static inline void pack_filter_conv_avx2_stride2( - WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, + const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index) { MEGDNN_MARK_USED_VAR(ncb_index); int32_t oc = kern_param.filter_meta.ocpg; @@ -187,7 +186,6 @@ static inline void pack_filter_conv_avx2_stride2( oc_index_id = ncb_index.ndrange_id[1]; const int8_t* pack_filter_ptr = kern_param.filter(group_id); - bundle.set(kern_param.workspace_ptr); int16_t* out_ptr = static_cast(bundle.get(1)) + group_id * round_up(oc, oc_step) * oc_out_stride; @@ -675,7 +673,7 @@ inline void kernel_handle_oh_remain( #undef cb_switch #undef cb } -void kernel_imp(WorkspaceBundle bundle, +void kernel_imp(const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index) { auto&& fm = kern_param.filter_meta; @@ -708,7 +706,6 @@ void kernel_imp(WorkspaceBundle bundle, batch_id = ncb_index.ndrange_id[1], channel_id = ncb_index.ndrange_id[2]; - bundle.set(kern_param.workspace_ptr); int8_t* src_ptr = static_cast(bundle.get(0)) + group_id * packed_group_size + batch_id * group * packed_group_size; @@ -742,7 +739,7 @@ void kernel_imp(WorkspaceBundle bundle, oc_stride, kern_param); } -void do_post_process(WorkspaceBundle bundle, +void do_post_process(const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index) { auto&& fm = kern_param.filter_meta; @@ -754,7 +751,6 @@ void do_post_process(WorkspaceBundle bundle, size_t group_id = ncb_index.ndrange_id[0], batch_id = ncb_index.ndrange_id[1]; - bundle.set(kern_param.workspace_ptr); bool need_post_process = kern_param.dst_type.enumv() == DTypeEnum::QuantizedS8; void* dst_tptr = nullptr; @@ -801,21 +797,22 @@ void do_post_process(WorkspaceBundle bundle, } SmallVector get_kimpls(const NCBKernSizeParam& kern_param, - WorkspaceBundle bundle) { + const WorkspaceBundle& bundle) { SmallVector ncb_kerns; auto fm = kern_param.filter_meta; size_t N = kern_param.n; size_t IC = kern_param.filter_meta.icpg; size_t OC = kern_param.filter_meta.ocpg; size_t group = fm.group; -#define cb(task) \ - auto task = [bundle, tmp_func]( \ - const ConvBiasImpl::NCBKernParam& kern_param, \ - const ConvBiasImpl::NCBKernIndex& ncb_index) { \ - tmp_func(bundle, kern_param, \ - {ncb_index.thread_id, \ - {ncb_index.ndrange_id[0], ncb_index.ndrange_id[1], \ - ncb_index.ndrange_id[2]}}); \ +#define cb(task) \ + auto task = [bundle = bundle, tmp_func]( \ + const ConvBiasImpl::NCBKernParam& kern_param, \ + const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { \ + bundle.set(kern_param.workspace_ptr); \ + tmp_func(bundle, kern_param, \ + {ncb_index.thread_id, \ + {ncb_index.ndrange_id[0], ncb_index.ndrange_id[1], \ + ncb_index.ndrange_id[2]}}); \ }; auto tmp_func = pack_src_conv_avx2_stride2; cb(pack_src_task); diff --git a/dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride2.h b/dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride2.h index 2d6f45a0..10679c4d 100644 --- a/dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride2.h +++ b/dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride2.h @@ -20,7 +20,7 @@ using NCBKern = fallback::ConvBiasImpl::NCBKern; using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam; SmallVector get_kimpls(const NCBKernSizeParam& param, - WorkspaceBundle bundle); + const WorkspaceBundle& bundle); } // namespace direct_conv_avx2_stride2 } // namespace x86 diff --git a/dnn/src/x86/conv_bias/int8/chanwise_helper.h b/dnn/src/x86/conv_bias/int8/chanwise_helper.h index 362ea082..c800ea79 100644 --- a/dnn/src/x86/conv_bias/int8/chanwise_helper.h +++ b/dnn/src/x86/conv_bias/int8/chanwise_helper.h @@ -48,7 +48,7 @@ static inline void get_rectified_size(const NCBKernSizeParam& param, } static inline void copy_padding_kern( - WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, + const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernIndex& ncb_index) { size_t IW = kern_param.isz[1]; size_t IH = kern_param.isz[0]; @@ -59,7 +59,6 @@ static inline void copy_padding_kern( get_rectified_size(kern_param, IH2, IW2, OH2, OW2); bool need_src_copy_var = need_src_copy(kern_param); size_t padding_group_size = IH2 * IW2; - bundle.set(kern_param.workspace_ptr); size_t group_id = ncb_index.ndrange_id[0], batch_id = ncb_index.ndrange_id[1],