diff --git a/dnn/src/aarch64/conv_bias/fp16/algos.cpp b/dnn/src/aarch64/conv_bias/fp16/algos.cpp index 399211f0..002b1006 100644 --- a/dnn/src/aarch64/conv_bias/fp16/algos.cpp +++ b/dnn/src/aarch64/conv_bias/fp16/algos.cpp @@ -12,8 +12,8 @@ #include "src/aarch64/conv_bias/fp16/algos.h" #include "src/aarch64/conv_bias/fp16/stride2_kern.h" -#include "src/arm_common/conv_bias/direct/multi_thread_common.h" #include "src/arm_common/conv_bias/postprocess_helper.h" +#include "src/fallback/conv_bias/direct/multi_thread_common.h" using namespace megdnn; using namespace aarch64; @@ -43,7 +43,7 @@ size_t ConvBiasImpl::AlgoF16DirectStride2::get_workspace( const NCBKernSizeParam& param) const { MIDOUT_BEGIN(megdnn_aarch64_conv_bias_stride2_conv2357_fp16, 0, 1) { bool large_group = param.filter_meta.group >= param.nr_threads; - auto wbundle = arm_common::MultithreadDirectConvCommon< + auto wbundle = fallback::MultithreadDirectConvCommon< dt_float16, __fp16>::get_bundle_stride(param, large_group); return wbundle.total_size_in_bytes(); } @@ -83,7 +83,7 @@ SmallVector ConvBiasImpl::AlgoF16DirectStride2::get_kimpl conv = fp16::conv_stride2::do_conv_7x7_stride2; } - WorkspaceBundle bundle = arm_common::MultithreadDirectConvCommon< + WorkspaceBundle bundle = fallback::MultithreadDirectConvCommon< dt_float16, __fp16>::get_bundle_stride(param, large_group); SmallVector ret_kerns; @@ -98,13 +98,13 @@ SmallVector ConvBiasImpl::AlgoF16DirectStride2::get_kimpl size_t OC = fm.ocpg; bundle.set(kern_param.workspace_ptr); for (size_t ic = 0; ic < IC; ic++) { - arm_common::MultithreadDirectConvCommon:: + fallback::MultithreadDirectConvCommon:: copy_padding_kern_stride( bundle, kern_param, ncb_index, {ncb_index.thread_id, 0, ic}); } for (size_t oc = 0; oc < OC; oc++) { - arm_common::MultithreadDirectConvCommon:: + fallback::MultithreadDirectConvCommon:: do_conv_kern_stride( bundle, kern_param, ncb_index, conv, {ncb_index.thread_id, 0, oc}); @@ -116,7 +116,7 @@ SmallVector ConvBiasImpl::AlgoF16DirectStride2::get_kimpl const NCBKernParam& kern_param, const NCBKernIndex& ncb_index) mutable { bundle.set(kern_param.workspace_ptr); - arm_common::MultithreadDirectConvCommon:: + fallback::MultithreadDirectConvCommon:: copy_padding_kern_stride( bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; @@ -125,7 +125,7 @@ SmallVector ConvBiasImpl::AlgoF16DirectStride2::get_kimpl const NCBKernParam& kern_param, const NCBKernIndex& ncb_index) mutable { bundle.set(kern_param.workspace_ptr); - arm_common::MultithreadDirectConvCommon:: + fallback::MultithreadDirectConvCommon:: do_conv_kern_stride( bundle, kern_param, ncb_index, conv, ncb_index.ndrange_id); }; diff --git a/dnn/src/aarch64/conv_bias/fp32/algos.cpp b/dnn/src/aarch64/conv_bias/fp32/algos.cpp index a5d3358c..ae6bf20a 100644 --- a/dnn/src/aarch64/conv_bias/fp32/algos.cpp +++ b/dnn/src/aarch64/conv_bias/fp32/algos.cpp @@ -11,9 +11,9 @@ #include "src/aarch64/conv_bias/fp32/algos.h" #include "src/aarch64/conv_bias/fp32/stride2_kern.h" -#include "src/arm_common/conv_bias/direct/multi_thread_common.h" #include "src/arm_common/conv_bias/postprocess_helper.h" #include "src/fallback/conv_bias/common.h" +#include "src/fallback/conv_bias/direct/multi_thread_common.h" #include "midout.h" @@ -42,8 +42,9 @@ size_t ConvBiasImpl::AlgoF32DirectStride2::get_workspace( const NCBKernSizeParam& param) const { MIDOUT_BEGIN(megdnn_aarch64_conv_bias_stride2_conv2357_fp32, 0, 1) { bool large_group = param.filter_meta.group >= param.nr_threads; - auto wbundle = arm_common::MultithreadDirectConvCommon< - float, float>::get_bundle_stride(param, large_group); + auto wbundle = + fallback::MultithreadDirectConvCommon::get_bundle_stride( + param, large_group); return wbundle.total_size_in_bytes(); } MIDOUT_END(); @@ -82,7 +83,7 @@ SmallVector ConvBiasImpl::AlgoF32DirectStride2::get_kimpl } WorkspaceBundle bundle = - arm_common::MultithreadDirectConvCommon::get_bundle_stride( + fallback::MultithreadDirectConvCommon::get_bundle_stride( param, large_group); SmallVector ret_kerns; @@ -97,13 +98,13 @@ SmallVector ConvBiasImpl::AlgoF32DirectStride2::get_kimpl size_t OC = fm.ocpg; bundle.set(kern_param.workspace_ptr); for (size_t ic = 0; ic < IC; ic++) { - arm_common::MultithreadDirectConvCommon:: + fallback::MultithreadDirectConvCommon:: copy_padding_kern_stride( bundle, kern_param, ncb_index, {ncb_index.thread_id, 0, ic}); } for (size_t oc = 0; oc < OC; oc++) { - arm_common::MultithreadDirectConvCommon:: + fallback::MultithreadDirectConvCommon:: do_conv_kern_stride( bundle, kern_param, ncb_index, conv, {ncb_index.thread_id, 0, oc}); @@ -115,7 +116,7 @@ SmallVector ConvBiasImpl::AlgoF32DirectStride2::get_kimpl const NCBKernParam& kern_param, const NCBKernIndex& ncb_index) mutable { bundle.set(kern_param.workspace_ptr); - arm_common::MultithreadDirectConvCommon:: + fallback::MultithreadDirectConvCommon:: copy_padding_kern_stride( bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; @@ -124,7 +125,7 @@ SmallVector ConvBiasImpl::AlgoF32DirectStride2::get_kimpl const NCBKernParam& kern_param, const NCBKernIndex& ncb_index) mutable { bundle.set(kern_param.workspace_ptr); - arm_common::MultithreadDirectConvCommon::do_conv_kern_stride( + fallback::MultithreadDirectConvCommon::do_conv_kern_stride( bundle, kern_param, ncb_index, conv, ncb_index.ndrange_id); }; ret_kerns.push_back({do_conv, {group, N, OC}}); diff --git a/dnn/src/arm_common/conv_bias/f16/algos.cpp b/dnn/src/arm_common/conv_bias/f16/algos.cpp index d26912aa..f66c3f2b 100644 --- a/dnn/src/arm_common/conv_bias/f16/algos.cpp +++ b/dnn/src/arm_common/conv_bias/f16/algos.cpp @@ -10,7 +10,6 @@ */ #include "src/arm_common/conv_bias/f16/algos.h" -#include "src/arm_common/conv_bias/direct/multi_thread_common.h" #include "src/arm_common/conv_bias/f16/direct.h" #include "src/arm_common/conv_bias/f16/do_conv_stride1.h" #include "src/arm_common/conv_bias/f16/strategy.h" @@ -18,6 +17,7 @@ #include "src/arm_common/conv_bias/postprocess_helper.h" #include "src/common/opr_delegate.h" #include "src/fallback/conv_bias/common.h" +#include "src/fallback/conv_bias/direct/multi_thread_common.h" #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC #include "midout.h" MIDOUT_DECL(megdnn_arm_common_winograd_fp16) @@ -187,8 +187,9 @@ bool ConvBiasImpl::AlgoF16Direct::usable( size_t ConvBiasImpl::AlgoF16Direct::get_workspace(const NCBKernSizeParam& param) const { MIDOUT_BEGIN(megdnn_arm_common_conv_bias_fp16_kimpl, 0, 1) { bool large_group = param.filter_meta.group >= param.nr_threads; - auto wbundle = MultithreadDirectConvCommon::get_bundle( - param, large_group); + auto wbundle = + fallback::MultithreadDirectConvCommon::get_bundle( + param, large_group); return wbundle.total_size_in_bytes(); } MIDOUT_END(); @@ -204,7 +205,7 @@ SmallVector ConvBiasImpl::AlgoF16Direct::get_kimpls( size_t group = fm.group; bool large_group = group >= param.nr_threads; WorkspaceBundle bundle = - MultithreadDirectConvCommon::get_bundle( + fallback::MultithreadDirectConvCommon::get_bundle( param, large_group); SmallVector ret_kerns; //! When group >= nr_threads, treat it as large_group, each thread process @@ -220,17 +221,20 @@ SmallVector ConvBiasImpl::AlgoF16Direct::get_kimpls( bundle.set(kern_param.workspace_ptr); if (fm.should_flip) { for (size_t oc = 0; oc < OC; oc++) { - MultithreadDirectConvCommon::weight_flip_kern( - bundle, kern_param, ncb_index, - {ncb_index.thread_id, 0, oc}); + fallback::MultithreadDirectConvCommon:: + weight_flip_kern( + bundle, kern_param, ncb_index, + {ncb_index.thread_id, 0, oc}); } } for (size_t ic = 0; ic < IC; ic++) { - MultithreadDirectConvCommon::copy_padding_kern( - bundle, kern_param, ncb_index, {ncb_index.thread_id, 0, ic}); + fallback::MultithreadDirectConvCommon:: + copy_padding_kern( + bundle, kern_param, ncb_index, + {ncb_index.thread_id, 0, ic}); } for (size_t oc = 0; oc < OC; oc++) { - MultithreadDirectConvCommon::do_conv_kern( + fallback::MultithreadDirectConvCommon::do_conv_kern( bundle, kern_param, ncb_index, fp16::conv_bias::kern_direct_f16, {ncb_index.thread_id, 0, oc}); } @@ -242,8 +246,9 @@ SmallVector ConvBiasImpl::AlgoF16Direct::get_kimpls( const NCBKernParam& kern_param, const NCBKernIndex& ncb_index) mutable { bundle.set(kern_param.workspace_ptr); - MultithreadDirectConvCommon::weight_flip_kern( - bundle, kern_param, ncb_index, ncb_index.ndrange_id); + fallback::MultithreadDirectConvCommon:: + weight_flip_kern( + bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({weight_flip, {group, 1_z, OC}}); } @@ -251,15 +256,16 @@ SmallVector ConvBiasImpl::AlgoF16Direct::get_kimpls( const NCBKernParam& kern_param, const NCBKernIndex& ncb_index) mutable { bundle.set(kern_param.workspace_ptr); - MultithreadDirectConvCommon::copy_padding_kern( - bundle, kern_param, ncb_index, ncb_index.ndrange_id); + fallback::MultithreadDirectConvCommon:: + copy_padding_kern( + bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({copy_padding, {group, N, IC}}); auto do_conv = [bundle]( const NCBKernParam& kern_param, const NCBKernIndex& ncb_index) mutable { bundle.set(kern_param.workspace_ptr); - MultithreadDirectConvCommon::do_conv_kern( + fallback::MultithreadDirectConvCommon::do_conv_kern( bundle, kern_param, ncb_index, fp16::conv_bias::kern_direct_f16, ncb_index.ndrange_id); }; @@ -324,9 +330,8 @@ SmallVector ConvBiasImpl::AlgoF16DirectStride1::get_kimpl } SWITCH_KERN(); - WorkspaceBundle bundle = - MultithreadDirectConvCommon::get_bundle_stride( - param, large_group); + WorkspaceBundle bundle = fallback::MultithreadDirectConvCommon< + dt_float16, __fp16>::get_bundle_stride(param, large_group); SmallVector ret_kerns; //! When group >= nr_threads, treat it as large_group, each thread process //! one group for better performance @@ -340,15 +345,16 @@ SmallVector ConvBiasImpl::AlgoF16DirectStride1::get_kimpl size_t OC = fm.ocpg; bundle.set(kern_param.workspace_ptr); for (size_t ic = 0; ic < IC; ic++) { - MultithreadDirectConvCommon:: + fallback::MultithreadDirectConvCommon:: copy_padding_kern_stride( bundle, kern_param, ncb_index, {ncb_index.thread_id, 0, ic}); } for (size_t oc = 0; oc < OC; oc++) { - MultithreadDirectConvCommon::do_conv_kern_stride( - bundle, kern_param, ncb_index, conv_kern_function, - {ncb_index.thread_id, 0, oc}); + fallback::MultithreadDirectConvCommon:: + do_conv_kern_stride( + bundle, kern_param, ncb_index, conv_kern_function, + {ncb_index.thread_id, 0, oc}); } }; ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); @@ -357,17 +363,19 @@ SmallVector ConvBiasImpl::AlgoF16DirectStride1::get_kimpl const NCBKernParam& kern_param, const NCBKernIndex& ncb_index) mutable { bundle.set(kern_param.workspace_ptr); - MultithreadDirectConvCommon::copy_padding_kern_stride( - bundle, kern_param, ncb_index, ncb_index.ndrange_id); + fallback::MultithreadDirectConvCommon:: + copy_padding_kern_stride( + bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({copy_padding, {group, N, IC}}); auto do_conv = [bundle, conv_kern_function]( const NCBKernParam& kern_param, const NCBKernIndex& ncb_index) mutable { bundle.set(kern_param.workspace_ptr); - MultithreadDirectConvCommon::do_conv_kern_stride( - bundle, kern_param, ncb_index, conv_kern_function, - ncb_index.ndrange_id); + fallback::MultithreadDirectConvCommon:: + do_conv_kern_stride( + bundle, kern_param, ncb_index, conv_kern_function, + ncb_index.ndrange_id); }; ret_kerns.push_back({do_conv, {group, N, OC}}); } @@ -378,9 +386,8 @@ size_t ConvBiasImpl::AlgoF16DirectStride1::get_workspace( const NCBKernSizeParam& param) const { MIDOUT_BEGIN(megdnn_arm_common_conv_bias_fp16_kimpl, 1, 1) { bool large_group = param.filter_meta.group >= param.nr_threads; - auto bundle = - MultithreadDirectConvCommon::get_bundle_stride( - param, large_group); + auto bundle = fallback::MultithreadDirectConvCommon< + dt_float16, __fp16>::get_bundle_stride(param, large_group); return bundle.total_size_in_bytes(); } MIDOUT_END(); diff --git a/dnn/src/arm_common/conv_bias/fp32/algos.cpp b/dnn/src/arm_common/conv_bias/fp32/algos.cpp index ce886905..e65869fa 100644 --- a/dnn/src/arm_common/conv_bias/fp32/algos.cpp +++ b/dnn/src/arm_common/conv_bias/fp32/algos.cpp @@ -11,7 +11,6 @@ */ #include "src/arm_common/conv_bias/fp32/algos.h" -#include "src/arm_common/conv_bias/direct/multi_thread_common.h" #include "src/arm_common/conv_bias/fp32/direct.h" #include "src/arm_common/conv_bias/fp32/do_conv_stride1.h" #include "src/arm_common/conv_bias/fp32/do_conv_stride2.h" @@ -20,6 +19,7 @@ #include "src/arm_common/conv_bias/postprocess_helper.h" #include "src/common/opr_delegate.h" #include "src/fallback/conv_bias/common.h" +#include "src/fallback/conv_bias/direct/multi_thread_common.h" #include "midout.h" @@ -343,7 +343,7 @@ bool ConvBiasImpl::AlgoF32Direct::usable( size_t ConvBiasImpl::AlgoF32Direct::get_workspace(const NCBKernSizeParam& param) const { MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 0, 1) { bool large_group = param.filter_meta.group >= param.nr_threads; - auto wbundle = MultithreadDirectConvCommon::get_bundle( + auto wbundle = fallback::MultithreadDirectConvCommon::get_bundle( param, large_group); return wbundle.total_size_in_bytes(); } @@ -359,7 +359,8 @@ SmallVector ConvBiasImpl::AlgoF32Direct::get_kimpls( size_t group = fm.group; bool large_group = group >= param.nr_threads; WorkspaceBundle bundle = - MultithreadDirectConvCommon::get_bundle(param, large_group); + fallback::MultithreadDirectConvCommon::get_bundle( + param, large_group); SmallVector ret_kerns; //! When group >= nr_threads, treat it as large_group, each thread process //! one group for better performance @@ -374,17 +375,18 @@ SmallVector ConvBiasImpl::AlgoF32Direct::get_kimpls( bundle.set(kern_param.workspace_ptr); if (fm.should_flip) { for (size_t oc = 0; oc < OC; oc++) { - MultithreadDirectConvCommon::weight_flip_kern( - bundle, kern_param, ncb_index, - {ncb_index.thread_id, 0, oc}); + fallback::MultithreadDirectConvCommon:: + weight_flip_kern( + bundle, kern_param, ncb_index, + {ncb_index.thread_id, 0, oc}); } } for (size_t ic = 0; ic < IC; ic++) { - MultithreadDirectConvCommon::copy_padding_kern( + fallback::MultithreadDirectConvCommon::copy_padding_kern( bundle, kern_param, ncb_index, {ncb_index.thread_id, 0, ic}); } for (size_t oc = 0; oc < OC; oc++) { - MultithreadDirectConvCommon::do_conv_kern( + fallback::MultithreadDirectConvCommon::do_conv_kern( bundle, kern_param, ncb_index, fp32::conv_bias::kern_direct, {ncb_index.thread_id, 0, oc}); } @@ -396,7 +398,7 @@ SmallVector ConvBiasImpl::AlgoF32Direct::get_kimpls( const NCBKernParam& kern_param, const NCBKernIndex& ncb_index) mutable { bundle.set(kern_param.workspace_ptr); - MultithreadDirectConvCommon::weight_flip_kern( + fallback::MultithreadDirectConvCommon::weight_flip_kern( bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({weight_flip, {group, 1_z, OC}}); @@ -405,7 +407,7 @@ SmallVector ConvBiasImpl::AlgoF32Direct::get_kimpls( const NCBKernParam& kern_param, const NCBKernIndex& ncb_index) mutable { bundle.set(kern_param.workspace_ptr); - MultithreadDirectConvCommon::copy_padding_kern( + fallback::MultithreadDirectConvCommon::copy_padding_kern( bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({copy_padding, {group, N, IC}}); @@ -413,7 +415,7 @@ SmallVector ConvBiasImpl::AlgoF32Direct::get_kimpls( const NCBKernParam& kern_param, const NCBKernIndex& ncb_index) mutable { bundle.set(kern_param.workspace_ptr); - MultithreadDirectConvCommon::do_conv_kern( + fallback::MultithreadDirectConvCommon::do_conv_kern( bundle, kern_param, ncb_index, fp32::conv_bias::kern_direct, ncb_index.ndrange_id); }; @@ -452,8 +454,9 @@ size_t ConvBiasImpl::AlgoF32DirectStride1::get_workspace( const NCBKernSizeParam& param) const { MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 1, 1) { bool large_group = param.filter_meta.group >= param.nr_threads; - auto bundle = MultithreadDirectConvCommon::get_bundle_stride( - param, large_group); + auto bundle = + fallback::MultithreadDirectConvCommon::get_bundle_stride( + param, large_group); return bundle.total_size_in_bytes(); } MIDOUT_END(); @@ -492,7 +495,7 @@ SmallVector ConvBiasImpl::AlgoF32DirectStride1::get_kimpl SWITCH_KERN_STR1(); WorkspaceBundle bundle = - MultithreadDirectConvCommon::get_bundle_stride( + fallback::MultithreadDirectConvCommon::get_bundle_stride( param, large_group); SmallVector ret_kerns; //! When group >= nr_threads, treat it as large_group, each thread process @@ -507,13 +510,16 @@ SmallVector ConvBiasImpl::AlgoF32DirectStride1::get_kimpl size_t OC = fm.ocpg; bundle.set(kern_param.workspace_ptr); for (size_t ic = 0; ic < IC; ic++) { - MultithreadDirectConvCommon::copy_padding_kern_stride( - bundle, kern_param, ncb_index, {ncb_index.thread_id, 0, ic}); + fallback::MultithreadDirectConvCommon:: + copy_padding_kern_stride( + bundle, kern_param, ncb_index, + {ncb_index.thread_id, 0, ic}); } for (size_t oc = 0; oc < OC; oc++) { - MultithreadDirectConvCommon::do_conv_kern_stride( - bundle, kern_param, ncb_index, conv_kern_function, - {ncb_index.thread_id, 0, oc}); + fallback::MultithreadDirectConvCommon:: + do_conv_kern_stride( + bundle, kern_param, ncb_index, conv_kern_function, + {ncb_index.thread_id, 0, oc}); } }; ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); @@ -522,15 +528,16 @@ SmallVector ConvBiasImpl::AlgoF32DirectStride1::get_kimpl const NCBKernParam& kern_param, const NCBKernIndex& ncb_index) mutable { bundle.set(kern_param.workspace_ptr); - MultithreadDirectConvCommon::copy_padding_kern_stride( - bundle, kern_param, ncb_index, ncb_index.ndrange_id); + fallback::MultithreadDirectConvCommon:: + copy_padding_kern_stride( + bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({copy_padding, {group, N, IC}}); auto do_conv = [bundle, conv_kern_function]( const NCBKernParam& kern_param, const NCBKernIndex& ncb_index) mutable { bundle.set(kern_param.workspace_ptr); - MultithreadDirectConvCommon::do_conv_kern_stride( + fallback::MultithreadDirectConvCommon::do_conv_kern_stride( bundle, kern_param, ncb_index, conv_kern_function, ncb_index.ndrange_id); }; @@ -570,8 +577,9 @@ size_t ConvBiasImpl::AlgoF32DirectStride2::get_workspace( const NCBKernSizeParam& param) const { MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 2, 1) { bool large_group = param.filter_meta.group >= param.nr_threads; - auto bundle = MultithreadDirectConvCommon::get_bundle_stride( - param, large_group); + auto bundle = + fallback::MultithreadDirectConvCommon::get_bundle_stride( + param, large_group); return bundle.total_size_in_bytes(); } MIDOUT_END(); @@ -609,7 +617,7 @@ SmallVector ConvBiasImpl::AlgoF32DirectStride2::get_kimpl SWITCH_KERN_STR2(); WorkspaceBundle bundle = - MultithreadDirectConvCommon::get_bundle_stride( + fallback::MultithreadDirectConvCommon::get_bundle_stride( param, large_group); SmallVector ret_kerns; //! When group >= nr_threads, treat it as large_group, each thread process @@ -624,13 +632,16 @@ SmallVector ConvBiasImpl::AlgoF32DirectStride2::get_kimpl size_t OC = fm.ocpg; bundle.set(kern_param.workspace_ptr); for (size_t ic = 0; ic < IC; ic++) { - MultithreadDirectConvCommon::copy_padding_kern_stride( - bundle, kern_param, ncb_index, {ncb_index.thread_id, 0, ic}); + fallback::MultithreadDirectConvCommon:: + copy_padding_kern_stride( + bundle, kern_param, ncb_index, + {ncb_index.thread_id, 0, ic}); } for (size_t oc = 0; oc < OC; oc++) { - MultithreadDirectConvCommon::do_conv_kern_stride( - bundle, kern_param, ncb_index, conv_kern_function, - {ncb_index.thread_id, 0, oc}); + fallback::MultithreadDirectConvCommon:: + do_conv_kern_stride( + bundle, kern_param, ncb_index, conv_kern_function, + {ncb_index.thread_id, 0, oc}); } }; ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); @@ -639,15 +650,16 @@ SmallVector ConvBiasImpl::AlgoF32DirectStride2::get_kimpl const NCBKernParam& kern_param, const NCBKernIndex& ncb_index) mutable { bundle.set(kern_param.workspace_ptr); - MultithreadDirectConvCommon::copy_padding_kern_stride( - bundle, kern_param, ncb_index, ncb_index.ndrange_id); + fallback::MultithreadDirectConvCommon:: + copy_padding_kern_stride( + bundle, kern_param, ncb_index, ncb_index.ndrange_id); }; ret_kerns.push_back({copy_padding, {group, N, IC}}); auto do_conv = [bundle, conv_kern_function]( const NCBKernParam& kern_param, const NCBKernIndex& ncb_index) mutable { bundle.set(kern_param.workspace_ptr); - MultithreadDirectConvCommon::do_conv_kern_stride( + fallback::MultithreadDirectConvCommon::do_conv_kern_stride( bundle, kern_param, ncb_index, conv_kern_function, ncb_index.ndrange_id); }; diff --git a/dnn/src/arm_common/conv_bias/direct/multi_thread_common.cpp b/dnn/src/fallback/conv_bias/direct/multi_thread_common.cpp similarity index 97% rename from dnn/src/arm_common/conv_bias/direct/multi_thread_common.cpp rename to dnn/src/fallback/conv_bias/direct/multi_thread_common.cpp index ebf43f6e..fc51510e 100644 --- a/dnn/src/arm_common/conv_bias/direct/multi_thread_common.cpp +++ b/dnn/src/fallback/conv_bias/direct/multi_thread_common.cpp @@ -1,5 +1,5 @@ /** - * \file dnn/src/arm_common/conv_bias/direct/multi_thread_common.cpp + * \file dnn/src/fallback/conv_bias/direct/multi_thread_common.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -9,12 +9,14 @@ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -#include "src/arm_common/conv_bias/direct/multi_thread_common.h" -#include "src/arm_common/conv_bias/postprocess_helper.h" +#include "multi_thread_common.h" #include "src/fallback/matrix_mul/opr_impl.h" using namespace megdnn; -using namespace arm_common; +using namespace fallback; +#if MEGDNN_X86 +using namespace x86; +#endif namespace { bool need_dst_copy(const megdnn::fallback::ConvBiasImpl::NCBKernSizeParam& param) { @@ -354,8 +356,8 @@ void MultithreadDirectConvCommon::do_conv_kern_stride( kern_param.nonlineMode, kern_param.bias_type, kern_param.dst_type, 1_z, 1_z, OH, OW); }; -template class megdnn::arm_common::MultithreadDirectConvCommon; +template class megdnn::fallback::MultithreadDirectConvCommon; #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template class megdnn::arm_common::MultithreadDirectConvCommon; +template class megdnn::fallback::MultithreadDirectConvCommon; #endif // vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/direct/multi_thread_common.h b/dnn/src/fallback/conv_bias/direct/multi_thread_common.h similarity index 84% rename from dnn/src/arm_common/conv_bias/direct/multi_thread_common.h rename to dnn/src/fallback/conv_bias/direct/multi_thread_common.h index 09d47fe5..4303c408 100644 --- a/dnn/src/arm_common/conv_bias/direct/multi_thread_common.h +++ b/dnn/src/fallback/conv_bias/direct/multi_thread_common.h @@ -1,5 +1,5 @@ /** - * \file dnn/src/arm_common/conv_bias/direct/multi_thread_common.h + * \file dnn/src/fallback/conv_bias/direct/multi_thread_common.h * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -10,11 +10,20 @@ */ #pragma once -#include "src/arm_common/conv_bias/opr_impl.h" +#include "src/fallback/conv_bias/opr_impl.h" #include "src/fallback/matrix_mul/opr_impl.h" +#if MEGDNN_X86 +#include "src/x86/conv_bias/postprocess_helper.h" +#elif (MEGDNN_ARMV7 || MEGDNN_AARCH64) +#include "src/arm_common/conv_bias/postprocess_helper.h" +#else +//! TODO: optimize common postprocess_helper with general intrinsic +#include "src/common/postprocess_helper.h" +#endif + namespace megdnn { -namespace arm_common { +namespace fallback { template class MultithreadDirectConvCommon { @@ -53,7 +62,7 @@ public: const CpuNDRange& workspace_ids); }; -} // namespace arm_common +} // namespace fallback } // namespace megdnn // vim: syntax=cpp.doxygen diff --git a/dnn/src/fallback/general_intrinsic/gi_float.h b/dnn/src/fallback/general_intrinsic/gi_float.h index 3dc3efe5..8af5c231 100644 --- a/dnn/src/fallback/general_intrinsic/gi_float.h +++ b/dnn/src/fallback/general_intrinsic/gi_float.h @@ -42,7 +42,7 @@ GI_FLOAT32_t GiReintInt32ToFloat32(GI_INT32_t Vector) { #elif defined(GI_SSE2_INTRINSICS) return _mm_castsi128_ps(Vector); #else - return (GI_FLOAT32_t)In; + return (GI_FLOAT32_t)Vector; #endif } @@ -53,7 +53,7 @@ GI_FLOAT32_t GiReintUint32ToFloat32(GI_UINT32_t Vector) { #elif defined(GI_SSE2_INTRINSICS) return _mm_castsi128_ps(Vector); #else - return (GI_FLOAT32_t)In; + return (GI_FLOAT32_t)Vector; #endif } diff --git a/dnn/src/fallback/general_intrinsic/gi_int.h b/dnn/src/fallback/general_intrinsic/gi_int.h index b2d95b56..97181862 100644 --- a/dnn/src/fallback/general_intrinsic/gi_int.h +++ b/dnn/src/fallback/general_intrinsic/gi_int.h @@ -1,5 +1,5 @@ /** - * \file dnn/src/fallback/general_intrinsic/gi_float.h + * \file dnn/src/fallback/general_intrinsic/gi_int.h * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2022 Megvii Inc. All rights reserved.