Browse Source

fix(dnn/arm_common): fix and optimize workspacebundle copy when algo compute

GitOrigin-RevId: 801aedbd72
release-0.6
Megvii Engine Team 5 years ago
parent
commit
32d91d5e6b
54 changed files with 581 additions and 547 deletions
  1. +9
    -7
      dnn/src/aarch64/conv_bias/fp16/algos.cpp
  2. +9
    -7
      dnn/src/aarch64/conv_bias/fp32/algos.cpp
  3. +9
    -10
      dnn/src/arm_common/conv_bias/direct/multi_thread_common.cpp
  4. +5
    -5
      dnn/src/arm_common/conv_bias/direct/multi_thread_common.h
  5. +18
    -15
      dnn/src/arm_common/conv_bias/f16/algos.cpp
  6. +26
    -22
      dnn/src/arm_common/conv_bias/fp32/algos.cpp
  7. +7
    -7
      dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw44_algo.cpp
  8. +9
    -9
      dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw_nchw44_algo.cpp
  9. +6
    -6
      dnn/src/arm_common/conv_bias/int8/channel_wise_nchw44.cpp
  10. +3
    -3
      dnn/src/arm_common/conv_bias/int8/channel_wise_nchw44.h
  11. +9
    -7
      dnn/src/arm_common/conv_bias/int8/direct_dotprod_nchw44_algo.cpp
  12. +17
    -16
      dnn/src/arm_common/conv_bias/int8/direct_nchw44_algo.cpp
  13. +12
    -12
      dnn/src/arm_common/conv_bias/int8/direct_nchw_nchw44_algo.cpp
  14. +9
    -9
      dnn/src/arm_common/conv_bias/int8/dot_direct_nchw_nchw44_algo.cpp
  15. +13
    -12
      dnn/src/arm_common/conv_bias/int8/stride1.cpp
  16. +3
    -3
      dnn/src/arm_common/conv_bias/int8/stride1.h
  17. +13
    -12
      dnn/src/arm_common/conv_bias/int8/stride1_dotprod.cpp
  18. +4
    -3
      dnn/src/arm_common/conv_bias/int8/stride1_dotprod.h
  19. +13
    -12
      dnn/src/arm_common/conv_bias/int8/stride2.cpp
  20. +4
    -3
      dnn/src/arm_common/conv_bias/int8/stride2.h
  21. +13
    -12
      dnn/src/arm_common/conv_bias/int8/stride2_dotprod.cpp
  22. +4
    -3
      dnn/src/arm_common/conv_bias/int8/stride2_dotprod.h
  23. +22
    -22
      dnn/src/arm_common/conv_bias/int8x8x16/algos.cpp
  24. +4
    -4
      dnn/src/arm_common/conv_bias/int8x8x16/algos.h
  25. +13
    -13
      dnn/src/arm_common/conv_bias/quint8/stride1.cpp
  26. +4
    -3
      dnn/src/arm_common/conv_bias/quint8/stride1.h
  27. +13
    -12
      dnn/src/arm_common/conv_bias/quint8/stride1_dotprod.cpp
  28. +4
    -3
      dnn/src/arm_common/conv_bias/quint8/stride1_dotprod.h
  29. +13
    -12
      dnn/src/arm_common/conv_bias/quint8/stride2.cpp
  30. +4
    -3
      dnn/src/arm_common/conv_bias/quint8/stride2.h
  31. +15
    -13
      dnn/src/arm_common/conv_bias/quint8/stride2_dotprod.cpp
  32. +4
    -3
      dnn/src/arm_common/conv_bias/quint8/stride2_dotprod.h
  33. +29
    -22
      dnn/src/fallback/conv_bias/im2col/algos.cpp
  34. +86
    -81
      dnn/src/fallback/conv_bias/im2col/strategy_base.h
  35. +5
    -5
      dnn/src/fallback/conv_bias/im2col/strategy_default.cpp
  36. +2
    -1
      dnn/src/fallback/conv_bias/im2col/strategy_default_nchw44.cpp
  37. +2
    -1
      dnn/src/fallback/conv_bias/im2col/strategy_fuse_nchw44.cpp
  38. +2
    -2
      dnn/src/fallback/conv_bias/im2col/strategy_fuse_nchw44_dot.cpp
  39. +2
    -1
      dnn/src/fallback/conv_bias/im2col/strategy_fuse_nchw44_fp32_s2.cpp
  40. +5
    -4
      dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp
  41. +5
    -5
      dnn/src/fallback/conv_bias/im2col/strategy_onlypacka.cpp
  42. +32
    -29
      dnn/src/fallback/conv_bias/winograd/winograd.h
  43. +1
    -4
      dnn/src/naive/handle.h
  44. +47
    -47
      dnn/src/x86/conv_bias/f32/algos.cpp
  45. +5
    -5
      dnn/src/x86/conv_bias/f32/algos.h
  46. +6
    -7
      dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.cpp
  47. +2
    -2
      dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.h
  48. +6
    -7
      dnn/src/x86/conv_bias/int8/avx2_chanwise_stride2.cpp
  49. +2
    -2
      dnn/src/x86/conv_bias/int8/avx2_chanwise_stride2.h
  50. +14
    -18
      dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride1.cpp
  51. +1
    -1
      dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride1.h
  52. +14
    -17
      dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride2.cpp
  53. +1
    -1
      dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride2.h
  54. +1
    -2
      dnn/src/x86/conv_bias/int8/chanwise_helper.h

+ 9
- 7
dnn/src/aarch64/conv_bias/fp16/algos.cpp View File

@@ -89,19 +89,20 @@ ConvBiasImpl::AlgoF16DirectStride2::get_kimpls(
conv = fp16::conv_stride2::do_conv_7x7_stride2; conv = fp16::conv_stride2::do_conv_7x7_stride2;
} }


WorkspaceBundle wbundle = arm_common::MultithreadDirectConvCommon<
WorkspaceBundle bundle = arm_common::MultithreadDirectConvCommon<
dt_float16, __fp16>::get_bundle_stride(param, m_large_group); dt_float16, __fp16>::get_bundle_stride(param, m_large_group);
SmallVector<NCBKern> ret_kerns; SmallVector<NCBKern> ret_kerns;


//! Dense conv and small group //! Dense conv and small group
if (m_large_group) { if (m_large_group) {
//! Channel wise conv and big groups //! Channel wise conv and big groups
auto exec_one_group = [wbundle, conv](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
auto exec_one_group = [bundle, conv](
const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) mutable {
auto fm = kern_param.filter_meta; auto fm = kern_param.filter_meta;
size_t IC = fm.icpg; size_t IC = fm.icpg;
size_t OC = fm.ocpg; size_t OC = fm.ocpg;
WorkspaceBundle bundle = wbundle;
bundle.set(kern_param.workspace_ptr);
for (size_t ic = 0; ic < IC; ic++) { for (size_t ic = 0; ic < IC; ic++) {
arm_common::MultithreadDirectConvCommon<dt_float16, __fp16>:: arm_common::MultithreadDirectConvCommon<dt_float16, __fp16>::
copy_padding_kern_stride(bundle, kern_param, ncb_index, copy_padding_kern_stride(bundle, kern_param, ncb_index,
@@ -115,16 +116,17 @@ ConvBiasImpl::AlgoF16DirectStride2::get_kimpls(
}; };
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
} else { } else {
WorkspaceBundle bundle = wbundle;
auto copy_padding = [bundle](const NCBKernParam& kern_param, auto copy_padding = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
arm_common::MultithreadDirectConvCommon<dt_float16, __fp16>:: arm_common::MultithreadDirectConvCommon<dt_float16, __fp16>::
copy_padding_kern_stride(bundle, kern_param, ncb_index, copy_padding_kern_stride(bundle, kern_param, ncb_index,
ncb_index.ndrange_id); ncb_index.ndrange_id);
}; };
ret_kerns.push_back({copy_padding, {group, N, IC}}); ret_kerns.push_back({copy_padding, {group, N, IC}});
auto do_conv = [bundle, conv](const NCBKernParam& kern_param, auto do_conv = [bundle, conv](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
arm_common::MultithreadDirectConvCommon<dt_float16, __fp16>:: arm_common::MultithreadDirectConvCommon<dt_float16, __fp16>::
do_conv_kern_stride(bundle, kern_param, ncb_index, conv, do_conv_kern_stride(bundle, kern_param, ncb_index, conv,
ncb_index.ndrange_id); ncb_index.ndrange_id);


+ 9
- 7
dnn/src/aarch64/conv_bias/fp32/algos.cpp View File

@@ -88,19 +88,20 @@ ConvBiasImpl::AlgoF32DirectStride2::get_kimpls(
conv = fp32::conv_stride2::do_conv_7x7_stride2; conv = fp32::conv_stride2::do_conv_7x7_stride2;
} }


WorkspaceBundle wbundle = arm_common::MultithreadDirectConvCommon<
WorkspaceBundle bundle = arm_common::MultithreadDirectConvCommon<
float, float>::get_bundle_stride(param, m_large_group); float, float>::get_bundle_stride(param, m_large_group);
SmallVector<NCBKern> ret_kerns; SmallVector<NCBKern> ret_kerns;


//! Dense conv and small group //! Dense conv and small group
if (m_large_group) { if (m_large_group) {
//! Channel wise conv and big groups //! Channel wise conv and big groups
auto exec_one_group = [wbundle, conv](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
auto exec_one_group = [bundle, conv](
const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) mutable {
auto fm = kern_param.filter_meta; auto fm = kern_param.filter_meta;
size_t IC = fm.icpg; size_t IC = fm.icpg;
size_t OC = fm.ocpg; size_t OC = fm.ocpg;
WorkspaceBundle bundle = wbundle;
bundle.set(kern_param.workspace_ptr);
for (size_t ic = 0; ic < IC; ic++) { for (size_t ic = 0; ic < IC; ic++) {
arm_common::MultithreadDirectConvCommon<float, float>:: arm_common::MultithreadDirectConvCommon<float, float>::
copy_padding_kern_stride(bundle, kern_param, ncb_index, copy_padding_kern_stride(bundle, kern_param, ncb_index,
@@ -116,16 +117,17 @@ ConvBiasImpl::AlgoF32DirectStride2::get_kimpls(
}; };
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
} else { } else {
WorkspaceBundle bundle = wbundle;
auto copy_padding = [bundle](const NCBKernParam& kern_param, auto copy_padding = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
arm_common::MultithreadDirectConvCommon<float, float>:: arm_common::MultithreadDirectConvCommon<float, float>::
copy_padding_kern_stride(bundle, kern_param, ncb_index, copy_padding_kern_stride(bundle, kern_param, ncb_index,
ncb_index.ndrange_id); ncb_index.ndrange_id);
}; };
ret_kerns.push_back({copy_padding, {group, N, IC}}); ret_kerns.push_back({copy_padding, {group, N, IC}});
auto do_conv = [bundle, conv](const NCBKernParam& kern_param, auto do_conv = [bundle, conv](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
arm_common::MultithreadDirectConvCommon< arm_common::MultithreadDirectConvCommon<
float, float>::do_conv_kern_stride(bundle, kern_param, float, float>::do_conv_kern_stride(bundle, kern_param,
ncb_index, conv, ncb_index, conv,


+ 9
- 10
dnn/src/arm_common/conv_bias/direct/multi_thread_common.cpp View File

@@ -119,7 +119,8 @@ MultithreadDirectConvCommon<io_ctype, compute_ctype>::get_bundle_stride(
//! Process one output channel weight flip //! Process one output channel weight flip
template <typename io_ctype, typename compute_ctype> template <typename io_ctype, typename compute_ctype>
void MultithreadDirectConvCommon<io_ctype, compute_ctype>::weight_flip_kern( void MultithreadDirectConvCommon<io_ctype, compute_ctype>::weight_flip_kern(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids) { const CpuNDRange& workspace_ids) {
size_t FH = kern_param.filter_meta.spatial[0]; size_t FH = kern_param.filter_meta.spatial[0];
@@ -131,7 +132,6 @@ void MultithreadDirectConvCommon<io_ctype, compute_ctype>::weight_flip_kern(
group_id = ncb_index.ndrange_id[0]; group_id = ncb_index.ndrange_id[0];
const io_ctype* filter = const io_ctype* filter =
kern_param.filter<io_ctype>(group_id) + channel_id * FH * FW * IC; kern_param.filter<io_ctype>(group_id) + channel_id * FH * FW * IC;
bundle.set(kern_param.workspace_ptr);
io_ctype* filter_flip = io_ctype* filter_flip =
static_cast<io_ctype*>(bundle.get(1)) + static_cast<io_ctype*>(bundle.get(1)) +
(workspace_group_id * IC * OC + channel_id * IC) * FH * FW; (workspace_group_id * IC * OC + channel_id * IC) * FH * FW;
@@ -148,7 +148,8 @@ void MultithreadDirectConvCommon<io_ctype, compute_ctype>::weight_flip_kern(
//! Process one input channel copy padding //! Process one input channel copy padding
template <typename io_ctype, typename compute_ctype> template <typename io_ctype, typename compute_ctype>
void MultithreadDirectConvCommon<io_ctype, compute_ctype>::copy_padding_kern( void MultithreadDirectConvCommon<io_ctype, compute_ctype>::copy_padding_kern(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids) { const CpuNDRange& workspace_ids) {
size_t IH = kern_param.isz[0]; size_t IH = kern_param.isz[0];
@@ -161,7 +162,6 @@ void MultithreadDirectConvCommon<io_ctype, compute_ctype>::copy_padding_kern(
size_t padding_group_size = IH2 * IW2 * IC; size_t padding_group_size = IH2 * IW2 * IC;
size_t N = kern_param.n; size_t N = kern_param.n;
size_t GROUP = kern_param.filter_meta.group; size_t GROUP = kern_param.filter_meta.group;
bundle.set(kern_param.workspace_ptr);


//! Used for get the workspace offset //! Used for get the workspace offset
size_t workspace_group_id = workspace_ids[0], size_t workspace_group_id = workspace_ids[0],
@@ -191,7 +191,7 @@ void MultithreadDirectConvCommon<io_ctype, compute_ctype>::copy_padding_kern(
//! Process one input channel copy padding //! Process one input channel copy padding
template <typename io_ctype, typename compute_ctype> template <typename io_ctype, typename compute_ctype>
void MultithreadDirectConvCommon<io_ctype, compute_ctype>:: void MultithreadDirectConvCommon<io_ctype, compute_ctype>::
copy_padding_kern_stride(WorkspaceBundle bundle,
copy_padding_kern_stride(const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids) { const CpuNDRange& workspace_ids) {
@@ -208,7 +208,6 @@ void MultithreadDirectConvCommon<io_ctype, compute_ctype>::
size_t GROUP = kern_param.filter_meta.group; size_t GROUP = kern_param.filter_meta.group;
get_rectified_size(kern_param, IH, IW, OH, OW, FH, FW, PH, PW, IH2, IW2, OW2); get_rectified_size(kern_param, IH, IW, OH, OW, FH, FW, PH, PW, IH2, IW2, OW2);
size_t padding_group_size = IH2 * IW2 * IC; size_t padding_group_size = IH2 * IW2 * IC;
bundle.set(kern_param.workspace_ptr);


//! Used for get the workspace offset //! Used for get the workspace offset
size_t workspace_group_id = workspace_ids[0], size_t workspace_group_id = workspace_ids[0],
@@ -235,7 +234,8 @@ void MultithreadDirectConvCommon<io_ctype, compute_ctype>::
//! compute one output channel //! compute one output channel
template <typename io_ctype, typename compute_ctype> template <typename io_ctype, typename compute_ctype>
void MultithreadDirectConvCommon<io_ctype, compute_ctype>::do_conv_kern( void MultithreadDirectConvCommon<io_ctype, compute_ctype>::do_conv_kern(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const kern_direct_conv_f32& fun, const CpuNDRange& workspace_ids) { const kern_direct_conv_f32& fun, const CpuNDRange& workspace_ids) {
size_t OH = kern_param.osz[0]; size_t OH = kern_param.osz[0];
@@ -251,7 +251,6 @@ void MultithreadDirectConvCommon<io_ctype, compute_ctype>::do_conv_kern(
size_t padding_group_size = IH2 * IW2 * IC; size_t padding_group_size = IH2 * IW2 * IC;
size_t N = kern_param.n; size_t N = kern_param.n;
size_t GROUP = kern_param.filter_meta.group; size_t GROUP = kern_param.filter_meta.group;
bundle.set(kern_param.workspace_ptr);


size_t group_id = ncb_index.ndrange_id[0], size_t group_id = ncb_index.ndrange_id[0],
batch_id = ncb_index.ndrange_id[1]; batch_id = ncb_index.ndrange_id[1];
@@ -305,7 +304,8 @@ void MultithreadDirectConvCommon<io_ctype, compute_ctype>::do_conv_kern(
//! compute one output channel //! compute one output channel
template <typename io_ctype, typename compute_ctype> template <typename io_ctype, typename compute_ctype>
void MultithreadDirectConvCommon<io_ctype, compute_ctype>::do_conv_kern_stride( void MultithreadDirectConvCommon<io_ctype, compute_ctype>::do_conv_kern_stride(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const kern_direct_conv_f32_stride& fun, const kern_direct_conv_f32_stride& fun,
const CpuNDRange& workspace_ids) { const CpuNDRange& workspace_ids) {
@@ -323,7 +323,6 @@ void MultithreadDirectConvCommon<io_ctype, compute_ctype>::do_conv_kern_stride(


size_t padding_group_size = IH2 * IW2 * IC; size_t padding_group_size = IH2 * IW2 * IC;
size_t GROUP = kern_param.filter_meta.group; size_t GROUP = kern_param.filter_meta.group;
bundle.set(kern_param.workspace_ptr);


//! Used for get the workspace offset //! Used for get the workspace offset
size_t group_id = ncb_index.ndrange_id[0], size_t group_id = ncb_index.ndrange_id[0],


+ 5
- 5
dnn/src/arm_common/conv_bias/direct/multi_thread_common.h View File

@@ -35,24 +35,24 @@ public:
bool m_large_group); bool m_large_group);
static WorkspaceBundle get_bundle_stride(const NCBKernSizeParam& param, static WorkspaceBundle get_bundle_stride(const NCBKernSizeParam& param,
bool m_large_group); bool m_large_group);
static void weight_flip_kern(WorkspaceBundle bundle,
static void weight_flip_kern(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);
static void copy_padding_kern(WorkspaceBundle bundle,
static void copy_padding_kern(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);
static void copy_padding_kern_stride(WorkspaceBundle bundle,
static void copy_padding_kern_stride(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);
static void do_conv_kern(WorkspaceBundle bundle,
static void do_conv_kern(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const kern_direct_conv_f32& fun, const kern_direct_conv_f32& fun,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);
static void do_conv_kern_stride(WorkspaceBundle bundle,
static void do_conv_kern_stride(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const kern_direct_conv_f32_stride& fun, const kern_direct_conv_f32_stride& fun,


+ 18
- 15
dnn/src/arm_common/conv_bias/f16/algos.cpp View File

@@ -362,7 +362,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16Direct::get_kimpls(
size_t IC = param.filter_meta.icpg; size_t IC = param.filter_meta.icpg;
size_t OC = param.filter_meta.ocpg; size_t OC = param.filter_meta.ocpg;
size_t group = fm.group; size_t group = fm.group;
WorkspaceBundle wbundle =
WorkspaceBundle bundle =
MultithreadDirectConvCommon<dt_float16, __fp16>::get_bundle( MultithreadDirectConvCommon<dt_float16, __fp16>::get_bundle(
param, m_large_group); param, m_large_group);
SmallVector<NCBKern> ret_kerns; SmallVector<NCBKern> ret_kerns;
@@ -370,12 +370,12 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16Direct::get_kimpls(
//! one group for better performance //! one group for better performance
if (m_large_group) { if (m_large_group) {
//! Channel wise conv and big groups //! Channel wise conv and big groups
auto exec_one_group = [wbundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
auto exec_one_group = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) mutable {
auto fm = kern_param.filter_meta; auto fm = kern_param.filter_meta;
size_t IC = fm.icpg; size_t IC = fm.icpg;
size_t OC = fm.ocpg; size_t OC = fm.ocpg;
WorkspaceBundle bundle = wbundle;
bundle.set(kern_param.workspace_ptr);
if (fm.should_flip) { if (fm.should_flip) {
for (size_t oc = 0; oc < OC; oc++) { for (size_t oc = 0; oc < OC; oc++) {
MultithreadDirectConvCommon<dt_float16, __fp16>:: MultithreadDirectConvCommon<dt_float16, __fp16>::
@@ -397,10 +397,10 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16Direct::get_kimpls(
}; };
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
} else { } else {
WorkspaceBundle bundle = wbundle;
if (fm.should_flip) { if (fm.should_flip) {
auto weight_flip = [bundle](const NCBKernParam& kern_param, auto weight_flip = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
MultithreadDirectConvCommon<dt_float16, __fp16>:: MultithreadDirectConvCommon<dt_float16, __fp16>::
weight_flip_kern(bundle, kern_param, ncb_index, weight_flip_kern(bundle, kern_param, ncb_index,
ncb_index.ndrange_id); ncb_index.ndrange_id);
@@ -408,13 +408,15 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16Direct::get_kimpls(
ret_kerns.push_back({weight_flip, {group, 1_z, OC}}); ret_kerns.push_back({weight_flip, {group, 1_z, OC}});
} }
auto copy_padding = [bundle](const NCBKernParam& kern_param, auto copy_padding = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
MultithreadDirectConvCommon<dt_float16, __fp16>::copy_padding_kern( MultithreadDirectConvCommon<dt_float16, __fp16>::copy_padding_kern(
bundle, kern_param, ncb_index, ncb_index.ndrange_id); bundle, kern_param, ncb_index, ncb_index.ndrange_id);
}; };
ret_kerns.push_back({copy_padding, {group, N, IC}}); ret_kerns.push_back({copy_padding, {group, N, IC}});
auto do_conv = [bundle](const NCBKernParam& kern_param, auto do_conv = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
MultithreadDirectConvCommon<dt_float16, __fp16>::do_conv_kern( MultithreadDirectConvCommon<dt_float16, __fp16>::do_conv_kern(
bundle, kern_param, ncb_index, bundle, kern_param, ncb_index,
fp16::conv_bias::kern_direct_f16, ncb_index.ndrange_id); fp16::conv_bias::kern_direct_f16, ncb_index.ndrange_id);
@@ -488,7 +490,7 @@ ConvBiasImpl::AlgoF16DirectStride1::get_kimpls(
} }
SWITCH_KERN(); SWITCH_KERN();


WorkspaceBundle wbundle =
WorkspaceBundle bundle =
MultithreadDirectConvCommon<dt_float16, __fp16>::get_bundle_stride( MultithreadDirectConvCommon<dt_float16, __fp16>::get_bundle_stride(
param, m_large_group); param, m_large_group);
SmallVector<NCBKern> ret_kerns; SmallVector<NCBKern> ret_kerns;
@@ -496,13 +498,13 @@ ConvBiasImpl::AlgoF16DirectStride1::get_kimpls(
//! one group for better performance //! one group for better performance
if (m_large_group) { if (m_large_group) {
//! Channel wise conv and big groups //! Channel wise conv and big groups
auto exec_one_group = [wbundle, conv_kern_function](
auto exec_one_group = [bundle, conv_kern_function](
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
auto fm = kern_param.filter_meta; auto fm = kern_param.filter_meta;
size_t IC = fm.icpg; size_t IC = fm.icpg;
size_t OC = fm.ocpg; size_t OC = fm.ocpg;
WorkspaceBundle bundle = wbundle;
bundle.set(kern_param.workspace_ptr);
for (size_t ic = 0; ic < IC; ic++) { for (size_t ic = 0; ic < IC; ic++) {
MultithreadDirectConvCommon<dt_float16, __fp16>:: MultithreadDirectConvCommon<dt_float16, __fp16>::
copy_padding_kern_stride(bundle, kern_param, ncb_index, copy_padding_kern_stride(bundle, kern_param, ncb_index,
@@ -517,9 +519,9 @@ ConvBiasImpl::AlgoF16DirectStride1::get_kimpls(
}; };
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
} else { } else {
WorkspaceBundle bundle = wbundle;
auto copy_padding = [bundle](const NCBKernParam& kern_param, auto copy_padding = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
MultithreadDirectConvCommon<dt_float16, __fp16>:: MultithreadDirectConvCommon<dt_float16, __fp16>::
copy_padding_kern_stride(bundle, kern_param, ncb_index, copy_padding_kern_stride(bundle, kern_param, ncb_index,
ncb_index.ndrange_id); ncb_index.ndrange_id);
@@ -527,7 +529,8 @@ ConvBiasImpl::AlgoF16DirectStride1::get_kimpls(
ret_kerns.push_back({copy_padding, {group, N, IC}}); ret_kerns.push_back({copy_padding, {group, N, IC}});
auto do_conv = [bundle, conv_kern_function]( auto do_conv = [bundle, conv_kern_function](
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
MultithreadDirectConvCommon<dt_float16, __fp16>:: MultithreadDirectConvCommon<dt_float16, __fp16>::
do_conv_kern_stride(bundle, kern_param, ncb_index, do_conv_kern_stride(bundle, kern_param, ncb_index,
conv_kern_function, conv_kern_function,


+ 26
- 22
dnn/src/arm_common/conv_bias/fp32/algos.cpp View File

@@ -597,7 +597,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32Direct::get_kimpls(
size_t IC = param.filter_meta.icpg; size_t IC = param.filter_meta.icpg;
size_t OC = param.filter_meta.ocpg; size_t OC = param.filter_meta.ocpg;
size_t group = fm.group; size_t group = fm.group;
WorkspaceBundle wbundle =
WorkspaceBundle bundle =
MultithreadDirectConvCommon<float, float>::get_bundle( MultithreadDirectConvCommon<float, float>::get_bundle(
param, m_large_group); param, m_large_group);
SmallVector<NCBKern> ret_kerns; SmallVector<NCBKern> ret_kerns;
@@ -605,12 +605,12 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32Direct::get_kimpls(
//! one group for better performance //! one group for better performance
if (m_large_group) { if (m_large_group) {
//! Channel wise conv and big groups //! Channel wise conv and big groups
auto exec_one_group = [wbundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
auto exec_one_group = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) mutable {
auto fm = kern_param.filter_meta; auto fm = kern_param.filter_meta;
size_t IC = fm.icpg; size_t IC = fm.icpg;
size_t OC = fm.ocpg; size_t OC = fm.ocpg;
WorkspaceBundle bundle = wbundle;
bundle.set(kern_param.workspace_ptr);
if (fm.should_flip) { if (fm.should_flip) {
for (size_t oc = 0; oc < OC; oc++) { for (size_t oc = 0; oc < OC; oc++) {
MultithreadDirectConvCommon<float, float>::weight_flip_kern( MultithreadDirectConvCommon<float, float>::weight_flip_kern(
@@ -631,23 +631,25 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32Direct::get_kimpls(
}; };
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
} else { } else {
WorkspaceBundle bundle = wbundle;
if (fm.should_flip) { if (fm.should_flip) {
auto weight_flip = [bundle](const NCBKernParam& kern_param, auto weight_flip = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
MultithreadDirectConvCommon<float, float>::weight_flip_kern( MultithreadDirectConvCommon<float, float>::weight_flip_kern(
bundle, kern_param, ncb_index, ncb_index.ndrange_id); bundle, kern_param, ncb_index, ncb_index.ndrange_id);
}; };
ret_kerns.push_back({weight_flip, {group, 1_z, OC}}); ret_kerns.push_back({weight_flip, {group, 1_z, OC}});
} }
auto copy_padding = [bundle](const NCBKernParam& kern_param, auto copy_padding = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
MultithreadDirectConvCommon<float, float>::copy_padding_kern( MultithreadDirectConvCommon<float, float>::copy_padding_kern(
bundle, kern_param, ncb_index, ncb_index.ndrange_id); bundle, kern_param, ncb_index, ncb_index.ndrange_id);
}; };
ret_kerns.push_back({copy_padding, {group, N, IC}}); ret_kerns.push_back({copy_padding, {group, N, IC}});
auto do_conv = [bundle](const NCBKernParam& kern_param, auto do_conv = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
MultithreadDirectConvCommon<float, float>::do_conv_kern( MultithreadDirectConvCommon<float, float>::do_conv_kern(
bundle, kern_param, ncb_index, fp32::conv_bias::kern_direct, bundle, kern_param, ncb_index, fp32::conv_bias::kern_direct,
ncb_index.ndrange_id); ncb_index.ndrange_id);
@@ -734,7 +736,7 @@ ConvBiasImpl::AlgoF32DirectStride1::get_kimpls(
} }
SWITCH_KERN_STR1(); SWITCH_KERN_STR1();


WorkspaceBundle wbundle =
WorkspaceBundle bundle =
MultithreadDirectConvCommon<float, float>::get_bundle_stride( MultithreadDirectConvCommon<float, float>::get_bundle_stride(
param, m_large_group); param, m_large_group);
SmallVector<NCBKern> ret_kerns; SmallVector<NCBKern> ret_kerns;
@@ -742,13 +744,13 @@ ConvBiasImpl::AlgoF32DirectStride1::get_kimpls(
//! one group for better performance //! one group for better performance
if (m_large_group) { if (m_large_group) {
//! Channel wise conv and big groups //! Channel wise conv and big groups
auto exec_one_group = [wbundle, conv_kern_function](
auto exec_one_group = [bundle, conv_kern_function](
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
auto fm = kern_param.filter_meta; auto fm = kern_param.filter_meta;
size_t IC = fm.icpg; size_t IC = fm.icpg;
size_t OC = fm.ocpg; size_t OC = fm.ocpg;
WorkspaceBundle bundle = wbundle;
bundle.set(kern_param.workspace_ptr);
for (size_t ic = 0; ic < IC; ic++) { for (size_t ic = 0; ic < IC; ic++) {
MultithreadDirectConvCommon<float, float>:: MultithreadDirectConvCommon<float, float>::
copy_padding_kern_stride(bundle, kern_param, ncb_index, copy_padding_kern_stride(bundle, kern_param, ncb_index,
@@ -762,16 +764,17 @@ ConvBiasImpl::AlgoF32DirectStride1::get_kimpls(
}; };
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
} else { } else {
WorkspaceBundle bundle = wbundle;
auto copy_padding = [bundle](const NCBKernParam& kern_param, auto copy_padding = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
MultithreadDirectConvCommon<float, float>::copy_padding_kern_stride( MultithreadDirectConvCommon<float, float>::copy_padding_kern_stride(
bundle, kern_param, ncb_index, ncb_index.ndrange_id); bundle, kern_param, ncb_index, ncb_index.ndrange_id);
}; };
ret_kerns.push_back({copy_padding, {group, N, IC}}); ret_kerns.push_back({copy_padding, {group, N, IC}});
auto do_conv = [bundle, conv_kern_function]( auto do_conv = [bundle, conv_kern_function](
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
MultithreadDirectConvCommon<float, float>::do_conv_kern_stride( MultithreadDirectConvCommon<float, float>::do_conv_kern_stride(
bundle, kern_param, ncb_index, conv_kern_function, bundle, kern_param, ncb_index, conv_kern_function,
ncb_index.ndrange_id); ncb_index.ndrange_id);
@@ -859,7 +862,7 @@ ConvBiasImpl::AlgoF32DirectStride2::get_kimpls(
} }
SWITCH_KERN_STR2(); SWITCH_KERN_STR2();


WorkspaceBundle wbundle =
WorkspaceBundle bundle =
MultithreadDirectConvCommon<float, float>::get_bundle_stride( MultithreadDirectConvCommon<float, float>::get_bundle_stride(
param, m_large_group); param, m_large_group);
SmallVector<NCBKern> ret_kerns; SmallVector<NCBKern> ret_kerns;
@@ -867,13 +870,13 @@ ConvBiasImpl::AlgoF32DirectStride2::get_kimpls(
//! one group for better performance //! one group for better performance
if (m_large_group) { if (m_large_group) {
//! Channel wise conv and big groups //! Channel wise conv and big groups
auto exec_one_group = [wbundle, conv_kern_function](
auto exec_one_group = [bundle, conv_kern_function](
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
auto fm = kern_param.filter_meta; auto fm = kern_param.filter_meta;
size_t IC = fm.icpg; size_t IC = fm.icpg;
size_t OC = fm.ocpg; size_t OC = fm.ocpg;
WorkspaceBundle bundle = wbundle;
bundle.set(kern_param.workspace_ptr);
for (size_t ic = 0; ic < IC; ic++) { for (size_t ic = 0; ic < IC; ic++) {
MultithreadDirectConvCommon<float, float>:: MultithreadDirectConvCommon<float, float>::
copy_padding_kern_stride(bundle, kern_param, ncb_index, copy_padding_kern_stride(bundle, kern_param, ncb_index,
@@ -887,16 +890,17 @@ ConvBiasImpl::AlgoF32DirectStride2::get_kimpls(
}; };
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
} else { } else {
WorkspaceBundle bundle = wbundle;
auto copy_padding = [bundle](const NCBKernParam& kern_param, auto copy_padding = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
MultithreadDirectConvCommon<float, float>::copy_padding_kern_stride( MultithreadDirectConvCommon<float, float>::copy_padding_kern_stride(
bundle, kern_param, ncb_index, ncb_index.ndrange_id); bundle, kern_param, ncb_index, ncb_index.ndrange_id);
}; };
ret_kerns.push_back({copy_padding, {group, N, IC}}); ret_kerns.push_back({copy_padding, {group, N, IC}});
auto do_conv = [bundle, conv_kern_function]( auto do_conv = [bundle, conv_kern_function](
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
MultithreadDirectConvCommon<float, float>::do_conv_kern_stride( MultithreadDirectConvCommon<float, float>::do_conv_kern_stride(
bundle, kern_param, ncb_index, conv_kern_function, bundle, kern_param, ncb_index, conv_kern_function,
ncb_index.ndrange_id); ncb_index.ndrange_id);


+ 7
- 7
dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw44_algo.cpp View File

@@ -22,7 +22,8 @@
using namespace megdnn; using namespace megdnn;
using namespace arm_common; using namespace arm_common;
using conv_fun = std::function<void( using conv_fun = std::function<void(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids, const CpuNDRange& ncb_range)>; const CpuNDRange& workspace_ids, const CpuNDRange& ncb_range)>;
MIDOUT_DECL(megdnn_arm_common_conv_bias_fp32_nchw44_stride1) MIDOUT_DECL(megdnn_arm_common_conv_bias_fp32_nchw44_stride1)
@@ -67,7 +68,7 @@ static WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param) {
}; };


template <size_t filter, BiasMode bias_mode, typename Op, int stride> template <size_t filter, BiasMode bias_mode, typename Op, int stride>
static void do_conv_kern(WorkspaceBundle bundle,
static void do_conv_kern(const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange&, const CpuNDRange&) { const CpuNDRange&, const CpuNDRange&) {
@@ -87,7 +88,6 @@ static void do_conv_kern(WorkspaceBundle bundle,
int oh2 = 0; int oh2 = 0;
int ow2 = 0; int ow2 = 0;
get_rectified_size(kern_param, ih2, iw2, oh2, ow2); get_rectified_size(kern_param, ih2, iw2, oh2, ow2);
bundle.set(kern_param.workspace_ptr);


constexpr int pack_c = 4; constexpr int pack_c = 4;
const int batch_id = ncb_index.ndrange_id[0]; const int batch_id = ncb_index.ndrange_id[0];
@@ -281,7 +281,6 @@ ConvBiasImpl::AlgoF32DirectNCHW44::dispatch_kerns(
megdnn_assert(do_conv_fun); megdnn_assert(do_conv_fun);


SmallVector<ConvBiasImpl::NCBKern> ret_kerns; SmallVector<ConvBiasImpl::NCBKern> ret_kerns;
WorkspaceBundle bundle = wbundle;
int oh = param.osz[0]; int oh = param.osz[0];
int ic = param.filter_meta.icpg; int ic = param.filter_meta.icpg;
int iw = param.isz[1]; int iw = param.isz[1];
@@ -291,10 +290,11 @@ ConvBiasImpl::AlgoF32DirectNCHW44::dispatch_kerns(
CpuNDRange ncb_range = {static_cast<size_t>(batch), CpuNDRange ncb_range = {static_cast<size_t>(batch),
static_cast<size_t>(group), static_cast<size_t>(group),
static_cast<size_t>(div_ceil(oh, oh_block))}; static_cast<size_t>(div_ceil(oh, oh_block))};
auto do_conv = [bundle, do_conv_fun, ncb_range](
auto do_conv = [wbundle, do_conv_fun, ncb_range](
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id,
const NCBKernIndex& ncb_index) mutable {
wbundle.set(kern_param.workspace_ptr);
do_conv_fun(wbundle, kern_param, ncb_index, ncb_index.ndrange_id,
ncb_range); ncb_range);
}; };
ret_kerns.push_back({do_conv, ncb_range}); ret_kerns.push_back({do_conv, ncb_range});


+ 9
- 9
dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw_nchw44_algo.cpp View File

@@ -23,7 +23,8 @@
using namespace megdnn; using namespace megdnn;
using namespace arm_common; using namespace arm_common;
using conv_fun = std::function<void( using conv_fun = std::function<void(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids, const CpuNDRange& ncb_range)>; const CpuNDRange& workspace_ids, const CpuNDRange& ncb_range)>;
MIDOUT_DECL(megdnn_arm_common_conv_bias_fp32_nchw_nchw44) MIDOUT_DECL(megdnn_arm_common_conv_bias_fp32_nchw_nchw44)
@@ -105,10 +106,9 @@ static inline void copy_pad_src(float* sptr_base, const float* sptr_origin,
sptr_base += iw2 * pad_bottom; sptr_base += iw2 * pad_bottom;
} }
} }
static void pack_weight(WorkspaceBundle bundle,
static void pack_weight(const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index) { const ConvBiasImpl::NCBKernIndex& ncb_index) {
bundle.set(kern_param.workspace_ptr);
const int group_id = ncb_index.ndrange_id[0]; const int group_id = ncb_index.ndrange_id[0];
int fh = kern_param.filter_meta.spatial[0]; int fh = kern_param.filter_meta.spatial[0];
int fw = kern_param.filter_meta.spatial[1]; int fw = kern_param.filter_meta.spatial[1];
@@ -124,7 +124,7 @@ static void pack_weight(WorkspaceBundle bundle,
} }


template <size_t filter_size, BiasMode bias_mode, typename Op, size_t stride> template <size_t filter_size, BiasMode bias_mode, typename Op, size_t stride>
static void do_conv_kern(WorkspaceBundle bundle,
static void do_conv_kern(const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange&, const CpuNDRange&) { const CpuNDRange&, const CpuNDRange&) {
@@ -144,7 +144,6 @@ static void do_conv_kern(WorkspaceBundle bundle,
int oh2 = 0; int oh2 = 0;
int ow2 = 0; int ow2 = 0;
get_rectified_size(kern_param, ih2, iw2, oh2, ow2); get_rectified_size(kern_param, ih2, iw2, oh2, ow2);
bundle.set(kern_param.workspace_ptr);


constexpr int pack_c = 4; constexpr int pack_c = 4;
const int batch_id = ncb_index.ndrange_id[0]; const int batch_id = ncb_index.ndrange_id[0];
@@ -220,7 +219,7 @@ ConvBiasImpl::AlgoF32DirectNCHWNCHW44::dispatch_kerns(
auto fm = param.filter_meta; auto fm = param.filter_meta;
const int batch = param.n; const int batch = param.n;
const int group = fm.group; const int group = fm.group;
WorkspaceBundle wbundle = get_bundle(param);
WorkspaceBundle bundle = get_bundle(param);
conv_fun do_conv_fun = nullptr; conv_fun do_conv_fun = nullptr;
// NOTE: remain_w is not used to gen hash of midout for compatible with // NOTE: remain_w is not used to gen hash of midout for compatible with
// shape runtime // shape runtime
@@ -301,11 +300,11 @@ ConvBiasImpl::AlgoF32DirectNCHWNCHW44::dispatch_kerns(
megdnn_assert(do_conv_fun); megdnn_assert(do_conv_fun);


SmallVector<ConvBiasImpl::NCBKern> ret_kerns; SmallVector<ConvBiasImpl::NCBKern> ret_kerns;
WorkspaceBundle bundle = wbundle;
int oh = param.osz[0]; int oh = param.osz[0];
int oh_block = block_helper(param.nr_threads, oh, 0); int oh_block = block_helper(param.nr_threads, oh, 0);
auto do_pack_weight = [bundle](const NCBKernParam& kern_param, auto do_pack_weight = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
pack_weight(bundle, kern_param, ncb_index); pack_weight(bundle, kern_param, ncb_index);
}; };
ret_kerns.push_back({do_pack_weight, {static_cast<size_t>(group)}}); ret_kerns.push_back({do_pack_weight, {static_cast<size_t>(group)}});
@@ -314,7 +313,8 @@ ConvBiasImpl::AlgoF32DirectNCHWNCHW44::dispatch_kerns(
static_cast<size_t>(div_ceil(oh, oh_block))}; static_cast<size_t>(div_ceil(oh, oh_block))};
auto do_conv = [bundle, do_conv_fun, ncb_range]( auto do_conv = [bundle, do_conv_fun, ncb_range](
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id, do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id,
ncb_range); ncb_range);
}; };


+ 6
- 6
dnn/src/arm_common/conv_bias/int8/channel_wise_nchw44.cpp View File

@@ -76,7 +76,7 @@ WorkspaceBundle stride1::get_bundle(


//! compute one output channel //! compute one output channel
template <bool quantized, size_t filter, BiasMode bias_mode, typename Op> template <bool quantized, size_t filter, BiasMode bias_mode, typename Op>
void stride1::do_conv_kern(WorkspaceBundle bundle,
void stride1::do_conv_kern(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) { const NCBKernIndex& ncb_index) {
size_t PH = kern_param.filter_meta.padding[0]; size_t PH = kern_param.filter_meta.padding[0];
@@ -100,7 +100,6 @@ void stride1::do_conv_kern(WorkspaceBundle bundle,


size_t thread_id = ncb_index.thread_id, batch_id = ncb_index.ndrange_id[0]; size_t thread_id = ncb_index.thread_id, batch_id = ncb_index.ndrange_id[0];
size_t group_id = ncb_index.ndrange_id[1]; size_t group_id = ncb_index.ndrange_id[1];
bundle.set(kern_param.workspace_ptr);
int8_t* padding_src = static_cast<int8_t*>(bundle.get(thread_id)); int8_t* padding_src = static_cast<int8_t*>(bundle.get(thread_id));
const int8_t* sptr = const int8_t* sptr =
kern_param.src<dt_int8>(batch_id, group_id, 0, pack_group_size); kern_param.src<dt_int8>(batch_id, group_id, 0, pack_group_size);
@@ -210,7 +209,8 @@ SmallVector<ConvBiasImpl::NCBKern> stride1::get_kimpls(
SmallVector<ConvBiasImpl::NCBKern> ret_kerns; SmallVector<ConvBiasImpl::NCBKern> ret_kerns;
auto exec_one_group = [wbundle, do_conv_fun]( auto exec_one_group = [wbundle, do_conv_fun](
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
wbundle.set(kern_param.workspace_ptr);
do_conv_fun(wbundle, kern_param, ncb_index); do_conv_fun(wbundle, kern_param, ncb_index);
}; };
ret_kerns.push_back({exec_one_group, {N, group}}); ret_kerns.push_back({exec_one_group, {N, group}});
@@ -253,7 +253,7 @@ WorkspaceBundle stride2::get_bundle(


//! compute one output channel //! compute one output channel
template <bool quantized, size_t filter, BiasMode bias_mode, typename Op> template <bool quantized, size_t filter, BiasMode bias_mode, typename Op>
void stride2::do_conv_kern(WorkspaceBundle bundle,
void stride2::do_conv_kern(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) { const NCBKernIndex& ncb_index) {
size_t PH = kern_param.filter_meta.padding[0]; size_t PH = kern_param.filter_meta.padding[0];
@@ -277,7 +277,6 @@ void stride2::do_conv_kern(WorkspaceBundle bundle,


size_t thread_id = ncb_index.thread_id, batch_id = ncb_index.ndrange_id[0]; size_t thread_id = ncb_index.thread_id, batch_id = ncb_index.ndrange_id[0];
size_t group_id = ncb_index.ndrange_id[1]; size_t group_id = ncb_index.ndrange_id[1];
bundle.set(kern_param.workspace_ptr);
int8_t* padding_src = static_cast<int8_t*>(bundle.get(thread_id)); int8_t* padding_src = static_cast<int8_t*>(bundle.get(thread_id));
const int8_t* sptr = const int8_t* sptr =
kern_param.src<dt_int8>(batch_id, group_id, 0, pack_group_size); kern_param.src<dt_int8>(batch_id, group_id, 0, pack_group_size);
@@ -325,7 +324,8 @@ SmallVector<ConvBiasImpl::NCBKern> stride2::get_kimpls(
SmallVector<ConvBiasImpl::NCBKern> ret_kerns; SmallVector<ConvBiasImpl::NCBKern> ret_kerns;
auto exec_one_group = [wbundle, do_conv_fun]( auto exec_one_group = [wbundle, do_conv_fun](
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
wbundle.set(kern_param.workspace_ptr);
do_conv_fun(wbundle, kern_param, ncb_index); do_conv_fun(wbundle, kern_param, ncb_index);
}; };
ret_kerns.push_back({exec_one_group, {N, group}}); ret_kerns.push_back({exec_one_group, {N, group}});


+ 3
- 3
dnn/src/arm_common/conv_bias/int8/channel_wise_nchw44.h View File

@@ -21,7 +21,7 @@ using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam;
using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam;
using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex;


using conv_fun = std::function<void(WorkspaceBundle bundle,
using conv_fun = std::function<void(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index)>; const NCBKernIndex& ncb_index)>;


@@ -32,7 +32,7 @@ bool is_available(const NCBKernSizeParam& param);
WorkspaceBundle get_bundle(const NCBKernSizeParam& param); WorkspaceBundle get_bundle(const NCBKernSizeParam& param);


template <bool quantized, size_t filter, BiasMode bias_mode, typename Op> template <bool quantized, size_t filter, BiasMode bias_mode, typename Op>
void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param,
void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index); const NCBKernIndex& ncb_index);


SmallVector<ConvBiasImpl::NCBKern> get_kimpls(const NCBKernSizeParam& param); SmallVector<ConvBiasImpl::NCBKern> get_kimpls(const NCBKernSizeParam& param);
@@ -44,7 +44,7 @@ bool is_available(const NCBKernSizeParam& param);
WorkspaceBundle get_bundle(const NCBKernSizeParam& param); WorkspaceBundle get_bundle(const NCBKernSizeParam& param);


template <bool quantized, size_t filter, BiasMode bias_mode, typename Op> template <bool quantized, size_t filter, BiasMode bias_mode, typename Op>
void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param,
void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index); const NCBKernIndex& ncb_index);


SmallVector<ConvBiasImpl::NCBKern> get_kimpls(const NCBKernSizeParam& param); SmallVector<ConvBiasImpl::NCBKern> get_kimpls(const NCBKernSizeParam& param);


+ 9
- 7
dnn/src/arm_common/conv_bias/int8/direct_dotprod_nchw44_algo.cpp View File

@@ -24,9 +24,10 @@ using namespace arm_common;


MIDOUT_DECL(megdnn_arm_common_conv_bias_int8) MIDOUT_DECL(megdnn_arm_common_conv_bias_int8)


using direct_fun = std::function<void(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& ncb_param,
const ConvBiasImpl::NCBKernIndex& ncb_index)>;
using direct_fun =
std::function<void(const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& ncb_param,
const ConvBiasImpl::NCBKernIndex& ncb_index)>;


namespace { namespace {


@@ -71,7 +72,7 @@ static WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param) {


template <typename dst_type, size_t filter_size, BiasMode bias_mode, template <typename dst_type, size_t filter_size, BiasMode bias_mode,
typename Op, int stride> typename Op, int stride>
static void conv_kern(WorkspaceBundle bundle,
static void conv_kern(const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& ncb_param, const ConvBiasImpl::NCBKernParam& ncb_param,
const ConvBiasImpl::NCBKernIndex& ncb_index) { const ConvBiasImpl::NCBKernIndex& ncb_index) {
const int OH = ncb_param.osz[0]; const int OH = ncb_param.osz[0];
@@ -93,7 +94,6 @@ static void conv_kern(WorkspaceBundle bundle,


constexpr int IC_PACK_SIZE = 4; constexpr int IC_PACK_SIZE = 4;
constexpr int OC_PACK_SIZE = 4; constexpr int OC_PACK_SIZE = 4;
bundle.set(ncb_param.workspace_ptr);


const int batch_id = ncb_index.ndrange_id[0]; const int batch_id = ncb_index.ndrange_id[0];
const int group_id = ncb_index.ndrange_id[1]; const int group_id = ncb_index.ndrange_id[1];
@@ -326,8 +326,10 @@ ConvBiasImpl::AlgoDotS8Direct_NCHW44::dispatch_kerns(
IC * IW * sizeof(int8_t) * 2); IC * IW * sizeof(int8_t) * 2);
size_t oh_tiles = static_cast<size_t>(div_ceil(OH, oh_tile_size)); size_t oh_tiles = static_cast<size_t>(div_ceil(OH, oh_tile_size));


auto do_conv = [wbundle, kernel](const NCBKernParam& ncb_param,
const NCBKernIndex& ncb_index) {
auto do_conv = [wbundle, kernel](
const NCBKernParam& ncb_param,
const NCBKernIndex& ncb_index) mutable {
wbundle.set(ncb_param.workspace_ptr);
kernel(wbundle, ncb_param, std::move(ncb_index)); kernel(wbundle, ncb_param, std::move(ncb_index));
}; };




+ 17
- 16
dnn/src/arm_common/conv_bias/int8/direct_nchw44_algo.cpp View File

@@ -23,7 +23,8 @@
using namespace megdnn; using namespace megdnn;
using namespace arm_common; using namespace arm_common;
using conv_fun = std::function<void( using conv_fun = std::function<void(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids, const CpuNDRange& ncb_range)>; const CpuNDRange& workspace_ids, const CpuNDRange& ncb_range)>;
MIDOUT_DECL(megdnn_arm_common_conv_bias_int8_nchw44) MIDOUT_DECL(megdnn_arm_common_conv_bias_int8_nchw44)
@@ -64,7 +65,7 @@ static WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param) {
} }
}; };


static void copy_padding_kern(WorkspaceBundle bundle,
static void copy_padding_kern(const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids) { const CpuNDRange& workspace_ids) {
@@ -78,7 +79,6 @@ static void copy_padding_kern(WorkspaceBundle bundle,
int IH2, IW2; int IH2, IW2;
get_rectified_size(kern_param, IH2, IW2); get_rectified_size(kern_param, IH2, IW2);
int padding_group_size = IH2 * IW2 * IC; int padding_group_size = IH2 * IW2 * IC;
bundle.set(kern_param.workspace_ptr);
//! Used for get the workspace offset //! Used for get the workspace offset
constexpr int pack_ic = 4; constexpr int pack_ic = 4;
constexpr int expend_element = 4; constexpr int expend_element = 4;
@@ -128,7 +128,7 @@ static void copy_padding_kern(WorkspaceBundle bundle,


template <size_t filter, BiasMode bias_mode, typename Op, int ow_remain, template <size_t filter, BiasMode bias_mode, typename Op, int ow_remain,
typename DstType, int stride> typename DstType, int stride>
static void do_conv_kern(WorkspaceBundle bundle,
static void do_conv_kern(const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids, const CpuNDRange& workspace_ids,
@@ -153,7 +153,6 @@ static void do_conv_kern(WorkspaceBundle bundle,
op = Op(scale_bias, scale_dst); op = Op(scale_bias, scale_dst);
} }
size_t padding_group_size = IH2 * IW2 * IC; size_t padding_group_size = IH2 * IW2 * IC;
bundle.set(kern_param.workspace_ptr);


constexpr size_t pack_c = 4; constexpr size_t pack_c = 4;
constexpr size_t src_expand_size = 4; constexpr size_t src_expand_size = 4;
@@ -375,7 +374,6 @@ ConvBiasImpl::AlgoS8DirectNCHW44::dispatch_kerns(
megdnn_assert(do_conv_fun); megdnn_assert(do_conv_fun);


SmallVector<ConvBiasImpl::NCBKern> ret_kerns; SmallVector<ConvBiasImpl::NCBKern> ret_kerns;
WorkspaceBundle bundle = wbundle;


constexpr size_t pack_oc = 4; constexpr size_t pack_oc = 4;
size_t oc_step = pack_oc; size_t oc_step = pack_oc;
@@ -384,28 +382,31 @@ ConvBiasImpl::AlgoS8DirectNCHW44::dispatch_kerns(
} }
if (group == 1) { if (group == 1) {
CpuNDRange ncb_range = {N, group, div_ceil(OC, oc_step)}; CpuNDRange ncb_range = {N, group, div_ceil(OC, oc_step)};
auto copy_padding = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
copy_padding_kern(bundle, kern_param, ncb_index,
auto copy_padding = [wbundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) mutable {
wbundle.set(kern_param.workspace_ptr);
copy_padding_kern(wbundle, kern_param, ncb_index,
ncb_index.ndrange_id); ncb_index.ndrange_id);
}; };
constexpr size_t pack_ic = 4; constexpr size_t pack_ic = 4;
ret_kerns.push_back({copy_padding, {N, group, div_ceil(IC, pack_ic)}}); ret_kerns.push_back({copy_padding, {N, group, div_ceil(IC, pack_ic)}});
auto do_conv = [bundle, do_conv_fun, ncb_range](
auto do_conv = [wbundle, do_conv_fun, ncb_range](
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id,
const NCBKernIndex& ncb_index) mutable {
wbundle.set(kern_param.workspace_ptr);
do_conv_fun(wbundle, kern_param, ncb_index, ncb_index.ndrange_id,
ncb_range); ncb_range);
}; };
ret_kerns.push_back({do_conv, ncb_range}); ret_kerns.push_back({do_conv, ncb_range});
} else { } else {
CpuNDRange ncb_range = {N, group, 1}; CpuNDRange ncb_range = {N, group, 1};
auto do_conv = [bundle, do_conv_fun, ncb_range](
auto do_conv = [wbundle, do_conv_fun, ncb_range](
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
copy_padding_kern(bundle, kern_param, ncb_index,
const NCBKernIndex& ncb_index) mutable {
wbundle.set(kern_param.workspace_ptr);
copy_padding_kern(wbundle, kern_param, ncb_index,
{0, ncb_index.thread_id, 0}); {0, ncb_index.thread_id, 0});
do_conv_fun(bundle, kern_param, ncb_index,
do_conv_fun(wbundle, kern_param, ncb_index,
{0, ncb_index.thread_id, 0}, ncb_range); {0, ncb_index.thread_id, 0}, ncb_range);
}; };
ret_kerns.push_back({do_conv, ncb_range}); ret_kerns.push_back({do_conv, ncb_range});


+ 12
- 12
dnn/src/arm_common/conv_bias/int8/direct_nchw_nchw44_algo.cpp View File

@@ -22,7 +22,8 @@
using namespace megdnn; using namespace megdnn;
using namespace arm_common; using namespace arm_common;
using conv_fun = std::function<void( using conv_fun = std::function<void(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids, const CpuNDRange& ncb_range)>; const CpuNDRange& workspace_ids, const CpuNDRange& ncb_range)>;
MIDOUT_DECL(megdnn_arm_common_conv_bias_int8_nchw_nchw44) MIDOUT_DECL(megdnn_arm_common_conv_bias_int8_nchw_nchw44)
@@ -77,7 +78,7 @@ static WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param) {
return {nullptr, {src_size, weight_size, tmp_size * param.nr_threads}}; return {nullptr, {src_size, weight_size, tmp_size * param.nr_threads}};
}; };


static void copy_padding_kern(WorkspaceBundle bundle,
static void copy_padding_kern(const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids) { const CpuNDRange& workspace_ids) {
@@ -92,7 +93,6 @@ static void copy_padding_kern(WorkspaceBundle bundle,
int ih2, iw2, oh2, ow2; int ih2, iw2, oh2, ow2;
get_rectified_size(kern_param, ih2, iw2, oh2, ow2); get_rectified_size(kern_param, ih2, iw2, oh2, ow2);
int padding_group_size = ih2 * iw2 * ic; int padding_group_size = ih2 * iw2 * ic;
bundle.set(kern_param.workspace_ptr);
//! Used for get the workspace offset //! Used for get the workspace offset
const int src_expand = stride_h == 2 ? 4 : 16; const int src_expand = stride_h == 2 ? 4 : 16;


@@ -124,10 +124,9 @@ static void copy_padding_kern(WorkspaceBundle bundle,
iw, iw2, pw, nullptr); iw, iw2, pw, nullptr);
} }
} }
static void pack_weight(WorkspaceBundle bundle,
static void pack_weight(const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index) { const ConvBiasImpl::NCBKernIndex& ncb_index) {
bundle.set(kern_param.workspace_ptr);
const int group_id = ncb_index.ndrange_id[0]; const int group_id = ncb_index.ndrange_id[0];
int fh = kern_param.filter_meta.spatial[0]; int fh = kern_param.filter_meta.spatial[0];
int fw = kern_param.filter_meta.spatial[1]; int fw = kern_param.filter_meta.spatial[1];
@@ -151,7 +150,7 @@ static void pack_weight(WorkspaceBundle bundle,
} }
} }
template <size_t filter, BiasMode bias_mode, typename Op, int stride> template <size_t filter, BiasMode bias_mode, typename Op, int stride>
static void do_conv_kern(WorkspaceBundle bundle,
static void do_conv_kern(const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids, const CpuNDRange& workspace_ids,
@@ -177,7 +176,6 @@ static void do_conv_kern(WorkspaceBundle bundle,
op = Op(scale_bias, scale_dst); op = Op(scale_bias, scale_dst);
} }
int padding_group_size = ih2 * iw2 * ic; int padding_group_size = ih2 * iw2 * ic;
bundle.set(kern_param.workspace_ptr);


constexpr int pack_c = 4; constexpr int pack_c = 4;
constexpr int src_expand_size = stride == 2 ? 4 : 16; constexpr int src_expand_size = stride == 2 ? 4 : 16;
@@ -258,7 +256,7 @@ ConvBiasImpl::AlgoS8DirectNCHWNCHW44::dispatch_kerns(
size_t N = param.n; size_t N = param.n;
size_t OC = fm.ocpg; size_t OC = fm.ocpg;
size_t group = fm.group; size_t group = fm.group;
WorkspaceBundle wbundle = get_bundle(param);
WorkspaceBundle bundle = get_bundle(param);
conv_fun do_conv_fun = nullptr; conv_fun do_conv_fun = nullptr;
// NOTE: remain_w is not used to gen hash of midout for compatible with changing // NOTE: remain_w is not used to gen hash of midout for compatible with changing
// shape runtime // shape runtime
@@ -342,18 +340,19 @@ ConvBiasImpl::AlgoS8DirectNCHWNCHW44::dispatch_kerns(
megdnn_assert(do_conv_fun); megdnn_assert(do_conv_fun);


SmallVector<ConvBiasImpl::NCBKern> ret_kerns; SmallVector<ConvBiasImpl::NCBKern> ret_kerns;
WorkspaceBundle bundle = wbundle;


constexpr size_t pack_oc = 8; constexpr size_t pack_oc = 8;
size_t oc_step = pack_oc; size_t oc_step = pack_oc;
auto copy_padding = [bundle](const NCBKernParam& kern_param, auto copy_padding = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
copy_padding_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id); copy_padding_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id);
}; };
ret_kerns.push_back({copy_padding, {N, group, fm.icpg}}); ret_kerns.push_back({copy_padding, {N, group, fm.icpg}});


auto do_pack_weight = [bundle](const NCBKernParam& kern_param, auto do_pack_weight = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
pack_weight(bundle, kern_param, ncb_index); pack_weight(bundle, kern_param, ncb_index);
}; };
ret_kerns.push_back({do_pack_weight, {static_cast<size_t>(group)}}); ret_kerns.push_back({do_pack_weight, {static_cast<size_t>(group)}});
@@ -361,7 +360,8 @@ ConvBiasImpl::AlgoS8DirectNCHWNCHW44::dispatch_kerns(
CpuNDRange ncb_range = {N, group, div_ceil(OC, oc_step)}; CpuNDRange ncb_range = {N, group, div_ceil(OC, oc_step)};
auto do_conv = [bundle, do_conv_fun, ncb_range]( auto do_conv = [bundle, do_conv_fun, ncb_range](
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id, do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id,
ncb_range); ncb_range);
}; };


+ 9
- 9
dnn/src/arm_common/conv_bias/int8/dot_direct_nchw_nchw44_algo.cpp View File

@@ -22,7 +22,8 @@
using namespace megdnn; using namespace megdnn;
using namespace arm_common; using namespace arm_common;
using conv_fun = std::function<void( using conv_fun = std::function<void(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids, const CpuNDRange& ncb_range)>; const CpuNDRange& workspace_ids, const CpuNDRange& ncb_range)>;
MIDOUT_DECL(megdnn_arm_common_conv_bias_int8_nchw44_dot) MIDOUT_DECL(megdnn_arm_common_conv_bias_int8_nchw44_dot)
@@ -82,7 +83,7 @@ static WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param) {
temp_size * param.nr_threads}}; temp_size * param.nr_threads}};
}; };


void do_weight_trans(WorkspaceBundle bundle,
void do_weight_trans(const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex&, const CpuNDRange&) { const ConvBiasImpl::NCBKernIndex&, const CpuNDRange&) {
const int ic = kern_param.filter_meta.icpg; const int ic = kern_param.filter_meta.icpg;
@@ -90,7 +91,6 @@ void do_weight_trans(WorkspaceBundle bundle,
const int fh = kern_param.filter_meta.spatial[0]; const int fh = kern_param.filter_meta.spatial[0];
const int fw = kern_param.filter_meta.spatial[1]; const int fw = kern_param.filter_meta.spatial[1];
const int fw2 = round_up(fw, 4); const int fw2 = round_up(fw, 4);
bundle.set(kern_param.workspace_ptr);
auto packed_weight = reinterpret_cast<int8_t*>(bundle.get(1)); auto packed_weight = reinterpret_cast<int8_t*>(bundle.get(1));
auto origin_weight = kern_param.filter<dt_int8>(); auto origin_weight = kern_param.filter<dt_int8>();
pack_weight_int8_nchw_nchw44_dot(packed_weight, origin_weight, oc, ic, fh, pack_weight_int8_nchw_nchw44_dot(packed_weight, origin_weight, oc, ic, fh,
@@ -98,7 +98,7 @@ void do_weight_trans(WorkspaceBundle bundle,
} }


template <size_t filter, BiasMode bias_mode, typename Op, int stride> template <size_t filter, BiasMode bias_mode, typename Op, int stride>
static void do_conv_kern(WorkspaceBundle bundle,
static void do_conv_kern(const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange&, const CpuNDRange&) { const CpuNDRange&, const CpuNDRange&) {
@@ -117,7 +117,6 @@ static void do_conv_kern(WorkspaceBundle bundle,
int ih2 = 0; int ih2 = 0;
int iw2 = 0; int iw2 = 0;
get_rectified_size(kern_param, ih2, iw2); get_rectified_size(kern_param, ih2, iw2);
bundle.set(kern_param.workspace_ptr);


constexpr int pack_c = 4; constexpr int pack_c = 4;
const int batch_id = ncb_index.ndrange_id[0]; const int batch_id = ncb_index.ndrange_id[0];
@@ -205,7 +204,7 @@ ConvBiasImpl::AlgoDotS8DirectNCHWNCHW44::dispatch_kerns(
auto fm = param.filter_meta; auto fm = param.filter_meta;
const int batch = param.n; const int batch = param.n;
const int group = fm.group; const int group = fm.group;
WorkspaceBundle wbundle = get_bundle(param);
WorkspaceBundle bundle = get_bundle(param);
conv_fun do_conv_fun = nullptr; conv_fun do_conv_fun = nullptr;
// NOTE: remain_w is not used to gen hash of midout for compatible with // NOTE: remain_w is not used to gen hash of midout for compatible with
// shape runtime // shape runtime
@@ -288,7 +287,6 @@ ConvBiasImpl::AlgoDotS8DirectNCHWNCHW44::dispatch_kerns(
megdnn_assert(do_conv_fun); megdnn_assert(do_conv_fun);


SmallVector<ConvBiasImpl::NCBKern> ret_kerns; SmallVector<ConvBiasImpl::NCBKern> ret_kerns;
WorkspaceBundle bundle = wbundle;
int oh = param.osz[0]; int oh = param.osz[0];
int ic = param.filter_meta.icpg; int ic = param.filter_meta.icpg;
int iw = param.isz[1]; int iw = param.isz[1];
@@ -302,14 +300,16 @@ ConvBiasImpl::AlgoDotS8DirectNCHWNCHW44::dispatch_kerns(
static_cast<size_t>(div_ceil(oh, oh_block))}; static_cast<size_t>(div_ceil(oh, oh_block))};


auto do_trans_weight = [bundle](const NCBKernParam& kern_param, auto do_trans_weight = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
do_weight_trans(bundle, kern_param, ncb_index, ncb_index.ndrange_id); do_weight_trans(bundle, kern_param, ncb_index, ncb_index.ndrange_id);
}; };
ret_kerns.push_back({do_trans_weight, {1}}); ret_kerns.push_back({do_trans_weight, {1}});


auto do_conv = [bundle, do_conv_fun, ncb_range]( auto do_conv = [bundle, do_conv_fun, ncb_range](
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id, do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id,
ncb_range); ncb_range);
}; };


+ 13
- 12
dnn/src/arm_common/conv_bias/int8/stride1.cpp View File

@@ -107,7 +107,8 @@ WorkspaceBundle direct_int8_stride1::get_bundle(
} }
//! Process one input channel copy padding //! Process one input channel copy padding
void direct_int8_stride1::copy_padding_kern( void direct_int8_stride1::copy_padding_kern(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids) { const CpuNDRange& workspace_ids) {
size_t IH = kern_param.isz[0]; size_t IH = kern_param.isz[0];
@@ -121,7 +122,6 @@ void direct_int8_stride1::copy_padding_kern(
get_rectified_size(kern_param, IH2, IW2, OH2, OW2); get_rectified_size(kern_param, IH2, IW2, OH2, OW2);
bool need_src_copy_var = need_src_copy(kern_param); bool need_src_copy_var = need_src_copy(kern_param);
size_t padding_group_size = IH2 * IW2 * IC; size_t padding_group_size = IH2 * IW2 * IC;
bundle.set(kern_param.workspace_ptr);
//! Used for get the workspace offset //! Used for get the workspace offset
size_t workspace_group_id = workspace_ids[0], size_t workspace_group_id = workspace_ids[0],
workspace_batch_id = workspace_ids[1], channel_id = workspace_ids[2], workspace_batch_id = workspace_ids[1], channel_id = workspace_ids[2],
@@ -145,7 +145,7 @@ void direct_int8_stride1::copy_padding_kern(
}; };
//! compute one output channel //! compute one output channel
template <size_t filter, BiasMode bias_mode, typename Op> template <size_t filter, BiasMode bias_mode, typename Op>
void direct_int8_stride1::do_conv_kern(WorkspaceBundle bundle,
void direct_int8_stride1::do_conv_kern(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids) { const CpuNDRange& workspace_ids) {
@@ -170,7 +170,6 @@ void direct_int8_stride1::do_conv_kern(WorkspaceBundle bundle,
op = Op(scale_bias, scale_dst); op = Op(scale_bias, scale_dst);
} }
size_t padding_group_size = IH2 * IW2 * IC; size_t padding_group_size = IH2 * IW2 * IC;
bundle.set(kern_param.workspace_ptr);
//! Used for get the workspace offset //! Used for get the workspace offset
size_t workspace_group_id = workspace_ids[0], size_t workspace_group_id = workspace_ids[0],
workspace_batch_id = workspace_ids[1], oc = workspace_ids[2]; workspace_batch_id = workspace_ids[1], oc = workspace_ids[2];
@@ -263,7 +262,7 @@ SmallVector<ConvBiasImpl::NCBKern> direct_int8_stride1::get_kimpls(
size_t IC = param.filter_meta.icpg; size_t IC = param.filter_meta.icpg;
size_t OC = param.filter_meta.ocpg; size_t OC = param.filter_meta.ocpg;
size_t group = fm.group; size_t group = fm.group;
WorkspaceBundle wbundle = get_bundle(param, m_large_group);
WorkspaceBundle bundle = get_bundle(param, m_large_group);
conv_fun do_conv_fun = nullptr; conv_fun do_conv_fun = nullptr;


#define DO_CONV_KERN_FUN(filter, bias_mode, op) \ #define DO_CONV_KERN_FUN(filter, bias_mode, op) \
@@ -324,13 +323,13 @@ SmallVector<ConvBiasImpl::NCBKern> direct_int8_stride1::get_kimpls(


SmallVector<ConvBiasImpl::NCBKern> ret_kerns; SmallVector<ConvBiasImpl::NCBKern> ret_kerns;
if (m_large_group) { if (m_large_group) {
auto exec_one_group = [wbundle, do_conv_fun](
auto exec_one_group = [bundle, do_conv_fun](
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
auto fm = kern_param.filter_meta; auto fm = kern_param.filter_meta;
size_t IC = fm.icpg; size_t IC = fm.icpg;
size_t OC = fm.ocpg; size_t OC = fm.ocpg;
WorkspaceBundle bundle = wbundle;
bundle.set(kern_param.workspace_ptr);
for (size_t ic = 0; ic < IC; ic++) { for (size_t ic = 0; ic < IC; ic++) {
copy_padding_kern(bundle, kern_param, ncb_index, copy_padding_kern(bundle, kern_param, ncb_index,
{ncb_index.thread_id, 0, ic}); {ncb_index.thread_id, 0, ic});
@@ -342,15 +341,17 @@ SmallVector<ConvBiasImpl::NCBKern> direct_int8_stride1::get_kimpls(
}; };
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
} else { } else {
WorkspaceBundle bundle = wbundle;
auto copy_padding = [bundle](const NCBKernParam& kern_param, auto copy_padding = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
copy_padding_kern(bundle, kern_param, ncb_index, copy_padding_kern(bundle, kern_param, ncb_index,
ncb_index.ndrange_id); ncb_index.ndrange_id);
}; };
ret_kerns.push_back({copy_padding, {group, N, IC}}); ret_kerns.push_back({copy_padding, {group, N, IC}});
auto do_conv = [bundle, do_conv_fun](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
auto do_conv = [bundle, do_conv_fun](
const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id);
}; };
ret_kerns.push_back({do_conv, {group, N, OC}}); ret_kerns.push_back({do_conv, {group, N, OC}});


+ 3
- 3
dnn/src/arm_common/conv_bias/int8/stride1.h View File

@@ -21,19 +21,19 @@ using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam;
using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex;


using conv_fun = std::function<void( using conv_fun = std::function<void(
WorkspaceBundle bundle, const NCBKernParam& kern_param,
const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>; const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>;


bool can_conv_direct_stride1_int8(const NCBKernSizeParam& param); bool can_conv_direct_stride1_int8(const NCBKernSizeParam& param);


WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group);


void copy_padding_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param,
void copy_padding_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);


template <size_t filter, BiasMode bias_mode, typename Op> template <size_t filter, BiasMode bias_mode, typename Op>
void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param,
void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);




+ 13
- 12
dnn/src/arm_common/conv_bias/int8/stride1_dotprod.cpp View File

@@ -109,7 +109,8 @@ WorkspaceBundle direct_dotprod_int8_stride1::get_bundle(
} }
//! Process one input channel copy padding //! Process one input channel copy padding
void direct_dotprod_int8_stride1::copy_padding_kern( void direct_dotprod_int8_stride1::copy_padding_kern(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids) { const CpuNDRange& workspace_ids) {
size_t IH = kern_param.isz[0]; size_t IH = kern_param.isz[0];
@@ -123,7 +124,6 @@ void direct_dotprod_int8_stride1::copy_padding_kern(
get_rectified_size(kern_param, IH2, IW2, OH2, OW2); get_rectified_size(kern_param, IH2, IW2, OH2, OW2);
bool need_src_copy_var = need_src_copy(kern_param); bool need_src_copy_var = need_src_copy(kern_param);
size_t padding_group_size = IH2 * IW2 * IC; size_t padding_group_size = IH2 * IW2 * IC;
bundle.set(kern_param.workspace_ptr);


//! Used for get the workspace offset //! Used for get the workspace offset
size_t group_id = ncb_index.ndrange_id[0], size_t group_id = ncb_index.ndrange_id[0],
@@ -148,7 +148,7 @@ void direct_dotprod_int8_stride1::copy_padding_kern(
//! compute one output channel //! compute one output channel
template <size_t filter, BiasMode bias_mode, typename Op> template <size_t filter, BiasMode bias_mode, typename Op>
void direct_dotprod_int8_stride1::do_conv_kern( void direct_dotprod_int8_stride1::do_conv_kern(
WorkspaceBundle bundle, const NCBKernParam& kern_param,
const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) {
size_t OH = kern_param.osz[0]; size_t OH = kern_param.osz[0];
size_t OW = kern_param.osz[1]; size_t OW = kern_param.osz[1];
@@ -172,7 +172,6 @@ void direct_dotprod_int8_stride1::do_conv_kern(
} }
size_t padding_group_size = IH2 * IW2 * IC; size_t padding_group_size = IH2 * IW2 * IC;


bundle.set(kern_param.workspace_ptr);
//! Used for get the workspace offset //! Used for get the workspace offset
size_t workspace_group_id = workspace_ids[0], size_t workspace_group_id = workspace_ids[0],
workspace_batch_id = workspace_ids[1], oc = workspace_ids[2]; workspace_batch_id = workspace_ids[1], oc = workspace_ids[2];
@@ -264,7 +263,7 @@ SmallVector<ConvBiasImpl::NCBKern> direct_dotprod_int8_stride1::get_kimpls(
size_t IC = param.filter_meta.icpg; size_t IC = param.filter_meta.icpg;
size_t OC = param.filter_meta.ocpg; size_t OC = param.filter_meta.ocpg;
size_t group = fm.group; size_t group = fm.group;
WorkspaceBundle wbundle = get_bundle(param, m_large_group);
WorkspaceBundle bundle = get_bundle(param, m_large_group);
conv_fun do_conv_fun = nullptr; conv_fun do_conv_fun = nullptr;


#define DO_CONV_KERN_FUN(filter, bias_mode, op) \ #define DO_CONV_KERN_FUN(filter, bias_mode, op) \
@@ -325,13 +324,13 @@ SmallVector<ConvBiasImpl::NCBKern> direct_dotprod_int8_stride1::get_kimpls(


SmallVector<ConvBiasImpl::NCBKern> ret_kerns; SmallVector<ConvBiasImpl::NCBKern> ret_kerns;
if (m_large_group) { if (m_large_group) {
auto exec_one_group = [wbundle, do_conv_fun](
auto exec_one_group = [bundle, do_conv_fun](
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
auto fm = kern_param.filter_meta; auto fm = kern_param.filter_meta;
size_t IC = fm.icpg; size_t IC = fm.icpg;
size_t OC = fm.ocpg; size_t OC = fm.ocpg;
WorkspaceBundle bundle = wbundle;
bundle.set(kern_param.workspace_ptr);
for (size_t ic = 0; ic < IC; ic++) { for (size_t ic = 0; ic < IC; ic++) {
copy_padding_kern(bundle, kern_param, ncb_index, copy_padding_kern(bundle, kern_param, ncb_index,
{ncb_index.thread_id, 0, ic}); {ncb_index.thread_id, 0, ic});
@@ -343,15 +342,17 @@ SmallVector<ConvBiasImpl::NCBKern> direct_dotprod_int8_stride1::get_kimpls(
}; };
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
} else { } else {
WorkspaceBundle bundle = wbundle;
auto copy_padding = [bundle](const NCBKernParam& kern_param, auto copy_padding = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
copy_padding_kern(bundle, kern_param, ncb_index, copy_padding_kern(bundle, kern_param, ncb_index,
ncb_index.ndrange_id); ncb_index.ndrange_id);
}; };
ret_kerns.push_back({copy_padding, {group, N, IC}}); ret_kerns.push_back({copy_padding, {group, N, IC}});
auto do_conv = [bundle, do_conv_fun](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
auto do_conv = [bundle, do_conv_fun](
const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id);
}; };
ret_kerns.push_back({do_conv, {group, N, OC}}); ret_kerns.push_back({do_conv, {group, N, OC}});


+ 4
- 3
dnn/src/arm_common/conv_bias/int8/stride1_dotprod.h View File

@@ -20,19 +20,20 @@ using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam;
using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex;


using conv_fun = std::function<void( using conv_fun = std::function<void(
WorkspaceBundle bundle, const NCBKernParam& kern_param,
const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>; const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>;


bool can_conv_direct_stride1_int8(const NCBKernSizeParam& param); bool can_conv_direct_stride1_int8(const NCBKernSizeParam& param);


WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group);


void copy_padding_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param,
void copy_padding_kern(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);


template <size_t filter, BiasMode bias_mode, typename Op> template <size_t filter, BiasMode bias_mode, typename Op>
void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param,
void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);




+ 13
- 12
dnn/src/arm_common/conv_bias/int8/stride2.cpp View File

@@ -115,7 +115,8 @@ WorkspaceBundle direct_int8_stride2::get_bundle(
} }
//! Process one input channel copy padding //! Process one input channel copy padding
void direct_int8_stride2::copy_padding_kern( void direct_int8_stride2::copy_padding_kern(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids) { const CpuNDRange& workspace_ids) {
size_t IH = kern_param.isz[0]; size_t IH = kern_param.isz[0];
@@ -129,7 +130,6 @@ void direct_int8_stride2::copy_padding_kern(
get_rectified_size(kern_param, IH2, IW2, OH2, OW2); get_rectified_size(kern_param, IH2, IW2, OH2, OW2);
bool need_src_copy_var = need_src_copy(kern_param); bool need_src_copy_var = need_src_copy(kern_param);
size_t padding_group_size = IH2 * IW2 * IC; size_t padding_group_size = IH2 * IW2 * IC;
bundle.set(kern_param.workspace_ptr);


//! Used for get the workspace offset //! Used for get the workspace offset
size_t group_id = ncb_index.ndrange_id[0], size_t group_id = ncb_index.ndrange_id[0],
@@ -153,7 +153,7 @@ void direct_int8_stride2::copy_padding_kern(
}; };
//! compute one output channel //! compute one output channel
template <size_t filter, BiasMode bias_mode, typename Op> template <size_t filter, BiasMode bias_mode, typename Op>
void direct_int8_stride2::do_conv_kern(WorkspaceBundle bundle,
void direct_int8_stride2::do_conv_kern(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids) { const CpuNDRange& workspace_ids) {
@@ -178,7 +178,6 @@ void direct_int8_stride2::do_conv_kern(WorkspaceBundle bundle,
op = Op(scale_bias, scale_dst); op = Op(scale_bias, scale_dst);
} }
size_t padding_group_size = IH2 * IW2 * IC; size_t padding_group_size = IH2 * IW2 * IC;
bundle.set(kern_param.workspace_ptr);
//! Used for get the workspace offset //! Used for get the workspace offset
size_t workspace_group_id = workspace_ids[0], size_t workspace_group_id = workspace_ids[0],
workspace_batch_id = workspace_ids[1], oc = workspace_ids[2]; workspace_batch_id = workspace_ids[1], oc = workspace_ids[2];
@@ -270,7 +269,7 @@ SmallVector<ConvBiasImpl::NCBKern> direct_int8_stride2::get_kimpls(
size_t IC = param.filter_meta.icpg; size_t IC = param.filter_meta.icpg;
size_t OC = param.filter_meta.ocpg; size_t OC = param.filter_meta.ocpg;
size_t group = fm.group; size_t group = fm.group;
WorkspaceBundle wbundle = get_bundle(param, m_large_group);
WorkspaceBundle bundle = get_bundle(param, m_large_group);
conv_fun do_conv_fun = nullptr; conv_fun do_conv_fun = nullptr;


#define DO_CONV_KERN_FUN(filter, bias_mode, op) \ #define DO_CONV_KERN_FUN(filter, bias_mode, op) \
@@ -331,13 +330,13 @@ SmallVector<ConvBiasImpl::NCBKern> direct_int8_stride2::get_kimpls(


SmallVector<ConvBiasImpl::NCBKern> ret_kerns; SmallVector<ConvBiasImpl::NCBKern> ret_kerns;
if (m_large_group) { if (m_large_group) {
auto exec_one_group = [wbundle, do_conv_fun](
auto exec_one_group = [bundle, do_conv_fun](
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
auto fm = kern_param.filter_meta; auto fm = kern_param.filter_meta;
size_t IC = fm.icpg; size_t IC = fm.icpg;
size_t OC = fm.ocpg; size_t OC = fm.ocpg;
WorkspaceBundle bundle = wbundle;
bundle.set(kern_param.workspace_ptr);
for (size_t ic = 0; ic < IC; ic++) { for (size_t ic = 0; ic < IC; ic++) {
copy_padding_kern(bundle, kern_param, ncb_index, copy_padding_kern(bundle, kern_param, ncb_index,
{ncb_index.thread_id, 0, ic}); {ncb_index.thread_id, 0, ic});
@@ -349,15 +348,17 @@ SmallVector<ConvBiasImpl::NCBKern> direct_int8_stride2::get_kimpls(
}; };
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
} else { } else {
WorkspaceBundle bundle = wbundle;
auto copy_padding = [bundle](const NCBKernParam& kern_param, auto copy_padding = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
copy_padding_kern(bundle, kern_param, ncb_index, copy_padding_kern(bundle, kern_param, ncb_index,
ncb_index.ndrange_id); ncb_index.ndrange_id);
}; };
ret_kerns.push_back({copy_padding, {group, N, IC}}); ret_kerns.push_back({copy_padding, {group, N, IC}});
auto do_conv = [bundle, do_conv_fun](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
auto do_conv = [bundle, do_conv_fun](
const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id);
}; };
ret_kerns.push_back({do_conv, {group, N, OC}}); ret_kerns.push_back({do_conv, {group, N, OC}});


+ 4
- 3
dnn/src/arm_common/conv_bias/int8/stride2.h View File

@@ -21,18 +21,19 @@ using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam;
using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex;


using conv_fun = std::function<void( using conv_fun = std::function<void(
WorkspaceBundle bundle, const NCBKernParam& kern_param,
const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>; const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>;
bool can_conv_direct_stride2_int8(const NCBKernSizeParam& param); bool can_conv_direct_stride2_int8(const NCBKernSizeParam& param);


WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group);


void copy_padding_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param,
void copy_padding_kern(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);


template <size_t filter, BiasMode bias_mode, typename Op> template <size_t filter, BiasMode bias_mode, typename Op>
void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param,
void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);




+ 13
- 12
dnn/src/arm_common/conv_bias/int8/stride2_dotprod.cpp View File

@@ -116,7 +116,8 @@ WorkspaceBundle direct_dotprod_int8_stride2::get_bundle(
} }
//! Process one input channel copy padding //! Process one input channel copy padding
void direct_dotprod_int8_stride2::copy_padding_kern( void direct_dotprod_int8_stride2::copy_padding_kern(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids) { const CpuNDRange& workspace_ids) {
size_t IH = kern_param.isz[0]; size_t IH = kern_param.isz[0];
@@ -130,7 +131,6 @@ void direct_dotprod_int8_stride2::copy_padding_kern(
get_rectified_size(kern_param, IH2, IW2, OH2, OW2); get_rectified_size(kern_param, IH2, IW2, OH2, OW2);
bool need_src_copy_var = need_src_copy(kern_param); bool need_src_copy_var = need_src_copy(kern_param);
size_t padding_group_size = IH2 * IW2 * IC; size_t padding_group_size = IH2 * IW2 * IC;
bundle.set(kern_param.workspace_ptr);


//! Used for get the workspace offset //! Used for get the workspace offset
size_t workspace_group_id = workspace_ids[0], size_t workspace_group_id = workspace_ids[0],
@@ -154,7 +154,7 @@ void direct_dotprod_int8_stride2::copy_padding_kern(
//! compute one output channel //! compute one output channel
template <size_t filter, BiasMode bias_mode, typename Op> template <size_t filter, BiasMode bias_mode, typename Op>
void direct_dotprod_int8_stride2::do_conv_kern( void direct_dotprod_int8_stride2::do_conv_kern(
WorkspaceBundle bundle, const NCBKernParam& kern_param,
const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) {
size_t OH = kern_param.osz[0]; size_t OH = kern_param.osz[0];
size_t OW = kern_param.osz[1]; size_t OW = kern_param.osz[1];
@@ -177,7 +177,6 @@ void direct_dotprod_int8_stride2::do_conv_kern(
op = Op(scale_bias, scale_dst); op = Op(scale_bias, scale_dst);
} }
size_t padding_group_size = IH2 * IW2 * IC; size_t padding_group_size = IH2 * IW2 * IC;
bundle.set(kern_param.workspace_ptr);
//! Used for get the workspace offset //! Used for get the workspace offset
size_t workspace_group_id = workspace_ids[0], size_t workspace_group_id = workspace_ids[0],
workspace_batch_id = workspace_ids[1], oc = workspace_ids[2]; workspace_batch_id = workspace_ids[1], oc = workspace_ids[2];
@@ -270,7 +269,7 @@ SmallVector<ConvBiasImpl::NCBKern> direct_dotprod_int8_stride2::get_kimpls(
size_t IC = param.filter_meta.icpg; size_t IC = param.filter_meta.icpg;
size_t OC = param.filter_meta.ocpg; size_t OC = param.filter_meta.ocpg;
size_t group = fm.group; size_t group = fm.group;
WorkspaceBundle wbundle = get_bundle(param, m_large_group);
WorkspaceBundle bundle = get_bundle(param, m_large_group);
conv_fun do_conv_fun = nullptr; conv_fun do_conv_fun = nullptr;


#define DO_CONV_KERN_FUN(filter, bias_mode, op) \ #define DO_CONV_KERN_FUN(filter, bias_mode, op) \
@@ -331,13 +330,13 @@ SmallVector<ConvBiasImpl::NCBKern> direct_dotprod_int8_stride2::get_kimpls(


SmallVector<ConvBiasImpl::NCBKern> ret_kerns; SmallVector<ConvBiasImpl::NCBKern> ret_kerns;
if (m_large_group) { if (m_large_group) {
auto exec_one_group = [wbundle, do_conv_fun](
auto exec_one_group = [bundle, do_conv_fun](
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
auto fm = kern_param.filter_meta; auto fm = kern_param.filter_meta;
size_t IC = fm.icpg; size_t IC = fm.icpg;
size_t OC = fm.ocpg; size_t OC = fm.ocpg;
WorkspaceBundle bundle = wbundle;
bundle.set(kern_param.workspace_ptr);
for (size_t ic = 0; ic < IC; ic++) { for (size_t ic = 0; ic < IC; ic++) {
copy_padding_kern(bundle, kern_param, ncb_index, copy_padding_kern(bundle, kern_param, ncb_index,
{ncb_index.thread_id, 0, ic}); {ncb_index.thread_id, 0, ic});
@@ -349,15 +348,17 @@ SmallVector<ConvBiasImpl::NCBKern> direct_dotprod_int8_stride2::get_kimpls(
}; };
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
} else { } else {
WorkspaceBundle bundle = wbundle;
auto copy_padding = [bundle](const NCBKernParam& kern_param, auto copy_padding = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
copy_padding_kern(bundle, kern_param, ncb_index, copy_padding_kern(bundle, kern_param, ncb_index,
ncb_index.ndrange_id); ncb_index.ndrange_id);
}; };
ret_kerns.push_back({copy_padding, {group, N, IC}}); ret_kerns.push_back({copy_padding, {group, N, IC}});
auto do_conv = [bundle, do_conv_fun](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
auto do_conv = [bundle, do_conv_fun](
const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id);
}; };
ret_kerns.push_back({do_conv, {group, N, OC}}); ret_kerns.push_back({do_conv, {group, N, OC}});


+ 4
- 3
dnn/src/arm_common/conv_bias/int8/stride2_dotprod.h View File

@@ -21,19 +21,20 @@ using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam;
using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex;


using conv_fun = std::function<void( using conv_fun = std::function<void(
WorkspaceBundle bundle, const NCBKernParam& kern_param,
const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>; const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>;


bool can_conv_direct_stride2_int8(const NCBKernSizeParam& param); bool can_conv_direct_stride2_int8(const NCBKernSizeParam& param);


WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group);


void copy_padding_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param,
void copy_padding_kern(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);


template <size_t filter, BiasMode bias_mode, typename Op> template <size_t filter, BiasMode bias_mode, typename Op>
void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param,
void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);




+ 22
- 22
dnn/src/arm_common/conv_bias/int8x8x16/algos.cpp View File

@@ -139,7 +139,8 @@ size_t ConvBiasImpl::AlgoI8x8x16Direct::get_workspace(
} }
//! Process one input channel copy padding //! Process one input channel copy padding
void ConvBiasImpl::AlgoI8x8x16Direct::copy_padding_kern( void ConvBiasImpl::AlgoI8x8x16Direct::copy_padding_kern(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids) { const CpuNDRange& workspace_ids) {
size_t IH = kern_param.isz[0]; size_t IH = kern_param.isz[0];
@@ -154,7 +155,6 @@ void ConvBiasImpl::AlgoI8x8x16Direct::copy_padding_kern(
get_rectified_size_str1(IH, IW, OH, OW, PH, PW, IH2, IW2, OH2, OW2); get_rectified_size_str1(IH, IW, OH, OW, PH, PW, IH2, IW2, OH2, OW2);
bool need_src_copy_var = need_src_copy_str1(kern_param); bool need_src_copy_var = need_src_copy_str1(kern_param);
size_t padding_group_size = IH2 * IW2 * IC; size_t padding_group_size = IH2 * IW2 * IC;
bundle.set(kern_param.workspace_ptr);


//! Used for get the workspace offset //! Used for get the workspace offset
size_t workspace_group_id = workspace_ids[0], size_t workspace_group_id = workspace_ids[0],
@@ -178,7 +178,7 @@ void ConvBiasImpl::AlgoI8x8x16Direct::copy_padding_kern(
}; };
//! compute one output channel //! compute one output channel
void ConvBiasImpl::AlgoI8x8x16Direct::do_conv_kern( void ConvBiasImpl::AlgoI8x8x16Direct::do_conv_kern(
WorkspaceBundle bundle, const NCBKernParam& kern_param,
const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) {
size_t OH = kern_param.osz[0]; size_t OH = kern_param.osz[0];
size_t OW = kern_param.osz[1]; size_t OW = kern_param.osz[1];
@@ -214,7 +214,6 @@ void ConvBiasImpl::AlgoI8x8x16Direct::do_conv_kern(
fun_add_to_dst = conv_bias::conv_direct_5x5_sc_int8_int8_int16<true>; fun_add_to_dst = conv_bias::conv_direct_5x5_sc_int8_int8_int16<true>;
} }


bundle.set(kern_param.workspace_ptr);
//! Used for get the workspace offset //! Used for get the workspace offset
size_t workspace_group_id = workspace_ids[0], size_t workspace_group_id = workspace_ids[0],
workspace_batch_id = workspace_ids[1], oc = workspace_ids[2]; workspace_batch_id = workspace_ids[1], oc = workspace_ids[2];
@@ -256,15 +255,15 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoI8x8x16Direct::get_kimpls(
size_t IC = param.filter_meta.icpg; size_t IC = param.filter_meta.icpg;
size_t OC = param.filter_meta.ocpg; size_t OC = param.filter_meta.ocpg;
size_t group = fm.group; size_t group = fm.group;
WorkspaceBundle wbundle = get_bundle(param);
WorkspaceBundle bundle = get_bundle(param);
SmallVector<NCBKern> ret_kerns; SmallVector<NCBKern> ret_kerns;
if (m_large_group) { if (m_large_group) {
auto exec_one_group = [wbundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
auto exec_one_group = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) mutable {
auto fm = kern_param.filter_meta; auto fm = kern_param.filter_meta;
size_t IC = fm.icpg; size_t IC = fm.icpg;
size_t OC = fm.ocpg; size_t OC = fm.ocpg;
WorkspaceBundle bundle = wbundle;
bundle.set(kern_param.workspace_ptr);
for (size_t ic = 0; ic < IC; ic++) { for (size_t ic = 0; ic < IC; ic++) {
copy_padding_kern(bundle, kern_param, ncb_index, copy_padding_kern(bundle, kern_param, ncb_index,
{ncb_index.thread_id, 0, ic}); {ncb_index.thread_id, 0, ic});
@@ -276,15 +275,16 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoI8x8x16Direct::get_kimpls(
}; };
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
} else { } else {
WorkspaceBundle bundle = wbundle;
auto copy_padding = [bundle](const NCBKernParam& kern_param, auto copy_padding = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
copy_padding_kern(bundle, kern_param, ncb_index, copy_padding_kern(bundle, kern_param, ncb_index,
ncb_index.ndrange_id); ncb_index.ndrange_id);
}; };
ret_kerns.push_back({copy_padding, {group, N, IC}}); ret_kerns.push_back({copy_padding, {group, N, IC}});
auto do_conv = [bundle](const NCBKernParam& kern_param, auto do_conv = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
do_conv_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id); do_conv_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id);
}; };
ret_kerns.push_back({do_conv, {group, N, OC}}); ret_kerns.push_back({do_conv, {group, N, OC}});
@@ -360,7 +360,8 @@ size_t ConvBiasImpl::AlgoI8x8x16Stride2::get_workspace(
} }
//! Process one input channel copy padding //! Process one input channel copy padding
void ConvBiasImpl::AlgoI8x8x16Stride2::copy_padding_kern( void ConvBiasImpl::AlgoI8x8x16Stride2::copy_padding_kern(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids) { const CpuNDRange& workspace_ids) {
size_t IH = kern_param.isz[0]; size_t IH = kern_param.isz[0];
@@ -378,7 +379,6 @@ void ConvBiasImpl::AlgoI8x8x16Stride2::copy_padding_kern(
bool need_src_copy_var = need_src_copy_str2(kern_param); bool need_src_copy_var = need_src_copy_str2(kern_param);
size_t padding_group_size = IH2 * IW2 * IC; size_t padding_group_size = IH2 * IW2 * IC;


bundle.set(kern_param.workspace_ptr);
size_t workspace_group_id = workspace_ids[0], size_t workspace_group_id = workspace_ids[0],
workspace_batch_id = workspace_ids[1], workspace_batch_id = workspace_ids[1],
channel_id = workspace_ids[2]; channel_id = workspace_ids[2];
@@ -400,7 +400,7 @@ void ConvBiasImpl::AlgoI8x8x16Stride2::copy_padding_kern(
}; };
//! compute one output channel //! compute one output channel
void ConvBiasImpl::AlgoI8x8x16Stride2::do_conv_kern( void ConvBiasImpl::AlgoI8x8x16Stride2::do_conv_kern(
WorkspaceBundle bundle, const NCBKernParam& kern_param,
const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) {
size_t OH = kern_param.osz[0]; size_t OH = kern_param.osz[0];
size_t OW = kern_param.osz[1]; size_t OW = kern_param.osz[1];
@@ -436,7 +436,6 @@ void ConvBiasImpl::AlgoI8x8x16Stride2::do_conv_kern(
fun_add_to_dst = conv_bias::conv_stride2_5x5_sc_int8_int8_int16<true>; fun_add_to_dst = conv_bias::conv_stride2_5x5_sc_int8_int8_int16<true>;
} }


bundle.set(kern_param.workspace_ptr);
//! Used for get the workspace offset //! Used for get the workspace offset
size_t workspace_group_id = workspace_ids[0], size_t workspace_group_id = workspace_ids[0],
workspace_batch_id = workspace_ids[1], oc = workspace_ids[2]; workspace_batch_id = workspace_ids[1], oc = workspace_ids[2];
@@ -476,15 +475,15 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoI8x8x16Stride2::get_kimpls(
size_t IC = param.filter_meta.icpg; size_t IC = param.filter_meta.icpg;
size_t OC = param.filter_meta.ocpg; size_t OC = param.filter_meta.ocpg;
size_t group = fm.group; size_t group = fm.group;
WorkspaceBundle wbundle = get_bundle(param);
WorkspaceBundle bundle = get_bundle(param);
SmallVector<NCBKern> ret_kerns; SmallVector<NCBKern> ret_kerns;
if (m_large_group) { if (m_large_group) {
auto exec_one_group = [wbundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
auto exec_one_group = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) mutable {
auto fm = kern_param.filter_meta; auto fm = kern_param.filter_meta;
size_t IC = fm.icpg; size_t IC = fm.icpg;
size_t OC = fm.ocpg; size_t OC = fm.ocpg;
WorkspaceBundle bundle = wbundle;
bundle.set(kern_param.workspace_ptr);
for (size_t ic = 0; ic < IC; ic++) { for (size_t ic = 0; ic < IC; ic++) {
copy_padding_kern(bundle, kern_param, ncb_index, copy_padding_kern(bundle, kern_param, ncb_index,
{ncb_index.thread_id, 0, ic}); {ncb_index.thread_id, 0, ic});
@@ -496,15 +495,16 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoI8x8x16Stride2::get_kimpls(
}; };
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
} else { } else {
WorkspaceBundle bundle = wbundle;
auto copy_padding = [bundle](const NCBKernParam& kern_param, auto copy_padding = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
copy_padding_kern(bundle, kern_param, ncb_index, copy_padding_kern(bundle, kern_param, ncb_index,
ncb_index.ndrange_id); ncb_index.ndrange_id);
}; };
ret_kerns.push_back({copy_padding, {group, N, IC}}); ret_kerns.push_back({copy_padding, {group, N, IC}});
auto do_conv = [bundle](const NCBKernParam& kern_param, auto do_conv = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
do_conv_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id); do_conv_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id);
}; };
ret_kerns.push_back({do_conv, {group, N, OC}}); ret_kerns.push_back({do_conv, {group, N, OC}});


+ 4
- 4
dnn/src/arm_common/conv_bias/int8x8x16/algos.h View File

@@ -18,11 +18,11 @@ namespace arm_common {
class ConvBiasImpl::AlgoI8x8x16Direct final : public AlgoBase { class ConvBiasImpl::AlgoI8x8x16Direct final : public AlgoBase {
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const; SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const;
WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const; WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const;
static void copy_padding_kern(WorkspaceBundle bundle,
static void copy_padding_kern(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);
static void do_conv_kern(WorkspaceBundle bundle,
static void do_conv_kern(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);
@@ -47,11 +47,11 @@ public:
class ConvBiasImpl::AlgoI8x8x16Stride2 final : public AlgoBase { class ConvBiasImpl::AlgoI8x8x16Stride2 final : public AlgoBase {
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const; SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const;
WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const; WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const;
static void copy_padding_kern(WorkspaceBundle bundle,
static void copy_padding_kern(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);
static void do_conv_kern(WorkspaceBundle bundle,
static void do_conv_kern(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);


+ 13
- 13
dnn/src/arm_common/conv_bias/quint8/stride1.cpp View File

@@ -99,7 +99,8 @@ WorkspaceBundle direct_quint8_stride1::get_bundle(
} }
//! Process one input channel copy padding //! Process one input channel copy padding
void direct_quint8_stride1::copy_padding_kern( void direct_quint8_stride1::copy_padding_kern(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids) { const CpuNDRange& workspace_ids) {
size_t IH = kern_param.isz[0]; size_t IH = kern_param.isz[0];
@@ -114,8 +115,6 @@ void direct_quint8_stride1::copy_padding_kern(
bool need_src_copy_var = need_src_copy(kern_param); bool need_src_copy_var = need_src_copy(kern_param);
size_t padding_group_size = IH2 * IW2 * IC; size_t padding_group_size = IH2 * IW2 * IC;


bundle.set(kern_param.workspace_ptr);

//! Used for get the workspace offset //! Used for get the workspace offset
size_t workspace_group_id = workspace_ids[0], size_t workspace_group_id = workspace_ids[0],
workspace_batch_id = workspace_ids[1], workspace_batch_id = workspace_ids[1],
@@ -142,7 +141,7 @@ void direct_quint8_stride1::copy_padding_kern(
}; };
//! compute one output channel //! compute one output channel
template <size_t filter, BiasMode bias_mode, typename Op> template <size_t filter, BiasMode bias_mode, typename Op>
void direct_quint8_stride1::do_conv_kern(WorkspaceBundle bundle,
void direct_quint8_stride1::do_conv_kern(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids) { const CpuNDRange& workspace_ids) {
@@ -180,7 +179,6 @@ void direct_quint8_stride1::do_conv_kern(WorkspaceBundle bundle,
} }
size_t padding_group_size = IH2 * IW2 * IC; size_t padding_group_size = IH2 * IW2 * IC;


bundle.set(kern_param.workspace_ptr);
//! Used for get the workspace offset //! Used for get the workspace offset
size_t workspace_group_id = workspace_ids[0], size_t workspace_group_id = workspace_ids[0],
workspace_batch_id = workspace_ids[1], oc = workspace_ids[2], workspace_batch_id = workspace_ids[1], oc = workspace_ids[2],
@@ -272,7 +270,7 @@ SmallVector<ConvBiasImpl::NCBKern> direct_quint8_stride1::get_kimpls(
size_t IC = param.filter_meta.icpg; size_t IC = param.filter_meta.icpg;
size_t OC = param.filter_meta.ocpg; size_t OC = param.filter_meta.ocpg;
size_t group = fm.group; size_t group = fm.group;
WorkspaceBundle wbundle = get_bundle(param, m_large_group);
WorkspaceBundle bundle = get_bundle(param, m_large_group);
conv_fun do_conv_fun = nullptr; conv_fun do_conv_fun = nullptr;


#define DO_CONV_KERN_FUN(filter, bias_mode, op) \ #define DO_CONV_KERN_FUN(filter, bias_mode, op) \
@@ -333,13 +331,13 @@ SmallVector<ConvBiasImpl::NCBKern> direct_quint8_stride1::get_kimpls(


SmallVector<ConvBiasImpl::NCBKern> ret_kerns; SmallVector<ConvBiasImpl::NCBKern> ret_kerns;
if (m_large_group) { if (m_large_group) {
auto exec_one_group = [wbundle, do_conv_fun](
auto exec_one_group = [bundle, do_conv_fun](
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
auto fm = kern_param.filter_meta; auto fm = kern_param.filter_meta;
size_t IC = fm.icpg; size_t IC = fm.icpg;
size_t OC = fm.ocpg; size_t OC = fm.ocpg;
WorkspaceBundle bundle = wbundle;
bundle.set(kern_param.workspace_ptr);
for (size_t ic = 0; ic < IC; ic++) { for (size_t ic = 0; ic < IC; ic++) {
copy_padding_kern(bundle, kern_param, ncb_index, copy_padding_kern(bundle, kern_param, ncb_index,
{ncb_index.thread_id, 0, ic}); {ncb_index.thread_id, 0, ic});
@@ -351,15 +349,17 @@ SmallVector<ConvBiasImpl::NCBKern> direct_quint8_stride1::get_kimpls(
}; };
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
} else { } else {
WorkspaceBundle bundle = wbundle;
auto copy_padding = [bundle](const NCBKernParam& kern_param, auto copy_padding = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
copy_padding_kern(bundle, kern_param, ncb_index, copy_padding_kern(bundle, kern_param, ncb_index,
ncb_index.ndrange_id); ncb_index.ndrange_id);
}; };
ret_kerns.push_back({copy_padding, {group, N, IC}}); ret_kerns.push_back({copy_padding, {group, N, IC}});
auto do_conv = [bundle, do_conv_fun](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
auto do_conv = [bundle, do_conv_fun](
const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id);
}; };
ret_kerns.push_back({do_conv, {group, N, OC}}); ret_kerns.push_back({do_conv, {group, N, OC}});


+ 4
- 3
dnn/src/arm_common/conv_bias/quint8/stride1.h View File

@@ -21,19 +21,20 @@ using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam;
using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex;


using conv_fun = std::function<void( using conv_fun = std::function<void(
WorkspaceBundle bundle, const NCBKernParam& kern_param,
const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>; const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>;


bool can_conv_direct_stride1_quint8(const NCBKernSizeParam& param); bool can_conv_direct_stride1_quint8(const NCBKernSizeParam& param);


WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group);


void copy_padding_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param,
void copy_padding_kern(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);


template <size_t filter, BiasMode bias_mode, typename Op> template <size_t filter, BiasMode bias_mode, typename Op>
void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param,
void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);




+ 13
- 12
dnn/src/arm_common/conv_bias/quint8/stride1_dotprod.cpp View File

@@ -101,7 +101,8 @@ WorkspaceBundle direct_dotprod_quint8_stride1::get_bundle(
} }
//! Process one input channel copy padding //! Process one input channel copy padding
void direct_dotprod_quint8_stride1::copy_padding_kern( void direct_dotprod_quint8_stride1::copy_padding_kern(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids) { const CpuNDRange& workspace_ids) {
size_t IH = kern_param.isz[0]; size_t IH = kern_param.isz[0];
@@ -115,7 +116,6 @@ void direct_dotprod_quint8_stride1::copy_padding_kern(
get_rectified_size(kern_param, IH2, IW2, OH2, OW2); get_rectified_size(kern_param, IH2, IW2, OH2, OW2);
bool need_src_copy_var = need_src_copy(kern_param); bool need_src_copy_var = need_src_copy(kern_param);
size_t padding_group_size = IH2 * IW2 * IC; size_t padding_group_size = IH2 * IW2 * IC;
bundle.set(kern_param.workspace_ptr);


//! Used for get the workspace offset //! Used for get the workspace offset
size_t workspace_group_id = workspace_ids[0], size_t workspace_group_id = workspace_ids[0],
@@ -144,7 +144,7 @@ void direct_dotprod_quint8_stride1::copy_padding_kern(
//! compute one output channel //! compute one output channel
template <size_t filter, BiasMode bias_mode, typename Op> template <size_t filter, BiasMode bias_mode, typename Op>
void direct_dotprod_quint8_stride1::do_conv_kern( void direct_dotprod_quint8_stride1::do_conv_kern(
WorkspaceBundle bundle, const NCBKernParam& kern_param,
const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) {
size_t OH = kern_param.osz[0]; size_t OH = kern_param.osz[0];
size_t OW = kern_param.osz[1]; size_t OW = kern_param.osz[1];
@@ -177,7 +177,6 @@ void direct_dotprod_quint8_stride1::do_conv_kern(
} }
size_t padding_group_size = IH2 * IW2 * IC; size_t padding_group_size = IH2 * IW2 * IC;


bundle.set(kern_param.workspace_ptr);
//! Used for get the workspace offset //! Used for get the workspace offset
size_t workspace_group_id = workspace_ids[0], size_t workspace_group_id = workspace_ids[0],
workspace_batch_id = workspace_ids[1], oc = workspace_ids[2], workspace_batch_id = workspace_ids[1], oc = workspace_ids[2],
@@ -271,7 +270,7 @@ SmallVector<ConvBiasImpl::NCBKern> direct_dotprod_quint8_stride1::get_kimpls(
size_t IC = param.filter_meta.icpg; size_t IC = param.filter_meta.icpg;
size_t OC = param.filter_meta.ocpg; size_t OC = param.filter_meta.ocpg;
size_t group = fm.group; size_t group = fm.group;
WorkspaceBundle wbundle = get_bundle(param, m_large_group);
WorkspaceBundle bundle = get_bundle(param, m_large_group);
conv_fun do_conv_fun = nullptr; conv_fun do_conv_fun = nullptr;


#define DO_CONV_KERN_FUN(filter, bias_mode, op) \ #define DO_CONV_KERN_FUN(filter, bias_mode, op) \
@@ -332,13 +331,13 @@ SmallVector<ConvBiasImpl::NCBKern> direct_dotprod_quint8_stride1::get_kimpls(


SmallVector<ConvBiasImpl::NCBKern> ret_kerns; SmallVector<ConvBiasImpl::NCBKern> ret_kerns;
if (m_large_group) { if (m_large_group) {
auto exec_one_group = [wbundle, do_conv_fun](
auto exec_one_group = [bundle, do_conv_fun](
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
auto fm = kern_param.filter_meta; auto fm = kern_param.filter_meta;
size_t IC = fm.icpg; size_t IC = fm.icpg;
size_t OC = fm.ocpg; size_t OC = fm.ocpg;
WorkspaceBundle bundle = wbundle;
bundle.set(kern_param.workspace_ptr);
for (size_t ic = 0; ic < IC; ic++) { for (size_t ic = 0; ic < IC; ic++) {
copy_padding_kern(bundle, kern_param, ncb_index, copy_padding_kern(bundle, kern_param, ncb_index,
{ncb_index.thread_id, 0, ic}); {ncb_index.thread_id, 0, ic});
@@ -350,15 +349,17 @@ SmallVector<ConvBiasImpl::NCBKern> direct_dotprod_quint8_stride1::get_kimpls(
}; };
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
}else { }else {
WorkspaceBundle bundle = wbundle;
auto copy_padding = [bundle](const NCBKernParam& kern_param, auto copy_padding = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
copy_padding_kern(bundle, kern_param, ncb_index, copy_padding_kern(bundle, kern_param, ncb_index,
ncb_index.ndrange_id); ncb_index.ndrange_id);
}; };
ret_kerns.push_back({copy_padding, {group, N, IC}}); ret_kerns.push_back({copy_padding, {group, N, IC}});
auto do_conv = [bundle, do_conv_fun](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
auto do_conv = [bundle, do_conv_fun](
const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id);
}; };
ret_kerns.push_back({do_conv, {group, N, OC}}); ret_kerns.push_back({do_conv, {group, N, OC}});


+ 4
- 3
dnn/src/arm_common/conv_bias/quint8/stride1_dotprod.h View File

@@ -21,19 +21,20 @@ using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam;
using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex;


using conv_fun = std::function<void( using conv_fun = std::function<void(
WorkspaceBundle bundle, const NCBKernParam& kern_param,
const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>; const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>;


bool can_conv_direct_stride1_quint8(const NCBKernSizeParam& param); bool can_conv_direct_stride1_quint8(const NCBKernSizeParam& param);


WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group);


void copy_padding_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param,
void copy_padding_kern(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);


template <size_t filter, BiasMode bias_mode, typename Op> template <size_t filter, BiasMode bias_mode, typename Op>
void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param,
void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);




+ 13
- 12
dnn/src/arm_common/conv_bias/quint8/stride2.cpp View File

@@ -108,7 +108,8 @@ WorkspaceBundle direct_quint8_stride2::get_bundle(
} }
//! Process one input channel copy padding //! Process one input channel copy padding
void direct_quint8_stride2::copy_padding_kern( void direct_quint8_stride2::copy_padding_kern(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids) { const CpuNDRange& workspace_ids) {
size_t IH = kern_param.isz[0]; size_t IH = kern_param.isz[0];
@@ -122,7 +123,6 @@ void direct_quint8_stride2::copy_padding_kern(
get_rectified_size(kern_param, IH2, IW2, OH2, OW2); get_rectified_size(kern_param, IH2, IW2, OH2, OW2);
bool need_src_copy_var = need_src_copy(kern_param); bool need_src_copy_var = need_src_copy(kern_param);
size_t padding_group_size = IH2 * IW2 * IC; size_t padding_group_size = IH2 * IW2 * IC;
bundle.set(kern_param.workspace_ptr);


//! Used for get the workspace offset //! Used for get the workspace offset
size_t workspace_group_id = workspace_ids[0], size_t workspace_group_id = workspace_ids[0],
@@ -149,7 +149,7 @@ void direct_quint8_stride2::copy_padding_kern(
}; };
//! compute one output channel //! compute one output channel
template <size_t filter, BiasMode bias_mode, typename Op> template <size_t filter, BiasMode bias_mode, typename Op>
void direct_quint8_stride2::do_conv_kern(WorkspaceBundle bundle,
void direct_quint8_stride2::do_conv_kern(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids) { const CpuNDRange& workspace_ids) {
@@ -187,7 +187,6 @@ void direct_quint8_stride2::do_conv_kern(WorkspaceBundle bundle,
} }
size_t padding_group_size = IH2 * IW2 * IC; size_t padding_group_size = IH2 * IW2 * IC;


bundle.set(kern_param.workspace_ptr);
//! Used for get the workspace offset //! Used for get the workspace offset
size_t workspace_group_id = workspace_ids[0], size_t workspace_group_id = workspace_ids[0],
workspace_batch_id = workspace_ids[1], oc = workspace_ids[2], workspace_batch_id = workspace_ids[1], oc = workspace_ids[2],
@@ -279,7 +278,7 @@ SmallVector<ConvBiasImpl::NCBKern> direct_quint8_stride2::get_kimpls(
size_t IC = param.filter_meta.icpg; size_t IC = param.filter_meta.icpg;
size_t OC = param.filter_meta.ocpg; size_t OC = param.filter_meta.ocpg;
size_t group = fm.group; size_t group = fm.group;
WorkspaceBundle wbundle = get_bundle(param, m_large_group);
WorkspaceBundle bundle = get_bundle(param, m_large_group);
conv_fun do_conv_fun = nullptr; conv_fun do_conv_fun = nullptr;


#define DO_CONV_KERN_FUN(filter, bias_mode, op) \ #define DO_CONV_KERN_FUN(filter, bias_mode, op) \
@@ -340,13 +339,13 @@ SmallVector<ConvBiasImpl::NCBKern> direct_quint8_stride2::get_kimpls(


SmallVector<ConvBiasImpl::NCBKern> ret_kerns; SmallVector<ConvBiasImpl::NCBKern> ret_kerns;
if (m_large_group) { if (m_large_group) {
auto exec_one_group = [wbundle, do_conv_fun](
auto exec_one_group = [bundle, do_conv_fun](
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
auto fm = kern_param.filter_meta; auto fm = kern_param.filter_meta;
size_t IC = fm.icpg; size_t IC = fm.icpg;
size_t OC = fm.ocpg; size_t OC = fm.ocpg;
WorkspaceBundle bundle = wbundle;
bundle.set(kern_param.workspace_ptr);
for (size_t ic = 0; ic < IC; ic++) { for (size_t ic = 0; ic < IC; ic++) {
copy_padding_kern(bundle, kern_param, ncb_index, copy_padding_kern(bundle, kern_param, ncb_index,
{ncb_index.thread_id, 0, ic}); {ncb_index.thread_id, 0, ic});
@@ -358,15 +357,17 @@ SmallVector<ConvBiasImpl::NCBKern> direct_quint8_stride2::get_kimpls(
}; };
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
}else { }else {
WorkspaceBundle bundle = wbundle;
auto copy_padding = [bundle](const NCBKernParam& kern_param, auto copy_padding = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
copy_padding_kern(bundle, kern_param, ncb_index, copy_padding_kern(bundle, kern_param, ncb_index,
ncb_index.ndrange_id); ncb_index.ndrange_id);
}; };
ret_kerns.push_back({copy_padding, {group, N, IC}}); ret_kerns.push_back({copy_padding, {group, N, IC}});
auto do_conv = [bundle, do_conv_fun](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
auto do_conv = [bundle, do_conv_fun](
const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id);
}; };
ret_kerns.push_back({do_conv, {group, N, OC}}); ret_kerns.push_back({do_conv, {group, N, OC}});


+ 4
- 3
dnn/src/arm_common/conv_bias/quint8/stride2.h View File

@@ -21,19 +21,20 @@ using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam;
using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex;


using conv_fun = std::function<void( using conv_fun = std::function<void(
WorkspaceBundle bundle, const NCBKernParam& kern_param,
const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>; const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>;


bool can_conv_direct_stride2_quint8(const NCBKernSizeParam& param); bool can_conv_direct_stride2_quint8(const NCBKernSizeParam& param);


WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group);


void copy_padding_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param,
void copy_padding_kern(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);


template <size_t filter, BiasMode bias_mode, typename Op> template <size_t filter, BiasMode bias_mode, typename Op>
void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param,
void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);




+ 15
- 13
dnn/src/arm_common/conv_bias/quint8/stride2_dotprod.cpp View File

@@ -108,8 +108,10 @@ WorkspaceBundle direct_dotprod_quint8_stride2::get_bundle(
} }
//! Process one input channel copy padding //! Process one input channel copy padding
void direct_dotprod_quint8_stride2::copy_padding_kern( void direct_dotprod_quint8_stride2::copy_padding_kern(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) {
const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids) {
size_t IH = kern_param.isz[0]; size_t IH = kern_param.isz[0];
size_t IW = kern_param.isz[1]; size_t IW = kern_param.isz[1];
size_t IC = kern_param.filter_meta.icpg; size_t IC = kern_param.filter_meta.icpg;
@@ -121,7 +123,6 @@ void direct_dotprod_quint8_stride2::copy_padding_kern(
get_rectified_size(kern_param, IH2, IW2, OH2, OW2); get_rectified_size(kern_param, IH2, IW2, OH2, OW2);
bool need_src_copy_var = need_src_copy(kern_param); bool need_src_copy_var = need_src_copy(kern_param);
size_t padding_group_size = IH2 * IW2 * IC; size_t padding_group_size = IH2 * IW2 * IC;
bundle.set(kern_param.workspace_ptr);


//! Used for get the workspace offset //! Used for get the workspace offset
size_t workspace_group_id = workspace_ids[0], size_t workspace_group_id = workspace_ids[0],
@@ -149,7 +150,7 @@ void direct_dotprod_quint8_stride2::copy_padding_kern(
//! compute one output channel //! compute one output channel
template <size_t filter, BiasMode bias_mode, typename Op> template <size_t filter, BiasMode bias_mode, typename Op>
void direct_dotprod_quint8_stride2::do_conv_kern( void direct_dotprod_quint8_stride2::do_conv_kern(
WorkspaceBundle bundle, const NCBKernParam& kern_param,
const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) {
size_t OH = kern_param.osz[0]; size_t OH = kern_param.osz[0];
size_t OW = kern_param.osz[1]; size_t OW = kern_param.osz[1];
@@ -182,7 +183,6 @@ void direct_dotprod_quint8_stride2::do_conv_kern(
} }
size_t padding_group_size = IH2 * IW2 * IC; size_t padding_group_size = IH2 * IW2 * IC;


bundle.set(kern_param.workspace_ptr);
//! Used for get the workspace offset //! Used for get the workspace offset
size_t workspace_group_id = workspace_ids[0], size_t workspace_group_id = workspace_ids[0],
workspace_batch_id = workspace_ids[1], oc = workspace_ids[2], workspace_batch_id = workspace_ids[1], oc = workspace_ids[2],
@@ -276,7 +276,7 @@ SmallVector<ConvBiasImpl::NCBKern> direct_dotprod_quint8_stride2::get_kimpls(
size_t IC = param.filter_meta.icpg; size_t IC = param.filter_meta.icpg;
size_t OC = param.filter_meta.ocpg; size_t OC = param.filter_meta.ocpg;
size_t group = fm.group; size_t group = fm.group;
WorkspaceBundle wbundle = get_bundle(param, m_large_group);
WorkspaceBundle bundle = get_bundle(param, m_large_group);
conv_fun do_conv_fun = nullptr; conv_fun do_conv_fun = nullptr;


#define DO_CONV_KERN_FUN(filter, bias_mode, op) \ #define DO_CONV_KERN_FUN(filter, bias_mode, op) \
@@ -337,13 +337,13 @@ SmallVector<ConvBiasImpl::NCBKern> direct_dotprod_quint8_stride2::get_kimpls(


SmallVector<ConvBiasImpl::NCBKern> ret_kerns; SmallVector<ConvBiasImpl::NCBKern> ret_kerns;
if (m_large_group) { if (m_large_group) {
auto exec_one_group = [wbundle, do_conv_fun](
auto exec_one_group = [bundle, do_conv_fun](
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
auto fm = kern_param.filter_meta; auto fm = kern_param.filter_meta;
size_t IC = fm.icpg; size_t IC = fm.icpg;
size_t OC = fm.ocpg; size_t OC = fm.ocpg;
WorkspaceBundle bundle = wbundle;
bundle.set(kern_param.workspace_ptr);
for (size_t ic = 0; ic < IC; ic++) { for (size_t ic = 0; ic < IC; ic++) {
copy_padding_kern(bundle, kern_param, ncb_index, copy_padding_kern(bundle, kern_param, ncb_index,
{ncb_index.thread_id, 0, ic}); {ncb_index.thread_id, 0, ic});
@@ -355,15 +355,17 @@ SmallVector<ConvBiasImpl::NCBKern> direct_dotprod_quint8_stride2::get_kimpls(
}; };
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
}else { }else {
WorkspaceBundle bundle = wbundle;
auto copy_padding = [bundle](const NCBKernParam& kern_param, auto copy_padding = [bundle](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
copy_padding_kern(bundle, kern_param, ncb_index, copy_padding_kern(bundle, kern_param, ncb_index,
ncb_index.ndrange_id); ncb_index.ndrange_id);
}; };
ret_kerns.push_back({copy_padding, {group, N, IC}}); ret_kerns.push_back({copy_padding, {group, N, IC}});
auto do_conv = [bundle, do_conv_fun](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
auto do_conv = [bundle, do_conv_fun](
const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id);
}; };
ret_kerns.push_back({do_conv, {group, N, OC}}); ret_kerns.push_back({do_conv, {group, N, OC}});


+ 4
- 3
dnn/src/arm_common/conv_bias/quint8/stride2_dotprod.h View File

@@ -21,19 +21,20 @@ using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam;
using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex;


using conv_fun = std::function<void( using conv_fun = std::function<void(
WorkspaceBundle bundle, const NCBKernParam& kern_param,
const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>; const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>;


bool can_conv_direct_stride2_quint8(const NCBKernSizeParam& param); bool can_conv_direct_stride2_quint8(const NCBKernSizeParam& param);


WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group);


void copy_padding_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param,
void copy_padding_kern(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);


template <size_t filter, BiasMode bias_mode, typename Op> template <size_t filter, BiasMode bias_mode, typename Op>
void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param,
void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);




+ 29
- 22
dnn/src/fallback/conv_bias/im2col/algos.cpp View File

@@ -39,7 +39,7 @@ struct Im2colBundelIndex {
using Pack_Mode=fallback::MatrixMulImpl::AlgoBase::PackMode; using Pack_Mode=fallback::MatrixMulImpl::AlgoBase::PackMode;


//! Process one input channel copy padding //! Process one input channel copy padding
static void copy_padding_kern(WorkspaceBundle bundle,
static void copy_padding_kern(WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& param, const ConvBiasImpl::NCBKernParam& param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
StrategyBase* im2colstrategy, size_t pack_oc_size) { StrategyBase* im2colstrategy, size_t pack_oc_size) {
@@ -48,7 +48,7 @@ static void copy_padding_kern(WorkspaceBundle bundle,


//! packA_kern //! packA_kern
static void packA_kern( static void packA_kern(
WorkspaceBundle bundle,
WorkspaceBundle& bundle,
const fallback::ConvBiasImpl::NCBKernParam& param, const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernSizeParam matmulparam, fallback::MatrixMulImpl::KernSizeParam matmulparam,
fallback::MatrixMulImpl::AlgoBase* matmul_algo, fallback::MatrixMulImpl::AlgoBase* matmul_algo,
@@ -72,11 +72,12 @@ class Im2colKerns<Pack_Mode::DEFAULT> {
public: public:
//! conv kernel //! conv kernel
static void kerns( static void kerns(
WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
const WorkspaceBundle& bundle, WorkspaceBundle bundle_thread,
const ConvBiasImpl::NCBKernParam& param, const ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param, fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
matmul_desc,
StrategyParam strategyparam, StrategyParam strategyparam,
fallback::ConvBiasImpl::NCBKernIndex ncb_index, fallback::ConvBiasImpl::NCBKernIndex ncb_index,
size_t ohw_tile_size, StrategyBase* im2colstrategy) { size_t ohw_tile_size, StrategyBase* im2colstrategy) {
@@ -100,7 +101,6 @@ public:
strategyparam.output_block_oc_size = output_block_oc_size; strategyparam.output_block_oc_size = output_block_oc_size;
strategyparam.output_block_size = output_block_size; strategyparam.output_block_size = output_block_size;


bundle.set(param.workspace_ptr);
bundle_thread.set( bundle_thread.set(
static_cast<int8_t*>( static_cast<int8_t*>(
bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) + bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) +
@@ -153,11 +153,12 @@ class Im2colKerns<Pack_Mode::ONLY_PACKA> {
public: public:
//! conv kernel //! conv kernel
static void kerns( static void kerns(
WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
const WorkspaceBundle& bundle, WorkspaceBundle bundle_thread,
const ConvBiasImpl::NCBKernParam& param, const ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param, fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
matmul_desc,
StrategyParam strategyparam, StrategyParam strategyparam,
fallback::ConvBiasImpl::NCBKernIndex ncb_index, fallback::ConvBiasImpl::NCBKernIndex ncb_index,
size_t ohw_tile_size, StrategyBase* im2colstrategy) { size_t ohw_tile_size, StrategyBase* im2colstrategy) {
@@ -169,7 +170,6 @@ public:
strategyparam.oc_tile_size, strategyparam.oc_tile_size,
OC - ncb_index.ndrange_id[3] * strategyparam.oc_tile_size); OC - ncb_index.ndrange_id[3] * strategyparam.oc_tile_size);


bundle.set(param.workspace_ptr);
bundle_thread.set( bundle_thread.set(
static_cast<int8_t*>( static_cast<int8_t*>(
bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) + bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) +
@@ -236,11 +236,12 @@ class Im2colKerns<Pack_Mode::NO_PACK> {
public: public:
//! conv kernel //! conv kernel
static void kerns( static void kerns(
WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
const WorkspaceBundle& bundle, WorkspaceBundle bundle_thread,
const ConvBiasImpl::NCBKernParam& param, const ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param, fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
matmul_desc,
StrategyParam strategyparam, StrategyParam strategyparam,
fallback::ConvBiasImpl::NCBKernIndex ncb_index, fallback::ConvBiasImpl::NCBKernIndex ncb_index,
size_t ohw_tile_size, StrategyBase* im2colstrategy) { size_t ohw_tile_size, StrategyBase* im2colstrategy) {
@@ -264,7 +265,6 @@ public:
strategyparam.output_block_oc_size = output_block_oc_size; strategyparam.output_block_oc_size = output_block_oc_size;
strategyparam.output_block_size = output_block_size; strategyparam.output_block_size = output_block_size;


bundle.set(param.workspace_ptr);
bundle_thread.set( bundle_thread.set(
static_cast<int8_t*>( static_cast<int8_t*>(
bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) + bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) +
@@ -567,16 +567,18 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns(
auto kern_padding = [bundle, im2colstrategy, auto kern_padding = [bundle, im2colstrategy,
pack_oc_size = pack_oc_size]( pack_oc_size = pack_oc_size](
const NCBKernParam& param, const NCBKernParam& param,
const NCBKernIndex& ncb_index) {
const NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);
copy_padding_kern(bundle, param, ncb_index, im2colstrategy, copy_padding_kern(bundle, param, ncb_index, im2colstrategy,
pack_oc_size); pack_oc_size);
}; };


auto kern_packA = [bundle, matmul_algo = m_matmul_algo, auto kern_packA = [bundle, matmul_algo = m_matmul_algo,
matmul_param, im2colstrategy, matmul_param, im2colstrategy,
pack_oc_size = pack_oc_size,
mdesc = mdesc](const NCBKernParam& param,
const NCBKernIndex& ncb_index) {
pack_oc_size = pack_oc_size, mdesc = mdesc](
const NCBKernParam& param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);
packA_kern(bundle, param, matmul_param, matmul_algo, ncb_index, packA_kern(bundle, param, matmul_param, matmul_algo, ncb_index,
im2colstrategy, mdesc, pack_oc_size); im2colstrategy, mdesc, pack_oc_size);
}; };
@@ -586,8 +588,10 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns(
matmul_algo = m_matmul_algo, matmul_algo = m_matmul_algo,
ohw_tile_size = ohw_tile_size, ohw_tile_size = ohw_tile_size,
strategyparam = strategyparam, matmul_desc = mdesc, strategyparam = strategyparam, matmul_desc = mdesc,
im2colstrategy](const NCBKernParam& param,
const NCBKernIndex& ncb_index) {
im2colstrategy](
const NCBKernParam& param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);
Im2colKerns<Pack_Mode::DEFAULT>::kerns( Im2colKerns<Pack_Mode::DEFAULT>::kerns(
bundle, bundle_thread, param, matmul_param, bundle, bundle_thread, param, matmul_param,
matmul_algo, matmul_desc, strategyparam, matmul_algo, matmul_desc, strategyparam,
@@ -608,8 +612,10 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns(
matmul_algo = m_matmul_algo, matmul_algo = m_matmul_algo,
strategyparam = strategyparam, strategyparam = strategyparam,
ohw_tile_size = ohw_tile_size, matmul_desc = mdesc, ohw_tile_size = ohw_tile_size, matmul_desc = mdesc,
im2colstrategy](const NCBKernParam& param,
const NCBKernIndex& ncb_index) {
im2colstrategy](
const NCBKernParam& param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);
Im2colKerns<Pack_Mode::ONLY_PACKA>::kerns( Im2colKerns<Pack_Mode::ONLY_PACKA>::kerns(
bundle, bundle_thread, param, matmul_param, bundle, bundle_thread, param, matmul_param,
matmul_algo, matmul_desc, strategyparam, matmul_algo, matmul_desc, strategyparam,
@@ -628,14 +634,15 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns(
matmul_algo = m_matmul_algo, matmul_algo = m_matmul_algo,
strategyparam = strategyparam, strategyparam = strategyparam,
ohw_tile_size = ohw_tile_size, matmul_desc = mdesc, ohw_tile_size = ohw_tile_size, matmul_desc = mdesc,
im2colstrategy](const NCBKernParam& param,
const NCBKernIndex& ncb_index) {
im2colstrategy](
const NCBKernParam& param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);
Im2colKerns<Pack_Mode::NO_PACK>::kerns( Im2colKerns<Pack_Mode::NO_PACK>::kerns(
bundle, bundle_thread, param, matmul_param, bundle, bundle_thread, param, matmul_param,
matmul_algo, matmul_desc, strategyparam, matmul_algo, matmul_desc, strategyparam,
ncb_index, ohw_tile_size, im2colstrategy); ncb_index, ohw_tile_size, im2colstrategy);
}; };

if (need_padding) { if (need_padding) {
ret_kern.push_back({kern_padding, {param.n, GROUP, IC}}); ret_kern.push_back({kern_padding, {param.n, GROUP, IC}});
} }


+ 86
- 81
dnn/src/fallback/conv_bias/im2col/strategy_base.h View File

@@ -50,21 +50,22 @@ public:
StrategyBase() = default; StrategyBase() = default;
virtual ~StrategyBase() = default; virtual ~StrategyBase() = default;
virtual void copy_padding_kern( virtual void copy_padding_kern(
WorkspaceBundle bundle,
const WorkspaceBundle& bundle,
const fallback::ConvBiasImpl::NCBKernParam& param, const fallback::ConvBiasImpl::NCBKernParam& param,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
size_t pack_size) = 0; size_t pack_size) = 0;
virtual void packA_kern( virtual void packA_kern(
WorkspaceBundle bundle,
const WorkspaceBundle& bundle,
const fallback::ConvBiasImpl::NCBKernParam& param, const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernSizeParam matmulparam, fallback::MatrixMulImpl::KernSizeParam matmulparam,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desec,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
matmul_desec,
size_t pack_size) = 0; size_t pack_size) = 0;


virtual void exec_im2col( virtual void exec_im2col(
WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread,
const StrategyParam& sparam, const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param, const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam matmul_param, fallback::MatrixMulImpl::KernParam matmul_param,
@@ -72,17 +73,18 @@ public:


virtual void exec_matmul( virtual void exec_matmul(
const fallback::ConvBiasImpl::NCBKernParam& param, const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam, WorkspaceBundle bundle,
WorkspaceBundle bundle_thread,
const StrategyParam& sparam, const WorkspaceBundle& bundle,
const WorkspaceBundle& bundle_thread,
fallback::MatrixMulImpl::KernParam matmul_param, fallback::MatrixMulImpl::KernParam matmul_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc
) = 0;
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
matmul_desc) = 0;


virtual void exec_postprocess( virtual void exec_postprocess(
const fallback::ConvBiasImpl::NCBKernParam& param, const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam, WorkspaceBundle bundle_thread) = 0;
const StrategyParam& sparam,
const WorkspaceBundle& bundle_thread) = 0;
}; };


template <typename src_ctype, typename bias_ctype, typename dst_ctype, template <typename src_ctype, typename bias_ctype, typename dst_ctype,
@@ -98,7 +100,7 @@ public:
StrategyBridge() = default; StrategyBridge() = default;


virtual void copy_padding_kern( virtual void copy_padding_kern(
WorkspaceBundle bundle,
const WorkspaceBundle& bundle,
const fallback::ConvBiasImpl::NCBKernParam& param, const fallback::ConvBiasImpl::NCBKernParam& param,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
size_t pack_oc_size) override { size_t pack_oc_size) override {
@@ -126,7 +128,6 @@ public:
size_t workspace_group_offset = group_id * padding_group_size; size_t workspace_group_offset = group_id * padding_group_size;
size_t workspace_batch_offset = size_t workspace_batch_offset =
param.filter_meta.group * batch_id * padding_group_size; param.filter_meta.group * batch_id * padding_group_size;
bundle.set(param.workspace_ptr);


src_ctype src_zp = static_cast<src_ctype>(0); src_ctype src_zp = static_cast<src_ctype>(0);
if (param.src_type.enumv() == DTypeEnum::Quantized8Asymm) { if (param.src_type.enumv() == DTypeEnum::Quantized8Asymm) {
@@ -212,8 +213,8 @@ void copy_dst(const fallback::ConvBiasImpl::NCBKernParam& param,


template <typename bias_ctype> template <typename bias_ctype>
void copy_bias(const fallback::ConvBiasImpl::NCBKernParam& param, void copy_bias(const fallback::ConvBiasImpl::NCBKernParam& param,
WorkspaceBundle bundle_thread, const StrategyParam& sparam,
size_t bias_index) {
const WorkspaceBundle& bundle_thread,
const StrategyParam& sparam, size_t bias_index) {
const bias_ctype* bias_ptr = static_cast<const bias_ctype*>( const bias_ctype* bias_ptr = static_cast<const bias_ctype*>(
param.bias<bias_ctype>(sparam.batch_id, sparam.group_id)); param.bias<bias_ctype>(sparam.batch_id, sparam.group_id));
bias_ctype* bias_temp_ptr = static_cast<bias_ctype*>( bias_ctype* bias_temp_ptr = static_cast<bias_ctype*>(
@@ -235,11 +236,11 @@ void copy_bias(const fallback::ConvBiasImpl::NCBKernParam& param,
} }
} }


template <typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode>
template <typename bias_ctype, typename dst_ctype, typename op_ctype,
typename op_dtype, megdnn::PostprocessMode postprocess_mode>
void do_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, void do_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam, WorkspaceBundle bundle_thread,
const StrategyParam& sparam,
const WorkspaceBundle& bundle_thread,
size_t matmul_bundle_index, size_t bias_bundle_index) { size_t matmul_bundle_index, size_t bias_bundle_index) {
copy_bias<bias_ctype>(param, bundle_thread, sparam, bias_bundle_index); copy_bias<bias_ctype>(param, bundle_thread, sparam, bias_bundle_index);
void* matmul_dst = get_matmul_dst_ptr<bias_ctype>( void* matmul_dst = get_matmul_dst_ptr<bias_ctype>(
@@ -288,32 +289,32 @@ public:
Strategy() = default; Strategy() = default;


virtual void packA_kern( virtual void packA_kern(
WorkspaceBundle bundle,
const WorkspaceBundle& bundle,
const fallback::ConvBiasImpl::NCBKernParam& param, const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernSizeParam matmulparam, fallback::MatrixMulImpl::KernSizeParam matmulparam,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
matmul_desc,
size_t pack_size) override; size_t pack_size) override;
virtual void exec_im2col( virtual void exec_im2col(
WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread,
const StrategyParam& sparam, const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param, const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam matmul_param, fallback::MatrixMulImpl::KernParam matmul_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override;


void exec_matmul(
const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam, WorkspaceBundle bundle,
WorkspaceBundle bundle_thread,
fallback::MatrixMulImpl::KernParam matmul_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc
) override;
void exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam, const WorkspaceBundle& bundle,
const WorkspaceBundle& bundle_thread,
fallback::MatrixMulImpl::KernParam matmul_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
matmul_desc) override;
void exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, void exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam, const StrategyParam& sparam,
WorkspaceBundle bundle_thread) override {
const WorkspaceBundle& bundle_thread) override {
do_postprocess<bias_ctype, dst_ctype, op_ctype, op_dtype, do_postprocess<bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode>(param, sparam, bundle_thread, postprocess_mode>(param, sparam, bundle_thread,
THREAD_BUNDLE_IM2COL_INDEX, THREAD_BUNDLE_IM2COL_INDEX,
@@ -341,11 +342,12 @@ public:


Strategy() = default; Strategy() = default;


void exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam matmul_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override;
void exec_im2col(
const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread,
const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam matmul_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override;
}; };


template <typename src_ctype, typename bias_ctype, typename dst_ctype, template <typename src_ctype, typename bias_ctype, typename dst_ctype,
@@ -367,7 +369,7 @@ public:
Strategy() = default; Strategy() = default;


void packA_kern( void packA_kern(
WorkspaceBundle bundle,
const WorkspaceBundle& bundle,
const fallback::ConvBiasImpl::NCBKernParam& param, const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernSizeParam matmulparam, fallback::MatrixMulImpl::KernSizeParam matmulparam,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
@@ -375,28 +377,28 @@ public:
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& MDsec, const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& MDsec,
size_t pack_size) override; size_t pack_size) override;


void exec_matmul(
const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam, WorkspaceBundle bundle,
WorkspaceBundle bundle_thread,
fallback::MatrixMulImpl::KernParam matmul_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc
) override;
void exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam, const WorkspaceBundle& bundle,
const WorkspaceBundle& bundle_thread,
fallback::MatrixMulImpl::KernParam matmul_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
matmul_desc) override;


void* get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param, void* get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param,
const WorkspaceBundle& bundle_thread, const WorkspaceBundle& bundle_thread,
const StrategyParam& sparam); const StrategyParam& sparam);


void exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam matmul_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override;
void exec_im2col(
const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread,
const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam matmul_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override;
void exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, void exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam, const StrategyParam& sparam,
WorkspaceBundle bundle_thread) override {
const WorkspaceBundle& bundle_thread) override {
do_postprocess<bias_ctype, dst_ctype, op_ctype, op_dtype, do_postprocess<bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode>(param, sparam, bundle_thread, postprocess_mode>(param, sparam, bundle_thread,
THREAD_BUNDLE_MATMULDST_INDEX, THREAD_BUNDLE_MATMULDST_INDEX,
@@ -423,7 +425,7 @@ public:
Strategy() = default; Strategy() = default;


void packA_kern( void packA_kern(
WorkspaceBundle bundle,
const WorkspaceBundle& bundle,
const fallback::ConvBiasImpl::NCBKernParam& param, const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernSizeParam matmulparam, fallback::MatrixMulImpl::KernSizeParam matmulparam,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
@@ -431,21 +433,21 @@ public:
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& MDsec, const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& MDsec,
size_t pack_size) override; size_t pack_size) override;


void exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam matmul_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override;

void exec_matmul(
void exec_im2col(
const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread,
const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param, const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam, WorkspaceBundle bundle,
WorkspaceBundle bundle_thread,
fallback::MatrixMulImpl::KernParam matmul_param, fallback::MatrixMulImpl::KernParam matmul_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc
) override;
const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override;

void exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam, const WorkspaceBundle& bundle,
const WorkspaceBundle& bundle_thread,
fallback::MatrixMulImpl::KernParam matmul_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
matmul_desc) override;


void* get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param, void* get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param,
const WorkspaceBundle& bundle_thread, const WorkspaceBundle& bundle_thread,
@@ -453,7 +455,7 @@ public:


void exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, void exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam, const StrategyParam& sparam,
WorkspaceBundle bundle_thread) override {
const WorkspaceBundle& bundle_thread) override {
do_postprocess<bias_ctype, dst_ctype, op_ctype, op_dtype, do_postprocess<bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode>(param, sparam, bundle_thread, postprocess_mode>(param, sparam, bundle_thread,
THREAD_BUNDLE_MATMULDST_INDEX, THREAD_BUNDLE_MATMULDST_INDEX,
@@ -476,11 +478,12 @@ public:
constexpr static size_t THREAD_BUNDLE_IM2COL_INDEX = 1; constexpr static size_t THREAD_BUNDLE_IM2COL_INDEX = 1;
constexpr static size_t THREAD_BUNDLE_BIAS_INDEX = 2; constexpr static size_t THREAD_BUNDLE_BIAS_INDEX = 2;


void exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam matmul_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override;
void exec_im2col(
const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread,
const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam matmul_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override;
}; };


template <typename op_ctype, typename op_dtype, template <typename op_ctype, typename op_dtype,
@@ -498,11 +501,12 @@ public:
constexpr static size_t THREAD_BUNDLE_IM2COL_INDEX = 1; constexpr static size_t THREAD_BUNDLE_IM2COL_INDEX = 1;
constexpr static size_t THREAD_BUNDLE_BIAS_INDEX = 2; constexpr static size_t THREAD_BUNDLE_BIAS_INDEX = 2;


void exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam matmul_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override;
void exec_im2col(
const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread,
const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam matmul_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override;
}; };




@@ -521,11 +525,12 @@ public:
constexpr static size_t THREAD_BUNDLE_IM2COL_INDEX = 1; constexpr static size_t THREAD_BUNDLE_IM2COL_INDEX = 1;
constexpr static size_t THREAD_BUNDLE_BIAS_INDEX = 2; constexpr static size_t THREAD_BUNDLE_BIAS_INDEX = 2;


void exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam matmul_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override;
void exec_im2col(
const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread,
const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam matmul_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override;
}; };
#endif #endif




+ 5
- 5
dnn/src/fallback/conv_bias/im2col/strategy_default.cpp View File

@@ -18,7 +18,7 @@ template <typename src_ctype, typename bias_ctype, typename dst_ctype,
megdnn::PostprocessMode postprocess_mode> megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode, PackMode::DEFAULT>:: postprocess_mode, PackMode::DEFAULT>::
packA_kern(WorkspaceBundle bundle,
packA_kern(const WorkspaceBundle& bundle,
const fallback::ConvBiasImpl::NCBKernParam& param, const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernSizeParam matmulparam, fallback::MatrixMulImpl::KernSizeParam matmulparam,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
@@ -26,7 +26,6 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
matmul_desc, matmul_desc,
size_t) { size_t) {
bundle.set(param.workspace_ptr);
fallback::MatrixMulImpl::KernParam matmul_param; fallback::MatrixMulImpl::KernParam matmul_param;
size_t group_id = ncb_index.ndrange_id[0]; size_t group_id = ncb_index.ndrange_id[0];
static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) = static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) =
@@ -50,7 +49,8 @@ template <typename src_ctype, typename bias_ctype, typename dst_ctype,
megdnn::PostprocessMode postprocess_mode> megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode, PackMode::DEFAULT>:: postprocess_mode, PackMode::DEFAULT>::
exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
exec_im2col(const WorkspaceBundle& bundle,
const WorkspaceBundle& bundle_thread,
const StrategyParam& sparam, const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param, const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam matmul_param, fallback::MatrixMulImpl::KernParam matmul_param,
@@ -139,8 +139,8 @@ template <typename src_ctype, typename bias_ctype, typename dst_ctype,
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode, PackMode::DEFAULT>:: postprocess_mode, PackMode::DEFAULT>::
exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param, exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam, WorkspaceBundle bundle,
WorkspaceBundle bundle_thread,
const StrategyParam& sparam, const WorkspaceBundle& bundle,
const WorkspaceBundle& bundle_thread,
fallback::MatrixMulImpl::KernParam matmul_param, fallback::MatrixMulImpl::KernParam matmul_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,


+ 2
- 1
dnn/src/fallback/conv_bias/im2col/strategy_default_nchw44.cpp View File

@@ -29,7 +29,8 @@ template <typename src_ctype, typename bias_ctype, typename dst_ctype,
megdnn::PostprocessMode postprocess_mode> megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode, PackMode::DEFAULT, FormatMode::NCHW44>:: postprocess_mode, PackMode::DEFAULT, FormatMode::NCHW44>::
exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
exec_im2col(const WorkspaceBundle& bundle,
const WorkspaceBundle& bundle_thread,
const StrategyParam& sparam, const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param, const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam matmul_param, fallback::MatrixMulImpl::KernParam matmul_param,


+ 2
- 1
dnn/src/fallback/conv_bias/im2col/strategy_fuse_nchw44.cpp View File

@@ -169,7 +169,8 @@ void naive_fuse_im2col_packB(dt_int8* src, size_t ic, size_t iw, size_t ih,
template <typename op_ctype, typename op_dtype, template <typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode> megdnn::PostprocessMode postprocess_mode>
void StrategyFuse4x4x16Nchw44<op_ctype, op_dtype, postprocess_mode>:: void StrategyFuse4x4x16Nchw44<op_ctype, op_dtype, postprocess_mode>::
exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
exec_im2col(const WorkspaceBundle& bundle,
const WorkspaceBundle& bundle_thread,
const StrategyParam& sparam, const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param, const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam, fallback::MatrixMulImpl::KernParam,


+ 2
- 2
dnn/src/fallback/conv_bias/im2col/strategy_fuse_nchw44_dot.cpp View File

@@ -172,7 +172,8 @@ void fuse_packb(const dt_int8* __restrict src, dt_int8* __restrict dst,
template <typename op_ctype, typename op_dtype, template <typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode> megdnn::PostprocessMode postprocess_mode>
void StrategyFuse8x12x4Nchw44Dot<op_ctype, op_dtype, postprocess_mode>:: void StrategyFuse8x12x4Nchw44Dot<op_ctype, op_dtype, postprocess_mode>::
exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
exec_im2col(const WorkspaceBundle& bundle,
const WorkspaceBundle& bundle_thread,
const StrategyParam& sparam, const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param, const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam /*matmul_param*/, fallback::MatrixMulImpl::KernParam /*matmul_param*/,
@@ -207,7 +208,6 @@ void StrategyFuse8x12x4Nchw44Dot<op_ctype, op_dtype, postprocess_mode>::
sparam.output_block_size); sparam.output_block_size);
} }



namespace megdnn { namespace megdnn {


template class StrategyFuse8x12x4Nchw44Dot<dt_qint32, dt_qint8, template class StrategyFuse8x12x4Nchw44Dot<dt_qint32, dt_qint8,


+ 2
- 1
dnn/src/fallback/conv_bias/im2col/strategy_fuse_nchw44_fp32_s2.cpp View File

@@ -164,7 +164,8 @@ void fuse_packb(const float* __restrict src, float* __restrict dst,
template <typename op_ctype, typename op_dtype, template <typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode> megdnn::PostprocessMode postprocess_mode>
void StrategyFuse8x12x1Nchw44K3x3S2<op_ctype, op_dtype, postprocess_mode>:: void StrategyFuse8x12x1Nchw44K3x3S2<op_ctype, op_dtype, postprocess_mode>::
exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
exec_im2col(const WorkspaceBundle& bundle,
const WorkspaceBundle& bundle_thread,
const StrategyParam& sparam, const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param, const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam /*matmul_param*/, fallback::MatrixMulImpl::KernParam /*matmul_param*/,


+ 5
- 4
dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp View File

@@ -19,7 +19,7 @@ template <typename src_ctype, typename bias_ctype, typename dst_ctype,
megdnn::PostprocessMode postprocess_mode> megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode, PackMode::NO_PACK>:: postprocess_mode, PackMode::NO_PACK>::
packA_kern(WorkspaceBundle bundle,
packA_kern(const WorkspaceBundle& bundle,
const fallback::ConvBiasImpl::NCBKernParam& param, const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernSizeParam matmulparam, fallback::MatrixMulImpl::KernSizeParam matmulparam,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
@@ -61,8 +61,8 @@ template <typename src_ctype, typename bias_ctype, typename dst_ctype,
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode, PackMode::NO_PACK>:: postprocess_mode, PackMode::NO_PACK>::
exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param, exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam, WorkspaceBundle bundle,
WorkspaceBundle bundle_thread,
const StrategyParam& sparam, const WorkspaceBundle& bundle,
const WorkspaceBundle& bundle_thread,
fallback::MatrixMulImpl::KernParam matmul_param, fallback::MatrixMulImpl::KernParam matmul_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
@@ -96,7 +96,8 @@ template <typename src_ctype, typename bias_ctype, typename dst_ctype,
megdnn::PostprocessMode postprocess_mode> megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode, PackMode::NO_PACK>:: postprocess_mode, PackMode::NO_PACK>::
exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
exec_im2col(const WorkspaceBundle& bundle,
const WorkspaceBundle& bundle_thread,
const StrategyParam& sparam, const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param, const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam matmul_param, fallback::MatrixMulImpl::KernParam matmul_param,


+ 5
- 5
dnn/src/fallback/conv_bias/im2col/strategy_onlypacka.cpp View File

@@ -19,7 +19,7 @@ template <typename src_ctype, typename bias_ctype, typename dst_ctype,
megdnn::PostprocessMode postprocess_mode> megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode, PackMode::ONLY_PACKA>:: postprocess_mode, PackMode::ONLY_PACKA>::
packA_kern(WorkspaceBundle bundle,
packA_kern(const WorkspaceBundle& bundle,
const fallback::ConvBiasImpl::NCBKernParam& param, const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernSizeParam matmulparam, fallback::MatrixMulImpl::KernSizeParam matmulparam,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
@@ -27,7 +27,6 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
const fallback::MatrixMulImpl::AlgoBase:: const fallback::MatrixMulImpl::AlgoBase::
MatmulDescription& /*matmul_desc*/, MatmulDescription& /*matmul_desc*/,
size_t) { size_t) {
bundle.set(param.workspace_ptr);
fallback::MatrixMulImpl::KernParam matmul_param; fallback::MatrixMulImpl::KernParam matmul_param;
static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) = static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) =
matmulparam; matmulparam;
@@ -56,8 +55,8 @@ template <typename src_ctype, typename bias_ctype, typename dst_ctype,
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode, PackMode::ONLY_PACKA>:: postprocess_mode, PackMode::ONLY_PACKA>::
exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param, exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param,
const StrategyParam& sparam, WorkspaceBundle bundle,
WorkspaceBundle bundle_thread,
const StrategyParam& sparam, const WorkspaceBundle& bundle,
const WorkspaceBundle& bundle_thread,
fallback::MatrixMulImpl::KernParam matmul_param, fallback::MatrixMulImpl::KernParam matmul_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
@@ -96,7 +95,8 @@ template <typename src_ctype, typename bias_ctype, typename dst_ctype,
megdnn::PostprocessMode postprocess_mode> megdnn::PostprocessMode postprocess_mode>
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
postprocess_mode, PackMode::ONLY_PACKA>:: postprocess_mode, PackMode::ONLY_PACKA>::
exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
exec_im2col(const WorkspaceBundle& bundle,
const WorkspaceBundle& bundle_thread,
const StrategyParam& sparam, const StrategyParam& sparam,
const fallback::ConvBiasImpl::NCBKernParam& param, const fallback::ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernParam matmul_param, fallback::MatrixMulImpl::KernParam matmul_param,


+ 32
- 29
dnn/src/fallback/conv_bias/winograd/winograd.h View File

@@ -194,12 +194,12 @@ public:
IC, 0, OC); IC, 0, OC);
} }


static void filter_process(Strategy strategy, WorkspaceBundle bundle_top,
WorkspaceBundle bundle_compute,
static void filter_process(Strategy strategy,
const WorkspaceBundle& bundle_top,
const WorkspaceBundle& bundle_compute,
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) { const NCBKernIndex& ncb_index) {
bundle_top.set(kern_param.workspace_ptr);
bundle_compute.set(bundle_top.get(0));
size_t compute_workspace_size_per_thread = size_t compute_workspace_size_per_thread =
bundle_compute.total_size_in_bytes(); bundle_compute.total_size_in_bytes();
size_t thread_id = ncb_index.thread_id; size_t thread_id = ncb_index.thread_id;
@@ -236,8 +236,8 @@ public:
} }


static void winograd_compute( static void winograd_compute(
Strategy strategy, WorkspaceBundle bundle_top,
WorkspaceBundle bundle_compute,
Strategy strategy, const WorkspaceBundle& bundle_top,
const WorkspaceBundle& bundle_compute,
fallback::MatrixMulImpl::AlgoBase* matmul_algo, fallback::MatrixMulImpl::AlgoBase* matmul_algo,
fallback::MatrixMulImpl::KernParam matmul_param, fallback::MatrixMulImpl::KernParam matmul_param,
size_t unit_tile_size, size_t unit_oc_size, size_t unit_tile_size, size_t unit_oc_size,
@@ -265,9 +265,6 @@ public:
size_t group_id = ncb_index.ndrange_id[0]; size_t group_id = ncb_index.ndrange_id[0];
size_t thread_id = ncb_index.thread_id; size_t thread_id = ncb_index.thread_id;


bundle_top.set(ncb_param.workspace_ptr);
bundle_compute.set(bundle_top.get(0));

const stype* src_ptr = ncb_param.src<stype>(batch_id, group_id); const stype* src_ptr = ncb_param.src<stype>(batch_id, group_id);
dst_type* dst_ptr = ncb_param.dst<dst_type>(batch_id, group_id); dst_type* dst_ptr = ncb_param.dst<dst_type>(batch_id, group_id);
const output_compute_type* bias_ptr = const output_compute_type* bias_ptr =
@@ -419,14 +416,16 @@ public:
param.filter_meta.format == param::ConvBias::Format::NCHW44) { param.filter_meta.format == param::ConvBias::Format::NCHW44) {
//! probably a gcc bug, labmda require capturing 'this' to call //! probably a gcc bug, labmda require capturing 'this' to call
//! static member function //! static member function
auto filter_process_kern = [this, strategy, bundle_top,
bundle_compute](
const NCBKernParam& ncb_param,
const NCBKernIndex& ncb_index) {
MEGDNN_MARK_USED_VAR(this);
filter_process(strategy, bundle_top, bundle_compute, ncb_param,
std::move(ncb_index));
};
auto filter_process_kern =
[this, strategy, bundle_top, bundle_compute](
const NCBKernParam& ncb_param,
const NCBKernIndex& ncb_index) mutable {
MEGDNN_MARK_USED_VAR(this);
bundle_top.set(ncb_param.workspace_ptr);
bundle_compute.set(bundle_top.get(0));
filter_process(strategy, bundle_top, bundle_compute,
ncb_param, std::move(ncb_index));
};
size_t oc_parallelism = OC; size_t oc_parallelism = OC;
if (param.filter_meta.format == param::ConvBias::Format::NCHW88) { if (param.filter_meta.format == param::ConvBias::Format::NCHW88) {
megdnn_assert(OC % 8 == 0); megdnn_assert(OC % 8 == 0);
@@ -438,18 +437,22 @@ public:
} }
kerns.push_back({filter_process_kern, {GROUP, 1, oc_parallelism}}); kerns.push_back({filter_process_kern, {GROUP, 1, oc_parallelism}});
} }
auto winograd_compute_kern = [strategy, bundle_top, bundle_compute,
matmul_algo, matmul_param, unit_tile_size,
unit_oc_size](
const NCBKernParam& ncb_param,
const NCBKernIndex& ncb_index) {
MIDOUT_BEGIN(megdnn_fallback_conv_bias_winograd_common, 0, 0) {
winograd_compute(strategy, bundle_top, bundle_compute,
matmul_algo, matmul_param, unit_tile_size,
unit_oc_size, ncb_param, std::move(ncb_index));
}
MIDOUT_END();
};
auto winograd_compute_kern =
[strategy, bundle_top, bundle_compute, matmul_algo,
matmul_param, unit_tile_size,
unit_oc_size](const NCBKernParam& ncb_param,
const NCBKernIndex& ncb_index) mutable {
MIDOUT_BEGIN(megdnn_fallback_conv_bias_winograd_common, 0,
0) {
bundle_top.set(ncb_param.workspace_ptr);
bundle_compute.set(bundle_top.get(0));
winograd_compute(strategy, bundle_top, bundle_compute,
matmul_algo, matmul_param,
unit_tile_size, unit_oc_size,
ncb_param, std::move(ncb_index));
}
MIDOUT_END();
};
kerns.push_back( kerns.push_back(
{winograd_compute_kern, {GROUP, N, nr_hw_tiles, nr_oc_tiles}}); {winograd_compute_kern, {GROUP, N, nr_hw_tiles, nr_oc_tiles}});
return kerns; return kerns;


+ 1
- 4
dnn/src/naive/handle.h View File

@@ -186,10 +186,7 @@ public:
*/ */
#define MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN(_handle, _parallelism, _stmt) \ #define MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN(_handle, _parallelism, _stmt) \
do { \ do { \
auto _kern = [=](size_t index, size_t thread_id) { \
_stmt(index, thread_id); \
}; \
_handle->dispatch_kern(_kern, _parallelism); \
_handle->dispatch_kern(_stmt, _parallelism); \
} while (0) } while (0)


//! disptch kern on current opr //! disptch kern on current opr


+ 47
- 47
dnn/src/x86/conv_bias/f32/algos.cpp View File

@@ -58,45 +58,47 @@ void get_rectified_size(size_t IH, size_t IW, size_t OH, size_t OW, size_t FH,
} }
} // namespace } // namespace


#define GET_KERN \
auto fm = param.filter_meta; \
size_t N = param.n; \
size_t IC = param.filter_meta.icpg; \
size_t OC = param.filter_meta.ocpg; \
size_t group = fm.group; \
WorkspaceBundle wbundle = get_bundle(param); \
SmallVector<NCBKern> ret_kerns; \
if (m_large_group) { \
auto exec_one_group = [wbundle](const NCBKernParam& kern_param, \
const NCBKernIndex& ncb_index) { \
auto fm = kern_param.filter_meta; \
size_t IC = fm.icpg; \
size_t OC = fm.ocpg; \
WorkspaceBundle bundle = wbundle; \
for (size_t ic = 0; ic < IC; ic++) { \
copy_padding_kern(bundle, kern_param, ncb_index, \
{ncb_index.thread_id, 0, ic}); \
} \
for (size_t oc = 0; oc < OC; oc++) { \
do_conv_kern(bundle, kern_param, ncb_index, \
{ncb_index.thread_id, 0, oc}); \
} \
}; \
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); \
} else { \
auto copy_padding = [wbundle](const NCBKernParam& kern_param, \
const NCBKernIndex& ncb_index) { \
copy_padding_kern(wbundle, kern_param, ncb_index, \
ncb_index.ndrange_id); \
}; \
ret_kerns.push_back({copy_padding, {group, N, IC}}); \
auto do_conv = [wbundle](const NCBKernParam& kern_param, \
const NCBKernIndex& ncb_index) { \
do_conv_kern(wbundle, kern_param, ncb_index, \
ncb_index.ndrange_id); \
}; \
ret_kerns.push_back({do_conv, {group, N, OC}}); \
} \
#define GET_KERN \
auto fm = param.filter_meta; \
size_t N = param.n; \
size_t IC = param.filter_meta.icpg; \
size_t OC = param.filter_meta.ocpg; \
size_t group = fm.group; \
WorkspaceBundle bundle = get_bundle(param); \
SmallVector<NCBKern> ret_kerns; \
if (m_large_group) { \
auto exec_one_group = [bundle]( \
const NCBKernParam& kern_param, \
const NCBKernIndex& ncb_index) mutable { \
bundle.set(kern_param.workspace_ptr); \
auto fm = kern_param.filter_meta; \
size_t IC = fm.icpg; \
size_t OC = fm.ocpg; \
for (size_t ic = 0; ic < IC; ic++) { \
copy_padding_kern(bundle, kern_param, ncb_index, \
{ncb_index.thread_id, 0, ic}); \
} \
for (size_t oc = 0; oc < OC; oc++) { \
do_conv_kern(bundle, kern_param, ncb_index, \
{ncb_index.thread_id, 0, oc}); \
} \
}; \
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); \
} else { \
auto copy_padding = [bundle](const NCBKernParam& kern_param, \
const NCBKernIndex& ncb_index) mutable { \
bundle.set(kern_param.workspace_ptr); \
copy_padding_kern(bundle, kern_param, ncb_index, \
ncb_index.ndrange_id); \
}; \
ret_kerns.push_back({copy_padding, {group, N, IC}}); \
auto do_conv = [bundle](const NCBKernParam& kern_param, \
const NCBKernIndex& ncb_index) mutable { \
bundle.set(kern_param.workspace_ptr); \
do_conv_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id); \
}; \
ret_kerns.push_back({do_conv, {group, N, OC}}); \
} \
return ret_kerns; return ret_kerns;


/* ===================== direct algo ===================== */ /* ===================== direct algo ===================== */
@@ -146,7 +148,8 @@ size_t ConvBiasImpl::AlgoDirect::get_workspace(


//! Process one input channel copy padding //! Process one input channel copy padding
void ConvBiasImpl::AlgoDirect::copy_padding_kern( void ConvBiasImpl::AlgoDirect::copy_padding_kern(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids) { const CpuNDRange& workspace_ids) {
size_t IH = kern_param.isz[0]; size_t IH = kern_param.isz[0];
@@ -169,7 +172,6 @@ void ConvBiasImpl::AlgoDirect::copy_padding_kern(
const float* sptr = static_cast<const float*>( const float* sptr = static_cast<const float*>(
kern_param.src<float>(batch_id, group_id)) + kern_param.src<float>(batch_id, group_id)) +
channel_id * IH * IW; channel_id * IH * IW;
bundle.set(kern_param.workspace_ptr);


//! Used for get the workspace offset //! Used for get the workspace offset
size_t workspace_group_id = workspace_ids[0], size_t workspace_group_id = workspace_ids[0],
@@ -239,7 +241,7 @@ void ConvBiasImpl::AlgoDirect::copy_padding_kern(
func = detail::convolution_##mode##_fh##fsize##_##simd; func = detail::convolution_##mode##_fh##fsize##_##simd;


//! compute one output channel //! compute one output channel
void ConvBiasImpl::AlgoDirect::do_conv_kern(WorkspaceBundle bundle,
void ConvBiasImpl::AlgoDirect::do_conv_kern(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids) { const CpuNDRange& workspace_ids) {
@@ -265,7 +267,6 @@ void ConvBiasImpl::AlgoDirect::do_conv_kern(WorkspaceBundle bundle,
func = nullptr; func = nullptr;
DISPATCH; DISPATCH;


bundle.set(kern_param.workspace_ptr);
size_t bias_offset = 0; size_t bias_offset = 0;
if (kern_param.bias_mode == megdnn::BiasMode::BIAS) { if (kern_param.bias_mode == megdnn::BiasMode::BIAS) {
bias_offset = OH * OW; bias_offset = OH * OW;
@@ -367,7 +368,8 @@ size_t ConvBiasImpl::AlgoDirectStride2::get_workspace(
} }
//! Process one input channel copy padding //! Process one input channel copy padding
void ConvBiasImpl::AlgoDirectStride2::copy_padding_kern( void ConvBiasImpl::AlgoDirectStride2::copy_padding_kern(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index, const ConvBiasImpl::NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids) { const CpuNDRange& workspace_ids) {
size_t IH = kern_param.isz[0]; size_t IH = kern_param.isz[0];
@@ -390,7 +392,6 @@ void ConvBiasImpl::AlgoDirectStride2::copy_padding_kern(
const float* sptr = static_cast<const float*>( const float* sptr = static_cast<const float*>(
kern_param.src<float>(batch_id, group_id)) + kern_param.src<float>(batch_id, group_id)) +
channel_id * IH * IW; channel_id * IH * IW;
bundle.set(kern_param.workspace_ptr);
//! Used for get the workspace offset //! Used for get the workspace offset
size_t workspace_group_id = workspace_ids[0], size_t workspace_group_id = workspace_ids[0],
workspace_batch_id = workspace_ids[1], workspace_batch_id = workspace_ids[1],
@@ -411,7 +412,7 @@ void ConvBiasImpl::AlgoDirectStride2::copy_padding_kern(


//! compute one output channel //! compute one output channel
void ConvBiasImpl::AlgoDirectStride2::do_conv_kern( void ConvBiasImpl::AlgoDirectStride2::do_conv_kern(
WorkspaceBundle bundle, const NCBKernParam& kern_param,
const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) {
size_t OH = kern_param.osz[0]; size_t OH = kern_param.osz[0];
size_t OW = kern_param.osz[1]; size_t OW = kern_param.osz[1];
@@ -446,7 +447,6 @@ void ConvBiasImpl::AlgoDirectStride2::do_conv_kern(
func_add_dst = conv_general_simd::do_conv_7x7_stride2<true>; func_add_dst = conv_general_simd::do_conv_7x7_stride2<true>;
} }


bundle.set(kern_param.workspace_ptr);
size_t bias_offset = 0; size_t bias_offset = 0;
if (kern_param.bias_mode == megdnn::BiasMode::BIAS) { if (kern_param.bias_mode == megdnn::BiasMode::BIAS) {
bias_offset = OH * OW; bias_offset = OH * OW;


+ 5
- 5
dnn/src/x86/conv_bias/f32/algos.h View File

@@ -20,11 +20,11 @@ class ConvBiasImpl::AlgoDirect final : public AlgoBase {
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const; SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const;
WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const; WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const;


static void copy_padding_kern(WorkspaceBundle bundle,
static void copy_padding_kern(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);
static void do_conv_kern(WorkspaceBundle bundle,
static void do_conv_kern(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);
@@ -57,11 +57,11 @@ class ConvBiasImpl::AlgoDirectStride2 final : public AlgoBase {
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const; SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const;
WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const; WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const;


static void copy_padding_kern(WorkspaceBundle bundle,
static void copy_padding_kern(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids);
static void do_conv_kern(WorkspaceBundle bundle,
const CpuNDRange& workspace_ids);
static void do_conv_kern(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index, const NCBKernIndex& ncb_index,
const CpuNDRange& workspace_ids); const CpuNDRange& workspace_ids);


+ 6
- 7
dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.cpp View File

@@ -19,7 +19,7 @@ namespace x86 {
namespace avx2_chanwise_stride1 { namespace avx2_chanwise_stride1 {


template <size_t filter, BiasMode bias_mode, bool is_quantized, typename Op> template <size_t filter, BiasMode bias_mode, bool is_quantized, typename Op>
void conv_kimpl(WorkspaceBundle bundle, const NCBKernParam& kern_param,
void conv_kimpl(const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) { const NCBKernIndex& ncb_index) {
size_t OH = kern_param.osz[0]; size_t OH = kern_param.osz[0];
size_t OW = kern_param.osz[1]; size_t OW = kern_param.osz[1];
@@ -38,9 +38,6 @@ void conv_kimpl(WorkspaceBundle bundle, const NCBKernParam& kern_param,
op = Op(scale_bias, scale_dst); op = Op(scale_bias, scale_dst);
} }
size_t padding_group_size = IH2 * IW2; size_t padding_group_size = IH2 * IW2;

bundle.set(kern_param.workspace_ptr);

size_t workspace_group_id = ncb_index.thread_id; size_t workspace_group_id = ncb_index.thread_id;
size_t group_id = ncb_index.ndrange_id[0], size_t group_id = ncb_index.ndrange_id[0],
batch_id = ncb_index.ndrange_id[1]; batch_id = ncb_index.ndrange_id[1];
@@ -98,7 +95,7 @@ void conv_kimpl(WorkspaceBundle bundle, const NCBKernParam& kern_param,
} }
}; };
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& kern_param, SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& kern_param,
WorkspaceBundle bundle) {
const WorkspaceBundle& bundle) {
MEGDNN_MARK_USED_VAR(kern_param); MEGDNN_MARK_USED_VAR(kern_param);
auto fm = kern_param.filter_meta; auto fm = kern_param.filter_meta;
size_t group = fm.group; size_t group = fm.group;
@@ -182,8 +179,10 @@ SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& kern_param,


DISPATCH_CONV_KERN(); DISPATCH_CONV_KERN();


auto exec_one_group = [bundle, do_conv_fun](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
auto exec_one_group = [bundle = bundle, do_conv_fun](
const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
copy_padding_kern(bundle, kern_param, ncb_index); copy_padding_kern(bundle, kern_param, ncb_index);
do_conv_fun(bundle, kern_param, ncb_index); do_conv_fun(bundle, kern_param, ncb_index);
}; };


+ 2
- 2
dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.h View File

@@ -17,11 +17,11 @@
namespace megdnn { namespace megdnn {
namespace x86 { namespace x86 {
namespace avx2_chanwise_stride1 { namespace avx2_chanwise_stride1 {
using conv_fun = std::function<void(WorkspaceBundle bundle,
using conv_fun = std::function<void(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index)>; const NCBKernIndex& ncb_index)>;
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param, SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param,
WorkspaceBundle bundle);
const WorkspaceBundle& bundle);


} // namespace avx2_chanwise_stride1 } // namespace avx2_chanwise_stride1
} // namespace x86 } // namespace x86


+ 6
- 7
dnn/src/x86/conv_bias/int8/avx2_chanwise_stride2.cpp View File

@@ -19,7 +19,7 @@ namespace x86 {
namespace avx2_chanwise_stride2 { namespace avx2_chanwise_stride2 {


template <size_t filter, BiasMode bias_mode, bool is_quantized, typename Op> template <size_t filter, BiasMode bias_mode, bool is_quantized, typename Op>
void conv_kimpl(WorkspaceBundle bundle, const NCBKernParam& kern_param,
void conv_kimpl(const WorkspaceBundle& bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) { const NCBKernIndex& ncb_index) {
size_t OH = kern_param.osz[0]; size_t OH = kern_param.osz[0];
size_t OW = kern_param.osz[1]; size_t OW = kern_param.osz[1];
@@ -38,9 +38,6 @@ void conv_kimpl(WorkspaceBundle bundle, const NCBKernParam& kern_param,
op = Op(scale_bias, scale_dst); op = Op(scale_bias, scale_dst);
} }
size_t padding_group_size = IH2 * IW2; size_t padding_group_size = IH2 * IW2;

bundle.set(kern_param.workspace_ptr);

size_t workspace_group_id = ncb_index.thread_id; size_t workspace_group_id = ncb_index.thread_id;
size_t group_id = ncb_index.ndrange_id[0], size_t group_id = ncb_index.ndrange_id[0],
batch_id = ncb_index.ndrange_id[1]; batch_id = ncb_index.ndrange_id[1];
@@ -98,7 +95,7 @@ void conv_kimpl(WorkspaceBundle bundle, const NCBKernParam& kern_param,
} }
}; };
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& kern_param, SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& kern_param,
WorkspaceBundle bundle) {
const WorkspaceBundle& bundle) {
MEGDNN_MARK_USED_VAR(kern_param); MEGDNN_MARK_USED_VAR(kern_param);
auto fm = kern_param.filter_meta; auto fm = kern_param.filter_meta;
size_t group = fm.group; size_t group = fm.group;
@@ -187,8 +184,10 @@ SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& kern_param,


DISPATCH_CONV_KERN(); DISPATCH_CONV_KERN();


auto exec_one_group = [bundle, do_conv_fun](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
auto exec_one_group = [bundle = bundle, do_conv_fun](
const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(kern_param.workspace_ptr);
copy_padding_kern(bundle, kern_param, ncb_index); copy_padding_kern(bundle, kern_param, ncb_index);
do_conv_fun(bundle, kern_param, ncb_index); do_conv_fun(bundle, kern_param, ncb_index);
}; };


+ 2
- 2
dnn/src/x86/conv_bias/int8/avx2_chanwise_stride2.h View File

@@ -17,11 +17,11 @@
namespace megdnn { namespace megdnn {
namespace x86 { namespace x86 {
namespace avx2_chanwise_stride2 { namespace avx2_chanwise_stride2 {
using conv_fun = std::function<void(WorkspaceBundle bundle,
using conv_fun = std::function<void(const WorkspaceBundle& bundle,
const NCBKernParam& kern_param, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index)>; const NCBKernIndex& ncb_index)>;
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param, SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param,
WorkspaceBundle bundle);
const WorkspaceBundle& bundle);


} // namespace avx2_chanwise_stride2 } // namespace avx2_chanwise_stride2
} // namespace x86 } // namespace x86


+ 14
- 18
dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride1.cpp View File

@@ -19,7 +19,7 @@ namespace direct_conv_avx2_stride1 {


//! layout:(N,IC,IH,IW)-->(N,IC/2,H,W,2) //! layout:(N,IC,IH,IW)-->(N,IC/2,H,W,2)
MEGDNN_ATTRIBUTE_TARGET("sse4.1") MEGDNN_ATTRIBUTE_TARGET("sse4.1")
void pack_src_conv_avx2_stride1(WorkspaceBundle bundle,
void pack_src_conv_avx2_stride1(const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index) { const ConvBiasImpl::NCBKernIndex& ncb_index) {
int32_t ih = kern_param.isz[0]; int32_t ih = kern_param.isz[0];
@@ -48,7 +48,6 @@ void pack_src_conv_avx2_stride1(WorkspaceBundle bundle,


const int8_t* src_ptr = kern_param.src<int8_t>(batch_id, group_id) + const int8_t* src_ptr = kern_param.src<int8_t>(batch_id, group_id) +
ic_step * channel_id * c_stride; ic_step * channel_id * c_stride;
bundle.set(kern_param.workspace_ptr);
int8_t* packed_src = static_cast<int8_t*>(bundle.get(0)) + int8_t* packed_src = static_cast<int8_t*>(bundle.get(0)) +
batch_id * group * packed_group_size + batch_id * group * packed_group_size +
group_id * packed_group_size + group_id * packed_group_size +
@@ -103,7 +102,7 @@ void pack_src_conv_avx2_stride1(WorkspaceBundle bundle,


MEGDNN_ATTRIBUTE_TARGET("sse4.1") MEGDNN_ATTRIBUTE_TARGET("sse4.1")
static inline void pack_filter_conv_avx2_stride1( static inline void pack_filter_conv_avx2_stride1(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index) { const ConvBiasImpl::NCBKernIndex& ncb_index) {
MEGDNN_MARK_USED_VAR(ncb_index); MEGDNN_MARK_USED_VAR(ncb_index);
int32_t oc = kern_param.filter_meta.ocpg; int32_t oc = kern_param.filter_meta.ocpg;
@@ -129,7 +128,6 @@ static inline void pack_filter_conv_avx2_stride1(
oc_index_id = ncb_index.ndrange_id[1]; oc_index_id = ncb_index.ndrange_id[1];


const int8_t* pack_filter_ptr = kern_param.filter<int8_t>(group_id); const int8_t* pack_filter_ptr = kern_param.filter<int8_t>(group_id);
bundle.set(kern_param.workspace_ptr);
int16_t* out_ptr = static_cast<int16_t*>(bundle.get(1)) + int16_t* out_ptr = static_cast<int16_t*>(bundle.get(1)) +
group_id * round_up(oc, oc_step) * oc_out_stride; group_id * round_up(oc, oc_step) * oc_out_stride;


@@ -602,7 +600,7 @@ inline void AlgoAVX2DirectConvStride1S8S8S32_forward(
#undef cb_switch #undef cb_switch
#undef cb #undef cb
} }
void do_conv_kern(WorkspaceBundle bundle,
void do_conv_kern(const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index) { const ConvBiasImpl::NCBKernIndex& ncb_index) {
auto&& fm = kern_param.filter_meta; auto&& fm = kern_param.filter_meta;
@@ -635,8 +633,6 @@ void do_conv_kern(WorkspaceBundle bundle,
batch_id = ncb_index.ndrange_id[1], batch_id = ncb_index.ndrange_id[1],
channel_id = ncb_index.ndrange_id[2]; channel_id = ncb_index.ndrange_id[2];


bundle.set(kern_param.workspace_ptr);

int8_t* src_ptr = static_cast<int8_t*>(bundle.get(0)) + int8_t* src_ptr = static_cast<int8_t*>(bundle.get(0)) +
group_id * packed_group_size + group_id * packed_group_size +
batch_id * group * packed_group_size; batch_id * group * packed_group_size;
@@ -672,7 +668,7 @@ void do_conv_kern(WorkspaceBundle bundle,
oc_stride, kern_param); oc_stride, kern_param);
} }


void do_post_process(WorkspaceBundle bundle,
void do_post_process(const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index) { const ConvBiasImpl::NCBKernIndex& ncb_index) {
auto&& fm = kern_param.filter_meta; auto&& fm = kern_param.filter_meta;
@@ -683,7 +679,6 @@ void do_post_process(WorkspaceBundle bundle,


size_t group_id = ncb_index.ndrange_id[0], size_t group_id = ncb_index.ndrange_id[0],
batch_id = ncb_index.ndrange_id[1]; batch_id = ncb_index.ndrange_id[1];
bundle.set(kern_param.workspace_ptr);
bool need_post_process = bool need_post_process =
kern_param.dst_type.enumv() == DTypeEnum::QuantizedS8; kern_param.dst_type.enumv() == DTypeEnum::QuantizedS8;
void* dst_tptr = nullptr; void* dst_tptr = nullptr;
@@ -729,21 +724,22 @@ void do_post_process(WorkspaceBundle bundle,
} }


SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& kern_param, SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& kern_param,
WorkspaceBundle bundle) {
const WorkspaceBundle& bundle) {
SmallVector<NCBKern> ncb_kerns; SmallVector<NCBKern> ncb_kerns;
auto fm = kern_param.filter_meta; auto fm = kern_param.filter_meta;
size_t N = kern_param.n; size_t N = kern_param.n;
size_t IC = kern_param.filter_meta.icpg; size_t IC = kern_param.filter_meta.icpg;
size_t OC = kern_param.filter_meta.ocpg; size_t OC = kern_param.filter_meta.ocpg;
size_t group = fm.group; size_t group = fm.group;
#define cb(task) \
auto task = [bundle, tmp_func]( \
const ConvBiasImpl::NCBKernParam& kern_param, \
const ConvBiasImpl::NCBKernIndex& ncb_index) { \
tmp_func(bundle, kern_param, \
{ncb_index.thread_id, \
{ncb_index.ndrange_id[0], ncb_index.ndrange_id[1], \
ncb_index.ndrange_id[2]}}); \
#define cb(task) \
auto task = [bundle = bundle, tmp_func]( \
const ConvBiasImpl::NCBKernParam& kern_param, \
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { \
bundle.set(kern_param.workspace_ptr); \
tmp_func(bundle, kern_param, \
{ncb_index.thread_id, \
{ncb_index.ndrange_id[0], ncb_index.ndrange_id[1], \
ncb_index.ndrange_id[2]}}); \
}; };
auto tmp_func = pack_src_conv_avx2_stride1; auto tmp_func = pack_src_conv_avx2_stride1;
cb(pack_src_task); cb(pack_src_task);


+ 1
- 1
dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride1.h View File

@@ -20,7 +20,7 @@ using NCBKern = fallback::ConvBiasImpl::NCBKern;
using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam; using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam;


SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param, SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param,
WorkspaceBundle bundle);
const WorkspaceBundle& bundle);


} // namespace direct_conv_avx2_stride1 } // namespace direct_conv_avx2_stride1
} // namespace x86 } // namespace x86


+ 14
- 17
dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride2.cpp View File

@@ -19,7 +19,7 @@ namespace direct_conv_avx2_stride2 {


//! layout:(N,IC,IH,IW)-->(N,IC/2,H,2*W_envnW_odd) //! layout:(N,IC,IH,IW)-->(N,IC/2,H,2*W_envnW_odd)
MEGDNN_ATTRIBUTE_TARGET("sse4.1") MEGDNN_ATTRIBUTE_TARGET("sse4.1")
void pack_src_conv_avx2_stride2(WorkspaceBundle bundle,
void pack_src_conv_avx2_stride2(const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index) { const ConvBiasImpl::NCBKernIndex& ncb_index) {
int32_t ih = kern_param.isz[0]; int32_t ih = kern_param.isz[0];
@@ -46,7 +46,6 @@ void pack_src_conv_avx2_stride2(WorkspaceBundle bundle,


const int8_t* src_ptr = kern_param.src<int8_t>(batch_id, group_id) + const int8_t* src_ptr = kern_param.src<int8_t>(batch_id, group_id) +
ic_step * channel_id * c_stride; ic_step * channel_id * c_stride;
bundle.set(kern_param.workspace_ptr);
int8_t* packed_src = static_cast<int8_t*>(bundle.get(0)) + int8_t* packed_src = static_cast<int8_t*>(bundle.get(0)) +
batch_id * group * packed_group_size + batch_id * group * packed_group_size +
group_id * packed_group_size + group_id * packed_group_size +
@@ -161,7 +160,7 @@ void pack_src_conv_avx2_stride2(WorkspaceBundle bundle,


MEGDNN_ATTRIBUTE_TARGET("sse4.1") MEGDNN_ATTRIBUTE_TARGET("sse4.1")
static inline void pack_filter_conv_avx2_stride2( static inline void pack_filter_conv_avx2_stride2(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index) { const ConvBiasImpl::NCBKernIndex& ncb_index) {
MEGDNN_MARK_USED_VAR(ncb_index); MEGDNN_MARK_USED_VAR(ncb_index);
int32_t oc = kern_param.filter_meta.ocpg; int32_t oc = kern_param.filter_meta.ocpg;
@@ -187,7 +186,6 @@ static inline void pack_filter_conv_avx2_stride2(
oc_index_id = ncb_index.ndrange_id[1]; oc_index_id = ncb_index.ndrange_id[1];


const int8_t* pack_filter_ptr = kern_param.filter<int8_t>(group_id); const int8_t* pack_filter_ptr = kern_param.filter<int8_t>(group_id);
bundle.set(kern_param.workspace_ptr);
int16_t* out_ptr = static_cast<int16_t*>(bundle.get(1)) + int16_t* out_ptr = static_cast<int16_t*>(bundle.get(1)) +
group_id * round_up(oc, oc_step) * oc_out_stride; group_id * round_up(oc, oc_step) * oc_out_stride;


@@ -675,7 +673,7 @@ inline void kernel_handle_oh_remain(
#undef cb_switch #undef cb_switch
#undef cb #undef cb
} }
void kernel_imp(WorkspaceBundle bundle,
void kernel_imp(const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index) { const ConvBiasImpl::NCBKernIndex& ncb_index) {
auto&& fm = kern_param.filter_meta; auto&& fm = kern_param.filter_meta;
@@ -708,7 +706,6 @@ void kernel_imp(WorkspaceBundle bundle,
batch_id = ncb_index.ndrange_id[1], batch_id = ncb_index.ndrange_id[1],
channel_id = ncb_index.ndrange_id[2]; channel_id = ncb_index.ndrange_id[2];


bundle.set(kern_param.workspace_ptr);
int8_t* src_ptr = static_cast<int8_t*>(bundle.get(0)) + int8_t* src_ptr = static_cast<int8_t*>(bundle.get(0)) +
group_id * packed_group_size + group_id * packed_group_size +
batch_id * group * packed_group_size; batch_id * group * packed_group_size;
@@ -742,7 +739,7 @@ void kernel_imp(WorkspaceBundle bundle,
oc_stride, kern_param); oc_stride, kern_param);
} }


void do_post_process(WorkspaceBundle bundle,
void do_post_process(const WorkspaceBundle& bundle,
const ConvBiasImpl::NCBKernParam& kern_param, const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index) { const ConvBiasImpl::NCBKernIndex& ncb_index) {
auto&& fm = kern_param.filter_meta; auto&& fm = kern_param.filter_meta;
@@ -754,7 +751,6 @@ void do_post_process(WorkspaceBundle bundle,
size_t group_id = ncb_index.ndrange_id[0], size_t group_id = ncb_index.ndrange_id[0],
batch_id = ncb_index.ndrange_id[1]; batch_id = ncb_index.ndrange_id[1];


bundle.set(kern_param.workspace_ptr);
bool need_post_process = bool need_post_process =
kern_param.dst_type.enumv() == DTypeEnum::QuantizedS8; kern_param.dst_type.enumv() == DTypeEnum::QuantizedS8;
void* dst_tptr = nullptr; void* dst_tptr = nullptr;
@@ -801,21 +797,22 @@ void do_post_process(WorkspaceBundle bundle,
} }


SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& kern_param, SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& kern_param,
WorkspaceBundle bundle) {
const WorkspaceBundle& bundle) {
SmallVector<NCBKern> ncb_kerns; SmallVector<NCBKern> ncb_kerns;
auto fm = kern_param.filter_meta; auto fm = kern_param.filter_meta;
size_t N = kern_param.n; size_t N = kern_param.n;
size_t IC = kern_param.filter_meta.icpg; size_t IC = kern_param.filter_meta.icpg;
size_t OC = kern_param.filter_meta.ocpg; size_t OC = kern_param.filter_meta.ocpg;
size_t group = fm.group; size_t group = fm.group;
#define cb(task) \
auto task = [bundle, tmp_func]( \
const ConvBiasImpl::NCBKernParam& kern_param, \
const ConvBiasImpl::NCBKernIndex& ncb_index) { \
tmp_func(bundle, kern_param, \
{ncb_index.thread_id, \
{ncb_index.ndrange_id[0], ncb_index.ndrange_id[1], \
ncb_index.ndrange_id[2]}}); \
#define cb(task) \
auto task = [bundle = bundle, tmp_func]( \
const ConvBiasImpl::NCBKernParam& kern_param, \
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { \
bundle.set(kern_param.workspace_ptr); \
tmp_func(bundle, kern_param, \
{ncb_index.thread_id, \
{ncb_index.ndrange_id[0], ncb_index.ndrange_id[1], \
ncb_index.ndrange_id[2]}}); \
}; };
auto tmp_func = pack_src_conv_avx2_stride2; auto tmp_func = pack_src_conv_avx2_stride2;
cb(pack_src_task); cb(pack_src_task);


+ 1
- 1
dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride2.h View File

@@ -20,7 +20,7 @@ using NCBKern = fallback::ConvBiasImpl::NCBKern;
using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam; using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam;


SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param, SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param,
WorkspaceBundle bundle);
const WorkspaceBundle& bundle);


} // namespace direct_conv_avx2_stride2 } // namespace direct_conv_avx2_stride2
} // namespace x86 } // namespace x86


+ 1
- 2
dnn/src/x86/conv_bias/int8/chanwise_helper.h View File

@@ -48,7 +48,7 @@ static inline void get_rectified_size(const NCBKernSizeParam& param,
} }


static inline void copy_padding_kern( static inline void copy_padding_kern(
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index) { const ConvBiasImpl::NCBKernIndex& ncb_index) {
size_t IW = kern_param.isz[1]; size_t IW = kern_param.isz[1];
size_t IH = kern_param.isz[0]; size_t IH = kern_param.isz[0];
@@ -59,7 +59,6 @@ static inline void copy_padding_kern(
get_rectified_size(kern_param, IH2, IW2, OH2, OW2); get_rectified_size(kern_param, IH2, IW2, OH2, OW2);
bool need_src_copy_var = need_src_copy(kern_param); bool need_src_copy_var = need_src_copy(kern_param);
size_t padding_group_size = IH2 * IW2; size_t padding_group_size = IH2 * IW2;
bundle.set(kern_param.workspace_ptr);


size_t group_id = ncb_index.ndrange_id[0], size_t group_id = ncb_index.ndrange_id[0],
batch_id = ncb_index.ndrange_id[1], batch_id = ncb_index.ndrange_id[1],


Loading…
Cancel
Save