GitOrigin-RevId: 801aedbd72
release-0.6
@@ -89,19 +89,20 @@ ConvBiasImpl::AlgoF16DirectStride2::get_kimpls( | |||||
conv = fp16::conv_stride2::do_conv_7x7_stride2; | conv = fp16::conv_stride2::do_conv_7x7_stride2; | ||||
} | } | ||||
WorkspaceBundle wbundle = arm_common::MultithreadDirectConvCommon< | |||||
WorkspaceBundle bundle = arm_common::MultithreadDirectConvCommon< | |||||
dt_float16, __fp16>::get_bundle_stride(param, m_large_group); | dt_float16, __fp16>::get_bundle_stride(param, m_large_group); | ||||
SmallVector<NCBKern> ret_kerns; | SmallVector<NCBKern> ret_kerns; | ||||
//! Dense conv and small group | //! Dense conv and small group | ||||
if (m_large_group) { | if (m_large_group) { | ||||
//! Channel wise conv and big groups | //! Channel wise conv and big groups | ||||
auto exec_one_group = [wbundle, conv](const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) { | |||||
auto exec_one_group = [bundle, conv]( | |||||
const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
auto fm = kern_param.filter_meta; | auto fm = kern_param.filter_meta; | ||||
size_t IC = fm.icpg; | size_t IC = fm.icpg; | ||||
size_t OC = fm.ocpg; | size_t OC = fm.ocpg; | ||||
WorkspaceBundle bundle = wbundle; | |||||
bundle.set(kern_param.workspace_ptr); | |||||
for (size_t ic = 0; ic < IC; ic++) { | for (size_t ic = 0; ic < IC; ic++) { | ||||
arm_common::MultithreadDirectConvCommon<dt_float16, __fp16>:: | arm_common::MultithreadDirectConvCommon<dt_float16, __fp16>:: | ||||
copy_padding_kern_stride(bundle, kern_param, ncb_index, | copy_padding_kern_stride(bundle, kern_param, ncb_index, | ||||
@@ -115,16 +116,17 @@ ConvBiasImpl::AlgoF16DirectStride2::get_kimpls( | |||||
}; | }; | ||||
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ||||
} else { | } else { | ||||
WorkspaceBundle bundle = wbundle; | |||||
auto copy_padding = [bundle](const NCBKernParam& kern_param, | auto copy_padding = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
arm_common::MultithreadDirectConvCommon<dt_float16, __fp16>:: | arm_common::MultithreadDirectConvCommon<dt_float16, __fp16>:: | ||||
copy_padding_kern_stride(bundle, kern_param, ncb_index, | copy_padding_kern_stride(bundle, kern_param, ncb_index, | ||||
ncb_index.ndrange_id); | ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({copy_padding, {group, N, IC}}); | ret_kerns.push_back({copy_padding, {group, N, IC}}); | ||||
auto do_conv = [bundle, conv](const NCBKernParam& kern_param, | auto do_conv = [bundle, conv](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
arm_common::MultithreadDirectConvCommon<dt_float16, __fp16>:: | arm_common::MultithreadDirectConvCommon<dt_float16, __fp16>:: | ||||
do_conv_kern_stride(bundle, kern_param, ncb_index, conv, | do_conv_kern_stride(bundle, kern_param, ncb_index, conv, | ||||
ncb_index.ndrange_id); | ncb_index.ndrange_id); | ||||
@@ -88,19 +88,20 @@ ConvBiasImpl::AlgoF32DirectStride2::get_kimpls( | |||||
conv = fp32::conv_stride2::do_conv_7x7_stride2; | conv = fp32::conv_stride2::do_conv_7x7_stride2; | ||||
} | } | ||||
WorkspaceBundle wbundle = arm_common::MultithreadDirectConvCommon< | |||||
WorkspaceBundle bundle = arm_common::MultithreadDirectConvCommon< | |||||
float, float>::get_bundle_stride(param, m_large_group); | float, float>::get_bundle_stride(param, m_large_group); | ||||
SmallVector<NCBKern> ret_kerns; | SmallVector<NCBKern> ret_kerns; | ||||
//! Dense conv and small group | //! Dense conv and small group | ||||
if (m_large_group) { | if (m_large_group) { | ||||
//! Channel wise conv and big groups | //! Channel wise conv and big groups | ||||
auto exec_one_group = [wbundle, conv](const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) { | |||||
auto exec_one_group = [bundle, conv]( | |||||
const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
auto fm = kern_param.filter_meta; | auto fm = kern_param.filter_meta; | ||||
size_t IC = fm.icpg; | size_t IC = fm.icpg; | ||||
size_t OC = fm.ocpg; | size_t OC = fm.ocpg; | ||||
WorkspaceBundle bundle = wbundle; | |||||
bundle.set(kern_param.workspace_ptr); | |||||
for (size_t ic = 0; ic < IC; ic++) { | for (size_t ic = 0; ic < IC; ic++) { | ||||
arm_common::MultithreadDirectConvCommon<float, float>:: | arm_common::MultithreadDirectConvCommon<float, float>:: | ||||
copy_padding_kern_stride(bundle, kern_param, ncb_index, | copy_padding_kern_stride(bundle, kern_param, ncb_index, | ||||
@@ -116,16 +117,17 @@ ConvBiasImpl::AlgoF32DirectStride2::get_kimpls( | |||||
}; | }; | ||||
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ||||
} else { | } else { | ||||
WorkspaceBundle bundle = wbundle; | |||||
auto copy_padding = [bundle](const NCBKernParam& kern_param, | auto copy_padding = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
arm_common::MultithreadDirectConvCommon<float, float>:: | arm_common::MultithreadDirectConvCommon<float, float>:: | ||||
copy_padding_kern_stride(bundle, kern_param, ncb_index, | copy_padding_kern_stride(bundle, kern_param, ncb_index, | ||||
ncb_index.ndrange_id); | ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({copy_padding, {group, N, IC}}); | ret_kerns.push_back({copy_padding, {group, N, IC}}); | ||||
auto do_conv = [bundle, conv](const NCBKernParam& kern_param, | auto do_conv = [bundle, conv](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
arm_common::MultithreadDirectConvCommon< | arm_common::MultithreadDirectConvCommon< | ||||
float, float>::do_conv_kern_stride(bundle, kern_param, | float, float>::do_conv_kern_stride(bundle, kern_param, | ||||
ncb_index, conv, | ncb_index, conv, | ||||
@@ -119,7 +119,8 @@ MultithreadDirectConvCommon<io_ctype, compute_ctype>::get_bundle_stride( | |||||
//! Process one output channel weight flip | //! Process one output channel weight flip | ||||
template <typename io_ctype, typename compute_ctype> | template <typename io_ctype, typename compute_ctype> | ||||
void MultithreadDirectConvCommon<io_ctype, compute_ctype>::weight_flip_kern( | void MultithreadDirectConvCommon<io_ctype, compute_ctype>::weight_flip_kern( | ||||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids) { | const CpuNDRange& workspace_ids) { | ||||
size_t FH = kern_param.filter_meta.spatial[0]; | size_t FH = kern_param.filter_meta.spatial[0]; | ||||
@@ -131,7 +132,6 @@ void MultithreadDirectConvCommon<io_ctype, compute_ctype>::weight_flip_kern( | |||||
group_id = ncb_index.ndrange_id[0]; | group_id = ncb_index.ndrange_id[0]; | ||||
const io_ctype* filter = | const io_ctype* filter = | ||||
kern_param.filter<io_ctype>(group_id) + channel_id * FH * FW * IC; | kern_param.filter<io_ctype>(group_id) + channel_id * FH * FW * IC; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
io_ctype* filter_flip = | io_ctype* filter_flip = | ||||
static_cast<io_ctype*>(bundle.get(1)) + | static_cast<io_ctype*>(bundle.get(1)) + | ||||
(workspace_group_id * IC * OC + channel_id * IC) * FH * FW; | (workspace_group_id * IC * OC + channel_id * IC) * FH * FW; | ||||
@@ -148,7 +148,8 @@ void MultithreadDirectConvCommon<io_ctype, compute_ctype>::weight_flip_kern( | |||||
//! Process one input channel copy padding | //! Process one input channel copy padding | ||||
template <typename io_ctype, typename compute_ctype> | template <typename io_ctype, typename compute_ctype> | ||||
void MultithreadDirectConvCommon<io_ctype, compute_ctype>::copy_padding_kern( | void MultithreadDirectConvCommon<io_ctype, compute_ctype>::copy_padding_kern( | ||||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids) { | const CpuNDRange& workspace_ids) { | ||||
size_t IH = kern_param.isz[0]; | size_t IH = kern_param.isz[0]; | ||||
@@ -161,7 +162,6 @@ void MultithreadDirectConvCommon<io_ctype, compute_ctype>::copy_padding_kern( | |||||
size_t padding_group_size = IH2 * IW2 * IC; | size_t padding_group_size = IH2 * IW2 * IC; | ||||
size_t N = kern_param.n; | size_t N = kern_param.n; | ||||
size_t GROUP = kern_param.filter_meta.group; | size_t GROUP = kern_param.filter_meta.group; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
//! Used for get the workspace offset | //! Used for get the workspace offset | ||||
size_t workspace_group_id = workspace_ids[0], | size_t workspace_group_id = workspace_ids[0], | ||||
@@ -191,7 +191,7 @@ void MultithreadDirectConvCommon<io_ctype, compute_ctype>::copy_padding_kern( | |||||
//! Process one input channel copy padding | //! Process one input channel copy padding | ||||
template <typename io_ctype, typename compute_ctype> | template <typename io_ctype, typename compute_ctype> | ||||
void MultithreadDirectConvCommon<io_ctype, compute_ctype>:: | void MultithreadDirectConvCommon<io_ctype, compute_ctype>:: | ||||
copy_padding_kern_stride(WorkspaceBundle bundle, | |||||
copy_padding_kern_stride(const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | const ConvBiasImpl::NCBKernParam& kern_param, | ||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids) { | const CpuNDRange& workspace_ids) { | ||||
@@ -208,7 +208,6 @@ void MultithreadDirectConvCommon<io_ctype, compute_ctype>:: | |||||
size_t GROUP = kern_param.filter_meta.group; | size_t GROUP = kern_param.filter_meta.group; | ||||
get_rectified_size(kern_param, IH, IW, OH, OW, FH, FW, PH, PW, IH2, IW2, OW2); | get_rectified_size(kern_param, IH, IW, OH, OW, FH, FW, PH, PW, IH2, IW2, OW2); | ||||
size_t padding_group_size = IH2 * IW2 * IC; | size_t padding_group_size = IH2 * IW2 * IC; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
//! Used for get the workspace offset | //! Used for get the workspace offset | ||||
size_t workspace_group_id = workspace_ids[0], | size_t workspace_group_id = workspace_ids[0], | ||||
@@ -235,7 +234,8 @@ void MultithreadDirectConvCommon<io_ctype, compute_ctype>:: | |||||
//! compute one output channel | //! compute one output channel | ||||
template <typename io_ctype, typename compute_ctype> | template <typename io_ctype, typename compute_ctype> | ||||
void MultithreadDirectConvCommon<io_ctype, compute_ctype>::do_conv_kern( | void MultithreadDirectConvCommon<io_ctype, compute_ctype>::do_conv_kern( | ||||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const kern_direct_conv_f32& fun, const CpuNDRange& workspace_ids) { | const kern_direct_conv_f32& fun, const CpuNDRange& workspace_ids) { | ||||
size_t OH = kern_param.osz[0]; | size_t OH = kern_param.osz[0]; | ||||
@@ -251,7 +251,6 @@ void MultithreadDirectConvCommon<io_ctype, compute_ctype>::do_conv_kern( | |||||
size_t padding_group_size = IH2 * IW2 * IC; | size_t padding_group_size = IH2 * IW2 * IC; | ||||
size_t N = kern_param.n; | size_t N = kern_param.n; | ||||
size_t GROUP = kern_param.filter_meta.group; | size_t GROUP = kern_param.filter_meta.group; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
size_t group_id = ncb_index.ndrange_id[0], | size_t group_id = ncb_index.ndrange_id[0], | ||||
batch_id = ncb_index.ndrange_id[1]; | batch_id = ncb_index.ndrange_id[1]; | ||||
@@ -305,7 +304,8 @@ void MultithreadDirectConvCommon<io_ctype, compute_ctype>::do_conv_kern( | |||||
//! compute one output channel | //! compute one output channel | ||||
template <typename io_ctype, typename compute_ctype> | template <typename io_ctype, typename compute_ctype> | ||||
void MultithreadDirectConvCommon<io_ctype, compute_ctype>::do_conv_kern_stride( | void MultithreadDirectConvCommon<io_ctype, compute_ctype>::do_conv_kern_stride( | ||||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const kern_direct_conv_f32_stride& fun, | const kern_direct_conv_f32_stride& fun, | ||||
const CpuNDRange& workspace_ids) { | const CpuNDRange& workspace_ids) { | ||||
@@ -323,7 +323,6 @@ void MultithreadDirectConvCommon<io_ctype, compute_ctype>::do_conv_kern_stride( | |||||
size_t padding_group_size = IH2 * IW2 * IC; | size_t padding_group_size = IH2 * IW2 * IC; | ||||
size_t GROUP = kern_param.filter_meta.group; | size_t GROUP = kern_param.filter_meta.group; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
//! Used for get the workspace offset | //! Used for get the workspace offset | ||||
size_t group_id = ncb_index.ndrange_id[0], | size_t group_id = ncb_index.ndrange_id[0], | ||||
@@ -35,24 +35,24 @@ public: | |||||
bool m_large_group); | bool m_large_group); | ||||
static WorkspaceBundle get_bundle_stride(const NCBKernSizeParam& param, | static WorkspaceBundle get_bundle_stride(const NCBKernSizeParam& param, | ||||
bool m_large_group); | bool m_large_group); | ||||
static void weight_flip_kern(WorkspaceBundle bundle, | |||||
static void weight_flip_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
static void copy_padding_kern(WorkspaceBundle bundle, | |||||
static void copy_padding_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
static void copy_padding_kern_stride(WorkspaceBundle bundle, | |||||
static void copy_padding_kern_stride(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
static void do_conv_kern(WorkspaceBundle bundle, | |||||
static void do_conv_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const kern_direct_conv_f32& fun, | const kern_direct_conv_f32& fun, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
static void do_conv_kern_stride(WorkspaceBundle bundle, | |||||
static void do_conv_kern_stride(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const kern_direct_conv_f32_stride& fun, | const kern_direct_conv_f32_stride& fun, | ||||
@@ -362,7 +362,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16Direct::get_kimpls( | |||||
size_t IC = param.filter_meta.icpg; | size_t IC = param.filter_meta.icpg; | ||||
size_t OC = param.filter_meta.ocpg; | size_t OC = param.filter_meta.ocpg; | ||||
size_t group = fm.group; | size_t group = fm.group; | ||||
WorkspaceBundle wbundle = | |||||
WorkspaceBundle bundle = | |||||
MultithreadDirectConvCommon<dt_float16, __fp16>::get_bundle( | MultithreadDirectConvCommon<dt_float16, __fp16>::get_bundle( | ||||
param, m_large_group); | param, m_large_group); | ||||
SmallVector<NCBKern> ret_kerns; | SmallVector<NCBKern> ret_kerns; | ||||
@@ -370,12 +370,12 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16Direct::get_kimpls( | |||||
//! one group for better performance | //! one group for better performance | ||||
if (m_large_group) { | if (m_large_group) { | ||||
//! Channel wise conv and big groups | //! Channel wise conv and big groups | ||||
auto exec_one_group = [wbundle](const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) { | |||||
auto exec_one_group = [bundle](const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
auto fm = kern_param.filter_meta; | auto fm = kern_param.filter_meta; | ||||
size_t IC = fm.icpg; | size_t IC = fm.icpg; | ||||
size_t OC = fm.ocpg; | size_t OC = fm.ocpg; | ||||
WorkspaceBundle bundle = wbundle; | |||||
bundle.set(kern_param.workspace_ptr); | |||||
if (fm.should_flip) { | if (fm.should_flip) { | ||||
for (size_t oc = 0; oc < OC; oc++) { | for (size_t oc = 0; oc < OC; oc++) { | ||||
MultithreadDirectConvCommon<dt_float16, __fp16>:: | MultithreadDirectConvCommon<dt_float16, __fp16>:: | ||||
@@ -397,10 +397,10 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16Direct::get_kimpls( | |||||
}; | }; | ||||
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ||||
} else { | } else { | ||||
WorkspaceBundle bundle = wbundle; | |||||
if (fm.should_flip) { | if (fm.should_flip) { | ||||
auto weight_flip = [bundle](const NCBKernParam& kern_param, | auto weight_flip = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
MultithreadDirectConvCommon<dt_float16, __fp16>:: | MultithreadDirectConvCommon<dt_float16, __fp16>:: | ||||
weight_flip_kern(bundle, kern_param, ncb_index, | weight_flip_kern(bundle, kern_param, ncb_index, | ||||
ncb_index.ndrange_id); | ncb_index.ndrange_id); | ||||
@@ -408,13 +408,15 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16Direct::get_kimpls( | |||||
ret_kerns.push_back({weight_flip, {group, 1_z, OC}}); | ret_kerns.push_back({weight_flip, {group, 1_z, OC}}); | ||||
} | } | ||||
auto copy_padding = [bundle](const NCBKernParam& kern_param, | auto copy_padding = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
MultithreadDirectConvCommon<dt_float16, __fp16>::copy_padding_kern( | MultithreadDirectConvCommon<dt_float16, __fp16>::copy_padding_kern( | ||||
bundle, kern_param, ncb_index, ncb_index.ndrange_id); | bundle, kern_param, ncb_index, ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({copy_padding, {group, N, IC}}); | ret_kerns.push_back({copy_padding, {group, N, IC}}); | ||||
auto do_conv = [bundle](const NCBKernParam& kern_param, | auto do_conv = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
MultithreadDirectConvCommon<dt_float16, __fp16>::do_conv_kern( | MultithreadDirectConvCommon<dt_float16, __fp16>::do_conv_kern( | ||||
bundle, kern_param, ncb_index, | bundle, kern_param, ncb_index, | ||||
fp16::conv_bias::kern_direct_f16, ncb_index.ndrange_id); | fp16::conv_bias::kern_direct_f16, ncb_index.ndrange_id); | ||||
@@ -488,7 +490,7 @@ ConvBiasImpl::AlgoF16DirectStride1::get_kimpls( | |||||
} | } | ||||
SWITCH_KERN(); | SWITCH_KERN(); | ||||
WorkspaceBundle wbundle = | |||||
WorkspaceBundle bundle = | |||||
MultithreadDirectConvCommon<dt_float16, __fp16>::get_bundle_stride( | MultithreadDirectConvCommon<dt_float16, __fp16>::get_bundle_stride( | ||||
param, m_large_group); | param, m_large_group); | ||||
SmallVector<NCBKern> ret_kerns; | SmallVector<NCBKern> ret_kerns; | ||||
@@ -496,13 +498,13 @@ ConvBiasImpl::AlgoF16DirectStride1::get_kimpls( | |||||
//! one group for better performance | //! one group for better performance | ||||
if (m_large_group) { | if (m_large_group) { | ||||
//! Channel wise conv and big groups | //! Channel wise conv and big groups | ||||
auto exec_one_group = [wbundle, conv_kern_function]( | |||||
auto exec_one_group = [bundle, conv_kern_function]( | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
auto fm = kern_param.filter_meta; | auto fm = kern_param.filter_meta; | ||||
size_t IC = fm.icpg; | size_t IC = fm.icpg; | ||||
size_t OC = fm.ocpg; | size_t OC = fm.ocpg; | ||||
WorkspaceBundle bundle = wbundle; | |||||
bundle.set(kern_param.workspace_ptr); | |||||
for (size_t ic = 0; ic < IC; ic++) { | for (size_t ic = 0; ic < IC; ic++) { | ||||
MultithreadDirectConvCommon<dt_float16, __fp16>:: | MultithreadDirectConvCommon<dt_float16, __fp16>:: | ||||
copy_padding_kern_stride(bundle, kern_param, ncb_index, | copy_padding_kern_stride(bundle, kern_param, ncb_index, | ||||
@@ -517,9 +519,9 @@ ConvBiasImpl::AlgoF16DirectStride1::get_kimpls( | |||||
}; | }; | ||||
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ||||
} else { | } else { | ||||
WorkspaceBundle bundle = wbundle; | |||||
auto copy_padding = [bundle](const NCBKernParam& kern_param, | auto copy_padding = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
MultithreadDirectConvCommon<dt_float16, __fp16>:: | MultithreadDirectConvCommon<dt_float16, __fp16>:: | ||||
copy_padding_kern_stride(bundle, kern_param, ncb_index, | copy_padding_kern_stride(bundle, kern_param, ncb_index, | ||||
ncb_index.ndrange_id); | ncb_index.ndrange_id); | ||||
@@ -527,7 +529,8 @@ ConvBiasImpl::AlgoF16DirectStride1::get_kimpls( | |||||
ret_kerns.push_back({copy_padding, {group, N, IC}}); | ret_kerns.push_back({copy_padding, {group, N, IC}}); | ||||
auto do_conv = [bundle, conv_kern_function]( | auto do_conv = [bundle, conv_kern_function]( | ||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
MultithreadDirectConvCommon<dt_float16, __fp16>:: | MultithreadDirectConvCommon<dt_float16, __fp16>:: | ||||
do_conv_kern_stride(bundle, kern_param, ncb_index, | do_conv_kern_stride(bundle, kern_param, ncb_index, | ||||
conv_kern_function, | conv_kern_function, | ||||
@@ -597,7 +597,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32Direct::get_kimpls( | |||||
size_t IC = param.filter_meta.icpg; | size_t IC = param.filter_meta.icpg; | ||||
size_t OC = param.filter_meta.ocpg; | size_t OC = param.filter_meta.ocpg; | ||||
size_t group = fm.group; | size_t group = fm.group; | ||||
WorkspaceBundle wbundle = | |||||
WorkspaceBundle bundle = | |||||
MultithreadDirectConvCommon<float, float>::get_bundle( | MultithreadDirectConvCommon<float, float>::get_bundle( | ||||
param, m_large_group); | param, m_large_group); | ||||
SmallVector<NCBKern> ret_kerns; | SmallVector<NCBKern> ret_kerns; | ||||
@@ -605,12 +605,12 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32Direct::get_kimpls( | |||||
//! one group for better performance | //! one group for better performance | ||||
if (m_large_group) { | if (m_large_group) { | ||||
//! Channel wise conv and big groups | //! Channel wise conv and big groups | ||||
auto exec_one_group = [wbundle](const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) { | |||||
auto exec_one_group = [bundle](const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
auto fm = kern_param.filter_meta; | auto fm = kern_param.filter_meta; | ||||
size_t IC = fm.icpg; | size_t IC = fm.icpg; | ||||
size_t OC = fm.ocpg; | size_t OC = fm.ocpg; | ||||
WorkspaceBundle bundle = wbundle; | |||||
bundle.set(kern_param.workspace_ptr); | |||||
if (fm.should_flip) { | if (fm.should_flip) { | ||||
for (size_t oc = 0; oc < OC; oc++) { | for (size_t oc = 0; oc < OC; oc++) { | ||||
MultithreadDirectConvCommon<float, float>::weight_flip_kern( | MultithreadDirectConvCommon<float, float>::weight_flip_kern( | ||||
@@ -631,23 +631,25 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32Direct::get_kimpls( | |||||
}; | }; | ||||
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ||||
} else { | } else { | ||||
WorkspaceBundle bundle = wbundle; | |||||
if (fm.should_flip) { | if (fm.should_flip) { | ||||
auto weight_flip = [bundle](const NCBKernParam& kern_param, | auto weight_flip = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
MultithreadDirectConvCommon<float, float>::weight_flip_kern( | MultithreadDirectConvCommon<float, float>::weight_flip_kern( | ||||
bundle, kern_param, ncb_index, ncb_index.ndrange_id); | bundle, kern_param, ncb_index, ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({weight_flip, {group, 1_z, OC}}); | ret_kerns.push_back({weight_flip, {group, 1_z, OC}}); | ||||
} | } | ||||
auto copy_padding = [bundle](const NCBKernParam& kern_param, | auto copy_padding = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
MultithreadDirectConvCommon<float, float>::copy_padding_kern( | MultithreadDirectConvCommon<float, float>::copy_padding_kern( | ||||
bundle, kern_param, ncb_index, ncb_index.ndrange_id); | bundle, kern_param, ncb_index, ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({copy_padding, {group, N, IC}}); | ret_kerns.push_back({copy_padding, {group, N, IC}}); | ||||
auto do_conv = [bundle](const NCBKernParam& kern_param, | auto do_conv = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
MultithreadDirectConvCommon<float, float>::do_conv_kern( | MultithreadDirectConvCommon<float, float>::do_conv_kern( | ||||
bundle, kern_param, ncb_index, fp32::conv_bias::kern_direct, | bundle, kern_param, ncb_index, fp32::conv_bias::kern_direct, | ||||
ncb_index.ndrange_id); | ncb_index.ndrange_id); | ||||
@@ -734,7 +736,7 @@ ConvBiasImpl::AlgoF32DirectStride1::get_kimpls( | |||||
} | } | ||||
SWITCH_KERN_STR1(); | SWITCH_KERN_STR1(); | ||||
WorkspaceBundle wbundle = | |||||
WorkspaceBundle bundle = | |||||
MultithreadDirectConvCommon<float, float>::get_bundle_stride( | MultithreadDirectConvCommon<float, float>::get_bundle_stride( | ||||
param, m_large_group); | param, m_large_group); | ||||
SmallVector<NCBKern> ret_kerns; | SmallVector<NCBKern> ret_kerns; | ||||
@@ -742,13 +744,13 @@ ConvBiasImpl::AlgoF32DirectStride1::get_kimpls( | |||||
//! one group for better performance | //! one group for better performance | ||||
if (m_large_group) { | if (m_large_group) { | ||||
//! Channel wise conv and big groups | //! Channel wise conv and big groups | ||||
auto exec_one_group = [wbundle, conv_kern_function]( | |||||
auto exec_one_group = [bundle, conv_kern_function]( | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
auto fm = kern_param.filter_meta; | auto fm = kern_param.filter_meta; | ||||
size_t IC = fm.icpg; | size_t IC = fm.icpg; | ||||
size_t OC = fm.ocpg; | size_t OC = fm.ocpg; | ||||
WorkspaceBundle bundle = wbundle; | |||||
bundle.set(kern_param.workspace_ptr); | |||||
for (size_t ic = 0; ic < IC; ic++) { | for (size_t ic = 0; ic < IC; ic++) { | ||||
MultithreadDirectConvCommon<float, float>:: | MultithreadDirectConvCommon<float, float>:: | ||||
copy_padding_kern_stride(bundle, kern_param, ncb_index, | copy_padding_kern_stride(bundle, kern_param, ncb_index, | ||||
@@ -762,16 +764,17 @@ ConvBiasImpl::AlgoF32DirectStride1::get_kimpls( | |||||
}; | }; | ||||
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ||||
} else { | } else { | ||||
WorkspaceBundle bundle = wbundle; | |||||
auto copy_padding = [bundle](const NCBKernParam& kern_param, | auto copy_padding = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
MultithreadDirectConvCommon<float, float>::copy_padding_kern_stride( | MultithreadDirectConvCommon<float, float>::copy_padding_kern_stride( | ||||
bundle, kern_param, ncb_index, ncb_index.ndrange_id); | bundle, kern_param, ncb_index, ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({copy_padding, {group, N, IC}}); | ret_kerns.push_back({copy_padding, {group, N, IC}}); | ||||
auto do_conv = [bundle, conv_kern_function]( | auto do_conv = [bundle, conv_kern_function]( | ||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
MultithreadDirectConvCommon<float, float>::do_conv_kern_stride( | MultithreadDirectConvCommon<float, float>::do_conv_kern_stride( | ||||
bundle, kern_param, ncb_index, conv_kern_function, | bundle, kern_param, ncb_index, conv_kern_function, | ||||
ncb_index.ndrange_id); | ncb_index.ndrange_id); | ||||
@@ -859,7 +862,7 @@ ConvBiasImpl::AlgoF32DirectStride2::get_kimpls( | |||||
} | } | ||||
SWITCH_KERN_STR2(); | SWITCH_KERN_STR2(); | ||||
WorkspaceBundle wbundle = | |||||
WorkspaceBundle bundle = | |||||
MultithreadDirectConvCommon<float, float>::get_bundle_stride( | MultithreadDirectConvCommon<float, float>::get_bundle_stride( | ||||
param, m_large_group); | param, m_large_group); | ||||
SmallVector<NCBKern> ret_kerns; | SmallVector<NCBKern> ret_kerns; | ||||
@@ -867,13 +870,13 @@ ConvBiasImpl::AlgoF32DirectStride2::get_kimpls( | |||||
//! one group for better performance | //! one group for better performance | ||||
if (m_large_group) { | if (m_large_group) { | ||||
//! Channel wise conv and big groups | //! Channel wise conv and big groups | ||||
auto exec_one_group = [wbundle, conv_kern_function]( | |||||
auto exec_one_group = [bundle, conv_kern_function]( | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
auto fm = kern_param.filter_meta; | auto fm = kern_param.filter_meta; | ||||
size_t IC = fm.icpg; | size_t IC = fm.icpg; | ||||
size_t OC = fm.ocpg; | size_t OC = fm.ocpg; | ||||
WorkspaceBundle bundle = wbundle; | |||||
bundle.set(kern_param.workspace_ptr); | |||||
for (size_t ic = 0; ic < IC; ic++) { | for (size_t ic = 0; ic < IC; ic++) { | ||||
MultithreadDirectConvCommon<float, float>:: | MultithreadDirectConvCommon<float, float>:: | ||||
copy_padding_kern_stride(bundle, kern_param, ncb_index, | copy_padding_kern_stride(bundle, kern_param, ncb_index, | ||||
@@ -887,16 +890,17 @@ ConvBiasImpl::AlgoF32DirectStride2::get_kimpls( | |||||
}; | }; | ||||
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ||||
} else { | } else { | ||||
WorkspaceBundle bundle = wbundle; | |||||
auto copy_padding = [bundle](const NCBKernParam& kern_param, | auto copy_padding = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
MultithreadDirectConvCommon<float, float>::copy_padding_kern_stride( | MultithreadDirectConvCommon<float, float>::copy_padding_kern_stride( | ||||
bundle, kern_param, ncb_index, ncb_index.ndrange_id); | bundle, kern_param, ncb_index, ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({copy_padding, {group, N, IC}}); | ret_kerns.push_back({copy_padding, {group, N, IC}}); | ||||
auto do_conv = [bundle, conv_kern_function]( | auto do_conv = [bundle, conv_kern_function]( | ||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
MultithreadDirectConvCommon<float, float>::do_conv_kern_stride( | MultithreadDirectConvCommon<float, float>::do_conv_kern_stride( | ||||
bundle, kern_param, ncb_index, conv_kern_function, | bundle, kern_param, ncb_index, conv_kern_function, | ||||
ncb_index.ndrange_id); | ncb_index.ndrange_id); | ||||
@@ -22,7 +22,8 @@ | |||||
using namespace megdnn; | using namespace megdnn; | ||||
using namespace arm_common; | using namespace arm_common; | ||||
using conv_fun = std::function<void( | using conv_fun = std::function<void( | ||||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids, const CpuNDRange& ncb_range)>; | const CpuNDRange& workspace_ids, const CpuNDRange& ncb_range)>; | ||||
MIDOUT_DECL(megdnn_arm_common_conv_bias_fp32_nchw44_stride1) | MIDOUT_DECL(megdnn_arm_common_conv_bias_fp32_nchw44_stride1) | ||||
@@ -67,7 +68,7 @@ static WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param) { | |||||
}; | }; | ||||
template <size_t filter, BiasMode bias_mode, typename Op, int stride> | template <size_t filter, BiasMode bias_mode, typename Op, int stride> | ||||
static void do_conv_kern(WorkspaceBundle bundle, | |||||
static void do_conv_kern(const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | const ConvBiasImpl::NCBKernParam& kern_param, | ||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange&, const CpuNDRange&) { | const CpuNDRange&, const CpuNDRange&) { | ||||
@@ -87,7 +88,6 @@ static void do_conv_kern(WorkspaceBundle bundle, | |||||
int oh2 = 0; | int oh2 = 0; | ||||
int ow2 = 0; | int ow2 = 0; | ||||
get_rectified_size(kern_param, ih2, iw2, oh2, ow2); | get_rectified_size(kern_param, ih2, iw2, oh2, ow2); | ||||
bundle.set(kern_param.workspace_ptr); | |||||
constexpr int pack_c = 4; | constexpr int pack_c = 4; | ||||
const int batch_id = ncb_index.ndrange_id[0]; | const int batch_id = ncb_index.ndrange_id[0]; | ||||
@@ -281,7 +281,6 @@ ConvBiasImpl::AlgoF32DirectNCHW44::dispatch_kerns( | |||||
megdnn_assert(do_conv_fun); | megdnn_assert(do_conv_fun); | ||||
SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | ||||
WorkspaceBundle bundle = wbundle; | |||||
int oh = param.osz[0]; | int oh = param.osz[0]; | ||||
int ic = param.filter_meta.icpg; | int ic = param.filter_meta.icpg; | ||||
int iw = param.isz[1]; | int iw = param.isz[1]; | ||||
@@ -291,10 +290,11 @@ ConvBiasImpl::AlgoF32DirectNCHW44::dispatch_kerns( | |||||
CpuNDRange ncb_range = {static_cast<size_t>(batch), | CpuNDRange ncb_range = {static_cast<size_t>(batch), | ||||
static_cast<size_t>(group), | static_cast<size_t>(group), | ||||
static_cast<size_t>(div_ceil(oh, oh_block))}; | static_cast<size_t>(div_ceil(oh, oh_block))}; | ||||
auto do_conv = [bundle, do_conv_fun, ncb_range]( | |||||
auto do_conv = [wbundle, do_conv_fun, ncb_range]( | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
wbundle.set(kern_param.workspace_ptr); | |||||
do_conv_fun(wbundle, kern_param, ncb_index, ncb_index.ndrange_id, | |||||
ncb_range); | ncb_range); | ||||
}; | }; | ||||
ret_kerns.push_back({do_conv, ncb_range}); | ret_kerns.push_back({do_conv, ncb_range}); | ||||
@@ -23,7 +23,8 @@ | |||||
using namespace megdnn; | using namespace megdnn; | ||||
using namespace arm_common; | using namespace arm_common; | ||||
using conv_fun = std::function<void( | using conv_fun = std::function<void( | ||||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids, const CpuNDRange& ncb_range)>; | const CpuNDRange& workspace_ids, const CpuNDRange& ncb_range)>; | ||||
MIDOUT_DECL(megdnn_arm_common_conv_bias_fp32_nchw_nchw44) | MIDOUT_DECL(megdnn_arm_common_conv_bias_fp32_nchw_nchw44) | ||||
@@ -105,10 +106,9 @@ static inline void copy_pad_src(float* sptr_base, const float* sptr_origin, | |||||
sptr_base += iw2 * pad_bottom; | sptr_base += iw2 * pad_bottom; | ||||
} | } | ||||
} | } | ||||
static void pack_weight(WorkspaceBundle bundle, | |||||
static void pack_weight(const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | const ConvBiasImpl::NCBKernParam& kern_param, | ||||
const ConvBiasImpl::NCBKernIndex& ncb_index) { | const ConvBiasImpl::NCBKernIndex& ncb_index) { | ||||
bundle.set(kern_param.workspace_ptr); | |||||
const int group_id = ncb_index.ndrange_id[0]; | const int group_id = ncb_index.ndrange_id[0]; | ||||
int fh = kern_param.filter_meta.spatial[0]; | int fh = kern_param.filter_meta.spatial[0]; | ||||
int fw = kern_param.filter_meta.spatial[1]; | int fw = kern_param.filter_meta.spatial[1]; | ||||
@@ -124,7 +124,7 @@ static void pack_weight(WorkspaceBundle bundle, | |||||
} | } | ||||
template <size_t filter_size, BiasMode bias_mode, typename Op, size_t stride> | template <size_t filter_size, BiasMode bias_mode, typename Op, size_t stride> | ||||
static void do_conv_kern(WorkspaceBundle bundle, | |||||
static void do_conv_kern(const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | const ConvBiasImpl::NCBKernParam& kern_param, | ||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange&, const CpuNDRange&) { | const CpuNDRange&, const CpuNDRange&) { | ||||
@@ -144,7 +144,6 @@ static void do_conv_kern(WorkspaceBundle bundle, | |||||
int oh2 = 0; | int oh2 = 0; | ||||
int ow2 = 0; | int ow2 = 0; | ||||
get_rectified_size(kern_param, ih2, iw2, oh2, ow2); | get_rectified_size(kern_param, ih2, iw2, oh2, ow2); | ||||
bundle.set(kern_param.workspace_ptr); | |||||
constexpr int pack_c = 4; | constexpr int pack_c = 4; | ||||
const int batch_id = ncb_index.ndrange_id[0]; | const int batch_id = ncb_index.ndrange_id[0]; | ||||
@@ -220,7 +219,7 @@ ConvBiasImpl::AlgoF32DirectNCHWNCHW44::dispatch_kerns( | |||||
auto fm = param.filter_meta; | auto fm = param.filter_meta; | ||||
const int batch = param.n; | const int batch = param.n; | ||||
const int group = fm.group; | const int group = fm.group; | ||||
WorkspaceBundle wbundle = get_bundle(param); | |||||
WorkspaceBundle bundle = get_bundle(param); | |||||
conv_fun do_conv_fun = nullptr; | conv_fun do_conv_fun = nullptr; | ||||
// NOTE: remain_w is not used to gen hash of midout for compatible with | // NOTE: remain_w is not used to gen hash of midout for compatible with | ||||
// shape runtime | // shape runtime | ||||
@@ -301,11 +300,11 @@ ConvBiasImpl::AlgoF32DirectNCHWNCHW44::dispatch_kerns( | |||||
megdnn_assert(do_conv_fun); | megdnn_assert(do_conv_fun); | ||||
SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | ||||
WorkspaceBundle bundle = wbundle; | |||||
int oh = param.osz[0]; | int oh = param.osz[0]; | ||||
int oh_block = block_helper(param.nr_threads, oh, 0); | int oh_block = block_helper(param.nr_threads, oh, 0); | ||||
auto do_pack_weight = [bundle](const NCBKernParam& kern_param, | auto do_pack_weight = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
pack_weight(bundle, kern_param, ncb_index); | pack_weight(bundle, kern_param, ncb_index); | ||||
}; | }; | ||||
ret_kerns.push_back({do_pack_weight, {static_cast<size_t>(group)}}); | ret_kerns.push_back({do_pack_weight, {static_cast<size_t>(group)}}); | ||||
@@ -314,7 +313,8 @@ ConvBiasImpl::AlgoF32DirectNCHWNCHW44::dispatch_kerns( | |||||
static_cast<size_t>(div_ceil(oh, oh_block))}; | static_cast<size_t>(div_ceil(oh, oh_block))}; | ||||
auto do_conv = [bundle, do_conv_fun, ncb_range]( | auto do_conv = [bundle, do_conv_fun, ncb_range]( | ||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id, | do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id, | ||||
ncb_range); | ncb_range); | ||||
}; | }; | ||||
@@ -76,7 +76,7 @@ WorkspaceBundle stride1::get_bundle( | |||||
//! compute one output channel | //! compute one output channel | ||||
template <bool quantized, size_t filter, BiasMode bias_mode, typename Op> | template <bool quantized, size_t filter, BiasMode bias_mode, typename Op> | ||||
void stride1::do_conv_kern(WorkspaceBundle bundle, | |||||
void stride1::do_conv_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | const NCBKernIndex& ncb_index) { | ||||
size_t PH = kern_param.filter_meta.padding[0]; | size_t PH = kern_param.filter_meta.padding[0]; | ||||
@@ -100,7 +100,6 @@ void stride1::do_conv_kern(WorkspaceBundle bundle, | |||||
size_t thread_id = ncb_index.thread_id, batch_id = ncb_index.ndrange_id[0]; | size_t thread_id = ncb_index.thread_id, batch_id = ncb_index.ndrange_id[0]; | ||||
size_t group_id = ncb_index.ndrange_id[1]; | size_t group_id = ncb_index.ndrange_id[1]; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
int8_t* padding_src = static_cast<int8_t*>(bundle.get(thread_id)); | int8_t* padding_src = static_cast<int8_t*>(bundle.get(thread_id)); | ||||
const int8_t* sptr = | const int8_t* sptr = | ||||
kern_param.src<dt_int8>(batch_id, group_id, 0, pack_group_size); | kern_param.src<dt_int8>(batch_id, group_id, 0, pack_group_size); | ||||
@@ -210,7 +209,8 @@ SmallVector<ConvBiasImpl::NCBKern> stride1::get_kimpls( | |||||
SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | ||||
auto exec_one_group = [wbundle, do_conv_fun]( | auto exec_one_group = [wbundle, do_conv_fun]( | ||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
wbundle.set(kern_param.workspace_ptr); | |||||
do_conv_fun(wbundle, kern_param, ncb_index); | do_conv_fun(wbundle, kern_param, ncb_index); | ||||
}; | }; | ||||
ret_kerns.push_back({exec_one_group, {N, group}}); | ret_kerns.push_back({exec_one_group, {N, group}}); | ||||
@@ -253,7 +253,7 @@ WorkspaceBundle stride2::get_bundle( | |||||
//! compute one output channel | //! compute one output channel | ||||
template <bool quantized, size_t filter, BiasMode bias_mode, typename Op> | template <bool quantized, size_t filter, BiasMode bias_mode, typename Op> | ||||
void stride2::do_conv_kern(WorkspaceBundle bundle, | |||||
void stride2::do_conv_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | const NCBKernIndex& ncb_index) { | ||||
size_t PH = kern_param.filter_meta.padding[0]; | size_t PH = kern_param.filter_meta.padding[0]; | ||||
@@ -277,7 +277,6 @@ void stride2::do_conv_kern(WorkspaceBundle bundle, | |||||
size_t thread_id = ncb_index.thread_id, batch_id = ncb_index.ndrange_id[0]; | size_t thread_id = ncb_index.thread_id, batch_id = ncb_index.ndrange_id[0]; | ||||
size_t group_id = ncb_index.ndrange_id[1]; | size_t group_id = ncb_index.ndrange_id[1]; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
int8_t* padding_src = static_cast<int8_t*>(bundle.get(thread_id)); | int8_t* padding_src = static_cast<int8_t*>(bundle.get(thread_id)); | ||||
const int8_t* sptr = | const int8_t* sptr = | ||||
kern_param.src<dt_int8>(batch_id, group_id, 0, pack_group_size); | kern_param.src<dt_int8>(batch_id, group_id, 0, pack_group_size); | ||||
@@ -325,7 +324,8 @@ SmallVector<ConvBiasImpl::NCBKern> stride2::get_kimpls( | |||||
SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | ||||
auto exec_one_group = [wbundle, do_conv_fun]( | auto exec_one_group = [wbundle, do_conv_fun]( | ||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
wbundle.set(kern_param.workspace_ptr); | |||||
do_conv_fun(wbundle, kern_param, ncb_index); | do_conv_fun(wbundle, kern_param, ncb_index); | ||||
}; | }; | ||||
ret_kerns.push_back({exec_one_group, {N, group}}); | ret_kerns.push_back({exec_one_group, {N, group}}); | ||||
@@ -21,7 +21,7 @@ using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam; | |||||
using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; | using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; | ||||
using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; | using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; | ||||
using conv_fun = std::function<void(WorkspaceBundle bundle, | |||||
using conv_fun = std::function<void(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index)>; | const NCBKernIndex& ncb_index)>; | ||||
@@ -32,7 +32,7 @@ bool is_available(const NCBKernSizeParam& param); | |||||
WorkspaceBundle get_bundle(const NCBKernSizeParam& param); | WorkspaceBundle get_bundle(const NCBKernSizeParam& param); | ||||
template <bool quantized, size_t filter, BiasMode bias_mode, typename Op> | template <bool quantized, size_t filter, BiasMode bias_mode, typename Op> | ||||
void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index); | const NCBKernIndex& ncb_index); | ||||
SmallVector<ConvBiasImpl::NCBKern> get_kimpls(const NCBKernSizeParam& param); | SmallVector<ConvBiasImpl::NCBKern> get_kimpls(const NCBKernSizeParam& param); | ||||
@@ -44,7 +44,7 @@ bool is_available(const NCBKernSizeParam& param); | |||||
WorkspaceBundle get_bundle(const NCBKernSizeParam& param); | WorkspaceBundle get_bundle(const NCBKernSizeParam& param); | ||||
template <bool quantized, size_t filter, BiasMode bias_mode, typename Op> | template <bool quantized, size_t filter, BiasMode bias_mode, typename Op> | ||||
void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index); | const NCBKernIndex& ncb_index); | ||||
SmallVector<ConvBiasImpl::NCBKern> get_kimpls(const NCBKernSizeParam& param); | SmallVector<ConvBiasImpl::NCBKern> get_kimpls(const NCBKernSizeParam& param); | ||||
@@ -24,9 +24,10 @@ using namespace arm_common; | |||||
MIDOUT_DECL(megdnn_arm_common_conv_bias_int8) | MIDOUT_DECL(megdnn_arm_common_conv_bias_int8) | ||||
using direct_fun = std::function<void( | |||||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& ncb_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index)>; | |||||
using direct_fun = | |||||
std::function<void(const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& ncb_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index)>; | |||||
namespace { | namespace { | ||||
@@ -71,7 +72,7 @@ static WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param) { | |||||
template <typename dst_type, size_t filter_size, BiasMode bias_mode, | template <typename dst_type, size_t filter_size, BiasMode bias_mode, | ||||
typename Op, int stride> | typename Op, int stride> | ||||
static void conv_kern(WorkspaceBundle bundle, | |||||
static void conv_kern(const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& ncb_param, | const ConvBiasImpl::NCBKernParam& ncb_param, | ||||
const ConvBiasImpl::NCBKernIndex& ncb_index) { | const ConvBiasImpl::NCBKernIndex& ncb_index) { | ||||
const int OH = ncb_param.osz[0]; | const int OH = ncb_param.osz[0]; | ||||
@@ -93,7 +94,6 @@ static void conv_kern(WorkspaceBundle bundle, | |||||
constexpr int IC_PACK_SIZE = 4; | constexpr int IC_PACK_SIZE = 4; | ||||
constexpr int OC_PACK_SIZE = 4; | constexpr int OC_PACK_SIZE = 4; | ||||
bundle.set(ncb_param.workspace_ptr); | |||||
const int batch_id = ncb_index.ndrange_id[0]; | const int batch_id = ncb_index.ndrange_id[0]; | ||||
const int group_id = ncb_index.ndrange_id[1]; | const int group_id = ncb_index.ndrange_id[1]; | ||||
@@ -326,8 +326,10 @@ ConvBiasImpl::AlgoDotS8Direct_NCHW44::dispatch_kerns( | |||||
IC * IW * sizeof(int8_t) * 2); | IC * IW * sizeof(int8_t) * 2); | ||||
size_t oh_tiles = static_cast<size_t>(div_ceil(OH, oh_tile_size)); | size_t oh_tiles = static_cast<size_t>(div_ceil(OH, oh_tile_size)); | ||||
auto do_conv = [wbundle, kernel](const NCBKernParam& ncb_param, | |||||
const NCBKernIndex& ncb_index) { | |||||
auto do_conv = [wbundle, kernel]( | |||||
const NCBKernParam& ncb_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
wbundle.set(ncb_param.workspace_ptr); | |||||
kernel(wbundle, ncb_param, std::move(ncb_index)); | kernel(wbundle, ncb_param, std::move(ncb_index)); | ||||
}; | }; | ||||
@@ -23,7 +23,8 @@ | |||||
using namespace megdnn; | using namespace megdnn; | ||||
using namespace arm_common; | using namespace arm_common; | ||||
using conv_fun = std::function<void( | using conv_fun = std::function<void( | ||||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids, const CpuNDRange& ncb_range)>; | const CpuNDRange& workspace_ids, const CpuNDRange& ncb_range)>; | ||||
MIDOUT_DECL(megdnn_arm_common_conv_bias_int8_nchw44) | MIDOUT_DECL(megdnn_arm_common_conv_bias_int8_nchw44) | ||||
@@ -64,7 +65,7 @@ static WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param) { | |||||
} | } | ||||
}; | }; | ||||
static void copy_padding_kern(WorkspaceBundle bundle, | |||||
static void copy_padding_kern(const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | const ConvBiasImpl::NCBKernParam& kern_param, | ||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids) { | const CpuNDRange& workspace_ids) { | ||||
@@ -78,7 +79,6 @@ static void copy_padding_kern(WorkspaceBundle bundle, | |||||
int IH2, IW2; | int IH2, IW2; | ||||
get_rectified_size(kern_param, IH2, IW2); | get_rectified_size(kern_param, IH2, IW2); | ||||
int padding_group_size = IH2 * IW2 * IC; | int padding_group_size = IH2 * IW2 * IC; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
//! Used for get the workspace offset | //! Used for get the workspace offset | ||||
constexpr int pack_ic = 4; | constexpr int pack_ic = 4; | ||||
constexpr int expend_element = 4; | constexpr int expend_element = 4; | ||||
@@ -128,7 +128,7 @@ static void copy_padding_kern(WorkspaceBundle bundle, | |||||
template <size_t filter, BiasMode bias_mode, typename Op, int ow_remain, | template <size_t filter, BiasMode bias_mode, typename Op, int ow_remain, | ||||
typename DstType, int stride> | typename DstType, int stride> | ||||
static void do_conv_kern(WorkspaceBundle bundle, | |||||
static void do_conv_kern(const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | const ConvBiasImpl::NCBKernParam& kern_param, | ||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids, | const CpuNDRange& workspace_ids, | ||||
@@ -153,7 +153,6 @@ static void do_conv_kern(WorkspaceBundle bundle, | |||||
op = Op(scale_bias, scale_dst); | op = Op(scale_bias, scale_dst); | ||||
} | } | ||||
size_t padding_group_size = IH2 * IW2 * IC; | size_t padding_group_size = IH2 * IW2 * IC; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
constexpr size_t pack_c = 4; | constexpr size_t pack_c = 4; | ||||
constexpr size_t src_expand_size = 4; | constexpr size_t src_expand_size = 4; | ||||
@@ -375,7 +374,6 @@ ConvBiasImpl::AlgoS8DirectNCHW44::dispatch_kerns( | |||||
megdnn_assert(do_conv_fun); | megdnn_assert(do_conv_fun); | ||||
SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | ||||
WorkspaceBundle bundle = wbundle; | |||||
constexpr size_t pack_oc = 4; | constexpr size_t pack_oc = 4; | ||||
size_t oc_step = pack_oc; | size_t oc_step = pack_oc; | ||||
@@ -384,28 +382,31 @@ ConvBiasImpl::AlgoS8DirectNCHW44::dispatch_kerns( | |||||
} | } | ||||
if (group == 1) { | if (group == 1) { | ||||
CpuNDRange ncb_range = {N, group, div_ceil(OC, oc_step)}; | CpuNDRange ncb_range = {N, group, div_ceil(OC, oc_step)}; | ||||
auto copy_padding = [bundle](const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) { | |||||
copy_padding_kern(bundle, kern_param, ncb_index, | |||||
auto copy_padding = [wbundle](const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
wbundle.set(kern_param.workspace_ptr); | |||||
copy_padding_kern(wbundle, kern_param, ncb_index, | |||||
ncb_index.ndrange_id); | ncb_index.ndrange_id); | ||||
}; | }; | ||||
constexpr size_t pack_ic = 4; | constexpr size_t pack_ic = 4; | ||||
ret_kerns.push_back({copy_padding, {N, group, div_ceil(IC, pack_ic)}}); | ret_kerns.push_back({copy_padding, {N, group, div_ceil(IC, pack_ic)}}); | ||||
auto do_conv = [bundle, do_conv_fun, ncb_range]( | |||||
auto do_conv = [wbundle, do_conv_fun, ncb_range]( | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
wbundle.set(kern_param.workspace_ptr); | |||||
do_conv_fun(wbundle, kern_param, ncb_index, ncb_index.ndrange_id, | |||||
ncb_range); | ncb_range); | ||||
}; | }; | ||||
ret_kerns.push_back({do_conv, ncb_range}); | ret_kerns.push_back({do_conv, ncb_range}); | ||||
} else { | } else { | ||||
CpuNDRange ncb_range = {N, group, 1}; | CpuNDRange ncb_range = {N, group, 1}; | ||||
auto do_conv = [bundle, do_conv_fun, ncb_range]( | |||||
auto do_conv = [wbundle, do_conv_fun, ncb_range]( | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
copy_padding_kern(bundle, kern_param, ncb_index, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
wbundle.set(kern_param.workspace_ptr); | |||||
copy_padding_kern(wbundle, kern_param, ncb_index, | |||||
{0, ncb_index.thread_id, 0}); | {0, ncb_index.thread_id, 0}); | ||||
do_conv_fun(bundle, kern_param, ncb_index, | |||||
do_conv_fun(wbundle, kern_param, ncb_index, | |||||
{0, ncb_index.thread_id, 0}, ncb_range); | {0, ncb_index.thread_id, 0}, ncb_range); | ||||
}; | }; | ||||
ret_kerns.push_back({do_conv, ncb_range}); | ret_kerns.push_back({do_conv, ncb_range}); | ||||
@@ -22,7 +22,8 @@ | |||||
using namespace megdnn; | using namespace megdnn; | ||||
using namespace arm_common; | using namespace arm_common; | ||||
using conv_fun = std::function<void( | using conv_fun = std::function<void( | ||||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids, const CpuNDRange& ncb_range)>; | const CpuNDRange& workspace_ids, const CpuNDRange& ncb_range)>; | ||||
MIDOUT_DECL(megdnn_arm_common_conv_bias_int8_nchw_nchw44) | MIDOUT_DECL(megdnn_arm_common_conv_bias_int8_nchw_nchw44) | ||||
@@ -77,7 +78,7 @@ static WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param) { | |||||
return {nullptr, {src_size, weight_size, tmp_size * param.nr_threads}}; | return {nullptr, {src_size, weight_size, tmp_size * param.nr_threads}}; | ||||
}; | }; | ||||
static void copy_padding_kern(WorkspaceBundle bundle, | |||||
static void copy_padding_kern(const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | const ConvBiasImpl::NCBKernParam& kern_param, | ||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids) { | const CpuNDRange& workspace_ids) { | ||||
@@ -92,7 +93,6 @@ static void copy_padding_kern(WorkspaceBundle bundle, | |||||
int ih2, iw2, oh2, ow2; | int ih2, iw2, oh2, ow2; | ||||
get_rectified_size(kern_param, ih2, iw2, oh2, ow2); | get_rectified_size(kern_param, ih2, iw2, oh2, ow2); | ||||
int padding_group_size = ih2 * iw2 * ic; | int padding_group_size = ih2 * iw2 * ic; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
//! Used for get the workspace offset | //! Used for get the workspace offset | ||||
const int src_expand = stride_h == 2 ? 4 : 16; | const int src_expand = stride_h == 2 ? 4 : 16; | ||||
@@ -124,10 +124,9 @@ static void copy_padding_kern(WorkspaceBundle bundle, | |||||
iw, iw2, pw, nullptr); | iw, iw2, pw, nullptr); | ||||
} | } | ||||
} | } | ||||
static void pack_weight(WorkspaceBundle bundle, | |||||
static void pack_weight(const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | const ConvBiasImpl::NCBKernParam& kern_param, | ||||
const ConvBiasImpl::NCBKernIndex& ncb_index) { | const ConvBiasImpl::NCBKernIndex& ncb_index) { | ||||
bundle.set(kern_param.workspace_ptr); | |||||
const int group_id = ncb_index.ndrange_id[0]; | const int group_id = ncb_index.ndrange_id[0]; | ||||
int fh = kern_param.filter_meta.spatial[0]; | int fh = kern_param.filter_meta.spatial[0]; | ||||
int fw = kern_param.filter_meta.spatial[1]; | int fw = kern_param.filter_meta.spatial[1]; | ||||
@@ -151,7 +150,7 @@ static void pack_weight(WorkspaceBundle bundle, | |||||
} | } | ||||
} | } | ||||
template <size_t filter, BiasMode bias_mode, typename Op, int stride> | template <size_t filter, BiasMode bias_mode, typename Op, int stride> | ||||
static void do_conv_kern(WorkspaceBundle bundle, | |||||
static void do_conv_kern(const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | const ConvBiasImpl::NCBKernParam& kern_param, | ||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids, | const CpuNDRange& workspace_ids, | ||||
@@ -177,7 +176,6 @@ static void do_conv_kern(WorkspaceBundle bundle, | |||||
op = Op(scale_bias, scale_dst); | op = Op(scale_bias, scale_dst); | ||||
} | } | ||||
int padding_group_size = ih2 * iw2 * ic; | int padding_group_size = ih2 * iw2 * ic; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
constexpr int pack_c = 4; | constexpr int pack_c = 4; | ||||
constexpr int src_expand_size = stride == 2 ? 4 : 16; | constexpr int src_expand_size = stride == 2 ? 4 : 16; | ||||
@@ -258,7 +256,7 @@ ConvBiasImpl::AlgoS8DirectNCHWNCHW44::dispatch_kerns( | |||||
size_t N = param.n; | size_t N = param.n; | ||||
size_t OC = fm.ocpg; | size_t OC = fm.ocpg; | ||||
size_t group = fm.group; | size_t group = fm.group; | ||||
WorkspaceBundle wbundle = get_bundle(param); | |||||
WorkspaceBundle bundle = get_bundle(param); | |||||
conv_fun do_conv_fun = nullptr; | conv_fun do_conv_fun = nullptr; | ||||
// NOTE: remain_w is not used to gen hash of midout for compatible with changing | // NOTE: remain_w is not used to gen hash of midout for compatible with changing | ||||
// shape runtime | // shape runtime | ||||
@@ -342,18 +340,19 @@ ConvBiasImpl::AlgoS8DirectNCHWNCHW44::dispatch_kerns( | |||||
megdnn_assert(do_conv_fun); | megdnn_assert(do_conv_fun); | ||||
SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | ||||
WorkspaceBundle bundle = wbundle; | |||||
constexpr size_t pack_oc = 8; | constexpr size_t pack_oc = 8; | ||||
size_t oc_step = pack_oc; | size_t oc_step = pack_oc; | ||||
auto copy_padding = [bundle](const NCBKernParam& kern_param, | auto copy_padding = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
copy_padding_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id); | copy_padding_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({copy_padding, {N, group, fm.icpg}}); | ret_kerns.push_back({copy_padding, {N, group, fm.icpg}}); | ||||
auto do_pack_weight = [bundle](const NCBKernParam& kern_param, | auto do_pack_weight = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
pack_weight(bundle, kern_param, ncb_index); | pack_weight(bundle, kern_param, ncb_index); | ||||
}; | }; | ||||
ret_kerns.push_back({do_pack_weight, {static_cast<size_t>(group)}}); | ret_kerns.push_back({do_pack_weight, {static_cast<size_t>(group)}}); | ||||
@@ -361,7 +360,8 @@ ConvBiasImpl::AlgoS8DirectNCHWNCHW44::dispatch_kerns( | |||||
CpuNDRange ncb_range = {N, group, div_ceil(OC, oc_step)}; | CpuNDRange ncb_range = {N, group, div_ceil(OC, oc_step)}; | ||||
auto do_conv = [bundle, do_conv_fun, ncb_range]( | auto do_conv = [bundle, do_conv_fun, ncb_range]( | ||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id, | do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id, | ||||
ncb_range); | ncb_range); | ||||
}; | }; | ||||
@@ -22,7 +22,8 @@ | |||||
using namespace megdnn; | using namespace megdnn; | ||||
using namespace arm_common; | using namespace arm_common; | ||||
using conv_fun = std::function<void( | using conv_fun = std::function<void( | ||||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids, const CpuNDRange& ncb_range)>; | const CpuNDRange& workspace_ids, const CpuNDRange& ncb_range)>; | ||||
MIDOUT_DECL(megdnn_arm_common_conv_bias_int8_nchw44_dot) | MIDOUT_DECL(megdnn_arm_common_conv_bias_int8_nchw44_dot) | ||||
@@ -82,7 +83,7 @@ static WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param) { | |||||
temp_size * param.nr_threads}}; | temp_size * param.nr_threads}}; | ||||
}; | }; | ||||
void do_weight_trans(WorkspaceBundle bundle, | |||||
void do_weight_trans(const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | const ConvBiasImpl::NCBKernParam& kern_param, | ||||
const ConvBiasImpl::NCBKernIndex&, const CpuNDRange&) { | const ConvBiasImpl::NCBKernIndex&, const CpuNDRange&) { | ||||
const int ic = kern_param.filter_meta.icpg; | const int ic = kern_param.filter_meta.icpg; | ||||
@@ -90,7 +91,6 @@ void do_weight_trans(WorkspaceBundle bundle, | |||||
const int fh = kern_param.filter_meta.spatial[0]; | const int fh = kern_param.filter_meta.spatial[0]; | ||||
const int fw = kern_param.filter_meta.spatial[1]; | const int fw = kern_param.filter_meta.spatial[1]; | ||||
const int fw2 = round_up(fw, 4); | const int fw2 = round_up(fw, 4); | ||||
bundle.set(kern_param.workspace_ptr); | |||||
auto packed_weight = reinterpret_cast<int8_t*>(bundle.get(1)); | auto packed_weight = reinterpret_cast<int8_t*>(bundle.get(1)); | ||||
auto origin_weight = kern_param.filter<dt_int8>(); | auto origin_weight = kern_param.filter<dt_int8>(); | ||||
pack_weight_int8_nchw_nchw44_dot(packed_weight, origin_weight, oc, ic, fh, | pack_weight_int8_nchw_nchw44_dot(packed_weight, origin_weight, oc, ic, fh, | ||||
@@ -98,7 +98,7 @@ void do_weight_trans(WorkspaceBundle bundle, | |||||
} | } | ||||
template <size_t filter, BiasMode bias_mode, typename Op, int stride> | template <size_t filter, BiasMode bias_mode, typename Op, int stride> | ||||
static void do_conv_kern(WorkspaceBundle bundle, | |||||
static void do_conv_kern(const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | const ConvBiasImpl::NCBKernParam& kern_param, | ||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange&, const CpuNDRange&) { | const CpuNDRange&, const CpuNDRange&) { | ||||
@@ -117,7 +117,6 @@ static void do_conv_kern(WorkspaceBundle bundle, | |||||
int ih2 = 0; | int ih2 = 0; | ||||
int iw2 = 0; | int iw2 = 0; | ||||
get_rectified_size(kern_param, ih2, iw2); | get_rectified_size(kern_param, ih2, iw2); | ||||
bundle.set(kern_param.workspace_ptr); | |||||
constexpr int pack_c = 4; | constexpr int pack_c = 4; | ||||
const int batch_id = ncb_index.ndrange_id[0]; | const int batch_id = ncb_index.ndrange_id[0]; | ||||
@@ -205,7 +204,7 @@ ConvBiasImpl::AlgoDotS8DirectNCHWNCHW44::dispatch_kerns( | |||||
auto fm = param.filter_meta; | auto fm = param.filter_meta; | ||||
const int batch = param.n; | const int batch = param.n; | ||||
const int group = fm.group; | const int group = fm.group; | ||||
WorkspaceBundle wbundle = get_bundle(param); | |||||
WorkspaceBundle bundle = get_bundle(param); | |||||
conv_fun do_conv_fun = nullptr; | conv_fun do_conv_fun = nullptr; | ||||
// NOTE: remain_w is not used to gen hash of midout for compatible with | // NOTE: remain_w is not used to gen hash of midout for compatible with | ||||
// shape runtime | // shape runtime | ||||
@@ -288,7 +287,6 @@ ConvBiasImpl::AlgoDotS8DirectNCHWNCHW44::dispatch_kerns( | |||||
megdnn_assert(do_conv_fun); | megdnn_assert(do_conv_fun); | ||||
SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | ||||
WorkspaceBundle bundle = wbundle; | |||||
int oh = param.osz[0]; | int oh = param.osz[0]; | ||||
int ic = param.filter_meta.icpg; | int ic = param.filter_meta.icpg; | ||||
int iw = param.isz[1]; | int iw = param.isz[1]; | ||||
@@ -302,14 +300,16 @@ ConvBiasImpl::AlgoDotS8DirectNCHWNCHW44::dispatch_kerns( | |||||
static_cast<size_t>(div_ceil(oh, oh_block))}; | static_cast<size_t>(div_ceil(oh, oh_block))}; | ||||
auto do_trans_weight = [bundle](const NCBKernParam& kern_param, | auto do_trans_weight = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
do_weight_trans(bundle, kern_param, ncb_index, ncb_index.ndrange_id); | do_weight_trans(bundle, kern_param, ncb_index, ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({do_trans_weight, {1}}); | ret_kerns.push_back({do_trans_weight, {1}}); | ||||
auto do_conv = [bundle, do_conv_fun, ncb_range]( | auto do_conv = [bundle, do_conv_fun, ncb_range]( | ||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id, | do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id, | ||||
ncb_range); | ncb_range); | ||||
}; | }; | ||||
@@ -107,7 +107,8 @@ WorkspaceBundle direct_int8_stride1::get_bundle( | |||||
} | } | ||||
//! Process one input channel copy padding | //! Process one input channel copy padding | ||||
void direct_int8_stride1::copy_padding_kern( | void direct_int8_stride1::copy_padding_kern( | ||||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids) { | const CpuNDRange& workspace_ids) { | ||||
size_t IH = kern_param.isz[0]; | size_t IH = kern_param.isz[0]; | ||||
@@ -121,7 +122,6 @@ void direct_int8_stride1::copy_padding_kern( | |||||
get_rectified_size(kern_param, IH2, IW2, OH2, OW2); | get_rectified_size(kern_param, IH2, IW2, OH2, OW2); | ||||
bool need_src_copy_var = need_src_copy(kern_param); | bool need_src_copy_var = need_src_copy(kern_param); | ||||
size_t padding_group_size = IH2 * IW2 * IC; | size_t padding_group_size = IH2 * IW2 * IC; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
//! Used for get the workspace offset | //! Used for get the workspace offset | ||||
size_t workspace_group_id = workspace_ids[0], | size_t workspace_group_id = workspace_ids[0], | ||||
workspace_batch_id = workspace_ids[1], channel_id = workspace_ids[2], | workspace_batch_id = workspace_ids[1], channel_id = workspace_ids[2], | ||||
@@ -145,7 +145,7 @@ void direct_int8_stride1::copy_padding_kern( | |||||
}; | }; | ||||
//! compute one output channel | //! compute one output channel | ||||
template <size_t filter, BiasMode bias_mode, typename Op> | template <size_t filter, BiasMode bias_mode, typename Op> | ||||
void direct_int8_stride1::do_conv_kern(WorkspaceBundle bundle, | |||||
void direct_int8_stride1::do_conv_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids) { | const CpuNDRange& workspace_ids) { | ||||
@@ -170,7 +170,6 @@ void direct_int8_stride1::do_conv_kern(WorkspaceBundle bundle, | |||||
op = Op(scale_bias, scale_dst); | op = Op(scale_bias, scale_dst); | ||||
} | } | ||||
size_t padding_group_size = IH2 * IW2 * IC; | size_t padding_group_size = IH2 * IW2 * IC; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
//! Used for get the workspace offset | //! Used for get the workspace offset | ||||
size_t workspace_group_id = workspace_ids[0], | size_t workspace_group_id = workspace_ids[0], | ||||
workspace_batch_id = workspace_ids[1], oc = workspace_ids[2]; | workspace_batch_id = workspace_ids[1], oc = workspace_ids[2]; | ||||
@@ -263,7 +262,7 @@ SmallVector<ConvBiasImpl::NCBKern> direct_int8_stride1::get_kimpls( | |||||
size_t IC = param.filter_meta.icpg; | size_t IC = param.filter_meta.icpg; | ||||
size_t OC = param.filter_meta.ocpg; | size_t OC = param.filter_meta.ocpg; | ||||
size_t group = fm.group; | size_t group = fm.group; | ||||
WorkspaceBundle wbundle = get_bundle(param, m_large_group); | |||||
WorkspaceBundle bundle = get_bundle(param, m_large_group); | |||||
conv_fun do_conv_fun = nullptr; | conv_fun do_conv_fun = nullptr; | ||||
#define DO_CONV_KERN_FUN(filter, bias_mode, op) \ | #define DO_CONV_KERN_FUN(filter, bias_mode, op) \ | ||||
@@ -324,13 +323,13 @@ SmallVector<ConvBiasImpl::NCBKern> direct_int8_stride1::get_kimpls( | |||||
SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | ||||
if (m_large_group) { | if (m_large_group) { | ||||
auto exec_one_group = [wbundle, do_conv_fun]( | |||||
auto exec_one_group = [bundle, do_conv_fun]( | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
auto fm = kern_param.filter_meta; | auto fm = kern_param.filter_meta; | ||||
size_t IC = fm.icpg; | size_t IC = fm.icpg; | ||||
size_t OC = fm.ocpg; | size_t OC = fm.ocpg; | ||||
WorkspaceBundle bundle = wbundle; | |||||
bundle.set(kern_param.workspace_ptr); | |||||
for (size_t ic = 0; ic < IC; ic++) { | for (size_t ic = 0; ic < IC; ic++) { | ||||
copy_padding_kern(bundle, kern_param, ncb_index, | copy_padding_kern(bundle, kern_param, ncb_index, | ||||
{ncb_index.thread_id, 0, ic}); | {ncb_index.thread_id, 0, ic}); | ||||
@@ -342,15 +341,17 @@ SmallVector<ConvBiasImpl::NCBKern> direct_int8_stride1::get_kimpls( | |||||
}; | }; | ||||
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ||||
} else { | } else { | ||||
WorkspaceBundle bundle = wbundle; | |||||
auto copy_padding = [bundle](const NCBKernParam& kern_param, | auto copy_padding = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
copy_padding_kern(bundle, kern_param, ncb_index, | copy_padding_kern(bundle, kern_param, ncb_index, | ||||
ncb_index.ndrange_id); | ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({copy_padding, {group, N, IC}}); | ret_kerns.push_back({copy_padding, {group, N, IC}}); | ||||
auto do_conv = [bundle, do_conv_fun](const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) { | |||||
auto do_conv = [bundle, do_conv_fun]( | |||||
const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); | do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({do_conv, {group, N, OC}}); | ret_kerns.push_back({do_conv, {group, N, OC}}); | ||||
@@ -21,19 +21,19 @@ using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; | |||||
using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; | using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; | ||||
using conv_fun = std::function<void( | using conv_fun = std::function<void( | ||||
WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>; | const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>; | ||||
bool can_conv_direct_stride1_int8(const NCBKernSizeParam& param); | bool can_conv_direct_stride1_int8(const NCBKernSizeParam& param); | ||||
WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); | WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); | ||||
void copy_padding_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
void copy_padding_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
template <size_t filter, BiasMode bias_mode, typename Op> | template <size_t filter, BiasMode bias_mode, typename Op> | ||||
void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
@@ -109,7 +109,8 @@ WorkspaceBundle direct_dotprod_int8_stride1::get_bundle( | |||||
} | } | ||||
//! Process one input channel copy padding | //! Process one input channel copy padding | ||||
void direct_dotprod_int8_stride1::copy_padding_kern( | void direct_dotprod_int8_stride1::copy_padding_kern( | ||||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids) { | const CpuNDRange& workspace_ids) { | ||||
size_t IH = kern_param.isz[0]; | size_t IH = kern_param.isz[0]; | ||||
@@ -123,7 +124,6 @@ void direct_dotprod_int8_stride1::copy_padding_kern( | |||||
get_rectified_size(kern_param, IH2, IW2, OH2, OW2); | get_rectified_size(kern_param, IH2, IW2, OH2, OW2); | ||||
bool need_src_copy_var = need_src_copy(kern_param); | bool need_src_copy_var = need_src_copy(kern_param); | ||||
size_t padding_group_size = IH2 * IW2 * IC; | size_t padding_group_size = IH2 * IW2 * IC; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
//! Used for get the workspace offset | //! Used for get the workspace offset | ||||
size_t group_id = ncb_index.ndrange_id[0], | size_t group_id = ncb_index.ndrange_id[0], | ||||
@@ -148,7 +148,7 @@ void direct_dotprod_int8_stride1::copy_padding_kern( | |||||
//! compute one output channel | //! compute one output channel | ||||
template <size_t filter, BiasMode bias_mode, typename Op> | template <size_t filter, BiasMode bias_mode, typename Op> | ||||
void direct_dotprod_int8_stride1::do_conv_kern( | void direct_dotprod_int8_stride1::do_conv_kern( | ||||
WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { | const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { | ||||
size_t OH = kern_param.osz[0]; | size_t OH = kern_param.osz[0]; | ||||
size_t OW = kern_param.osz[1]; | size_t OW = kern_param.osz[1]; | ||||
@@ -172,7 +172,6 @@ void direct_dotprod_int8_stride1::do_conv_kern( | |||||
} | } | ||||
size_t padding_group_size = IH2 * IW2 * IC; | size_t padding_group_size = IH2 * IW2 * IC; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
//! Used for get the workspace offset | //! Used for get the workspace offset | ||||
size_t workspace_group_id = workspace_ids[0], | size_t workspace_group_id = workspace_ids[0], | ||||
workspace_batch_id = workspace_ids[1], oc = workspace_ids[2]; | workspace_batch_id = workspace_ids[1], oc = workspace_ids[2]; | ||||
@@ -264,7 +263,7 @@ SmallVector<ConvBiasImpl::NCBKern> direct_dotprod_int8_stride1::get_kimpls( | |||||
size_t IC = param.filter_meta.icpg; | size_t IC = param.filter_meta.icpg; | ||||
size_t OC = param.filter_meta.ocpg; | size_t OC = param.filter_meta.ocpg; | ||||
size_t group = fm.group; | size_t group = fm.group; | ||||
WorkspaceBundle wbundle = get_bundle(param, m_large_group); | |||||
WorkspaceBundle bundle = get_bundle(param, m_large_group); | |||||
conv_fun do_conv_fun = nullptr; | conv_fun do_conv_fun = nullptr; | ||||
#define DO_CONV_KERN_FUN(filter, bias_mode, op) \ | #define DO_CONV_KERN_FUN(filter, bias_mode, op) \ | ||||
@@ -325,13 +324,13 @@ SmallVector<ConvBiasImpl::NCBKern> direct_dotprod_int8_stride1::get_kimpls( | |||||
SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | ||||
if (m_large_group) { | if (m_large_group) { | ||||
auto exec_one_group = [wbundle, do_conv_fun]( | |||||
auto exec_one_group = [bundle, do_conv_fun]( | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
auto fm = kern_param.filter_meta; | auto fm = kern_param.filter_meta; | ||||
size_t IC = fm.icpg; | size_t IC = fm.icpg; | ||||
size_t OC = fm.ocpg; | size_t OC = fm.ocpg; | ||||
WorkspaceBundle bundle = wbundle; | |||||
bundle.set(kern_param.workspace_ptr); | |||||
for (size_t ic = 0; ic < IC; ic++) { | for (size_t ic = 0; ic < IC; ic++) { | ||||
copy_padding_kern(bundle, kern_param, ncb_index, | copy_padding_kern(bundle, kern_param, ncb_index, | ||||
{ncb_index.thread_id, 0, ic}); | {ncb_index.thread_id, 0, ic}); | ||||
@@ -343,15 +342,17 @@ SmallVector<ConvBiasImpl::NCBKern> direct_dotprod_int8_stride1::get_kimpls( | |||||
}; | }; | ||||
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ||||
} else { | } else { | ||||
WorkspaceBundle bundle = wbundle; | |||||
auto copy_padding = [bundle](const NCBKernParam& kern_param, | auto copy_padding = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
copy_padding_kern(bundle, kern_param, ncb_index, | copy_padding_kern(bundle, kern_param, ncb_index, | ||||
ncb_index.ndrange_id); | ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({copy_padding, {group, N, IC}}); | ret_kerns.push_back({copy_padding, {group, N, IC}}); | ||||
auto do_conv = [bundle, do_conv_fun](const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) { | |||||
auto do_conv = [bundle, do_conv_fun]( | |||||
const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); | do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({do_conv, {group, N, OC}}); | ret_kerns.push_back({do_conv, {group, N, OC}}); | ||||
@@ -20,19 +20,20 @@ using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; | |||||
using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; | using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; | ||||
using conv_fun = std::function<void( | using conv_fun = std::function<void( | ||||
WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>; | const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>; | ||||
bool can_conv_direct_stride1_int8(const NCBKernSizeParam& param); | bool can_conv_direct_stride1_int8(const NCBKernSizeParam& param); | ||||
WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); | WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); | ||||
void copy_padding_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
void copy_padding_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
template <size_t filter, BiasMode bias_mode, typename Op> | template <size_t filter, BiasMode bias_mode, typename Op> | ||||
void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
@@ -115,7 +115,8 @@ WorkspaceBundle direct_int8_stride2::get_bundle( | |||||
} | } | ||||
//! Process one input channel copy padding | //! Process one input channel copy padding | ||||
void direct_int8_stride2::copy_padding_kern( | void direct_int8_stride2::copy_padding_kern( | ||||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids) { | const CpuNDRange& workspace_ids) { | ||||
size_t IH = kern_param.isz[0]; | size_t IH = kern_param.isz[0]; | ||||
@@ -129,7 +130,6 @@ void direct_int8_stride2::copy_padding_kern( | |||||
get_rectified_size(kern_param, IH2, IW2, OH2, OW2); | get_rectified_size(kern_param, IH2, IW2, OH2, OW2); | ||||
bool need_src_copy_var = need_src_copy(kern_param); | bool need_src_copy_var = need_src_copy(kern_param); | ||||
size_t padding_group_size = IH2 * IW2 * IC; | size_t padding_group_size = IH2 * IW2 * IC; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
//! Used for get the workspace offset | //! Used for get the workspace offset | ||||
size_t group_id = ncb_index.ndrange_id[0], | size_t group_id = ncb_index.ndrange_id[0], | ||||
@@ -153,7 +153,7 @@ void direct_int8_stride2::copy_padding_kern( | |||||
}; | }; | ||||
//! compute one output channel | //! compute one output channel | ||||
template <size_t filter, BiasMode bias_mode, typename Op> | template <size_t filter, BiasMode bias_mode, typename Op> | ||||
void direct_int8_stride2::do_conv_kern(WorkspaceBundle bundle, | |||||
void direct_int8_stride2::do_conv_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids) { | const CpuNDRange& workspace_ids) { | ||||
@@ -178,7 +178,6 @@ void direct_int8_stride2::do_conv_kern(WorkspaceBundle bundle, | |||||
op = Op(scale_bias, scale_dst); | op = Op(scale_bias, scale_dst); | ||||
} | } | ||||
size_t padding_group_size = IH2 * IW2 * IC; | size_t padding_group_size = IH2 * IW2 * IC; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
//! Used for get the workspace offset | //! Used for get the workspace offset | ||||
size_t workspace_group_id = workspace_ids[0], | size_t workspace_group_id = workspace_ids[0], | ||||
workspace_batch_id = workspace_ids[1], oc = workspace_ids[2]; | workspace_batch_id = workspace_ids[1], oc = workspace_ids[2]; | ||||
@@ -270,7 +269,7 @@ SmallVector<ConvBiasImpl::NCBKern> direct_int8_stride2::get_kimpls( | |||||
size_t IC = param.filter_meta.icpg; | size_t IC = param.filter_meta.icpg; | ||||
size_t OC = param.filter_meta.ocpg; | size_t OC = param.filter_meta.ocpg; | ||||
size_t group = fm.group; | size_t group = fm.group; | ||||
WorkspaceBundle wbundle = get_bundle(param, m_large_group); | |||||
WorkspaceBundle bundle = get_bundle(param, m_large_group); | |||||
conv_fun do_conv_fun = nullptr; | conv_fun do_conv_fun = nullptr; | ||||
#define DO_CONV_KERN_FUN(filter, bias_mode, op) \ | #define DO_CONV_KERN_FUN(filter, bias_mode, op) \ | ||||
@@ -331,13 +330,13 @@ SmallVector<ConvBiasImpl::NCBKern> direct_int8_stride2::get_kimpls( | |||||
SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | ||||
if (m_large_group) { | if (m_large_group) { | ||||
auto exec_one_group = [wbundle, do_conv_fun]( | |||||
auto exec_one_group = [bundle, do_conv_fun]( | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
auto fm = kern_param.filter_meta; | auto fm = kern_param.filter_meta; | ||||
size_t IC = fm.icpg; | size_t IC = fm.icpg; | ||||
size_t OC = fm.ocpg; | size_t OC = fm.ocpg; | ||||
WorkspaceBundle bundle = wbundle; | |||||
bundle.set(kern_param.workspace_ptr); | |||||
for (size_t ic = 0; ic < IC; ic++) { | for (size_t ic = 0; ic < IC; ic++) { | ||||
copy_padding_kern(bundle, kern_param, ncb_index, | copy_padding_kern(bundle, kern_param, ncb_index, | ||||
{ncb_index.thread_id, 0, ic}); | {ncb_index.thread_id, 0, ic}); | ||||
@@ -349,15 +348,17 @@ SmallVector<ConvBiasImpl::NCBKern> direct_int8_stride2::get_kimpls( | |||||
}; | }; | ||||
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ||||
} else { | } else { | ||||
WorkspaceBundle bundle = wbundle; | |||||
auto copy_padding = [bundle](const NCBKernParam& kern_param, | auto copy_padding = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
copy_padding_kern(bundle, kern_param, ncb_index, | copy_padding_kern(bundle, kern_param, ncb_index, | ||||
ncb_index.ndrange_id); | ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({copy_padding, {group, N, IC}}); | ret_kerns.push_back({copy_padding, {group, N, IC}}); | ||||
auto do_conv = [bundle, do_conv_fun](const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) { | |||||
auto do_conv = [bundle, do_conv_fun]( | |||||
const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); | do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({do_conv, {group, N, OC}}); | ret_kerns.push_back({do_conv, {group, N, OC}}); | ||||
@@ -21,18 +21,19 @@ using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; | |||||
using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; | using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; | ||||
using conv_fun = std::function<void( | using conv_fun = std::function<void( | ||||
WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>; | const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>; | ||||
bool can_conv_direct_stride2_int8(const NCBKernSizeParam& param); | bool can_conv_direct_stride2_int8(const NCBKernSizeParam& param); | ||||
WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); | WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); | ||||
void copy_padding_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
void copy_padding_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
template <size_t filter, BiasMode bias_mode, typename Op> | template <size_t filter, BiasMode bias_mode, typename Op> | ||||
void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
@@ -116,7 +116,8 @@ WorkspaceBundle direct_dotprod_int8_stride2::get_bundle( | |||||
} | } | ||||
//! Process one input channel copy padding | //! Process one input channel copy padding | ||||
void direct_dotprod_int8_stride2::copy_padding_kern( | void direct_dotprod_int8_stride2::copy_padding_kern( | ||||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids) { | const CpuNDRange& workspace_ids) { | ||||
size_t IH = kern_param.isz[0]; | size_t IH = kern_param.isz[0]; | ||||
@@ -130,7 +131,6 @@ void direct_dotprod_int8_stride2::copy_padding_kern( | |||||
get_rectified_size(kern_param, IH2, IW2, OH2, OW2); | get_rectified_size(kern_param, IH2, IW2, OH2, OW2); | ||||
bool need_src_copy_var = need_src_copy(kern_param); | bool need_src_copy_var = need_src_copy(kern_param); | ||||
size_t padding_group_size = IH2 * IW2 * IC; | size_t padding_group_size = IH2 * IW2 * IC; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
//! Used for get the workspace offset | //! Used for get the workspace offset | ||||
size_t workspace_group_id = workspace_ids[0], | size_t workspace_group_id = workspace_ids[0], | ||||
@@ -154,7 +154,7 @@ void direct_dotprod_int8_stride2::copy_padding_kern( | |||||
//! compute one output channel | //! compute one output channel | ||||
template <size_t filter, BiasMode bias_mode, typename Op> | template <size_t filter, BiasMode bias_mode, typename Op> | ||||
void direct_dotprod_int8_stride2::do_conv_kern( | void direct_dotprod_int8_stride2::do_conv_kern( | ||||
WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { | const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { | ||||
size_t OH = kern_param.osz[0]; | size_t OH = kern_param.osz[0]; | ||||
size_t OW = kern_param.osz[1]; | size_t OW = kern_param.osz[1]; | ||||
@@ -177,7 +177,6 @@ void direct_dotprod_int8_stride2::do_conv_kern( | |||||
op = Op(scale_bias, scale_dst); | op = Op(scale_bias, scale_dst); | ||||
} | } | ||||
size_t padding_group_size = IH2 * IW2 * IC; | size_t padding_group_size = IH2 * IW2 * IC; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
//! Used for get the workspace offset | //! Used for get the workspace offset | ||||
size_t workspace_group_id = workspace_ids[0], | size_t workspace_group_id = workspace_ids[0], | ||||
workspace_batch_id = workspace_ids[1], oc = workspace_ids[2]; | workspace_batch_id = workspace_ids[1], oc = workspace_ids[2]; | ||||
@@ -270,7 +269,7 @@ SmallVector<ConvBiasImpl::NCBKern> direct_dotprod_int8_stride2::get_kimpls( | |||||
size_t IC = param.filter_meta.icpg; | size_t IC = param.filter_meta.icpg; | ||||
size_t OC = param.filter_meta.ocpg; | size_t OC = param.filter_meta.ocpg; | ||||
size_t group = fm.group; | size_t group = fm.group; | ||||
WorkspaceBundle wbundle = get_bundle(param, m_large_group); | |||||
WorkspaceBundle bundle = get_bundle(param, m_large_group); | |||||
conv_fun do_conv_fun = nullptr; | conv_fun do_conv_fun = nullptr; | ||||
#define DO_CONV_KERN_FUN(filter, bias_mode, op) \ | #define DO_CONV_KERN_FUN(filter, bias_mode, op) \ | ||||
@@ -331,13 +330,13 @@ SmallVector<ConvBiasImpl::NCBKern> direct_dotprod_int8_stride2::get_kimpls( | |||||
SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | ||||
if (m_large_group) { | if (m_large_group) { | ||||
auto exec_one_group = [wbundle, do_conv_fun]( | |||||
auto exec_one_group = [bundle, do_conv_fun]( | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
auto fm = kern_param.filter_meta; | auto fm = kern_param.filter_meta; | ||||
size_t IC = fm.icpg; | size_t IC = fm.icpg; | ||||
size_t OC = fm.ocpg; | size_t OC = fm.ocpg; | ||||
WorkspaceBundle bundle = wbundle; | |||||
bundle.set(kern_param.workspace_ptr); | |||||
for (size_t ic = 0; ic < IC; ic++) { | for (size_t ic = 0; ic < IC; ic++) { | ||||
copy_padding_kern(bundle, kern_param, ncb_index, | copy_padding_kern(bundle, kern_param, ncb_index, | ||||
{ncb_index.thread_id, 0, ic}); | {ncb_index.thread_id, 0, ic}); | ||||
@@ -349,15 +348,17 @@ SmallVector<ConvBiasImpl::NCBKern> direct_dotprod_int8_stride2::get_kimpls( | |||||
}; | }; | ||||
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ||||
} else { | } else { | ||||
WorkspaceBundle bundle = wbundle; | |||||
auto copy_padding = [bundle](const NCBKernParam& kern_param, | auto copy_padding = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
copy_padding_kern(bundle, kern_param, ncb_index, | copy_padding_kern(bundle, kern_param, ncb_index, | ||||
ncb_index.ndrange_id); | ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({copy_padding, {group, N, IC}}); | ret_kerns.push_back({copy_padding, {group, N, IC}}); | ||||
auto do_conv = [bundle, do_conv_fun](const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) { | |||||
auto do_conv = [bundle, do_conv_fun]( | |||||
const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); | do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({do_conv, {group, N, OC}}); | ret_kerns.push_back({do_conv, {group, N, OC}}); | ||||
@@ -21,19 +21,20 @@ using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; | |||||
using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; | using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; | ||||
using conv_fun = std::function<void( | using conv_fun = std::function<void( | ||||
WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>; | const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>; | ||||
bool can_conv_direct_stride2_int8(const NCBKernSizeParam& param); | bool can_conv_direct_stride2_int8(const NCBKernSizeParam& param); | ||||
WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); | WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); | ||||
void copy_padding_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
void copy_padding_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
template <size_t filter, BiasMode bias_mode, typename Op> | template <size_t filter, BiasMode bias_mode, typename Op> | ||||
void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
@@ -139,7 +139,8 @@ size_t ConvBiasImpl::AlgoI8x8x16Direct::get_workspace( | |||||
} | } | ||||
//! Process one input channel copy padding | //! Process one input channel copy padding | ||||
void ConvBiasImpl::AlgoI8x8x16Direct::copy_padding_kern( | void ConvBiasImpl::AlgoI8x8x16Direct::copy_padding_kern( | ||||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids) { | const CpuNDRange& workspace_ids) { | ||||
size_t IH = kern_param.isz[0]; | size_t IH = kern_param.isz[0]; | ||||
@@ -154,7 +155,6 @@ void ConvBiasImpl::AlgoI8x8x16Direct::copy_padding_kern( | |||||
get_rectified_size_str1(IH, IW, OH, OW, PH, PW, IH2, IW2, OH2, OW2); | get_rectified_size_str1(IH, IW, OH, OW, PH, PW, IH2, IW2, OH2, OW2); | ||||
bool need_src_copy_var = need_src_copy_str1(kern_param); | bool need_src_copy_var = need_src_copy_str1(kern_param); | ||||
size_t padding_group_size = IH2 * IW2 * IC; | size_t padding_group_size = IH2 * IW2 * IC; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
//! Used for get the workspace offset | //! Used for get the workspace offset | ||||
size_t workspace_group_id = workspace_ids[0], | size_t workspace_group_id = workspace_ids[0], | ||||
@@ -178,7 +178,7 @@ void ConvBiasImpl::AlgoI8x8x16Direct::copy_padding_kern( | |||||
}; | }; | ||||
//! compute one output channel | //! compute one output channel | ||||
void ConvBiasImpl::AlgoI8x8x16Direct::do_conv_kern( | void ConvBiasImpl::AlgoI8x8x16Direct::do_conv_kern( | ||||
WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { | const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { | ||||
size_t OH = kern_param.osz[0]; | size_t OH = kern_param.osz[0]; | ||||
size_t OW = kern_param.osz[1]; | size_t OW = kern_param.osz[1]; | ||||
@@ -214,7 +214,6 @@ void ConvBiasImpl::AlgoI8x8x16Direct::do_conv_kern( | |||||
fun_add_to_dst = conv_bias::conv_direct_5x5_sc_int8_int8_int16<true>; | fun_add_to_dst = conv_bias::conv_direct_5x5_sc_int8_int8_int16<true>; | ||||
} | } | ||||
bundle.set(kern_param.workspace_ptr); | |||||
//! Used for get the workspace offset | //! Used for get the workspace offset | ||||
size_t workspace_group_id = workspace_ids[0], | size_t workspace_group_id = workspace_ids[0], | ||||
workspace_batch_id = workspace_ids[1], oc = workspace_ids[2]; | workspace_batch_id = workspace_ids[1], oc = workspace_ids[2]; | ||||
@@ -256,15 +255,15 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoI8x8x16Direct::get_kimpls( | |||||
size_t IC = param.filter_meta.icpg; | size_t IC = param.filter_meta.icpg; | ||||
size_t OC = param.filter_meta.ocpg; | size_t OC = param.filter_meta.ocpg; | ||||
size_t group = fm.group; | size_t group = fm.group; | ||||
WorkspaceBundle wbundle = get_bundle(param); | |||||
WorkspaceBundle bundle = get_bundle(param); | |||||
SmallVector<NCBKern> ret_kerns; | SmallVector<NCBKern> ret_kerns; | ||||
if (m_large_group) { | if (m_large_group) { | ||||
auto exec_one_group = [wbundle](const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) { | |||||
auto exec_one_group = [bundle](const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
auto fm = kern_param.filter_meta; | auto fm = kern_param.filter_meta; | ||||
size_t IC = fm.icpg; | size_t IC = fm.icpg; | ||||
size_t OC = fm.ocpg; | size_t OC = fm.ocpg; | ||||
WorkspaceBundle bundle = wbundle; | |||||
bundle.set(kern_param.workspace_ptr); | |||||
for (size_t ic = 0; ic < IC; ic++) { | for (size_t ic = 0; ic < IC; ic++) { | ||||
copy_padding_kern(bundle, kern_param, ncb_index, | copy_padding_kern(bundle, kern_param, ncb_index, | ||||
{ncb_index.thread_id, 0, ic}); | {ncb_index.thread_id, 0, ic}); | ||||
@@ -276,15 +275,16 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoI8x8x16Direct::get_kimpls( | |||||
}; | }; | ||||
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ||||
} else { | } else { | ||||
WorkspaceBundle bundle = wbundle; | |||||
auto copy_padding = [bundle](const NCBKernParam& kern_param, | auto copy_padding = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
copy_padding_kern(bundle, kern_param, ncb_index, | copy_padding_kern(bundle, kern_param, ncb_index, | ||||
ncb_index.ndrange_id); | ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({copy_padding, {group, N, IC}}); | ret_kerns.push_back({copy_padding, {group, N, IC}}); | ||||
auto do_conv = [bundle](const NCBKernParam& kern_param, | auto do_conv = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
do_conv_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id); | do_conv_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({do_conv, {group, N, OC}}); | ret_kerns.push_back({do_conv, {group, N, OC}}); | ||||
@@ -360,7 +360,8 @@ size_t ConvBiasImpl::AlgoI8x8x16Stride2::get_workspace( | |||||
} | } | ||||
//! Process one input channel copy padding | //! Process one input channel copy padding | ||||
void ConvBiasImpl::AlgoI8x8x16Stride2::copy_padding_kern( | void ConvBiasImpl::AlgoI8x8x16Stride2::copy_padding_kern( | ||||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids) { | const CpuNDRange& workspace_ids) { | ||||
size_t IH = kern_param.isz[0]; | size_t IH = kern_param.isz[0]; | ||||
@@ -378,7 +379,6 @@ void ConvBiasImpl::AlgoI8x8x16Stride2::copy_padding_kern( | |||||
bool need_src_copy_var = need_src_copy_str2(kern_param); | bool need_src_copy_var = need_src_copy_str2(kern_param); | ||||
size_t padding_group_size = IH2 * IW2 * IC; | size_t padding_group_size = IH2 * IW2 * IC; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
size_t workspace_group_id = workspace_ids[0], | size_t workspace_group_id = workspace_ids[0], | ||||
workspace_batch_id = workspace_ids[1], | workspace_batch_id = workspace_ids[1], | ||||
channel_id = workspace_ids[2]; | channel_id = workspace_ids[2]; | ||||
@@ -400,7 +400,7 @@ void ConvBiasImpl::AlgoI8x8x16Stride2::copy_padding_kern( | |||||
}; | }; | ||||
//! compute one output channel | //! compute one output channel | ||||
void ConvBiasImpl::AlgoI8x8x16Stride2::do_conv_kern( | void ConvBiasImpl::AlgoI8x8x16Stride2::do_conv_kern( | ||||
WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { | const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { | ||||
size_t OH = kern_param.osz[0]; | size_t OH = kern_param.osz[0]; | ||||
size_t OW = kern_param.osz[1]; | size_t OW = kern_param.osz[1]; | ||||
@@ -436,7 +436,6 @@ void ConvBiasImpl::AlgoI8x8x16Stride2::do_conv_kern( | |||||
fun_add_to_dst = conv_bias::conv_stride2_5x5_sc_int8_int8_int16<true>; | fun_add_to_dst = conv_bias::conv_stride2_5x5_sc_int8_int8_int16<true>; | ||||
} | } | ||||
bundle.set(kern_param.workspace_ptr); | |||||
//! Used for get the workspace offset | //! Used for get the workspace offset | ||||
size_t workspace_group_id = workspace_ids[0], | size_t workspace_group_id = workspace_ids[0], | ||||
workspace_batch_id = workspace_ids[1], oc = workspace_ids[2]; | workspace_batch_id = workspace_ids[1], oc = workspace_ids[2]; | ||||
@@ -476,15 +475,15 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoI8x8x16Stride2::get_kimpls( | |||||
size_t IC = param.filter_meta.icpg; | size_t IC = param.filter_meta.icpg; | ||||
size_t OC = param.filter_meta.ocpg; | size_t OC = param.filter_meta.ocpg; | ||||
size_t group = fm.group; | size_t group = fm.group; | ||||
WorkspaceBundle wbundle = get_bundle(param); | |||||
WorkspaceBundle bundle = get_bundle(param); | |||||
SmallVector<NCBKern> ret_kerns; | SmallVector<NCBKern> ret_kerns; | ||||
if (m_large_group) { | if (m_large_group) { | ||||
auto exec_one_group = [wbundle](const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) { | |||||
auto exec_one_group = [bundle](const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
auto fm = kern_param.filter_meta; | auto fm = kern_param.filter_meta; | ||||
size_t IC = fm.icpg; | size_t IC = fm.icpg; | ||||
size_t OC = fm.ocpg; | size_t OC = fm.ocpg; | ||||
WorkspaceBundle bundle = wbundle; | |||||
bundle.set(kern_param.workspace_ptr); | |||||
for (size_t ic = 0; ic < IC; ic++) { | for (size_t ic = 0; ic < IC; ic++) { | ||||
copy_padding_kern(bundle, kern_param, ncb_index, | copy_padding_kern(bundle, kern_param, ncb_index, | ||||
{ncb_index.thread_id, 0, ic}); | {ncb_index.thread_id, 0, ic}); | ||||
@@ -496,15 +495,16 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoI8x8x16Stride2::get_kimpls( | |||||
}; | }; | ||||
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ||||
} else { | } else { | ||||
WorkspaceBundle bundle = wbundle; | |||||
auto copy_padding = [bundle](const NCBKernParam& kern_param, | auto copy_padding = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
copy_padding_kern(bundle, kern_param, ncb_index, | copy_padding_kern(bundle, kern_param, ncb_index, | ||||
ncb_index.ndrange_id); | ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({copy_padding, {group, N, IC}}); | ret_kerns.push_back({copy_padding, {group, N, IC}}); | ||||
auto do_conv = [bundle](const NCBKernParam& kern_param, | auto do_conv = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
do_conv_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id); | do_conv_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({do_conv, {group, N, OC}}); | ret_kerns.push_back({do_conv, {group, N, OC}}); | ||||
@@ -18,11 +18,11 @@ namespace arm_common { | |||||
class ConvBiasImpl::AlgoI8x8x16Direct final : public AlgoBase { | class ConvBiasImpl::AlgoI8x8x16Direct final : public AlgoBase { | ||||
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const; | SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const; | ||||
WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const; | WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const; | ||||
static void copy_padding_kern(WorkspaceBundle bundle, | |||||
static void copy_padding_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
static void do_conv_kern(WorkspaceBundle bundle, | |||||
static void do_conv_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
@@ -47,11 +47,11 @@ public: | |||||
class ConvBiasImpl::AlgoI8x8x16Stride2 final : public AlgoBase { | class ConvBiasImpl::AlgoI8x8x16Stride2 final : public AlgoBase { | ||||
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const; | SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const; | ||||
WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const; | WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const; | ||||
static void copy_padding_kern(WorkspaceBundle bundle, | |||||
static void copy_padding_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
static void do_conv_kern(WorkspaceBundle bundle, | |||||
static void do_conv_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
@@ -99,7 +99,8 @@ WorkspaceBundle direct_quint8_stride1::get_bundle( | |||||
} | } | ||||
//! Process one input channel copy padding | //! Process one input channel copy padding | ||||
void direct_quint8_stride1::copy_padding_kern( | void direct_quint8_stride1::copy_padding_kern( | ||||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids) { | const CpuNDRange& workspace_ids) { | ||||
size_t IH = kern_param.isz[0]; | size_t IH = kern_param.isz[0]; | ||||
@@ -114,8 +115,6 @@ void direct_quint8_stride1::copy_padding_kern( | |||||
bool need_src_copy_var = need_src_copy(kern_param); | bool need_src_copy_var = need_src_copy(kern_param); | ||||
size_t padding_group_size = IH2 * IW2 * IC; | size_t padding_group_size = IH2 * IW2 * IC; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
//! Used for get the workspace offset | //! Used for get the workspace offset | ||||
size_t workspace_group_id = workspace_ids[0], | size_t workspace_group_id = workspace_ids[0], | ||||
workspace_batch_id = workspace_ids[1], | workspace_batch_id = workspace_ids[1], | ||||
@@ -142,7 +141,7 @@ void direct_quint8_stride1::copy_padding_kern( | |||||
}; | }; | ||||
//! compute one output channel | //! compute one output channel | ||||
template <size_t filter, BiasMode bias_mode, typename Op> | template <size_t filter, BiasMode bias_mode, typename Op> | ||||
void direct_quint8_stride1::do_conv_kern(WorkspaceBundle bundle, | |||||
void direct_quint8_stride1::do_conv_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids) { | const CpuNDRange& workspace_ids) { | ||||
@@ -180,7 +179,6 @@ void direct_quint8_stride1::do_conv_kern(WorkspaceBundle bundle, | |||||
} | } | ||||
size_t padding_group_size = IH2 * IW2 * IC; | size_t padding_group_size = IH2 * IW2 * IC; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
//! Used for get the workspace offset | //! Used for get the workspace offset | ||||
size_t workspace_group_id = workspace_ids[0], | size_t workspace_group_id = workspace_ids[0], | ||||
workspace_batch_id = workspace_ids[1], oc = workspace_ids[2], | workspace_batch_id = workspace_ids[1], oc = workspace_ids[2], | ||||
@@ -272,7 +270,7 @@ SmallVector<ConvBiasImpl::NCBKern> direct_quint8_stride1::get_kimpls( | |||||
size_t IC = param.filter_meta.icpg; | size_t IC = param.filter_meta.icpg; | ||||
size_t OC = param.filter_meta.ocpg; | size_t OC = param.filter_meta.ocpg; | ||||
size_t group = fm.group; | size_t group = fm.group; | ||||
WorkspaceBundle wbundle = get_bundle(param, m_large_group); | |||||
WorkspaceBundle bundle = get_bundle(param, m_large_group); | |||||
conv_fun do_conv_fun = nullptr; | conv_fun do_conv_fun = nullptr; | ||||
#define DO_CONV_KERN_FUN(filter, bias_mode, op) \ | #define DO_CONV_KERN_FUN(filter, bias_mode, op) \ | ||||
@@ -333,13 +331,13 @@ SmallVector<ConvBiasImpl::NCBKern> direct_quint8_stride1::get_kimpls( | |||||
SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | ||||
if (m_large_group) { | if (m_large_group) { | ||||
auto exec_one_group = [wbundle, do_conv_fun]( | |||||
auto exec_one_group = [bundle, do_conv_fun]( | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
auto fm = kern_param.filter_meta; | auto fm = kern_param.filter_meta; | ||||
size_t IC = fm.icpg; | size_t IC = fm.icpg; | ||||
size_t OC = fm.ocpg; | size_t OC = fm.ocpg; | ||||
WorkspaceBundle bundle = wbundle; | |||||
bundle.set(kern_param.workspace_ptr); | |||||
for (size_t ic = 0; ic < IC; ic++) { | for (size_t ic = 0; ic < IC; ic++) { | ||||
copy_padding_kern(bundle, kern_param, ncb_index, | copy_padding_kern(bundle, kern_param, ncb_index, | ||||
{ncb_index.thread_id, 0, ic}); | {ncb_index.thread_id, 0, ic}); | ||||
@@ -351,15 +349,17 @@ SmallVector<ConvBiasImpl::NCBKern> direct_quint8_stride1::get_kimpls( | |||||
}; | }; | ||||
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ||||
} else { | } else { | ||||
WorkspaceBundle bundle = wbundle; | |||||
auto copy_padding = [bundle](const NCBKernParam& kern_param, | auto copy_padding = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
copy_padding_kern(bundle, kern_param, ncb_index, | copy_padding_kern(bundle, kern_param, ncb_index, | ||||
ncb_index.ndrange_id); | ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({copy_padding, {group, N, IC}}); | ret_kerns.push_back({copy_padding, {group, N, IC}}); | ||||
auto do_conv = [bundle, do_conv_fun](const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) { | |||||
auto do_conv = [bundle, do_conv_fun]( | |||||
const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); | do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({do_conv, {group, N, OC}}); | ret_kerns.push_back({do_conv, {group, N, OC}}); | ||||
@@ -21,19 +21,20 @@ using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; | |||||
using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; | using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; | ||||
using conv_fun = std::function<void( | using conv_fun = std::function<void( | ||||
WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>; | const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>; | ||||
bool can_conv_direct_stride1_quint8(const NCBKernSizeParam& param); | bool can_conv_direct_stride1_quint8(const NCBKernSizeParam& param); | ||||
WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); | WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); | ||||
void copy_padding_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
void copy_padding_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
template <size_t filter, BiasMode bias_mode, typename Op> | template <size_t filter, BiasMode bias_mode, typename Op> | ||||
void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
@@ -101,7 +101,8 @@ WorkspaceBundle direct_dotprod_quint8_stride1::get_bundle( | |||||
} | } | ||||
//! Process one input channel copy padding | //! Process one input channel copy padding | ||||
void direct_dotprod_quint8_stride1::copy_padding_kern( | void direct_dotprod_quint8_stride1::copy_padding_kern( | ||||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids) { | const CpuNDRange& workspace_ids) { | ||||
size_t IH = kern_param.isz[0]; | size_t IH = kern_param.isz[0]; | ||||
@@ -115,7 +116,6 @@ void direct_dotprod_quint8_stride1::copy_padding_kern( | |||||
get_rectified_size(kern_param, IH2, IW2, OH2, OW2); | get_rectified_size(kern_param, IH2, IW2, OH2, OW2); | ||||
bool need_src_copy_var = need_src_copy(kern_param); | bool need_src_copy_var = need_src_copy(kern_param); | ||||
size_t padding_group_size = IH2 * IW2 * IC; | size_t padding_group_size = IH2 * IW2 * IC; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
//! Used for get the workspace offset | //! Used for get the workspace offset | ||||
size_t workspace_group_id = workspace_ids[0], | size_t workspace_group_id = workspace_ids[0], | ||||
@@ -144,7 +144,7 @@ void direct_dotprod_quint8_stride1::copy_padding_kern( | |||||
//! compute one output channel | //! compute one output channel | ||||
template <size_t filter, BiasMode bias_mode, typename Op> | template <size_t filter, BiasMode bias_mode, typename Op> | ||||
void direct_dotprod_quint8_stride1::do_conv_kern( | void direct_dotprod_quint8_stride1::do_conv_kern( | ||||
WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { | const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { | ||||
size_t OH = kern_param.osz[0]; | size_t OH = kern_param.osz[0]; | ||||
size_t OW = kern_param.osz[1]; | size_t OW = kern_param.osz[1]; | ||||
@@ -177,7 +177,6 @@ void direct_dotprod_quint8_stride1::do_conv_kern( | |||||
} | } | ||||
size_t padding_group_size = IH2 * IW2 * IC; | size_t padding_group_size = IH2 * IW2 * IC; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
//! Used for get the workspace offset | //! Used for get the workspace offset | ||||
size_t workspace_group_id = workspace_ids[0], | size_t workspace_group_id = workspace_ids[0], | ||||
workspace_batch_id = workspace_ids[1], oc = workspace_ids[2], | workspace_batch_id = workspace_ids[1], oc = workspace_ids[2], | ||||
@@ -271,7 +270,7 @@ SmallVector<ConvBiasImpl::NCBKern> direct_dotprod_quint8_stride1::get_kimpls( | |||||
size_t IC = param.filter_meta.icpg; | size_t IC = param.filter_meta.icpg; | ||||
size_t OC = param.filter_meta.ocpg; | size_t OC = param.filter_meta.ocpg; | ||||
size_t group = fm.group; | size_t group = fm.group; | ||||
WorkspaceBundle wbundle = get_bundle(param, m_large_group); | |||||
WorkspaceBundle bundle = get_bundle(param, m_large_group); | |||||
conv_fun do_conv_fun = nullptr; | conv_fun do_conv_fun = nullptr; | ||||
#define DO_CONV_KERN_FUN(filter, bias_mode, op) \ | #define DO_CONV_KERN_FUN(filter, bias_mode, op) \ | ||||
@@ -332,13 +331,13 @@ SmallVector<ConvBiasImpl::NCBKern> direct_dotprod_quint8_stride1::get_kimpls( | |||||
SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | ||||
if (m_large_group) { | if (m_large_group) { | ||||
auto exec_one_group = [wbundle, do_conv_fun]( | |||||
auto exec_one_group = [bundle, do_conv_fun]( | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
auto fm = kern_param.filter_meta; | auto fm = kern_param.filter_meta; | ||||
size_t IC = fm.icpg; | size_t IC = fm.icpg; | ||||
size_t OC = fm.ocpg; | size_t OC = fm.ocpg; | ||||
WorkspaceBundle bundle = wbundle; | |||||
bundle.set(kern_param.workspace_ptr); | |||||
for (size_t ic = 0; ic < IC; ic++) { | for (size_t ic = 0; ic < IC; ic++) { | ||||
copy_padding_kern(bundle, kern_param, ncb_index, | copy_padding_kern(bundle, kern_param, ncb_index, | ||||
{ncb_index.thread_id, 0, ic}); | {ncb_index.thread_id, 0, ic}); | ||||
@@ -350,15 +349,17 @@ SmallVector<ConvBiasImpl::NCBKern> direct_dotprod_quint8_stride1::get_kimpls( | |||||
}; | }; | ||||
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ||||
}else { | }else { | ||||
WorkspaceBundle bundle = wbundle; | |||||
auto copy_padding = [bundle](const NCBKernParam& kern_param, | auto copy_padding = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
copy_padding_kern(bundle, kern_param, ncb_index, | copy_padding_kern(bundle, kern_param, ncb_index, | ||||
ncb_index.ndrange_id); | ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({copy_padding, {group, N, IC}}); | ret_kerns.push_back({copy_padding, {group, N, IC}}); | ||||
auto do_conv = [bundle, do_conv_fun](const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) { | |||||
auto do_conv = [bundle, do_conv_fun]( | |||||
const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); | do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({do_conv, {group, N, OC}}); | ret_kerns.push_back({do_conv, {group, N, OC}}); | ||||
@@ -21,19 +21,20 @@ using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; | |||||
using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; | using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; | ||||
using conv_fun = std::function<void( | using conv_fun = std::function<void( | ||||
WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>; | const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>; | ||||
bool can_conv_direct_stride1_quint8(const NCBKernSizeParam& param); | bool can_conv_direct_stride1_quint8(const NCBKernSizeParam& param); | ||||
WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); | WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); | ||||
void copy_padding_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
void copy_padding_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
template <size_t filter, BiasMode bias_mode, typename Op> | template <size_t filter, BiasMode bias_mode, typename Op> | ||||
void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
@@ -108,7 +108,8 @@ WorkspaceBundle direct_quint8_stride2::get_bundle( | |||||
} | } | ||||
//! Process one input channel copy padding | //! Process one input channel copy padding | ||||
void direct_quint8_stride2::copy_padding_kern( | void direct_quint8_stride2::copy_padding_kern( | ||||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids) { | const CpuNDRange& workspace_ids) { | ||||
size_t IH = kern_param.isz[0]; | size_t IH = kern_param.isz[0]; | ||||
@@ -122,7 +123,6 @@ void direct_quint8_stride2::copy_padding_kern( | |||||
get_rectified_size(kern_param, IH2, IW2, OH2, OW2); | get_rectified_size(kern_param, IH2, IW2, OH2, OW2); | ||||
bool need_src_copy_var = need_src_copy(kern_param); | bool need_src_copy_var = need_src_copy(kern_param); | ||||
size_t padding_group_size = IH2 * IW2 * IC; | size_t padding_group_size = IH2 * IW2 * IC; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
//! Used for get the workspace offset | //! Used for get the workspace offset | ||||
size_t workspace_group_id = workspace_ids[0], | size_t workspace_group_id = workspace_ids[0], | ||||
@@ -149,7 +149,7 @@ void direct_quint8_stride2::copy_padding_kern( | |||||
}; | }; | ||||
//! compute one output channel | //! compute one output channel | ||||
template <size_t filter, BiasMode bias_mode, typename Op> | template <size_t filter, BiasMode bias_mode, typename Op> | ||||
void direct_quint8_stride2::do_conv_kern(WorkspaceBundle bundle, | |||||
void direct_quint8_stride2::do_conv_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids) { | const CpuNDRange& workspace_ids) { | ||||
@@ -187,7 +187,6 @@ void direct_quint8_stride2::do_conv_kern(WorkspaceBundle bundle, | |||||
} | } | ||||
size_t padding_group_size = IH2 * IW2 * IC; | size_t padding_group_size = IH2 * IW2 * IC; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
//! Used for get the workspace offset | //! Used for get the workspace offset | ||||
size_t workspace_group_id = workspace_ids[0], | size_t workspace_group_id = workspace_ids[0], | ||||
workspace_batch_id = workspace_ids[1], oc = workspace_ids[2], | workspace_batch_id = workspace_ids[1], oc = workspace_ids[2], | ||||
@@ -279,7 +278,7 @@ SmallVector<ConvBiasImpl::NCBKern> direct_quint8_stride2::get_kimpls( | |||||
size_t IC = param.filter_meta.icpg; | size_t IC = param.filter_meta.icpg; | ||||
size_t OC = param.filter_meta.ocpg; | size_t OC = param.filter_meta.ocpg; | ||||
size_t group = fm.group; | size_t group = fm.group; | ||||
WorkspaceBundle wbundle = get_bundle(param, m_large_group); | |||||
WorkspaceBundle bundle = get_bundle(param, m_large_group); | |||||
conv_fun do_conv_fun = nullptr; | conv_fun do_conv_fun = nullptr; | ||||
#define DO_CONV_KERN_FUN(filter, bias_mode, op) \ | #define DO_CONV_KERN_FUN(filter, bias_mode, op) \ | ||||
@@ -340,13 +339,13 @@ SmallVector<ConvBiasImpl::NCBKern> direct_quint8_stride2::get_kimpls( | |||||
SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | ||||
if (m_large_group) { | if (m_large_group) { | ||||
auto exec_one_group = [wbundle, do_conv_fun]( | |||||
auto exec_one_group = [bundle, do_conv_fun]( | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
auto fm = kern_param.filter_meta; | auto fm = kern_param.filter_meta; | ||||
size_t IC = fm.icpg; | size_t IC = fm.icpg; | ||||
size_t OC = fm.ocpg; | size_t OC = fm.ocpg; | ||||
WorkspaceBundle bundle = wbundle; | |||||
bundle.set(kern_param.workspace_ptr); | |||||
for (size_t ic = 0; ic < IC; ic++) { | for (size_t ic = 0; ic < IC; ic++) { | ||||
copy_padding_kern(bundle, kern_param, ncb_index, | copy_padding_kern(bundle, kern_param, ncb_index, | ||||
{ncb_index.thread_id, 0, ic}); | {ncb_index.thread_id, 0, ic}); | ||||
@@ -358,15 +357,17 @@ SmallVector<ConvBiasImpl::NCBKern> direct_quint8_stride2::get_kimpls( | |||||
}; | }; | ||||
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ||||
}else { | }else { | ||||
WorkspaceBundle bundle = wbundle; | |||||
auto copy_padding = [bundle](const NCBKernParam& kern_param, | auto copy_padding = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
copy_padding_kern(bundle, kern_param, ncb_index, | copy_padding_kern(bundle, kern_param, ncb_index, | ||||
ncb_index.ndrange_id); | ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({copy_padding, {group, N, IC}}); | ret_kerns.push_back({copy_padding, {group, N, IC}}); | ||||
auto do_conv = [bundle, do_conv_fun](const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) { | |||||
auto do_conv = [bundle, do_conv_fun]( | |||||
const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); | do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({do_conv, {group, N, OC}}); | ret_kerns.push_back({do_conv, {group, N, OC}}); | ||||
@@ -21,19 +21,20 @@ using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; | |||||
using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; | using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; | ||||
using conv_fun = std::function<void( | using conv_fun = std::function<void( | ||||
WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>; | const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>; | ||||
bool can_conv_direct_stride2_quint8(const NCBKernSizeParam& param); | bool can_conv_direct_stride2_quint8(const NCBKernSizeParam& param); | ||||
WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); | WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); | ||||
void copy_padding_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
void copy_padding_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
template <size_t filter, BiasMode bias_mode, typename Op> | template <size_t filter, BiasMode bias_mode, typename Op> | ||||
void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
@@ -108,8 +108,10 @@ WorkspaceBundle direct_dotprod_quint8_stride2::get_bundle( | |||||
} | } | ||||
//! Process one input channel copy padding | //! Process one input channel copy padding | ||||
void direct_dotprod_quint8_stride2::copy_padding_kern( | void direct_dotprod_quint8_stride2::copy_padding_kern( | ||||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { | |||||
const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | |||||
const CpuNDRange& workspace_ids) { | |||||
size_t IH = kern_param.isz[0]; | size_t IH = kern_param.isz[0]; | ||||
size_t IW = kern_param.isz[1]; | size_t IW = kern_param.isz[1]; | ||||
size_t IC = kern_param.filter_meta.icpg; | size_t IC = kern_param.filter_meta.icpg; | ||||
@@ -121,7 +123,6 @@ void direct_dotprod_quint8_stride2::copy_padding_kern( | |||||
get_rectified_size(kern_param, IH2, IW2, OH2, OW2); | get_rectified_size(kern_param, IH2, IW2, OH2, OW2); | ||||
bool need_src_copy_var = need_src_copy(kern_param); | bool need_src_copy_var = need_src_copy(kern_param); | ||||
size_t padding_group_size = IH2 * IW2 * IC; | size_t padding_group_size = IH2 * IW2 * IC; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
//! Used for get the workspace offset | //! Used for get the workspace offset | ||||
size_t workspace_group_id = workspace_ids[0], | size_t workspace_group_id = workspace_ids[0], | ||||
@@ -149,7 +150,7 @@ void direct_dotprod_quint8_stride2::copy_padding_kern( | |||||
//! compute one output channel | //! compute one output channel | ||||
template <size_t filter, BiasMode bias_mode, typename Op> | template <size_t filter, BiasMode bias_mode, typename Op> | ||||
void direct_dotprod_quint8_stride2::do_conv_kern( | void direct_dotprod_quint8_stride2::do_conv_kern( | ||||
WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { | const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { | ||||
size_t OH = kern_param.osz[0]; | size_t OH = kern_param.osz[0]; | ||||
size_t OW = kern_param.osz[1]; | size_t OW = kern_param.osz[1]; | ||||
@@ -182,7 +183,6 @@ void direct_dotprod_quint8_stride2::do_conv_kern( | |||||
} | } | ||||
size_t padding_group_size = IH2 * IW2 * IC; | size_t padding_group_size = IH2 * IW2 * IC; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
//! Used for get the workspace offset | //! Used for get the workspace offset | ||||
size_t workspace_group_id = workspace_ids[0], | size_t workspace_group_id = workspace_ids[0], | ||||
workspace_batch_id = workspace_ids[1], oc = workspace_ids[2], | workspace_batch_id = workspace_ids[1], oc = workspace_ids[2], | ||||
@@ -276,7 +276,7 @@ SmallVector<ConvBiasImpl::NCBKern> direct_dotprod_quint8_stride2::get_kimpls( | |||||
size_t IC = param.filter_meta.icpg; | size_t IC = param.filter_meta.icpg; | ||||
size_t OC = param.filter_meta.ocpg; | size_t OC = param.filter_meta.ocpg; | ||||
size_t group = fm.group; | size_t group = fm.group; | ||||
WorkspaceBundle wbundle = get_bundle(param, m_large_group); | |||||
WorkspaceBundle bundle = get_bundle(param, m_large_group); | |||||
conv_fun do_conv_fun = nullptr; | conv_fun do_conv_fun = nullptr; | ||||
#define DO_CONV_KERN_FUN(filter, bias_mode, op) \ | #define DO_CONV_KERN_FUN(filter, bias_mode, op) \ | ||||
@@ -337,13 +337,13 @@ SmallVector<ConvBiasImpl::NCBKern> direct_dotprod_quint8_stride2::get_kimpls( | |||||
SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | ||||
if (m_large_group) { | if (m_large_group) { | ||||
auto exec_one_group = [wbundle, do_conv_fun]( | |||||
auto exec_one_group = [bundle, do_conv_fun]( | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
auto fm = kern_param.filter_meta; | auto fm = kern_param.filter_meta; | ||||
size_t IC = fm.icpg; | size_t IC = fm.icpg; | ||||
size_t OC = fm.ocpg; | size_t OC = fm.ocpg; | ||||
WorkspaceBundle bundle = wbundle; | |||||
bundle.set(kern_param.workspace_ptr); | |||||
for (size_t ic = 0; ic < IC; ic++) { | for (size_t ic = 0; ic < IC; ic++) { | ||||
copy_padding_kern(bundle, kern_param, ncb_index, | copy_padding_kern(bundle, kern_param, ncb_index, | ||||
{ncb_index.thread_id, 0, ic}); | {ncb_index.thread_id, 0, ic}); | ||||
@@ -355,15 +355,17 @@ SmallVector<ConvBiasImpl::NCBKern> direct_dotprod_quint8_stride2::get_kimpls( | |||||
}; | }; | ||||
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); | ||||
}else { | }else { | ||||
WorkspaceBundle bundle = wbundle; | |||||
auto copy_padding = [bundle](const NCBKernParam& kern_param, | auto copy_padding = [bundle](const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
copy_padding_kern(bundle, kern_param, ncb_index, | copy_padding_kern(bundle, kern_param, ncb_index, | ||||
ncb_index.ndrange_id); | ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({copy_padding, {group, N, IC}}); | ret_kerns.push_back({copy_padding, {group, N, IC}}); | ||||
auto do_conv = [bundle, do_conv_fun](const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) { | |||||
auto do_conv = [bundle, do_conv_fun]( | |||||
const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); | do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id); | ||||
}; | }; | ||||
ret_kerns.push_back({do_conv, {group, N, OC}}); | ret_kerns.push_back({do_conv, {group, N, OC}}); | ||||
@@ -21,19 +21,20 @@ using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; | |||||
using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; | using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex; | ||||
using conv_fun = std::function<void( | using conv_fun = std::function<void( | ||||
WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>; | const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids)>; | ||||
bool can_conv_direct_stride2_quint8(const NCBKernSizeParam& param); | bool can_conv_direct_stride2_quint8(const NCBKernSizeParam& param); | ||||
WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); | WorkspaceBundle get_bundle(const NCBKernSizeParam& param, bool m_large_group); | ||||
void copy_padding_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
void copy_padding_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
template <size_t filter, BiasMode bias_mode, typename Op> | template <size_t filter, BiasMode bias_mode, typename Op> | ||||
void do_conv_kern(WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
void do_conv_kern(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
@@ -39,7 +39,7 @@ struct Im2colBundelIndex { | |||||
using Pack_Mode=fallback::MatrixMulImpl::AlgoBase::PackMode; | using Pack_Mode=fallback::MatrixMulImpl::AlgoBase::PackMode; | ||||
//! Process one input channel copy padding | //! Process one input channel copy padding | ||||
static void copy_padding_kern(WorkspaceBundle bundle, | |||||
static void copy_padding_kern(WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& param, | const ConvBiasImpl::NCBKernParam& param, | ||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
StrategyBase* im2colstrategy, size_t pack_oc_size) { | StrategyBase* im2colstrategy, size_t pack_oc_size) { | ||||
@@ -48,7 +48,7 @@ static void copy_padding_kern(WorkspaceBundle bundle, | |||||
//! packA_kern | //! packA_kern | ||||
static void packA_kern( | static void packA_kern( | ||||
WorkspaceBundle bundle, | |||||
WorkspaceBundle& bundle, | |||||
const fallback::ConvBiasImpl::NCBKernParam& param, | const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
fallback::MatrixMulImpl::KernSizeParam matmulparam, | fallback::MatrixMulImpl::KernSizeParam matmulparam, | ||||
fallback::MatrixMulImpl::AlgoBase* matmul_algo, | fallback::MatrixMulImpl::AlgoBase* matmul_algo, | ||||
@@ -72,11 +72,12 @@ class Im2colKerns<Pack_Mode::DEFAULT> { | |||||
public: | public: | ||||
//! conv kernel | //! conv kernel | ||||
static void kerns( | static void kerns( | ||||
WorkspaceBundle bundle, WorkspaceBundle bundle_thread, | |||||
const WorkspaceBundle& bundle, WorkspaceBundle bundle_thread, | |||||
const ConvBiasImpl::NCBKernParam& param, | const ConvBiasImpl::NCBKernParam& param, | ||||
fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param, | fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param, | ||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | ||||
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc, | |||||
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& | |||||
matmul_desc, | |||||
StrategyParam strategyparam, | StrategyParam strategyparam, | ||||
fallback::ConvBiasImpl::NCBKernIndex ncb_index, | fallback::ConvBiasImpl::NCBKernIndex ncb_index, | ||||
size_t ohw_tile_size, StrategyBase* im2colstrategy) { | size_t ohw_tile_size, StrategyBase* im2colstrategy) { | ||||
@@ -100,7 +101,6 @@ public: | |||||
strategyparam.output_block_oc_size = output_block_oc_size; | strategyparam.output_block_oc_size = output_block_oc_size; | ||||
strategyparam.output_block_size = output_block_size; | strategyparam.output_block_size = output_block_size; | ||||
bundle.set(param.workspace_ptr); | |||||
bundle_thread.set( | bundle_thread.set( | ||||
static_cast<int8_t*>( | static_cast<int8_t*>( | ||||
bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) + | bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) + | ||||
@@ -153,11 +153,12 @@ class Im2colKerns<Pack_Mode::ONLY_PACKA> { | |||||
public: | public: | ||||
//! conv kernel | //! conv kernel | ||||
static void kerns( | static void kerns( | ||||
WorkspaceBundle bundle, WorkspaceBundle bundle_thread, | |||||
const WorkspaceBundle& bundle, WorkspaceBundle bundle_thread, | |||||
const ConvBiasImpl::NCBKernParam& param, | const ConvBiasImpl::NCBKernParam& param, | ||||
fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param, | fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param, | ||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | ||||
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc, | |||||
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& | |||||
matmul_desc, | |||||
StrategyParam strategyparam, | StrategyParam strategyparam, | ||||
fallback::ConvBiasImpl::NCBKernIndex ncb_index, | fallback::ConvBiasImpl::NCBKernIndex ncb_index, | ||||
size_t ohw_tile_size, StrategyBase* im2colstrategy) { | size_t ohw_tile_size, StrategyBase* im2colstrategy) { | ||||
@@ -169,7 +170,6 @@ public: | |||||
strategyparam.oc_tile_size, | strategyparam.oc_tile_size, | ||||
OC - ncb_index.ndrange_id[3] * strategyparam.oc_tile_size); | OC - ncb_index.ndrange_id[3] * strategyparam.oc_tile_size); | ||||
bundle.set(param.workspace_ptr); | |||||
bundle_thread.set( | bundle_thread.set( | ||||
static_cast<int8_t*>( | static_cast<int8_t*>( | ||||
bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) + | bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) + | ||||
@@ -236,11 +236,12 @@ class Im2colKerns<Pack_Mode::NO_PACK> { | |||||
public: | public: | ||||
//! conv kernel | //! conv kernel | ||||
static void kerns( | static void kerns( | ||||
WorkspaceBundle bundle, WorkspaceBundle bundle_thread, | |||||
const WorkspaceBundle& bundle, WorkspaceBundle bundle_thread, | |||||
const ConvBiasImpl::NCBKernParam& param, | const ConvBiasImpl::NCBKernParam& param, | ||||
fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param, | fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param, | ||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | ||||
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc, | |||||
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& | |||||
matmul_desc, | |||||
StrategyParam strategyparam, | StrategyParam strategyparam, | ||||
fallback::ConvBiasImpl::NCBKernIndex ncb_index, | fallback::ConvBiasImpl::NCBKernIndex ncb_index, | ||||
size_t ohw_tile_size, StrategyBase* im2colstrategy) { | size_t ohw_tile_size, StrategyBase* im2colstrategy) { | ||||
@@ -264,7 +265,6 @@ public: | |||||
strategyparam.output_block_oc_size = output_block_oc_size; | strategyparam.output_block_oc_size = output_block_oc_size; | ||||
strategyparam.output_block_size = output_block_size; | strategyparam.output_block_size = output_block_size; | ||||
bundle.set(param.workspace_ptr); | |||||
bundle_thread.set( | bundle_thread.set( | ||||
static_cast<int8_t*>( | static_cast<int8_t*>( | ||||
bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) + | bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) + | ||||
@@ -567,16 +567,18 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns( | |||||
auto kern_padding = [bundle, im2colstrategy, | auto kern_padding = [bundle, im2colstrategy, | ||||
pack_oc_size = pack_oc_size]( | pack_oc_size = pack_oc_size]( | ||||
const NCBKernParam& param, | const NCBKernParam& param, | ||||
const NCBKernIndex& ncb_index) { | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(param.workspace_ptr); | |||||
copy_padding_kern(bundle, param, ncb_index, im2colstrategy, | copy_padding_kern(bundle, param, ncb_index, im2colstrategy, | ||||
pack_oc_size); | pack_oc_size); | ||||
}; | }; | ||||
auto kern_packA = [bundle, matmul_algo = m_matmul_algo, | auto kern_packA = [bundle, matmul_algo = m_matmul_algo, | ||||
matmul_param, im2colstrategy, | matmul_param, im2colstrategy, | ||||
pack_oc_size = pack_oc_size, | |||||
mdesc = mdesc](const NCBKernParam& param, | |||||
const NCBKernIndex& ncb_index) { | |||||
pack_oc_size = pack_oc_size, mdesc = mdesc]( | |||||
const NCBKernParam& param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(param.workspace_ptr); | |||||
packA_kern(bundle, param, matmul_param, matmul_algo, ncb_index, | packA_kern(bundle, param, matmul_param, matmul_algo, ncb_index, | ||||
im2colstrategy, mdesc, pack_oc_size); | im2colstrategy, mdesc, pack_oc_size); | ||||
}; | }; | ||||
@@ -586,8 +588,10 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns( | |||||
matmul_algo = m_matmul_algo, | matmul_algo = m_matmul_algo, | ||||
ohw_tile_size = ohw_tile_size, | ohw_tile_size = ohw_tile_size, | ||||
strategyparam = strategyparam, matmul_desc = mdesc, | strategyparam = strategyparam, matmul_desc = mdesc, | ||||
im2colstrategy](const NCBKernParam& param, | |||||
const NCBKernIndex& ncb_index) { | |||||
im2colstrategy]( | |||||
const NCBKernParam& param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(param.workspace_ptr); | |||||
Im2colKerns<Pack_Mode::DEFAULT>::kerns( | Im2colKerns<Pack_Mode::DEFAULT>::kerns( | ||||
bundle, bundle_thread, param, matmul_param, | bundle, bundle_thread, param, matmul_param, | ||||
matmul_algo, matmul_desc, strategyparam, | matmul_algo, matmul_desc, strategyparam, | ||||
@@ -608,8 +612,10 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns( | |||||
matmul_algo = m_matmul_algo, | matmul_algo = m_matmul_algo, | ||||
strategyparam = strategyparam, | strategyparam = strategyparam, | ||||
ohw_tile_size = ohw_tile_size, matmul_desc = mdesc, | ohw_tile_size = ohw_tile_size, matmul_desc = mdesc, | ||||
im2colstrategy](const NCBKernParam& param, | |||||
const NCBKernIndex& ncb_index) { | |||||
im2colstrategy]( | |||||
const NCBKernParam& param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(param.workspace_ptr); | |||||
Im2colKerns<Pack_Mode::ONLY_PACKA>::kerns( | Im2colKerns<Pack_Mode::ONLY_PACKA>::kerns( | ||||
bundle, bundle_thread, param, matmul_param, | bundle, bundle_thread, param, matmul_param, | ||||
matmul_algo, matmul_desc, strategyparam, | matmul_algo, matmul_desc, strategyparam, | ||||
@@ -628,14 +634,15 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns( | |||||
matmul_algo = m_matmul_algo, | matmul_algo = m_matmul_algo, | ||||
strategyparam = strategyparam, | strategyparam = strategyparam, | ||||
ohw_tile_size = ohw_tile_size, matmul_desc = mdesc, | ohw_tile_size = ohw_tile_size, matmul_desc = mdesc, | ||||
im2colstrategy](const NCBKernParam& param, | |||||
const NCBKernIndex& ncb_index) { | |||||
im2colstrategy]( | |||||
const NCBKernParam& param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(param.workspace_ptr); | |||||
Im2colKerns<Pack_Mode::NO_PACK>::kerns( | Im2colKerns<Pack_Mode::NO_PACK>::kerns( | ||||
bundle, bundle_thread, param, matmul_param, | bundle, bundle_thread, param, matmul_param, | ||||
matmul_algo, matmul_desc, strategyparam, | matmul_algo, matmul_desc, strategyparam, | ||||
ncb_index, ohw_tile_size, im2colstrategy); | ncb_index, ohw_tile_size, im2colstrategy); | ||||
}; | }; | ||||
if (need_padding) { | if (need_padding) { | ||||
ret_kern.push_back({kern_padding, {param.n, GROUP, IC}}); | ret_kern.push_back({kern_padding, {param.n, GROUP, IC}}); | ||||
} | } | ||||
@@ -50,21 +50,22 @@ public: | |||||
StrategyBase() = default; | StrategyBase() = default; | ||||
virtual ~StrategyBase() = default; | virtual ~StrategyBase() = default; | ||||
virtual void copy_padding_kern( | virtual void copy_padding_kern( | ||||
WorkspaceBundle bundle, | |||||
const WorkspaceBundle& bundle, | |||||
const fallback::ConvBiasImpl::NCBKernParam& param, | const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
size_t pack_size) = 0; | size_t pack_size) = 0; | ||||
virtual void packA_kern( | virtual void packA_kern( | ||||
WorkspaceBundle bundle, | |||||
const WorkspaceBundle& bundle, | |||||
const fallback::ConvBiasImpl::NCBKernParam& param, | const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
fallback::MatrixMulImpl::KernSizeParam matmulparam, | fallback::MatrixMulImpl::KernSizeParam matmulparam, | ||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | ||||
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desec, | |||||
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& | |||||
matmul_desec, | |||||
size_t pack_size) = 0; | size_t pack_size) = 0; | ||||
virtual void exec_im2col( | virtual void exec_im2col( | ||||
WorkspaceBundle bundle, WorkspaceBundle bundle_thread, | |||||
const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread, | |||||
const StrategyParam& sparam, | const StrategyParam& sparam, | ||||
const fallback::ConvBiasImpl::NCBKernParam& param, | const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
fallback::MatrixMulImpl::KernParam matmul_param, | fallback::MatrixMulImpl::KernParam matmul_param, | ||||
@@ -72,17 +73,18 @@ public: | |||||
virtual void exec_matmul( | virtual void exec_matmul( | ||||
const fallback::ConvBiasImpl::NCBKernParam& param, | const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
const StrategyParam& sparam, WorkspaceBundle bundle, | |||||
WorkspaceBundle bundle_thread, | |||||
const StrategyParam& sparam, const WorkspaceBundle& bundle, | |||||
const WorkspaceBundle& bundle_thread, | |||||
fallback::MatrixMulImpl::KernParam matmul_param, | fallback::MatrixMulImpl::KernParam matmul_param, | ||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | ||||
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc | |||||
) = 0; | |||||
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& | |||||
matmul_desc) = 0; | |||||
virtual void exec_postprocess( | virtual void exec_postprocess( | ||||
const fallback::ConvBiasImpl::NCBKernParam& param, | const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
const StrategyParam& sparam, WorkspaceBundle bundle_thread) = 0; | |||||
const StrategyParam& sparam, | |||||
const WorkspaceBundle& bundle_thread) = 0; | |||||
}; | }; | ||||
template <typename src_ctype, typename bias_ctype, typename dst_ctype, | template <typename src_ctype, typename bias_ctype, typename dst_ctype, | ||||
@@ -98,7 +100,7 @@ public: | |||||
StrategyBridge() = default; | StrategyBridge() = default; | ||||
virtual void copy_padding_kern( | virtual void copy_padding_kern( | ||||
WorkspaceBundle bundle, | |||||
const WorkspaceBundle& bundle, | |||||
const fallback::ConvBiasImpl::NCBKernParam& param, | const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
size_t pack_oc_size) override { | size_t pack_oc_size) override { | ||||
@@ -126,7 +128,6 @@ public: | |||||
size_t workspace_group_offset = group_id * padding_group_size; | size_t workspace_group_offset = group_id * padding_group_size; | ||||
size_t workspace_batch_offset = | size_t workspace_batch_offset = | ||||
param.filter_meta.group * batch_id * padding_group_size; | param.filter_meta.group * batch_id * padding_group_size; | ||||
bundle.set(param.workspace_ptr); | |||||
src_ctype src_zp = static_cast<src_ctype>(0); | src_ctype src_zp = static_cast<src_ctype>(0); | ||||
if (param.src_type.enumv() == DTypeEnum::Quantized8Asymm) { | if (param.src_type.enumv() == DTypeEnum::Quantized8Asymm) { | ||||
@@ -212,8 +213,8 @@ void copy_dst(const fallback::ConvBiasImpl::NCBKernParam& param, | |||||
template <typename bias_ctype> | template <typename bias_ctype> | ||||
void copy_bias(const fallback::ConvBiasImpl::NCBKernParam& param, | void copy_bias(const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
WorkspaceBundle bundle_thread, const StrategyParam& sparam, | |||||
size_t bias_index) { | |||||
const WorkspaceBundle& bundle_thread, | |||||
const StrategyParam& sparam, size_t bias_index) { | |||||
const bias_ctype* bias_ptr = static_cast<const bias_ctype*>( | const bias_ctype* bias_ptr = static_cast<const bias_ctype*>( | ||||
param.bias<bias_ctype>(sparam.batch_id, sparam.group_id)); | param.bias<bias_ctype>(sparam.batch_id, sparam.group_id)); | ||||
bias_ctype* bias_temp_ptr = static_cast<bias_ctype*>( | bias_ctype* bias_temp_ptr = static_cast<bias_ctype*>( | ||||
@@ -235,11 +236,11 @@ void copy_bias(const fallback::ConvBiasImpl::NCBKernParam& param, | |||||
} | } | ||||
} | } | ||||
template <typename bias_ctype, typename dst_ctype, | |||||
typename op_ctype, typename op_dtype, | |||||
megdnn::PostprocessMode postprocess_mode> | |||||
template <typename bias_ctype, typename dst_ctype, typename op_ctype, | |||||
typename op_dtype, megdnn::PostprocessMode postprocess_mode> | |||||
void do_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, | void do_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
const StrategyParam& sparam, WorkspaceBundle bundle_thread, | |||||
const StrategyParam& sparam, | |||||
const WorkspaceBundle& bundle_thread, | |||||
size_t matmul_bundle_index, size_t bias_bundle_index) { | size_t matmul_bundle_index, size_t bias_bundle_index) { | ||||
copy_bias<bias_ctype>(param, bundle_thread, sparam, bias_bundle_index); | copy_bias<bias_ctype>(param, bundle_thread, sparam, bias_bundle_index); | ||||
void* matmul_dst = get_matmul_dst_ptr<bias_ctype>( | void* matmul_dst = get_matmul_dst_ptr<bias_ctype>( | ||||
@@ -288,32 +289,32 @@ public: | |||||
Strategy() = default; | Strategy() = default; | ||||
virtual void packA_kern( | virtual void packA_kern( | ||||
WorkspaceBundle bundle, | |||||
const WorkspaceBundle& bundle, | |||||
const fallback::ConvBiasImpl::NCBKernParam& param, | const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
fallback::MatrixMulImpl::KernSizeParam matmulparam, | fallback::MatrixMulImpl::KernSizeParam matmulparam, | ||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | ||||
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc, | |||||
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& | |||||
matmul_desc, | |||||
size_t pack_size) override; | size_t pack_size) override; | ||||
virtual void exec_im2col( | virtual void exec_im2col( | ||||
WorkspaceBundle bundle, WorkspaceBundle bundle_thread, | |||||
const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread, | |||||
const StrategyParam& sparam, | const StrategyParam& sparam, | ||||
const fallback::ConvBiasImpl::NCBKernParam& param, | const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
fallback::MatrixMulImpl::KernParam matmul_param, | fallback::MatrixMulImpl::KernParam matmul_param, | ||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; | const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; | ||||
void exec_matmul( | |||||
const fallback::ConvBiasImpl::NCBKernParam& param, | |||||
const StrategyParam& sparam, WorkspaceBundle bundle, | |||||
WorkspaceBundle bundle_thread, | |||||
fallback::MatrixMulImpl::KernParam matmul_param, | |||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||||
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | |||||
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc | |||||
) override; | |||||
void exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param, | |||||
const StrategyParam& sparam, const WorkspaceBundle& bundle, | |||||
const WorkspaceBundle& bundle_thread, | |||||
fallback::MatrixMulImpl::KernParam matmul_param, | |||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||||
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | |||||
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& | |||||
matmul_desc) override; | |||||
void exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, | void exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
const StrategyParam& sparam, | const StrategyParam& sparam, | ||||
WorkspaceBundle bundle_thread) override { | |||||
const WorkspaceBundle& bundle_thread) override { | |||||
do_postprocess<bias_ctype, dst_ctype, op_ctype, op_dtype, | do_postprocess<bias_ctype, dst_ctype, op_ctype, op_dtype, | ||||
postprocess_mode>(param, sparam, bundle_thread, | postprocess_mode>(param, sparam, bundle_thread, | ||||
THREAD_BUNDLE_IM2COL_INDEX, | THREAD_BUNDLE_IM2COL_INDEX, | ||||
@@ -341,11 +342,12 @@ public: | |||||
Strategy() = default; | Strategy() = default; | ||||
void exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, | |||||
const StrategyParam& sparam, | |||||
const fallback::ConvBiasImpl::NCBKernParam& param, | |||||
fallback::MatrixMulImpl::KernParam matmul_param, | |||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; | |||||
void exec_im2col( | |||||
const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread, | |||||
const StrategyParam& sparam, | |||||
const fallback::ConvBiasImpl::NCBKernParam& param, | |||||
fallback::MatrixMulImpl::KernParam matmul_param, | |||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; | |||||
}; | }; | ||||
template <typename src_ctype, typename bias_ctype, typename dst_ctype, | template <typename src_ctype, typename bias_ctype, typename dst_ctype, | ||||
@@ -367,7 +369,7 @@ public: | |||||
Strategy() = default; | Strategy() = default; | ||||
void packA_kern( | void packA_kern( | ||||
WorkspaceBundle bundle, | |||||
const WorkspaceBundle& bundle, | |||||
const fallback::ConvBiasImpl::NCBKernParam& param, | const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
fallback::MatrixMulImpl::KernSizeParam matmulparam, | fallback::MatrixMulImpl::KernSizeParam matmulparam, | ||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | ||||
@@ -375,28 +377,28 @@ public: | |||||
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& MDsec, | const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& MDsec, | ||||
size_t pack_size) override; | size_t pack_size) override; | ||||
void exec_matmul( | |||||
const fallback::ConvBiasImpl::NCBKernParam& param, | |||||
const StrategyParam& sparam, WorkspaceBundle bundle, | |||||
WorkspaceBundle bundle_thread, | |||||
fallback::MatrixMulImpl::KernParam matmul_param, | |||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||||
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | |||||
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc | |||||
) override; | |||||
void exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param, | |||||
const StrategyParam& sparam, const WorkspaceBundle& bundle, | |||||
const WorkspaceBundle& bundle_thread, | |||||
fallback::MatrixMulImpl::KernParam matmul_param, | |||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||||
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | |||||
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& | |||||
matmul_desc) override; | |||||
void* get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param, | void* get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
const WorkspaceBundle& bundle_thread, | const WorkspaceBundle& bundle_thread, | ||||
const StrategyParam& sparam); | const StrategyParam& sparam); | ||||
void exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, | |||||
const StrategyParam& sparam, | |||||
const fallback::ConvBiasImpl::NCBKernParam& param, | |||||
fallback::MatrixMulImpl::KernParam matmul_param, | |||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; | |||||
void exec_im2col( | |||||
const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread, | |||||
const StrategyParam& sparam, | |||||
const fallback::ConvBiasImpl::NCBKernParam& param, | |||||
fallback::MatrixMulImpl::KernParam matmul_param, | |||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; | |||||
void exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, | void exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
const StrategyParam& sparam, | const StrategyParam& sparam, | ||||
WorkspaceBundle bundle_thread) override { | |||||
const WorkspaceBundle& bundle_thread) override { | |||||
do_postprocess<bias_ctype, dst_ctype, op_ctype, op_dtype, | do_postprocess<bias_ctype, dst_ctype, op_ctype, op_dtype, | ||||
postprocess_mode>(param, sparam, bundle_thread, | postprocess_mode>(param, sparam, bundle_thread, | ||||
THREAD_BUNDLE_MATMULDST_INDEX, | THREAD_BUNDLE_MATMULDST_INDEX, | ||||
@@ -423,7 +425,7 @@ public: | |||||
Strategy() = default; | Strategy() = default; | ||||
void packA_kern( | void packA_kern( | ||||
WorkspaceBundle bundle, | |||||
const WorkspaceBundle& bundle, | |||||
const fallback::ConvBiasImpl::NCBKernParam& param, | const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
fallback::MatrixMulImpl::KernSizeParam matmulparam, | fallback::MatrixMulImpl::KernSizeParam matmulparam, | ||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | ||||
@@ -431,21 +433,21 @@ public: | |||||
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& MDsec, | const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& MDsec, | ||||
size_t pack_size) override; | size_t pack_size) override; | ||||
void exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, | |||||
const StrategyParam& sparam, | |||||
const fallback::ConvBiasImpl::NCBKernParam& param, | |||||
fallback::MatrixMulImpl::KernParam matmul_param, | |||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; | |||||
void exec_matmul( | |||||
void exec_im2col( | |||||
const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread, | |||||
const StrategyParam& sparam, | |||||
const fallback::ConvBiasImpl::NCBKernParam& param, | const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
const StrategyParam& sparam, WorkspaceBundle bundle, | |||||
WorkspaceBundle bundle_thread, | |||||
fallback::MatrixMulImpl::KernParam matmul_param, | fallback::MatrixMulImpl::KernParam matmul_param, | ||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||||
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | |||||
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc | |||||
) override; | |||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; | |||||
void exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param, | |||||
const StrategyParam& sparam, const WorkspaceBundle& bundle, | |||||
const WorkspaceBundle& bundle_thread, | |||||
fallback::MatrixMulImpl::KernParam matmul_param, | |||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||||
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | |||||
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& | |||||
matmul_desc) override; | |||||
void* get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param, | void* get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
const WorkspaceBundle& bundle_thread, | const WorkspaceBundle& bundle_thread, | ||||
@@ -453,7 +455,7 @@ public: | |||||
void exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, | void exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
const StrategyParam& sparam, | const StrategyParam& sparam, | ||||
WorkspaceBundle bundle_thread) override { | |||||
const WorkspaceBundle& bundle_thread) override { | |||||
do_postprocess<bias_ctype, dst_ctype, op_ctype, op_dtype, | do_postprocess<bias_ctype, dst_ctype, op_ctype, op_dtype, | ||||
postprocess_mode>(param, sparam, bundle_thread, | postprocess_mode>(param, sparam, bundle_thread, | ||||
THREAD_BUNDLE_MATMULDST_INDEX, | THREAD_BUNDLE_MATMULDST_INDEX, | ||||
@@ -476,11 +478,12 @@ public: | |||||
constexpr static size_t THREAD_BUNDLE_IM2COL_INDEX = 1; | constexpr static size_t THREAD_BUNDLE_IM2COL_INDEX = 1; | ||||
constexpr static size_t THREAD_BUNDLE_BIAS_INDEX = 2; | constexpr static size_t THREAD_BUNDLE_BIAS_INDEX = 2; | ||||
void exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, | |||||
const StrategyParam& sparam, | |||||
const fallback::ConvBiasImpl::NCBKernParam& param, | |||||
fallback::MatrixMulImpl::KernParam matmul_param, | |||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; | |||||
void exec_im2col( | |||||
const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread, | |||||
const StrategyParam& sparam, | |||||
const fallback::ConvBiasImpl::NCBKernParam& param, | |||||
fallback::MatrixMulImpl::KernParam matmul_param, | |||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; | |||||
}; | }; | ||||
template <typename op_ctype, typename op_dtype, | template <typename op_ctype, typename op_dtype, | ||||
@@ -498,11 +501,12 @@ public: | |||||
constexpr static size_t THREAD_BUNDLE_IM2COL_INDEX = 1; | constexpr static size_t THREAD_BUNDLE_IM2COL_INDEX = 1; | ||||
constexpr static size_t THREAD_BUNDLE_BIAS_INDEX = 2; | constexpr static size_t THREAD_BUNDLE_BIAS_INDEX = 2; | ||||
void exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, | |||||
const StrategyParam& sparam, | |||||
const fallback::ConvBiasImpl::NCBKernParam& param, | |||||
fallback::MatrixMulImpl::KernParam matmul_param, | |||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; | |||||
void exec_im2col( | |||||
const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread, | |||||
const StrategyParam& sparam, | |||||
const fallback::ConvBiasImpl::NCBKernParam& param, | |||||
fallback::MatrixMulImpl::KernParam matmul_param, | |||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; | |||||
}; | }; | ||||
@@ -521,11 +525,12 @@ public: | |||||
constexpr static size_t THREAD_BUNDLE_IM2COL_INDEX = 1; | constexpr static size_t THREAD_BUNDLE_IM2COL_INDEX = 1; | ||||
constexpr static size_t THREAD_BUNDLE_BIAS_INDEX = 2; | constexpr static size_t THREAD_BUNDLE_BIAS_INDEX = 2; | ||||
void exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, | |||||
const StrategyParam& sparam, | |||||
const fallback::ConvBiasImpl::NCBKernParam& param, | |||||
fallback::MatrixMulImpl::KernParam matmul_param, | |||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; | |||||
void exec_im2col( | |||||
const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread, | |||||
const StrategyParam& sparam, | |||||
const fallback::ConvBiasImpl::NCBKernParam& param, | |||||
fallback::MatrixMulImpl::KernParam matmul_param, | |||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; | |||||
}; | }; | ||||
#endif | #endif | ||||
@@ -18,7 +18,7 @@ template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||||
megdnn::PostprocessMode postprocess_mode> | megdnn::PostprocessMode postprocess_mode> | ||||
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | ||||
postprocess_mode, PackMode::DEFAULT>:: | postprocess_mode, PackMode::DEFAULT>:: | ||||
packA_kern(WorkspaceBundle bundle, | |||||
packA_kern(const WorkspaceBundle& bundle, | |||||
const fallback::ConvBiasImpl::NCBKernParam& param, | const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
fallback::MatrixMulImpl::KernSizeParam matmulparam, | fallback::MatrixMulImpl::KernSizeParam matmulparam, | ||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | ||||
@@ -26,7 +26,6 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||||
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& | const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& | ||||
matmul_desc, | matmul_desc, | ||||
size_t) { | size_t) { | ||||
bundle.set(param.workspace_ptr); | |||||
fallback::MatrixMulImpl::KernParam matmul_param; | fallback::MatrixMulImpl::KernParam matmul_param; | ||||
size_t group_id = ncb_index.ndrange_id[0]; | size_t group_id = ncb_index.ndrange_id[0]; | ||||
static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) = | static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) = | ||||
@@ -50,7 +49,8 @@ template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||||
megdnn::PostprocessMode postprocess_mode> | megdnn::PostprocessMode postprocess_mode> | ||||
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | ||||
postprocess_mode, PackMode::DEFAULT>:: | postprocess_mode, PackMode::DEFAULT>:: | ||||
exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, | |||||
exec_im2col(const WorkspaceBundle& bundle, | |||||
const WorkspaceBundle& bundle_thread, | |||||
const StrategyParam& sparam, | const StrategyParam& sparam, | ||||
const fallback::ConvBiasImpl::NCBKernParam& param, | const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
fallback::MatrixMulImpl::KernParam matmul_param, | fallback::MatrixMulImpl::KernParam matmul_param, | ||||
@@ -139,8 +139,8 @@ template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||||
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | ||||
postprocess_mode, PackMode::DEFAULT>:: | postprocess_mode, PackMode::DEFAULT>:: | ||||
exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param, | exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
const StrategyParam& sparam, WorkspaceBundle bundle, | |||||
WorkspaceBundle bundle_thread, | |||||
const StrategyParam& sparam, const WorkspaceBundle& bundle, | |||||
const WorkspaceBundle& bundle_thread, | |||||
fallback::MatrixMulImpl::KernParam matmul_param, | fallback::MatrixMulImpl::KernParam matmul_param, | ||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | ||||
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
@@ -29,7 +29,8 @@ template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||||
megdnn::PostprocessMode postprocess_mode> | megdnn::PostprocessMode postprocess_mode> | ||||
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | ||||
postprocess_mode, PackMode::DEFAULT, FormatMode::NCHW44>:: | postprocess_mode, PackMode::DEFAULT, FormatMode::NCHW44>:: | ||||
exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, | |||||
exec_im2col(const WorkspaceBundle& bundle, | |||||
const WorkspaceBundle& bundle_thread, | |||||
const StrategyParam& sparam, | const StrategyParam& sparam, | ||||
const fallback::ConvBiasImpl::NCBKernParam& param, | const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
fallback::MatrixMulImpl::KernParam matmul_param, | fallback::MatrixMulImpl::KernParam matmul_param, | ||||
@@ -169,7 +169,8 @@ void naive_fuse_im2col_packB(dt_int8* src, size_t ic, size_t iw, size_t ih, | |||||
template <typename op_ctype, typename op_dtype, | template <typename op_ctype, typename op_dtype, | ||||
megdnn::PostprocessMode postprocess_mode> | megdnn::PostprocessMode postprocess_mode> | ||||
void StrategyFuse4x4x16Nchw44<op_ctype, op_dtype, postprocess_mode>:: | void StrategyFuse4x4x16Nchw44<op_ctype, op_dtype, postprocess_mode>:: | ||||
exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, | |||||
exec_im2col(const WorkspaceBundle& bundle, | |||||
const WorkspaceBundle& bundle_thread, | |||||
const StrategyParam& sparam, | const StrategyParam& sparam, | ||||
const fallback::ConvBiasImpl::NCBKernParam& param, | const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
fallback::MatrixMulImpl::KernParam, | fallback::MatrixMulImpl::KernParam, | ||||
@@ -172,7 +172,8 @@ void fuse_packb(const dt_int8* __restrict src, dt_int8* __restrict dst, | |||||
template <typename op_ctype, typename op_dtype, | template <typename op_ctype, typename op_dtype, | ||||
megdnn::PostprocessMode postprocess_mode> | megdnn::PostprocessMode postprocess_mode> | ||||
void StrategyFuse8x12x4Nchw44Dot<op_ctype, op_dtype, postprocess_mode>:: | void StrategyFuse8x12x4Nchw44Dot<op_ctype, op_dtype, postprocess_mode>:: | ||||
exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, | |||||
exec_im2col(const WorkspaceBundle& bundle, | |||||
const WorkspaceBundle& bundle_thread, | |||||
const StrategyParam& sparam, | const StrategyParam& sparam, | ||||
const fallback::ConvBiasImpl::NCBKernParam& param, | const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
fallback::MatrixMulImpl::KernParam /*matmul_param*/, | fallback::MatrixMulImpl::KernParam /*matmul_param*/, | ||||
@@ -207,7 +208,6 @@ void StrategyFuse8x12x4Nchw44Dot<op_ctype, op_dtype, postprocess_mode>:: | |||||
sparam.output_block_size); | sparam.output_block_size); | ||||
} | } | ||||
namespace megdnn { | namespace megdnn { | ||||
template class StrategyFuse8x12x4Nchw44Dot<dt_qint32, dt_qint8, | template class StrategyFuse8x12x4Nchw44Dot<dt_qint32, dt_qint8, | ||||
@@ -164,7 +164,8 @@ void fuse_packb(const float* __restrict src, float* __restrict dst, | |||||
template <typename op_ctype, typename op_dtype, | template <typename op_ctype, typename op_dtype, | ||||
megdnn::PostprocessMode postprocess_mode> | megdnn::PostprocessMode postprocess_mode> | ||||
void StrategyFuse8x12x1Nchw44K3x3S2<op_ctype, op_dtype, postprocess_mode>:: | void StrategyFuse8x12x1Nchw44K3x3S2<op_ctype, op_dtype, postprocess_mode>:: | ||||
exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, | |||||
exec_im2col(const WorkspaceBundle& bundle, | |||||
const WorkspaceBundle& bundle_thread, | |||||
const StrategyParam& sparam, | const StrategyParam& sparam, | ||||
const fallback::ConvBiasImpl::NCBKernParam& param, | const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
fallback::MatrixMulImpl::KernParam /*matmul_param*/, | fallback::MatrixMulImpl::KernParam /*matmul_param*/, | ||||
@@ -19,7 +19,7 @@ template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||||
megdnn::PostprocessMode postprocess_mode> | megdnn::PostprocessMode postprocess_mode> | ||||
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | ||||
postprocess_mode, PackMode::NO_PACK>:: | postprocess_mode, PackMode::NO_PACK>:: | ||||
packA_kern(WorkspaceBundle bundle, | |||||
packA_kern(const WorkspaceBundle& bundle, | |||||
const fallback::ConvBiasImpl::NCBKernParam& param, | const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
fallback::MatrixMulImpl::KernSizeParam matmulparam, | fallback::MatrixMulImpl::KernSizeParam matmulparam, | ||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | ||||
@@ -61,8 +61,8 @@ template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||||
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | ||||
postprocess_mode, PackMode::NO_PACK>:: | postprocess_mode, PackMode::NO_PACK>:: | ||||
exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param, | exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
const StrategyParam& sparam, WorkspaceBundle bundle, | |||||
WorkspaceBundle bundle_thread, | |||||
const StrategyParam& sparam, const WorkspaceBundle& bundle, | |||||
const WorkspaceBundle& bundle_thread, | |||||
fallback::MatrixMulImpl::KernParam matmul_param, | fallback::MatrixMulImpl::KernParam matmul_param, | ||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | ||||
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
@@ -96,7 +96,8 @@ template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||||
megdnn::PostprocessMode postprocess_mode> | megdnn::PostprocessMode postprocess_mode> | ||||
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | ||||
postprocess_mode, PackMode::NO_PACK>:: | postprocess_mode, PackMode::NO_PACK>:: | ||||
exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, | |||||
exec_im2col(const WorkspaceBundle& bundle, | |||||
const WorkspaceBundle& bundle_thread, | |||||
const StrategyParam& sparam, | const StrategyParam& sparam, | ||||
const fallback::ConvBiasImpl::NCBKernParam& param, | const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
fallback::MatrixMulImpl::KernParam matmul_param, | fallback::MatrixMulImpl::KernParam matmul_param, | ||||
@@ -19,7 +19,7 @@ template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||||
megdnn::PostprocessMode postprocess_mode> | megdnn::PostprocessMode postprocess_mode> | ||||
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | ||||
postprocess_mode, PackMode::ONLY_PACKA>:: | postprocess_mode, PackMode::ONLY_PACKA>:: | ||||
packA_kern(WorkspaceBundle bundle, | |||||
packA_kern(const WorkspaceBundle& bundle, | |||||
const fallback::ConvBiasImpl::NCBKernParam& param, | const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
fallback::MatrixMulImpl::KernSizeParam matmulparam, | fallback::MatrixMulImpl::KernSizeParam matmulparam, | ||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | ||||
@@ -27,7 +27,6 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||||
const fallback::MatrixMulImpl::AlgoBase:: | const fallback::MatrixMulImpl::AlgoBase:: | ||||
MatmulDescription& /*matmul_desc*/, | MatmulDescription& /*matmul_desc*/, | ||||
size_t) { | size_t) { | ||||
bundle.set(param.workspace_ptr); | |||||
fallback::MatrixMulImpl::KernParam matmul_param; | fallback::MatrixMulImpl::KernParam matmul_param; | ||||
static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) = | static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) = | ||||
matmulparam; | matmulparam; | ||||
@@ -56,8 +55,8 @@ template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||||
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | ||||
postprocess_mode, PackMode::ONLY_PACKA>:: | postprocess_mode, PackMode::ONLY_PACKA>:: | ||||
exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param, | exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
const StrategyParam& sparam, WorkspaceBundle bundle, | |||||
WorkspaceBundle bundle_thread, | |||||
const StrategyParam& sparam, const WorkspaceBundle& bundle, | |||||
const WorkspaceBundle& bundle_thread, | |||||
fallback::MatrixMulImpl::KernParam matmul_param, | fallback::MatrixMulImpl::KernParam matmul_param, | ||||
const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | ||||
const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
@@ -96,7 +95,8 @@ template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||||
megdnn::PostprocessMode postprocess_mode> | megdnn::PostprocessMode postprocess_mode> | ||||
void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | ||||
postprocess_mode, PackMode::ONLY_PACKA>:: | postprocess_mode, PackMode::ONLY_PACKA>:: | ||||
exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, | |||||
exec_im2col(const WorkspaceBundle& bundle, | |||||
const WorkspaceBundle& bundle_thread, | |||||
const StrategyParam& sparam, | const StrategyParam& sparam, | ||||
const fallback::ConvBiasImpl::NCBKernParam& param, | const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
fallback::MatrixMulImpl::KernParam matmul_param, | fallback::MatrixMulImpl::KernParam matmul_param, | ||||
@@ -194,12 +194,12 @@ public: | |||||
IC, 0, OC); | IC, 0, OC); | ||||
} | } | ||||
static void filter_process(Strategy strategy, WorkspaceBundle bundle_top, | |||||
WorkspaceBundle bundle_compute, | |||||
static void filter_process(Strategy strategy, | |||||
const WorkspaceBundle& bundle_top, | |||||
const WorkspaceBundle& bundle_compute, | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index) { | const NCBKernIndex& ncb_index) { | ||||
bundle_top.set(kern_param.workspace_ptr); | |||||
bundle_compute.set(bundle_top.get(0)); | |||||
size_t compute_workspace_size_per_thread = | size_t compute_workspace_size_per_thread = | ||||
bundle_compute.total_size_in_bytes(); | bundle_compute.total_size_in_bytes(); | ||||
size_t thread_id = ncb_index.thread_id; | size_t thread_id = ncb_index.thread_id; | ||||
@@ -236,8 +236,8 @@ public: | |||||
} | } | ||||
static void winograd_compute( | static void winograd_compute( | ||||
Strategy strategy, WorkspaceBundle bundle_top, | |||||
WorkspaceBundle bundle_compute, | |||||
Strategy strategy, const WorkspaceBundle& bundle_top, | |||||
const WorkspaceBundle& bundle_compute, | |||||
fallback::MatrixMulImpl::AlgoBase* matmul_algo, | fallback::MatrixMulImpl::AlgoBase* matmul_algo, | ||||
fallback::MatrixMulImpl::KernParam matmul_param, | fallback::MatrixMulImpl::KernParam matmul_param, | ||||
size_t unit_tile_size, size_t unit_oc_size, | size_t unit_tile_size, size_t unit_oc_size, | ||||
@@ -265,9 +265,6 @@ public: | |||||
size_t group_id = ncb_index.ndrange_id[0]; | size_t group_id = ncb_index.ndrange_id[0]; | ||||
size_t thread_id = ncb_index.thread_id; | size_t thread_id = ncb_index.thread_id; | ||||
bundle_top.set(ncb_param.workspace_ptr); | |||||
bundle_compute.set(bundle_top.get(0)); | |||||
const stype* src_ptr = ncb_param.src<stype>(batch_id, group_id); | const stype* src_ptr = ncb_param.src<stype>(batch_id, group_id); | ||||
dst_type* dst_ptr = ncb_param.dst<dst_type>(batch_id, group_id); | dst_type* dst_ptr = ncb_param.dst<dst_type>(batch_id, group_id); | ||||
const output_compute_type* bias_ptr = | const output_compute_type* bias_ptr = | ||||
@@ -419,14 +416,16 @@ public: | |||||
param.filter_meta.format == param::ConvBias::Format::NCHW44) { | param.filter_meta.format == param::ConvBias::Format::NCHW44) { | ||||
//! probably a gcc bug, labmda require capturing 'this' to call | //! probably a gcc bug, labmda require capturing 'this' to call | ||||
//! static member function | //! static member function | ||||
auto filter_process_kern = [this, strategy, bundle_top, | |||||
bundle_compute]( | |||||
const NCBKernParam& ncb_param, | |||||
const NCBKernIndex& ncb_index) { | |||||
MEGDNN_MARK_USED_VAR(this); | |||||
filter_process(strategy, bundle_top, bundle_compute, ncb_param, | |||||
std::move(ncb_index)); | |||||
}; | |||||
auto filter_process_kern = | |||||
[this, strategy, bundle_top, bundle_compute]( | |||||
const NCBKernParam& ncb_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
MEGDNN_MARK_USED_VAR(this); | |||||
bundle_top.set(ncb_param.workspace_ptr); | |||||
bundle_compute.set(bundle_top.get(0)); | |||||
filter_process(strategy, bundle_top, bundle_compute, | |||||
ncb_param, std::move(ncb_index)); | |||||
}; | |||||
size_t oc_parallelism = OC; | size_t oc_parallelism = OC; | ||||
if (param.filter_meta.format == param::ConvBias::Format::NCHW88) { | if (param.filter_meta.format == param::ConvBias::Format::NCHW88) { | ||||
megdnn_assert(OC % 8 == 0); | megdnn_assert(OC % 8 == 0); | ||||
@@ -438,18 +437,22 @@ public: | |||||
} | } | ||||
kerns.push_back({filter_process_kern, {GROUP, 1, oc_parallelism}}); | kerns.push_back({filter_process_kern, {GROUP, 1, oc_parallelism}}); | ||||
} | } | ||||
auto winograd_compute_kern = [strategy, bundle_top, bundle_compute, | |||||
matmul_algo, matmul_param, unit_tile_size, | |||||
unit_oc_size]( | |||||
const NCBKernParam& ncb_param, | |||||
const NCBKernIndex& ncb_index) { | |||||
MIDOUT_BEGIN(megdnn_fallback_conv_bias_winograd_common, 0, 0) { | |||||
winograd_compute(strategy, bundle_top, bundle_compute, | |||||
matmul_algo, matmul_param, unit_tile_size, | |||||
unit_oc_size, ncb_param, std::move(ncb_index)); | |||||
} | |||||
MIDOUT_END(); | |||||
}; | |||||
auto winograd_compute_kern = | |||||
[strategy, bundle_top, bundle_compute, matmul_algo, | |||||
matmul_param, unit_tile_size, | |||||
unit_oc_size](const NCBKernParam& ncb_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
MIDOUT_BEGIN(megdnn_fallback_conv_bias_winograd_common, 0, | |||||
0) { | |||||
bundle_top.set(ncb_param.workspace_ptr); | |||||
bundle_compute.set(bundle_top.get(0)); | |||||
winograd_compute(strategy, bundle_top, bundle_compute, | |||||
matmul_algo, matmul_param, | |||||
unit_tile_size, unit_oc_size, | |||||
ncb_param, std::move(ncb_index)); | |||||
} | |||||
MIDOUT_END(); | |||||
}; | |||||
kerns.push_back( | kerns.push_back( | ||||
{winograd_compute_kern, {GROUP, N, nr_hw_tiles, nr_oc_tiles}}); | {winograd_compute_kern, {GROUP, N, nr_hw_tiles, nr_oc_tiles}}); | ||||
return kerns; | return kerns; | ||||
@@ -186,10 +186,7 @@ public: | |||||
*/ | */ | ||||
#define MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN(_handle, _parallelism, _stmt) \ | #define MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN(_handle, _parallelism, _stmt) \ | ||||
do { \ | do { \ | ||||
auto _kern = [=](size_t index, size_t thread_id) { \ | |||||
_stmt(index, thread_id); \ | |||||
}; \ | |||||
_handle->dispatch_kern(_kern, _parallelism); \ | |||||
_handle->dispatch_kern(_stmt, _parallelism); \ | |||||
} while (0) | } while (0) | ||||
//! disptch kern on current opr | //! disptch kern on current opr | ||||
@@ -58,45 +58,47 @@ void get_rectified_size(size_t IH, size_t IW, size_t OH, size_t OW, size_t FH, | |||||
} | } | ||||
} // namespace | } // namespace | ||||
#define GET_KERN \ | |||||
auto fm = param.filter_meta; \ | |||||
size_t N = param.n; \ | |||||
size_t IC = param.filter_meta.icpg; \ | |||||
size_t OC = param.filter_meta.ocpg; \ | |||||
size_t group = fm.group; \ | |||||
WorkspaceBundle wbundle = get_bundle(param); \ | |||||
SmallVector<NCBKern> ret_kerns; \ | |||||
if (m_large_group) { \ | |||||
auto exec_one_group = [wbundle](const NCBKernParam& kern_param, \ | |||||
const NCBKernIndex& ncb_index) { \ | |||||
auto fm = kern_param.filter_meta; \ | |||||
size_t IC = fm.icpg; \ | |||||
size_t OC = fm.ocpg; \ | |||||
WorkspaceBundle bundle = wbundle; \ | |||||
for (size_t ic = 0; ic < IC; ic++) { \ | |||||
copy_padding_kern(bundle, kern_param, ncb_index, \ | |||||
{ncb_index.thread_id, 0, ic}); \ | |||||
} \ | |||||
for (size_t oc = 0; oc < OC; oc++) { \ | |||||
do_conv_kern(bundle, kern_param, ncb_index, \ | |||||
{ncb_index.thread_id, 0, oc}); \ | |||||
} \ | |||||
}; \ | |||||
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); \ | |||||
} else { \ | |||||
auto copy_padding = [wbundle](const NCBKernParam& kern_param, \ | |||||
const NCBKernIndex& ncb_index) { \ | |||||
copy_padding_kern(wbundle, kern_param, ncb_index, \ | |||||
ncb_index.ndrange_id); \ | |||||
}; \ | |||||
ret_kerns.push_back({copy_padding, {group, N, IC}}); \ | |||||
auto do_conv = [wbundle](const NCBKernParam& kern_param, \ | |||||
const NCBKernIndex& ncb_index) { \ | |||||
do_conv_kern(wbundle, kern_param, ncb_index, \ | |||||
ncb_index.ndrange_id); \ | |||||
}; \ | |||||
ret_kerns.push_back({do_conv, {group, N, OC}}); \ | |||||
} \ | |||||
#define GET_KERN \ | |||||
auto fm = param.filter_meta; \ | |||||
size_t N = param.n; \ | |||||
size_t IC = param.filter_meta.icpg; \ | |||||
size_t OC = param.filter_meta.ocpg; \ | |||||
size_t group = fm.group; \ | |||||
WorkspaceBundle bundle = get_bundle(param); \ | |||||
SmallVector<NCBKern> ret_kerns; \ | |||||
if (m_large_group) { \ | |||||
auto exec_one_group = [bundle]( \ | |||||
const NCBKernParam& kern_param, \ | |||||
const NCBKernIndex& ncb_index) mutable { \ | |||||
bundle.set(kern_param.workspace_ptr); \ | |||||
auto fm = kern_param.filter_meta; \ | |||||
size_t IC = fm.icpg; \ | |||||
size_t OC = fm.ocpg; \ | |||||
for (size_t ic = 0; ic < IC; ic++) { \ | |||||
copy_padding_kern(bundle, kern_param, ncb_index, \ | |||||
{ncb_index.thread_id, 0, ic}); \ | |||||
} \ | |||||
for (size_t oc = 0; oc < OC; oc++) { \ | |||||
do_conv_kern(bundle, kern_param, ncb_index, \ | |||||
{ncb_index.thread_id, 0, oc}); \ | |||||
} \ | |||||
}; \ | |||||
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); \ | |||||
} else { \ | |||||
auto copy_padding = [bundle](const NCBKernParam& kern_param, \ | |||||
const NCBKernIndex& ncb_index) mutable { \ | |||||
bundle.set(kern_param.workspace_ptr); \ | |||||
copy_padding_kern(bundle, kern_param, ncb_index, \ | |||||
ncb_index.ndrange_id); \ | |||||
}; \ | |||||
ret_kerns.push_back({copy_padding, {group, N, IC}}); \ | |||||
auto do_conv = [bundle](const NCBKernParam& kern_param, \ | |||||
const NCBKernIndex& ncb_index) mutable { \ | |||||
bundle.set(kern_param.workspace_ptr); \ | |||||
do_conv_kern(bundle, kern_param, ncb_index, ncb_index.ndrange_id); \ | |||||
}; \ | |||||
ret_kerns.push_back({do_conv, {group, N, OC}}); \ | |||||
} \ | |||||
return ret_kerns; | return ret_kerns; | ||||
/* ===================== direct algo ===================== */ | /* ===================== direct algo ===================== */ | ||||
@@ -146,7 +148,8 @@ size_t ConvBiasImpl::AlgoDirect::get_workspace( | |||||
//! Process one input channel copy padding | //! Process one input channel copy padding | ||||
void ConvBiasImpl::AlgoDirect::copy_padding_kern( | void ConvBiasImpl::AlgoDirect::copy_padding_kern( | ||||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids) { | const CpuNDRange& workspace_ids) { | ||||
size_t IH = kern_param.isz[0]; | size_t IH = kern_param.isz[0]; | ||||
@@ -169,7 +172,6 @@ void ConvBiasImpl::AlgoDirect::copy_padding_kern( | |||||
const float* sptr = static_cast<const float*>( | const float* sptr = static_cast<const float*>( | ||||
kern_param.src<float>(batch_id, group_id)) + | kern_param.src<float>(batch_id, group_id)) + | ||||
channel_id * IH * IW; | channel_id * IH * IW; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
//! Used for get the workspace offset | //! Used for get the workspace offset | ||||
size_t workspace_group_id = workspace_ids[0], | size_t workspace_group_id = workspace_ids[0], | ||||
@@ -239,7 +241,7 @@ void ConvBiasImpl::AlgoDirect::copy_padding_kern( | |||||
func = detail::convolution_##mode##_fh##fsize##_##simd; | func = detail::convolution_##mode##_fh##fsize##_##simd; | ||||
//! compute one output channel | //! compute one output channel | ||||
void ConvBiasImpl::AlgoDirect::do_conv_kern(WorkspaceBundle bundle, | |||||
void ConvBiasImpl::AlgoDirect::do_conv_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids) { | const CpuNDRange& workspace_ids) { | ||||
@@ -265,7 +267,6 @@ void ConvBiasImpl::AlgoDirect::do_conv_kern(WorkspaceBundle bundle, | |||||
func = nullptr; | func = nullptr; | ||||
DISPATCH; | DISPATCH; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
size_t bias_offset = 0; | size_t bias_offset = 0; | ||||
if (kern_param.bias_mode == megdnn::BiasMode::BIAS) { | if (kern_param.bias_mode == megdnn::BiasMode::BIAS) { | ||||
bias_offset = OH * OW; | bias_offset = OH * OW; | ||||
@@ -367,7 +368,8 @@ size_t ConvBiasImpl::AlgoDirectStride2::get_workspace( | |||||
} | } | ||||
//! Process one input channel copy padding | //! Process one input channel copy padding | ||||
void ConvBiasImpl::AlgoDirectStride2::copy_padding_kern( | void ConvBiasImpl::AlgoDirectStride2::copy_padding_kern( | ||||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids) { | const CpuNDRange& workspace_ids) { | ||||
size_t IH = kern_param.isz[0]; | size_t IH = kern_param.isz[0]; | ||||
@@ -390,7 +392,6 @@ void ConvBiasImpl::AlgoDirectStride2::copy_padding_kern( | |||||
const float* sptr = static_cast<const float*>( | const float* sptr = static_cast<const float*>( | ||||
kern_param.src<float>(batch_id, group_id)) + | kern_param.src<float>(batch_id, group_id)) + | ||||
channel_id * IH * IW; | channel_id * IH * IW; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
//! Used for get the workspace offset | //! Used for get the workspace offset | ||||
size_t workspace_group_id = workspace_ids[0], | size_t workspace_group_id = workspace_ids[0], | ||||
workspace_batch_id = workspace_ids[1], | workspace_batch_id = workspace_ids[1], | ||||
@@ -411,7 +412,7 @@ void ConvBiasImpl::AlgoDirectStride2::copy_padding_kern( | |||||
//! compute one output channel | //! compute one output channel | ||||
void ConvBiasImpl::AlgoDirectStride2::do_conv_kern( | void ConvBiasImpl::AlgoDirectStride2::do_conv_kern( | ||||
WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { | const NCBKernIndex& ncb_index, const CpuNDRange& workspace_ids) { | ||||
size_t OH = kern_param.osz[0]; | size_t OH = kern_param.osz[0]; | ||||
size_t OW = kern_param.osz[1]; | size_t OW = kern_param.osz[1]; | ||||
@@ -446,7 +447,6 @@ void ConvBiasImpl::AlgoDirectStride2::do_conv_kern( | |||||
func_add_dst = conv_general_simd::do_conv_7x7_stride2<true>; | func_add_dst = conv_general_simd::do_conv_7x7_stride2<true>; | ||||
} | } | ||||
bundle.set(kern_param.workspace_ptr); | |||||
size_t bias_offset = 0; | size_t bias_offset = 0; | ||||
if (kern_param.bias_mode == megdnn::BiasMode::BIAS) { | if (kern_param.bias_mode == megdnn::BiasMode::BIAS) { | ||||
bias_offset = OH * OW; | bias_offset = OH * OW; | ||||
@@ -20,11 +20,11 @@ class ConvBiasImpl::AlgoDirect final : public AlgoBase { | |||||
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const; | SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const; | ||||
WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const; | WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const; | ||||
static void copy_padding_kern(WorkspaceBundle bundle, | |||||
static void copy_padding_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
static void do_conv_kern(WorkspaceBundle bundle, | |||||
static void do_conv_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
@@ -57,11 +57,11 @@ class ConvBiasImpl::AlgoDirectStride2 final : public AlgoBase { | |||||
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const; | SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const; | ||||
WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const; | WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const; | ||||
static void copy_padding_kern(WorkspaceBundle bundle, | |||||
static void copy_padding_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | |||||
static void do_conv_kern(WorkspaceBundle bundle, | |||||
const CpuNDRange& workspace_ids); | |||||
static void do_conv_kern(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index, | const NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids); | const CpuNDRange& workspace_ids); | ||||
@@ -19,7 +19,7 @@ namespace x86 { | |||||
namespace avx2_chanwise_stride1 { | namespace avx2_chanwise_stride1 { | ||||
template <size_t filter, BiasMode bias_mode, bool is_quantized, typename Op> | template <size_t filter, BiasMode bias_mode, bool is_quantized, typename Op> | ||||
void conv_kimpl(WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
void conv_kimpl(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) { | const NCBKernIndex& ncb_index) { | ||||
size_t OH = kern_param.osz[0]; | size_t OH = kern_param.osz[0]; | ||||
size_t OW = kern_param.osz[1]; | size_t OW = kern_param.osz[1]; | ||||
@@ -38,9 +38,6 @@ void conv_kimpl(WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
op = Op(scale_bias, scale_dst); | op = Op(scale_bias, scale_dst); | ||||
} | } | ||||
size_t padding_group_size = IH2 * IW2; | size_t padding_group_size = IH2 * IW2; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
size_t workspace_group_id = ncb_index.thread_id; | size_t workspace_group_id = ncb_index.thread_id; | ||||
size_t group_id = ncb_index.ndrange_id[0], | size_t group_id = ncb_index.ndrange_id[0], | ||||
batch_id = ncb_index.ndrange_id[1]; | batch_id = ncb_index.ndrange_id[1]; | ||||
@@ -98,7 +95,7 @@ void conv_kimpl(WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
} | } | ||||
}; | }; | ||||
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& kern_param, | SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& kern_param, | ||||
WorkspaceBundle bundle) { | |||||
const WorkspaceBundle& bundle) { | |||||
MEGDNN_MARK_USED_VAR(kern_param); | MEGDNN_MARK_USED_VAR(kern_param); | ||||
auto fm = kern_param.filter_meta; | auto fm = kern_param.filter_meta; | ||||
size_t group = fm.group; | size_t group = fm.group; | ||||
@@ -182,8 +179,10 @@ SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& kern_param, | |||||
DISPATCH_CONV_KERN(); | DISPATCH_CONV_KERN(); | ||||
auto exec_one_group = [bundle, do_conv_fun](const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) { | |||||
auto exec_one_group = [bundle = bundle, do_conv_fun]( | |||||
const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
copy_padding_kern(bundle, kern_param, ncb_index); | copy_padding_kern(bundle, kern_param, ncb_index); | ||||
do_conv_fun(bundle, kern_param, ncb_index); | do_conv_fun(bundle, kern_param, ncb_index); | ||||
}; | }; | ||||
@@ -17,11 +17,11 @@ | |||||
namespace megdnn { | namespace megdnn { | ||||
namespace x86 { | namespace x86 { | ||||
namespace avx2_chanwise_stride1 { | namespace avx2_chanwise_stride1 { | ||||
using conv_fun = std::function<void(WorkspaceBundle bundle, | |||||
using conv_fun = std::function<void(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index)>; | const NCBKernIndex& ncb_index)>; | ||||
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param, | SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param, | ||||
WorkspaceBundle bundle); | |||||
const WorkspaceBundle& bundle); | |||||
} // namespace avx2_chanwise_stride1 | } // namespace avx2_chanwise_stride1 | ||||
} // namespace x86 | } // namespace x86 | ||||
@@ -19,7 +19,7 @@ namespace x86 { | |||||
namespace avx2_chanwise_stride2 { | namespace avx2_chanwise_stride2 { | ||||
template <size_t filter, BiasMode bias_mode, bool is_quantized, typename Op> | template <size_t filter, BiasMode bias_mode, bool is_quantized, typename Op> | ||||
void conv_kimpl(WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
void conv_kimpl(const WorkspaceBundle& bundle, const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) { | const NCBKernIndex& ncb_index) { | ||||
size_t OH = kern_param.osz[0]; | size_t OH = kern_param.osz[0]; | ||||
size_t OW = kern_param.osz[1]; | size_t OW = kern_param.osz[1]; | ||||
@@ -38,9 +38,6 @@ void conv_kimpl(WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
op = Op(scale_bias, scale_dst); | op = Op(scale_bias, scale_dst); | ||||
} | } | ||||
size_t padding_group_size = IH2 * IW2; | size_t padding_group_size = IH2 * IW2; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
size_t workspace_group_id = ncb_index.thread_id; | size_t workspace_group_id = ncb_index.thread_id; | ||||
size_t group_id = ncb_index.ndrange_id[0], | size_t group_id = ncb_index.ndrange_id[0], | ||||
batch_id = ncb_index.ndrange_id[1]; | batch_id = ncb_index.ndrange_id[1]; | ||||
@@ -98,7 +95,7 @@ void conv_kimpl(WorkspaceBundle bundle, const NCBKernParam& kern_param, | |||||
} | } | ||||
}; | }; | ||||
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& kern_param, | SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& kern_param, | ||||
WorkspaceBundle bundle) { | |||||
const WorkspaceBundle& bundle) { | |||||
MEGDNN_MARK_USED_VAR(kern_param); | MEGDNN_MARK_USED_VAR(kern_param); | ||||
auto fm = kern_param.filter_meta; | auto fm = kern_param.filter_meta; | ||||
size_t group = fm.group; | size_t group = fm.group; | ||||
@@ -187,8 +184,10 @@ SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& kern_param, | |||||
DISPATCH_CONV_KERN(); | DISPATCH_CONV_KERN(); | ||||
auto exec_one_group = [bundle, do_conv_fun](const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) { | |||||
auto exec_one_group = [bundle = bundle, do_conv_fun]( | |||||
const NCBKernParam& kern_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
bundle.set(kern_param.workspace_ptr); | |||||
copy_padding_kern(bundle, kern_param, ncb_index); | copy_padding_kern(bundle, kern_param, ncb_index); | ||||
do_conv_fun(bundle, kern_param, ncb_index); | do_conv_fun(bundle, kern_param, ncb_index); | ||||
}; | }; | ||||
@@ -17,11 +17,11 @@ | |||||
namespace megdnn { | namespace megdnn { | ||||
namespace x86 { | namespace x86 { | ||||
namespace avx2_chanwise_stride2 { | namespace avx2_chanwise_stride2 { | ||||
using conv_fun = std::function<void(WorkspaceBundle bundle, | |||||
using conv_fun = std::function<void(const WorkspaceBundle& bundle, | |||||
const NCBKernParam& kern_param, | const NCBKernParam& kern_param, | ||||
const NCBKernIndex& ncb_index)>; | const NCBKernIndex& ncb_index)>; | ||||
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param, | SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param, | ||||
WorkspaceBundle bundle); | |||||
const WorkspaceBundle& bundle); | |||||
} // namespace avx2_chanwise_stride2 | } // namespace avx2_chanwise_stride2 | ||||
} // namespace x86 | } // namespace x86 | ||||
@@ -19,7 +19,7 @@ namespace direct_conv_avx2_stride1 { | |||||
//! layout:(N,IC,IH,IW)-->(N,IC/2,H,W,2) | //! layout:(N,IC,IH,IW)-->(N,IC/2,H,W,2) | ||||
MEGDNN_ATTRIBUTE_TARGET("sse4.1") | MEGDNN_ATTRIBUTE_TARGET("sse4.1") | ||||
void pack_src_conv_avx2_stride1(WorkspaceBundle bundle, | |||||
void pack_src_conv_avx2_stride1(const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | const ConvBiasImpl::NCBKernParam& kern_param, | ||||
const ConvBiasImpl::NCBKernIndex& ncb_index) { | const ConvBiasImpl::NCBKernIndex& ncb_index) { | ||||
int32_t ih = kern_param.isz[0]; | int32_t ih = kern_param.isz[0]; | ||||
@@ -48,7 +48,6 @@ void pack_src_conv_avx2_stride1(WorkspaceBundle bundle, | |||||
const int8_t* src_ptr = kern_param.src<int8_t>(batch_id, group_id) + | const int8_t* src_ptr = kern_param.src<int8_t>(batch_id, group_id) + | ||||
ic_step * channel_id * c_stride; | ic_step * channel_id * c_stride; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
int8_t* packed_src = static_cast<int8_t*>(bundle.get(0)) + | int8_t* packed_src = static_cast<int8_t*>(bundle.get(0)) + | ||||
batch_id * group * packed_group_size + | batch_id * group * packed_group_size + | ||||
group_id * packed_group_size + | group_id * packed_group_size + | ||||
@@ -103,7 +102,7 @@ void pack_src_conv_avx2_stride1(WorkspaceBundle bundle, | |||||
MEGDNN_ATTRIBUTE_TARGET("sse4.1") | MEGDNN_ATTRIBUTE_TARGET("sse4.1") | ||||
static inline void pack_filter_conv_avx2_stride1( | static inline void pack_filter_conv_avx2_stride1( | ||||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index) { | const ConvBiasImpl::NCBKernIndex& ncb_index) { | ||||
MEGDNN_MARK_USED_VAR(ncb_index); | MEGDNN_MARK_USED_VAR(ncb_index); | ||||
int32_t oc = kern_param.filter_meta.ocpg; | int32_t oc = kern_param.filter_meta.ocpg; | ||||
@@ -129,7 +128,6 @@ static inline void pack_filter_conv_avx2_stride1( | |||||
oc_index_id = ncb_index.ndrange_id[1]; | oc_index_id = ncb_index.ndrange_id[1]; | ||||
const int8_t* pack_filter_ptr = kern_param.filter<int8_t>(group_id); | const int8_t* pack_filter_ptr = kern_param.filter<int8_t>(group_id); | ||||
bundle.set(kern_param.workspace_ptr); | |||||
int16_t* out_ptr = static_cast<int16_t*>(bundle.get(1)) + | int16_t* out_ptr = static_cast<int16_t*>(bundle.get(1)) + | ||||
group_id * round_up(oc, oc_step) * oc_out_stride; | group_id * round_up(oc, oc_step) * oc_out_stride; | ||||
@@ -602,7 +600,7 @@ inline void AlgoAVX2DirectConvStride1S8S8S32_forward( | |||||
#undef cb_switch | #undef cb_switch | ||||
#undef cb | #undef cb | ||||
} | } | ||||
void do_conv_kern(WorkspaceBundle bundle, | |||||
void do_conv_kern(const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | const ConvBiasImpl::NCBKernParam& kern_param, | ||||
const ConvBiasImpl::NCBKernIndex& ncb_index) { | const ConvBiasImpl::NCBKernIndex& ncb_index) { | ||||
auto&& fm = kern_param.filter_meta; | auto&& fm = kern_param.filter_meta; | ||||
@@ -635,8 +633,6 @@ void do_conv_kern(WorkspaceBundle bundle, | |||||
batch_id = ncb_index.ndrange_id[1], | batch_id = ncb_index.ndrange_id[1], | ||||
channel_id = ncb_index.ndrange_id[2]; | channel_id = ncb_index.ndrange_id[2]; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
int8_t* src_ptr = static_cast<int8_t*>(bundle.get(0)) + | int8_t* src_ptr = static_cast<int8_t*>(bundle.get(0)) + | ||||
group_id * packed_group_size + | group_id * packed_group_size + | ||||
batch_id * group * packed_group_size; | batch_id * group * packed_group_size; | ||||
@@ -672,7 +668,7 @@ void do_conv_kern(WorkspaceBundle bundle, | |||||
oc_stride, kern_param); | oc_stride, kern_param); | ||||
} | } | ||||
void do_post_process(WorkspaceBundle bundle, | |||||
void do_post_process(const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | const ConvBiasImpl::NCBKernParam& kern_param, | ||||
const ConvBiasImpl::NCBKernIndex& ncb_index) { | const ConvBiasImpl::NCBKernIndex& ncb_index) { | ||||
auto&& fm = kern_param.filter_meta; | auto&& fm = kern_param.filter_meta; | ||||
@@ -683,7 +679,6 @@ void do_post_process(WorkspaceBundle bundle, | |||||
size_t group_id = ncb_index.ndrange_id[0], | size_t group_id = ncb_index.ndrange_id[0], | ||||
batch_id = ncb_index.ndrange_id[1]; | batch_id = ncb_index.ndrange_id[1]; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
bool need_post_process = | bool need_post_process = | ||||
kern_param.dst_type.enumv() == DTypeEnum::QuantizedS8; | kern_param.dst_type.enumv() == DTypeEnum::QuantizedS8; | ||||
void* dst_tptr = nullptr; | void* dst_tptr = nullptr; | ||||
@@ -729,21 +724,22 @@ void do_post_process(WorkspaceBundle bundle, | |||||
} | } | ||||
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& kern_param, | SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& kern_param, | ||||
WorkspaceBundle bundle) { | |||||
const WorkspaceBundle& bundle) { | |||||
SmallVector<NCBKern> ncb_kerns; | SmallVector<NCBKern> ncb_kerns; | ||||
auto fm = kern_param.filter_meta; | auto fm = kern_param.filter_meta; | ||||
size_t N = kern_param.n; | size_t N = kern_param.n; | ||||
size_t IC = kern_param.filter_meta.icpg; | size_t IC = kern_param.filter_meta.icpg; | ||||
size_t OC = kern_param.filter_meta.ocpg; | size_t OC = kern_param.filter_meta.ocpg; | ||||
size_t group = fm.group; | size_t group = fm.group; | ||||
#define cb(task) \ | |||||
auto task = [bundle, tmp_func]( \ | |||||
const ConvBiasImpl::NCBKernParam& kern_param, \ | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index) { \ | |||||
tmp_func(bundle, kern_param, \ | |||||
{ncb_index.thread_id, \ | |||||
{ncb_index.ndrange_id[0], ncb_index.ndrange_id[1], \ | |||||
ncb_index.ndrange_id[2]}}); \ | |||||
#define cb(task) \ | |||||
auto task = [bundle = bundle, tmp_func]( \ | |||||
const ConvBiasImpl::NCBKernParam& kern_param, \ | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { \ | |||||
bundle.set(kern_param.workspace_ptr); \ | |||||
tmp_func(bundle, kern_param, \ | |||||
{ncb_index.thread_id, \ | |||||
{ncb_index.ndrange_id[0], ncb_index.ndrange_id[1], \ | |||||
ncb_index.ndrange_id[2]}}); \ | |||||
}; | }; | ||||
auto tmp_func = pack_src_conv_avx2_stride1; | auto tmp_func = pack_src_conv_avx2_stride1; | ||||
cb(pack_src_task); | cb(pack_src_task); | ||||
@@ -20,7 +20,7 @@ using NCBKern = fallback::ConvBiasImpl::NCBKern; | |||||
using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam; | using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam; | ||||
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param, | SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param, | ||||
WorkspaceBundle bundle); | |||||
const WorkspaceBundle& bundle); | |||||
} // namespace direct_conv_avx2_stride1 | } // namespace direct_conv_avx2_stride1 | ||||
} // namespace x86 | } // namespace x86 | ||||
@@ -19,7 +19,7 @@ namespace direct_conv_avx2_stride2 { | |||||
//! layout:(N,IC,IH,IW)-->(N,IC/2,H,2*W_envnW_odd) | //! layout:(N,IC,IH,IW)-->(N,IC/2,H,2*W_envnW_odd) | ||||
MEGDNN_ATTRIBUTE_TARGET("sse4.1") | MEGDNN_ATTRIBUTE_TARGET("sse4.1") | ||||
void pack_src_conv_avx2_stride2(WorkspaceBundle bundle, | |||||
void pack_src_conv_avx2_stride2(const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | const ConvBiasImpl::NCBKernParam& kern_param, | ||||
const ConvBiasImpl::NCBKernIndex& ncb_index) { | const ConvBiasImpl::NCBKernIndex& ncb_index) { | ||||
int32_t ih = kern_param.isz[0]; | int32_t ih = kern_param.isz[0]; | ||||
@@ -46,7 +46,6 @@ void pack_src_conv_avx2_stride2(WorkspaceBundle bundle, | |||||
const int8_t* src_ptr = kern_param.src<int8_t>(batch_id, group_id) + | const int8_t* src_ptr = kern_param.src<int8_t>(batch_id, group_id) + | ||||
ic_step * channel_id * c_stride; | ic_step * channel_id * c_stride; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
int8_t* packed_src = static_cast<int8_t*>(bundle.get(0)) + | int8_t* packed_src = static_cast<int8_t*>(bundle.get(0)) + | ||||
batch_id * group * packed_group_size + | batch_id * group * packed_group_size + | ||||
group_id * packed_group_size + | group_id * packed_group_size + | ||||
@@ -161,7 +160,7 @@ void pack_src_conv_avx2_stride2(WorkspaceBundle bundle, | |||||
MEGDNN_ATTRIBUTE_TARGET("sse4.1") | MEGDNN_ATTRIBUTE_TARGET("sse4.1") | ||||
static inline void pack_filter_conv_avx2_stride2( | static inline void pack_filter_conv_avx2_stride2( | ||||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index) { | const ConvBiasImpl::NCBKernIndex& ncb_index) { | ||||
MEGDNN_MARK_USED_VAR(ncb_index); | MEGDNN_MARK_USED_VAR(ncb_index); | ||||
int32_t oc = kern_param.filter_meta.ocpg; | int32_t oc = kern_param.filter_meta.ocpg; | ||||
@@ -187,7 +186,6 @@ static inline void pack_filter_conv_avx2_stride2( | |||||
oc_index_id = ncb_index.ndrange_id[1]; | oc_index_id = ncb_index.ndrange_id[1]; | ||||
const int8_t* pack_filter_ptr = kern_param.filter<int8_t>(group_id); | const int8_t* pack_filter_ptr = kern_param.filter<int8_t>(group_id); | ||||
bundle.set(kern_param.workspace_ptr); | |||||
int16_t* out_ptr = static_cast<int16_t*>(bundle.get(1)) + | int16_t* out_ptr = static_cast<int16_t*>(bundle.get(1)) + | ||||
group_id * round_up(oc, oc_step) * oc_out_stride; | group_id * round_up(oc, oc_step) * oc_out_stride; | ||||
@@ -675,7 +673,7 @@ inline void kernel_handle_oh_remain( | |||||
#undef cb_switch | #undef cb_switch | ||||
#undef cb | #undef cb | ||||
} | } | ||||
void kernel_imp(WorkspaceBundle bundle, | |||||
void kernel_imp(const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | const ConvBiasImpl::NCBKernParam& kern_param, | ||||
const ConvBiasImpl::NCBKernIndex& ncb_index) { | const ConvBiasImpl::NCBKernIndex& ncb_index) { | ||||
auto&& fm = kern_param.filter_meta; | auto&& fm = kern_param.filter_meta; | ||||
@@ -708,7 +706,6 @@ void kernel_imp(WorkspaceBundle bundle, | |||||
batch_id = ncb_index.ndrange_id[1], | batch_id = ncb_index.ndrange_id[1], | ||||
channel_id = ncb_index.ndrange_id[2]; | channel_id = ncb_index.ndrange_id[2]; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
int8_t* src_ptr = static_cast<int8_t*>(bundle.get(0)) + | int8_t* src_ptr = static_cast<int8_t*>(bundle.get(0)) + | ||||
group_id * packed_group_size + | group_id * packed_group_size + | ||||
batch_id * group * packed_group_size; | batch_id * group * packed_group_size; | ||||
@@ -742,7 +739,7 @@ void kernel_imp(WorkspaceBundle bundle, | |||||
oc_stride, kern_param); | oc_stride, kern_param); | ||||
} | } | ||||
void do_post_process(WorkspaceBundle bundle, | |||||
void do_post_process(const WorkspaceBundle& bundle, | |||||
const ConvBiasImpl::NCBKernParam& kern_param, | const ConvBiasImpl::NCBKernParam& kern_param, | ||||
const ConvBiasImpl::NCBKernIndex& ncb_index) { | const ConvBiasImpl::NCBKernIndex& ncb_index) { | ||||
auto&& fm = kern_param.filter_meta; | auto&& fm = kern_param.filter_meta; | ||||
@@ -754,7 +751,6 @@ void do_post_process(WorkspaceBundle bundle, | |||||
size_t group_id = ncb_index.ndrange_id[0], | size_t group_id = ncb_index.ndrange_id[0], | ||||
batch_id = ncb_index.ndrange_id[1]; | batch_id = ncb_index.ndrange_id[1]; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
bool need_post_process = | bool need_post_process = | ||||
kern_param.dst_type.enumv() == DTypeEnum::QuantizedS8; | kern_param.dst_type.enumv() == DTypeEnum::QuantizedS8; | ||||
void* dst_tptr = nullptr; | void* dst_tptr = nullptr; | ||||
@@ -801,21 +797,22 @@ void do_post_process(WorkspaceBundle bundle, | |||||
} | } | ||||
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& kern_param, | SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& kern_param, | ||||
WorkspaceBundle bundle) { | |||||
const WorkspaceBundle& bundle) { | |||||
SmallVector<NCBKern> ncb_kerns; | SmallVector<NCBKern> ncb_kerns; | ||||
auto fm = kern_param.filter_meta; | auto fm = kern_param.filter_meta; | ||||
size_t N = kern_param.n; | size_t N = kern_param.n; | ||||
size_t IC = kern_param.filter_meta.icpg; | size_t IC = kern_param.filter_meta.icpg; | ||||
size_t OC = kern_param.filter_meta.ocpg; | size_t OC = kern_param.filter_meta.ocpg; | ||||
size_t group = fm.group; | size_t group = fm.group; | ||||
#define cb(task) \ | |||||
auto task = [bundle, tmp_func]( \ | |||||
const ConvBiasImpl::NCBKernParam& kern_param, \ | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index) { \ | |||||
tmp_func(bundle, kern_param, \ | |||||
{ncb_index.thread_id, \ | |||||
{ncb_index.ndrange_id[0], ncb_index.ndrange_id[1], \ | |||||
ncb_index.ndrange_id[2]}}); \ | |||||
#define cb(task) \ | |||||
auto task = [bundle = bundle, tmp_func]( \ | |||||
const ConvBiasImpl::NCBKernParam& kern_param, \ | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { \ | |||||
bundle.set(kern_param.workspace_ptr); \ | |||||
tmp_func(bundle, kern_param, \ | |||||
{ncb_index.thread_id, \ | |||||
{ncb_index.ndrange_id[0], ncb_index.ndrange_id[1], \ | |||||
ncb_index.ndrange_id[2]}}); \ | |||||
}; | }; | ||||
auto tmp_func = pack_src_conv_avx2_stride2; | auto tmp_func = pack_src_conv_avx2_stride2; | ||||
cb(pack_src_task); | cb(pack_src_task); | ||||
@@ -20,7 +20,7 @@ using NCBKern = fallback::ConvBiasImpl::NCBKern; | |||||
using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam; | using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam; | ||||
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param, | SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param, | ||||
WorkspaceBundle bundle); | |||||
const WorkspaceBundle& bundle); | |||||
} // namespace direct_conv_avx2_stride2 | } // namespace direct_conv_avx2_stride2 | ||||
} // namespace x86 | } // namespace x86 | ||||
@@ -48,7 +48,7 @@ static inline void get_rectified_size(const NCBKernSizeParam& param, | |||||
} | } | ||||
static inline void copy_padding_kern( | static inline void copy_padding_kern( | ||||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const WorkspaceBundle& bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index) { | const ConvBiasImpl::NCBKernIndex& ncb_index) { | ||||
size_t IW = kern_param.isz[1]; | size_t IW = kern_param.isz[1]; | ||||
size_t IH = kern_param.isz[0]; | size_t IH = kern_param.isz[0]; | ||||
@@ -59,7 +59,6 @@ static inline void copy_padding_kern( | |||||
get_rectified_size(kern_param, IH2, IW2, OH2, OW2); | get_rectified_size(kern_param, IH2, IW2, OH2, OW2); | ||||
bool need_src_copy_var = need_src_copy(kern_param); | bool need_src_copy_var = need_src_copy(kern_param); | ||||
size_t padding_group_size = IH2 * IW2; | size_t padding_group_size = IH2 * IW2; | ||||
bundle.set(kern_param.workspace_ptr); | |||||
size_t group_id = ncb_index.ndrange_id[0], | size_t group_id = ncb_index.ndrange_id[0], | ||||
batch_id = ncb_index.ndrange_id[1], | batch_id = ncb_index.ndrange_id[1], | ||||