GitOrigin-RevId: 4106b46b6c
tags/v0.5.0
@@ -178,6 +178,23 @@ public: | |||
const NCBKernSizeParam& param) const override; | |||
}; | |||
class ConvBiasImpl::AlgoF32DirectStride2NCHW44 final : public AlgoBase { | |||
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const; | |||
public: | |||
AlgoF32DirectStride2NCHW44() {} | |||
bool is_reproducible() const override { return true; } | |||
const char* name() const override { return "F32_CONV_NCHW44_DIRECT_S2"; } | |||
bool usable(fallback::ConvBiasImpl*, const NCBKernSizeParam& param, | |||
AlgoSelectionStrategy algo_selection_strategy) const override; | |||
size_t get_workspace(fallback::ConvBiasImpl*, | |||
const NCBKernSizeParam& param) const override; | |||
virtual SmallVector<NCBKern> dispatch_kerns( | |||
fallback::ConvBiasImpl* opr, | |||
const NCBKernSizeParam& param) const override; | |||
}; | |||
class ConvBiasImpl::AlgoF32DirectStride1 final : public AlgoBase { | |||
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const; | |||
bool m_large_group; | |||
@@ -0,0 +1,281 @@ | |||
/** | |||
* \file dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_algo.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express | |||
* or implied. | |||
*/ | |||
#include "megdnn/oprs.h" | |||
#include "src/arm_common/conv_bias/fp32/algos.h" | |||
#include "src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.h" | |||
#include "src/arm_common/conv_bias/fp32/strategy.h" | |||
#include "src/arm_common/elemwise_op.h" | |||
#include "src/common/opr_delegate.h" | |||
#include "midout.h" | |||
using namespace megdnn; | |||
using namespace arm_common; | |||
using conv_fun = std::function<void( | |||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | |||
const ConvBiasImpl::NCBKernIndex& ncb_index, | |||
const CpuNDRange& workspace_ids, const CpuNDRange& ncb_range)>; | |||
MIDOUT_DECL(megdnn_arm_common_conv_bias_fp32_nchw44_stride2) | |||
namespace { | |||
// block_helper is used to calculate oh block size | |||
static inline int block_helper(const int nthread, const int amount, | |||
const int size_per_unit) { | |||
constexpr int l2_cache_size = 256 * 1024; | |||
const int block_per_thread = div_ceil(amount, nthread); | |||
const int best_block = std::min( | |||
amount, (l2_cache_size + size_per_unit / 2) / size_per_unit); | |||
const int max_block_num = div_ceil(block_per_thread, best_block); | |||
const int min_block_num = std::max(max_block_num - 1, 1); | |||
const int max_block = div_ceil(block_per_thread, max_block_num); | |||
const int min_block = div_ceil(block_per_thread, min_block_num); | |||
const int max_loss = std::abs(max_block_num * max_block - block_per_thread); | |||
const int min_loss = std::abs(min_block_num * min_block - block_per_thread); | |||
int block = max_loss > min_loss ? min_block : max_block; | |||
return block; | |||
} | |||
static inline size_t get_perthread_cache_bytes(const int ic, const int ih2, | |||
const int iw2) { | |||
// border_size is used to avoid read illegal memory | |||
int border_size = 64 * 2; | |||
return ic * ih2 * iw2 * sizeof(float) + border_size; | |||
} | |||
static void get_rectified_size( | |||
const megdnn::fallback::ConvBiasImpl::NCBKernSizeParam& param, int& ih2, | |||
int& iw2, int& oh2, int& ow2) { | |||
int ic = param.filter_meta.icpg; | |||
int iw = param.isz[1]; | |||
int oh = param.osz[0]; | |||
int ow = param.osz[1]; | |||
oh2 = oh; | |||
ow2 = ow; | |||
constexpr int cacheline = 64 / sizeof(float); | |||
int block_oh = | |||
block_helper(param.nr_threads, oh, ic * iw * sizeof(float) * 2); | |||
auto&& fm = param.filter_meta; | |||
const int stride_h = static_cast<int>(fm.stride[0]); | |||
const int filter_h = static_cast<int>(fm.spatial[0]); | |||
ih2 = block_oh * stride_h + filter_h - stride_h; | |||
iw2 = round_up(iw + 2 * static_cast<int>(fm.padding[1]), cacheline); | |||
} | |||
static WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param) { | |||
auto&& fm = param.filter_meta; | |||
int ic = fm.icpg; | |||
int ih2, iw2, oh2, ow2; | |||
get_rectified_size(param, ih2, iw2, oh2, ow2); | |||
size_t src_size = get_perthread_cache_bytes(ic, ih2, iw2); | |||
return {nullptr, {src_size * param.nr_threads}}; | |||
}; | |||
template <size_t filter, BiasMode bias_mode, typename Op> | |||
static void do_conv_kern(WorkspaceBundle bundle, | |||
const ConvBiasImpl::NCBKernParam& kern_param, | |||
const ConvBiasImpl::NCBKernIndex& ncb_index, | |||
const CpuNDRange&, const CpuNDRange&) { | |||
const int oh = kern_param.osz[0]; | |||
const int ow = kern_param.osz[1]; | |||
const int fh = kern_param.filter_meta.spatial[0]; | |||
const int fw = kern_param.filter_meta.spatial[1]; | |||
const int ic = kern_param.filter_meta.icpg; | |||
const int oc = kern_param.filter_meta.ocpg; | |||
const int ih = kern_param.isz[0]; | |||
const int iw = kern_param.isz[1]; | |||
const int stride_h = kern_param.filter_meta.stride[0]; | |||
const int ph = kern_param.filter_meta.padding[0]; | |||
const int pw = kern_param.filter_meta.padding[1]; | |||
int ih2 = 0; | |||
int iw2 = 0; | |||
int oh2 = 0; | |||
int ow2 = 0; | |||
get_rectified_size(kern_param, ih2, iw2, oh2, ow2); | |||
bundle.set(kern_param.workspace_ptr); | |||
constexpr int pack_c = 4; | |||
const int batch_id = ncb_index.ndrange_id[0]; | |||
const int group_id = ncb_index.ndrange_id[1]; | |||
constexpr int oc_idx = 0; | |||
int oc_block = oc; | |||
int oh_block = block_helper(kern_param.nr_threads, oh2, | |||
ic * iw * sizeof(float) * 2); | |||
const int oh_idx = ncb_index.ndrange_id[2]; | |||
const int oh_block_real = std::min(oh - oh_idx * oh_block, oh_block); | |||
const int ih_real = oh_block_real * stride_h + fh - stride_h; | |||
const int src_top_pad = std::max(ph - oh_idx * oh_block * stride_h, 0); | |||
const int src_bottom_pad = std::max( | |||
(oh_idx * oh_block + oh_block_real - 1) * stride_h + fh - ih - ph, | |||
0); | |||
const int remain_right_pad = std::max(iw2 - iw - pw, 0); | |||
const int src_offset = | |||
std::max(oh_idx * oh_block * stride_h - ph, 0) * iw * pack_c; | |||
const float* origin_sptr = static_cast<const float*>(kern_param.src<float>( | |||
batch_id, group_id, 0, 1, 1)) + | |||
src_offset; | |||
const size_t src_size = get_perthread_cache_bytes(ic, ih2, iw2); | |||
float* sptr = reinterpret_cast<float*>((int8_t*)bundle.get(0) + | |||
ncb_index.thread_id * src_size); | |||
conv_bias::pack_src_fp32_nchw44_stride2( | |||
sptr, origin_sptr, ph, pw, remain_right_pad, | |||
ih_real - src_top_pad - src_bottom_pad, iw, iw2, src_top_pad, | |||
src_bottom_pad, ic, ih * iw); | |||
const float* fptr = | |||
kern_param.filter<dt_float32>(group_id) + oc_idx * fh * fw * ic; | |||
float_t* dst = kern_param.dst<float_t>(batch_id, group_id) + | |||
oh_idx * oh_block * ow * pack_c; | |||
const int bias_offset = bias_mode == BiasMode::BIAS | |||
? oh_idx * oh_block * ow * pack_c | |||
: oc_idx; | |||
const float* bptr = | |||
kern_param.bias<dt_float32>(batch_id, group_id) + bias_offset; | |||
Op op; | |||
#define KERN1_NCHW44_CONV(filter) \ | |||
conv_bias::conv_direct_stride2_##filter##x##filter##_fp32_nchw44< \ | |||
\ | |||
bias_mode, Op>(sptr, fptr, bptr, nullptr, dst, oc_block, ic, \ | |||
ih_real, iw2, oh, oh_block_real, ow, op, ph, pw) | |||
DISPATCH_FILTER(filter, KERN1_NCHW44_CONV); | |||
#undef KERN1_NCHW44_CONV | |||
} | |||
} // namespace | |||
/* ===================== stride2 algo ===================== */ | |||
bool ConvBiasImpl::AlgoF32DirectStride2NCHW44::usable( | |||
fallback::ConvBiasImpl*, const NCBKernSizeParam& param, | |||
AlgoSelectionStrategy) const { | |||
auto&& fm = param.filter_meta; | |||
auto fh = fm.spatial[0]; | |||
int oc = fm.ocpg; | |||
bool ok_type = ((param.src_type.enumv() == DTypeEnum::Float32 && | |||
param.filter_type.enumv() == DTypeEnum::Float32 && | |||
(param.dst_type.enumv() == DTypeEnum::Float32))) && | |||
(fm.format == param::Convolution::Format::NCHW44); | |||
bool ok_src_dst = (oc % 4 == 0 && oc >= 4); | |||
bool ok_filter = fm.spatial_ndim == 2 && fh == fm.spatial[1] && | |||
(fh == 2 || fh == 3 || fh == 5 || fh == 7); | |||
bool ok_slide = fm.dilation[0] == 1 && fm.dilation[1] == 1 && | |||
fm.stride[0] == 2 && fm.stride[1] == 2; | |||
bool ok_conv = !fm.should_flip; | |||
bool avaible = ok_type && ok_src_dst && ok_filter && ok_slide && ok_conv; | |||
return avaible; | |||
} | |||
size_t ConvBiasImpl::AlgoF32DirectStride2NCHW44::get_workspace( | |||
fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { | |||
return get_bundle(param).total_size_in_bytes(); | |||
} | |||
SmallVector<ConvBiasImpl::NCBKern> | |||
ConvBiasImpl::AlgoF32DirectStride2NCHW44::dispatch_kerns( | |||
fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { | |||
auto fm = param.filter_meta; | |||
const int batch = param.n; | |||
const int group = fm.group; | |||
WorkspaceBundle wbundle = get_bundle(param); | |||
conv_fun do_conv_fun = nullptr; | |||
// NOTE: remain_w is not used to gen hash of midout for compatible with | |||
// shape runtime | |||
#define DO_CONV_KERN_FUN(filter, bias_mode, op) \ | |||
MIDOUT_BEGIN(megdnn_arm_common_conv_bias_fp32_nchw44_stride2, \ | |||
midout_iv(#filter #bias_mode #op##_hash)) { \ | |||
do_conv_fun = do_conv_kern<filter, bias_mode, op>; \ | |||
} \ | |||
MIDOUT_END(); | |||
#define GET_OP_PARAM(filter, bias_mode) \ | |||
switch (param.nonlineMode) { \ | |||
case param::ConvBias::NonlineMode::IDENTITY: \ | |||
DO_CONV_KERN_FUN(filter, bias_mode, NoneOp<dt_float32>) \ | |||
break; \ | |||
case param::ConvBias::NonlineMode::RELU: \ | |||
DO_CONV_KERN_FUN(filter, bias_mode, ReluOp<dt_float32>) \ | |||
break; \ | |||
case param::ConvBias::NonlineMode::H_SWISH: \ | |||
DO_CONV_KERN_FUN(filter, bias_mode, HSwishOp<dt_float32>) \ | |||
break; \ | |||
default: \ | |||
megdnn_assert(0); \ | |||
break; \ | |||
} | |||
#define GET_BIAS_MODE_PARAM(filter) \ | |||
switch (param.bias_mode) { \ | |||
case BiasMode::NO_BIAS: \ | |||
GET_OP_PARAM(filter, BiasMode::NO_BIAS) \ | |||
break; \ | |||
case BiasMode::BROADCAST_CHANNEL_BIAS: \ | |||
GET_OP_PARAM(filter, BiasMode::BROADCAST_CHANNEL_BIAS) \ | |||
break; \ | |||
case BiasMode::BIAS: \ | |||
GET_OP_PARAM(filter, BiasMode::BIAS) \ | |||
break; \ | |||
default: \ | |||
megdnn_assert(0); \ | |||
break; \ | |||
} | |||
#define DISPATCH_CONV_KERN() \ | |||
switch (param.filter_meta.spatial[0]) { \ | |||
case 2: \ | |||
GET_BIAS_MODE_PARAM(2) \ | |||
break; \ | |||
case 3: \ | |||
GET_BIAS_MODE_PARAM(3) \ | |||
break; \ | |||
case 5: \ | |||
GET_BIAS_MODE_PARAM(5) \ | |||
break; \ | |||
case 7: \ | |||
GET_BIAS_MODE_PARAM(7) \ | |||
break; \ | |||
default: \ | |||
megdnn_assert(0); \ | |||
break; \ | |||
} | |||
DISPATCH_CONV_KERN(); | |||
#undef DO_CONV_KERN_FUN | |||
#undef GET_REMAIN_W_PARAM | |||
#undef GET_OP_PARAM | |||
#undef GET_BIAS_MODE_PARAM | |||
#undef DISPATCH_CONV_KERN | |||
megdnn_assert(do_conv_fun); | |||
SmallVector<ConvBiasImpl::NCBKern> ret_kerns; | |||
WorkspaceBundle bundle = wbundle; | |||
int oh = param.osz[0]; | |||
int ic = param.filter_meta.icpg; | |||
int iw = param.isz[1]; | |||
int oh_block = | |||
block_helper(param.nr_threads, oh, ic * iw * sizeof(float) * 2); | |||
CpuNDRange ncb_range = {static_cast<size_t>(batch), | |||
static_cast<size_t>(group), | |||
static_cast<size_t>(div_ceil(oh, oh_block))}; | |||
auto do_conv = [bundle, do_conv_fun, ncb_range]( | |||
const NCBKernParam& kern_param, | |||
const NCBKernIndex& ncb_index) { | |||
do_conv_fun(bundle, kern_param, ncb_index, ncb_index.ndrange_id, | |||
ncb_range); | |||
}; | |||
ret_kerns.push_back({do_conv, ncb_range}); | |||
return ret_kerns; | |||
} | |||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,748 @@ | |||
/** | |||
* \file | |||
* dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#include "src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.h" | |||
#include "src/arm_common/conv_bias/intrinsic_helper.h" | |||
#include "src/arm_common/elemwise_op.h" | |||
#include "src/arm_common/simd_macro/marm_neon.h" | |||
#include "src/common/unroll_macro.h" | |||
#include "src/common/utils.h" | |||
#include "src/fallback/conv_bias/common.h" | |||
using namespace megdnn; | |||
using namespace arm_common; | |||
namespace { | |||
template <int src_idx, int weight_idx, int c_dim, typename Func, int ow_block, | |||
typename T, typename T2, typename T3, typename T4> | |||
struct ShiftCalHelper { | |||
static void impl(T& c, T2& src, T3& weight); | |||
}; | |||
template <int src_idx, int weight_idx, typename Func, typename T, typename T2, | |||
typename T3, typename T4> | |||
struct ShiftCalHelper<src_idx, weight_idx, 2, Func, 8, T, T2, T3, T4> { | |||
static void impl(T& c, T2& src, T3& weight) { | |||
#define cb(step, lane) \ | |||
c[0][step] = Func::template impl<lane>(c[0][step], weight[0][lane], \ | |||
src[(step + src_idx) % 8]); \ | |||
c[1][step] = Func::template impl<lane>(c[1][step], weight[1][lane], \ | |||
src[(step + src_idx) % 8]); | |||
UNROLL_CALL_RAW(8, cb, 0); | |||
UNROLL_CALL_RAW(8, cb, 1); | |||
UNROLL_CALL_RAW(8, cb, 2); | |||
UNROLL_CALL_RAW(8, cb, 3); | |||
#undef cb | |||
} | |||
}; | |||
template <int src_idx, int weight_idx, typename Func, typename T, typename T2, | |||
typename T3, typename T4> | |||
struct ShiftCalHelper<src_idx, weight_idx, 2, Func, 4, T, T2, T3, T4> { | |||
static void impl(T& c, T2& src, T3& weight) { | |||
#define cb(step, lane) \ | |||
c[0][step] = Func::template impl<lane>(c[0][step], weight[0][lane], \ | |||
src[(step + src_idx) % 4]); \ | |||
c[1][step] = Func::template impl<lane>(c[1][step], weight[1][lane], \ | |||
src[(step + src_idx) % 4]); | |||
UNROLL_CALL_RAW(4, cb, 0); | |||
UNROLL_CALL_RAW(4, cb, 1); | |||
UNROLL_CALL_RAW(4, cb, 2); | |||
UNROLL_CALL_RAW(4, cb, 3); | |||
#undef cb | |||
} | |||
}; | |||
template <int src_idx, int weight_idx, typename Func, typename T, typename T2, | |||
typename T3, typename T4> | |||
struct ShiftCalHelper<src_idx, weight_idx, 1, Func, 8, T, T2, T3, T4> { | |||
static void impl(T& c, T2& src, T3& weight) { | |||
#define cb(step, lane) \ | |||
c[0][step] = Func::template impl<lane>(c[0][step], weight[0][lane], \ | |||
src[(step + src_idx) % 8]); | |||
UNROLL_CALL_RAW(8, cb, 0); | |||
UNROLL_CALL_RAW(8, cb, 1); | |||
UNROLL_CALL_RAW(8, cb, 2); | |||
UNROLL_CALL_RAW(8, cb, 3); | |||
#undef cb | |||
} | |||
}; | |||
template <int src_idx, int weight_idx, typename Func, typename T, typename T2, | |||
typename T3, typename T4> | |||
struct ShiftCalHelper<src_idx, weight_idx, 1, Func, 4, T, T2, T3, T4> { | |||
static void impl(T& c, T2& src, T3& weight) { | |||
#define cb(step, lane) \ | |||
c[0][step] = Func::template impl<lane>(c[0][step], weight[0][lane], \ | |||
src[(step + src_idx) % 4]); | |||
UNROLL_CALL_RAW(4, cb, 0); | |||
UNROLL_CALL_RAW(4, cb, 1); | |||
UNROLL_CALL_RAW(4, cb, 2); | |||
UNROLL_CALL_RAW(4, cb, 3); | |||
#undef cb | |||
} | |||
}; | |||
template <int src_idx, int weight_idx, int c_dim, typename FUNC, int ow_block, | |||
typename T, typename T2, typename T3> | |||
inline void cal_helper(T& c, T2& src, T3& weight) { | |||
ShiftCalHelper<src_idx, weight_idx, c_dim, FUNC, ow_block, T, T2, T3, | |||
int>::impl(c, src, weight); | |||
}; | |||
template <int oc> | |||
struct OCHelper { | |||
public: | |||
static const int val = -1; | |||
}; | |||
template <> | |||
struct OCHelper<4> { | |||
public: | |||
static const int val = 1; | |||
}; | |||
#if MEGDNN_AARCH64 | |||
template <> | |||
struct OCHelper<8> { | |||
public: | |||
static const int val = 2; | |||
}; | |||
#endif | |||
/** | |||
* oc8_ow8(m = 8, n = 8) and oc4_ow8(m = 4, n = 8) gemm like kernel | |||
* */ | |||
template <BiasMode bias_mode, typename Op, int remain_w, int filter_size, | |||
int oc_block, int ow_block> | |||
struct KerNeonXXs2Nchw44FP32 { | |||
static void impl(const float32_t* src_ptr, const float32_t* weight_ptr, | |||
const float32_t* bias_ptr, float32_t* dst_ptr, int ic, | |||
int ih, int iw, int ld_dst_oc, const Op& op, | |||
const float32_t* src_ptr_odd); | |||
}; | |||
template <BiasMode bias_mode, typename Op, int remain_w, int oc_block, | |||
int ow_block> | |||
struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> { | |||
static void impl(const float32_t* src_ptr_origin, | |||
const float32_t* weight_ptr, const float32_t* bias_ptr, | |||
float32_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, | |||
const Op& op, const float32_t* src_ptr_odd_origin) { | |||
constexpr int loop_ic_step = 4; | |||
constexpr int filter_size = 2; | |||
constexpr int oc_step = 4; | |||
constexpr int simd_len = 4; | |||
constexpr int ld_weight = oc_step * oc_step; | |||
const int ld_bias = bias_mode == BiasMode::BIAS ? ld_dst_oc : oc_step; | |||
const int ld_weight_oc = oc_step * filter_size * filter_size * ic; | |||
const int ld_weight_fh = oc_step * oc_step * filter_size; | |||
const int ld_src_ic = ih * iw; | |||
const int ld_src_iw = iw * oc_step; | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
float32x4_t c[c_dim][ow_block]; | |||
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | |||
const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic; | |||
float32x4_t src[ow_block]; | |||
float32x4_t weight[c_dim][4]; | |||
/////////row 0///////////// | |||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, 0); | |||
load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr, | |||
ld_weight_oc); | |||
cal_helper<0, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, weight); | |||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr_odd, | |||
0); | |||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, weight); | |||
src_ptr += ld_src_iw; | |||
src_ptr_odd += ld_src_iw; | |||
weight_ptr += ld_weight_fh; | |||
/////////row 1///////////// | |||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, 0); | |||
load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr, | |||
ld_weight_oc); | |||
cal_helper<0, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, weight); | |||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr_odd, | |||
0); | |||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, weight); | |||
src_ptr += ld_src_iw; | |||
src_ptr_odd += ld_src_iw; | |||
weight_ptr += ld_weight_fh; | |||
} | |||
store_ocx_ow8_remain_static<c_dim, remain_w, Op>(c, op, dst_ptr, | |||
ld_dst_oc); | |||
} | |||
}; | |||
template <BiasMode bias_mode, typename Op, int remain_w, int oc_block, | |||
int ow_block> | |||
struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> { | |||
static void impl(const float32_t* src_ptr_origin, | |||
const float32_t* weight_ptr, const float32_t* bias_ptr, | |||
float32_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, | |||
const Op& op, const float32_t* src_ptr_odd_origin) { | |||
constexpr int loop_ic_step = 4; | |||
constexpr int filter_size = 3; | |||
constexpr int oc_step = 4; | |||
constexpr int simd_len = 4; | |||
constexpr int ld_weight = oc_step * oc_step; | |||
const int ld_bias = bias_mode == BiasMode::BIAS ? ld_dst_oc : oc_step; | |||
const int ld_weight_oc = oc_step * filter_size * filter_size * ic; | |||
const int ld_weight_fh = oc_step * oc_step * filter_size; | |||
const int ld_src_ic = ih * iw; | |||
const int ld_src_iw = iw * oc_step; | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
float32x4_t c[c_dim][ow_block]; | |||
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | |||
const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic; | |||
float32x4_t src[ow_block]; | |||
float32x4_t weight[c_dim][4]; | |||
/////////row 0///////////// | |||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, 0); | |||
load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr, | |||
ld_weight_oc); | |||
cal_helper<0, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, weight); | |||
src[0] = vld1q_f32(src_ptr + ow_block * simd_len); | |||
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, weight); | |||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr_odd, | |||
0); | |||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, weight); | |||
src_ptr += ld_src_iw; | |||
src_ptr_odd += ld_src_iw; | |||
weight_ptr += ld_weight_fh; | |||
/////////row 1///////////// | |||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, 0); | |||
load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr, | |||
ld_weight_oc); | |||
cal_helper<0, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, weight); | |||
src[0] = vld1q_f32(src_ptr + ow_block * simd_len); | |||
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, weight); | |||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr_odd, | |||
0); | |||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, weight); | |||
src_ptr += ld_src_iw; | |||
src_ptr_odd += ld_src_iw; | |||
weight_ptr += ld_weight_fh; | |||
//////////row 2///////////// | |||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, 0); | |||
load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr, | |||
ld_weight_oc); | |||
cal_helper<0, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, weight); | |||
src[0] = vld1q_f32(src_ptr + ow_block * simd_len); | |||
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, weight); | |||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr_odd, | |||
0); | |||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, weight); | |||
src_ptr += ld_src_iw; | |||
src_ptr_odd += ld_src_iw; | |||
weight_ptr += ld_weight_fh; | |||
} | |||
store_ocx_ow8_remain_static<c_dim, remain_w, Op>(c, op, dst_ptr, | |||
ld_dst_oc); | |||
} | |||
}; | |||
template <BiasMode bias_mode, typename Op, int remain_w, int oc_block, | |||
int ow_block> | |||
struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 5, oc_block, ow_block> { | |||
static void impl(const float32_t* src_ptr_origin, | |||
const float32_t* weight_ptr, const float32_t* bias_ptr, | |||
float32_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, | |||
const Op& op, const float32_t* src_ptr_odd_origin) { | |||
constexpr int loop_ic_step = 4; | |||
constexpr int filter_size = 5; | |||
constexpr int oc_step = 4; | |||
constexpr int simd_len = 4; | |||
constexpr int ld_weight = oc_step * oc_step; | |||
const int ld_bias = bias_mode == BiasMode::BIAS ? ld_dst_oc : oc_step; | |||
const int ld_weight_oc = oc_step * filter_size * filter_size * ic; | |||
const int ld_weight_fh = oc_step * oc_step * filter_size; | |||
const int ld_src_ic = ih * iw; | |||
const int ld_src_iw = iw * oc_step; | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
float32x4_t c[c_dim][ow_block]; | |||
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | |||
const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic; | |||
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { | |||
float32x4_t src[ow_block]; | |||
float32x4_t weight[c_dim][4]; | |||
// even element | |||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, | |||
0); | |||
load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr, | |||
ld_weight_oc); | |||
cal_helper<0, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||
weight); | |||
src[0] = vld1q_f32(src_ptr + ow_block * simd_len); | |||
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||
weight); | |||
src[1] = vld1q_f32(src_ptr + (ow_block + 1) * simd_len); | |||
load_helper<4, 4 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<2, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||
weight); | |||
// odd element | |||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>( | |||
src, src_ptr_odd, 0); | |||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||
weight); | |||
src[0] = vld1q_f32(src_ptr_odd + ow_block * simd_len); | |||
load_helper<4, 3 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||
weight); | |||
src_ptr += ld_src_iw; | |||
src_ptr_odd += ld_src_iw; | |||
weight_ptr += ld_weight_fh; | |||
} | |||
} | |||
store_ocx_ow8_remain_static<c_dim, remain_w, Op>(c, op, dst_ptr, | |||
ld_dst_oc); | |||
} | |||
}; | |||
/** | |||
* for kernel[7], calculate sequence is kernel[0], kernel[2], kernel[4], | |||
* kernel[6], kernel[1], kernel[3], kernel[5] | |||
* src is packed like 0, 2, 4, 6, 8, 10, 1, 3, 5, 7, 9 | |||
**/ | |||
template <BiasMode bias_mode, typename Op, int remain_w, int oc_block, | |||
int ow_block> | |||
struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 7, oc_block, ow_block> { | |||
static void impl(const float32_t* src_ptr_origin, | |||
const float32_t* weight_ptr, const float32_t* bias_ptr, | |||
float32_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, | |||
const Op& op, const float32_t* src_ptr_odd_origin) { | |||
constexpr int loop_ic_step = 4; | |||
constexpr int filter_size = 7; | |||
constexpr int oc_step = 4; | |||
constexpr int simd_len = 4; | |||
constexpr int ld_weight = oc_step * oc_step; | |||
const int ld_bias = bias_mode == BiasMode::BIAS ? ld_dst_oc : oc_step; | |||
const int ld_weight_oc = oc_step * filter_size * filter_size * ic; | |||
const int ld_weight_fh = oc_step * oc_step * filter_size; | |||
const int ld_src_ic = ih * iw; | |||
const int ld_src_iw = iw * oc_step; | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
float32x4_t c[c_dim][ow_block]; | |||
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | |||
const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic; | |||
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { | |||
float32x4_t src[ow_block]; | |||
float32x4_t weight[c_dim][4]; | |||
// even element | |||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, | |||
0); | |||
load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr, | |||
ld_weight_oc); | |||
cal_helper<0, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||
weight); | |||
src[0] = vld1q_f32(src_ptr + ow_block * simd_len); | |||
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||
weight); | |||
src[1] = vld1q_f32(src_ptr + (ow_block + 1) * simd_len); | |||
load_helper<4, 4 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<2, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||
weight); | |||
src[2] = vld1q_f32(src_ptr + (ow_block + 2) * simd_len); | |||
load_helper<4, 6 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<3, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||
weight); | |||
// odd element | |||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>( | |||
src, src_ptr_odd, 0); | |||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||
weight); | |||
src[0] = vld1q_f32(src_ptr_odd + ow_block * simd_len); | |||
load_helper<4, 3 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||
weight); | |||
src[1] = vld1q_f32(src_ptr_odd + (ow_block + 1) * simd_len); | |||
load_helper<4, 5 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<2, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||
weight); | |||
src_ptr += ld_src_iw; | |||
src_ptr_odd += ld_src_iw; | |||
weight_ptr += ld_weight_fh; | |||
} | |||
} | |||
store_ocx_ow8_remain_static<c_dim, remain_w, Op>(c, op, dst_ptr, | |||
ld_dst_oc); | |||
} | |||
}; | |||
} // namespace | |||
namespace { | |||
inline void odd_even_split_iw8_even(float* sptr_base, const float* sptr, | |||
const int odd_start, const int src_idx, | |||
const int iw_idx) { | |||
constexpr int ic_step = 4; | |||
const int src_offset = src_idx * ic_step; | |||
const int even_offset = iw_idx / 2 * ic_step; | |||
const int odd_offset = (odd_start + iw_idx / 2) * ic_step; | |||
float32x4_t temp[8]; | |||
temp[0] = vld1q_f32(sptr + src_offset + 0 * ic_step); | |||
temp[1] = vld1q_f32(sptr + src_offset + 1 * ic_step); | |||
temp[2] = vld1q_f32(sptr + src_offset + 2 * ic_step); | |||
temp[3] = vld1q_f32(sptr + src_offset + 3 * ic_step); | |||
temp[4] = vld1q_f32(sptr + src_offset + 4 * ic_step); | |||
temp[5] = vld1q_f32(sptr + src_offset + 5 * ic_step); | |||
temp[6] = vld1q_f32(sptr + src_offset + 6 * ic_step); | |||
temp[7] = vld1q_f32(sptr + src_offset + 7 * ic_step); | |||
vst1q_f32(sptr_base + even_offset + 0 * ic_step, temp[0]); | |||
vst1q_f32(sptr_base + even_offset + 1 * ic_step, temp[2]); | |||
vst1q_f32(sptr_base + even_offset + 2 * ic_step, temp[4]); | |||
vst1q_f32(sptr_base + even_offset + 3 * ic_step, temp[6]); | |||
vst1q_f32(sptr_base + odd_offset + 0 * ic_step, temp[1]); | |||
vst1q_f32(sptr_base + odd_offset + 1 * ic_step, temp[3]); | |||
vst1q_f32(sptr_base + odd_offset + 2 * ic_step, temp[5]); | |||
vst1q_f32(sptr_base + odd_offset + 3 * ic_step, temp[7]); | |||
} | |||
void odd_even_split_iw8_odd(float* sptr_base, const float* sptr, | |||
const int odd_start, const int src_idx, | |||
const int iw_idx) { | |||
constexpr int ic_step = 4; | |||
const int src_offset = src_idx * ic_step; | |||
const int even_offset = (iw_idx + 1) / 2 * ic_step; | |||
const int odd_offset = (odd_start + iw_idx / 2) * ic_step; | |||
float32x4_t temp[8]; | |||
temp[0] = vld1q_f32(sptr + src_offset + 0 * ic_step); | |||
temp[1] = vld1q_f32(sptr + src_offset + 1 * ic_step); | |||
temp[2] = vld1q_f32(sptr + src_offset + 2 * ic_step); | |||
temp[3] = vld1q_f32(sptr + src_offset + 3 * ic_step); | |||
temp[4] = vld1q_f32(sptr + src_offset + 4 * ic_step); | |||
temp[5] = vld1q_f32(sptr + src_offset + 5 * ic_step); | |||
temp[6] = vld1q_f32(sptr + src_offset + 6 * ic_step); | |||
temp[7] = vld1q_f32(sptr + src_offset + 7 * ic_step); | |||
vst1q_f32(sptr_base + odd_offset + 0 * ic_step, temp[0]); | |||
vst1q_f32(sptr_base + odd_offset + 1 * ic_step, temp[2]); | |||
vst1q_f32(sptr_base + odd_offset + 2 * ic_step, temp[4]); | |||
vst1q_f32(sptr_base + odd_offset + 3 * ic_step, temp[6]); | |||
vst1q_f32(sptr_base + even_offset + 0 * ic_step, temp[1]); | |||
vst1q_f32(sptr_base + even_offset + 1 * ic_step, temp[3]); | |||
vst1q_f32(sptr_base + even_offset + 2 * ic_step, temp[5]); | |||
vst1q_f32(sptr_base + even_offset + 3 * ic_step, temp[7]); | |||
} | |||
} // namespace | |||
void conv_bias::pack_src_fp32_nchw44_stride2( | |||
float* sptr_base, const float* sptr_origin, const int ph, const int pw, | |||
const int pad_right, const int ih, const int iw, const int iw2, | |||
const int pad_top, const int pad_bottom, const int ic, | |||
const int ic_stride) { | |||
constexpr int ic_step = 4; | |||
int odd_start = megdnn::div_ceil(iw2, 2); | |||
float32x4_t zero_v = vdupq_n_f32(0.f); | |||
MEGDNN_MARK_USED_VAR(ph); | |||
bool even_start = pw % 2 == 0; | |||
rep_step(ic_idx, ic, ic_step) { | |||
const float* sptr = sptr_origin + ic_idx * ic_stride; | |||
memset(sptr_base, 0, sizeof(float) * iw2 * pad_top * ic_step); | |||
sptr_base += iw2 * pad_top * ic_step; | |||
rep(ih_idx, ih) { | |||
int iw_idx = 0; | |||
rep(idx, pw) { | |||
if (iw_idx % 2 == 0) { | |||
vst1q_f32(sptr_base + iw_idx / 2 * ic_step, zero_v); | |||
} else { | |||
vst1q_f32(sptr_base + (odd_start + iw_idx / 2) * ic_step, | |||
zero_v); | |||
} | |||
++iw_idx; | |||
} | |||
int src_idx = 0; | |||
if (even_start) { | |||
for (; src_idx + 7 < iw; src_idx += 8) { | |||
odd_even_split_iw8_even(sptr_base, sptr, odd_start, src_idx, | |||
iw_idx); | |||
iw_idx += 8; | |||
} | |||
} else { | |||
for (; src_idx + 7 < iw; src_idx += 8) { | |||
odd_even_split_iw8_odd(sptr_base, sptr, odd_start, src_idx, | |||
iw_idx); | |||
iw_idx += 8; | |||
} | |||
} | |||
for (; src_idx < iw; ++src_idx) { | |||
if (iw_idx % 2 == 0) { | |||
vst1q_f32(sptr_base + iw_idx / 2 * ic_step, | |||
vld1q_f32(sptr + src_idx * ic_step)); | |||
} else { | |||
vst1q_f32(sptr_base + (odd_start + iw_idx / 2) * ic_step, | |||
vld1q_f32(sptr + src_idx * ic_step)); | |||
} | |||
++iw_idx; | |||
} | |||
rep(idx, pad_right) { | |||
if (iw_idx % 2 == 0) { | |||
vst1q_f32(sptr_base + iw_idx / 2 * ic_step, zero_v); | |||
} else { | |||
vst1q_f32(sptr_base + (odd_start + iw_idx / 2) * ic_step, | |||
zero_v); | |||
} | |||
++iw_idx; | |||
} | |||
sptr_base += iw2 * ic_step; | |||
sptr += iw * ic_step; | |||
} | |||
memset(sptr_base, 0, sizeof(float) * iw2 * pad_bottom * ic_step); | |||
sptr_base += iw2 * pad_bottom * ic_step; | |||
} | |||
} | |||
template <BiasMode bias_mode, typename Op, int filter_size> | |||
static void conv_direct_stride2_fp32_nchw44( | |||
const float32_t* src, const float32_t* filter, const float32_t* bias, | |||
float32_t*, float32_t* dst, const int oc, const int ic, const int ih, | |||
const int iw, const int oh, const int oh_block, const int ow, | |||
const Op& op, const int, const int) { | |||
constexpr int fh = filter_size; | |||
constexpr int fw = filter_size; | |||
constexpr int ic_step = 4; | |||
#if MEGDNN_ARMV7 | |||
constexpr int big_oc_step = 4; | |||
#else | |||
constexpr int big_oc_step = 8; | |||
#endif | |||
constexpr int oc_step = 4; | |||
constexpr int ih_step = 1; | |||
constexpr int oh_step = 1; | |||
constexpr int ow_step = 8; | |||
constexpr int stride_h = 2; | |||
constexpr int stride_w = 2; | |||
const int img_stride = oh * ow; | |||
const int ow_end = ow / ow_step * ow_step; | |||
const int ow_remain = ow - ow_end; | |||
const int oc_end = oc / big_oc_step * big_oc_step; | |||
const int oc_remain = oc - oc_end; | |||
const int ld_dst_oc = oc_step * img_stride; | |||
const int odd_start = div_ceil(iw, 2); | |||
using remain_fun = std::function<void( | |||
const float32_t* src_ptr, const float32_t* weight_ptr, | |||
const float32_t* bias_ptr, float32_t* dst_ptr, int ic, int ih, | |||
int iw, int ld_dst_oc, const Op& op, | |||
const float32_t* src_ptr_odd_origin)>; | |||
remain_fun kern_big_oc_remain = nullptr; | |||
remain_fun kern_small_oc_remain = nullptr; | |||
switch (ow_remain) { | |||
#define cb(step) \ | |||
case step: \ | |||
kern_big_oc_remain = \ | |||
KerNeonXXs2Nchw44FP32<bias_mode, Op, step, filter_size, \ | |||
big_oc_step, ow_step>::impl; \ | |||
kern_small_oc_remain = \ | |||
KerNeonXXs2Nchw44FP32<bias_mode, Op, step, filter_size, \ | |||
oc_step, ow_step>::impl; \ | |||
break; | |||
UNROLL_CALL_RAW(8, cb); | |||
default: | |||
megdnn_assert(0, "no remain %d for kern", ow_remain); | |||
} | |||
for (int oc_idx = 0; oc_idx < oc_end; oc_idx += big_oc_step) { | |||
const int weight_offset = oc_idx * ic * fh * fw; | |||
for (int oh_idx = 0; oh_idx < oh_block; oh_idx += oh_step) { | |||
for (int ow_idx = 0; ow_idx < ow_end; ow_idx += ow_step) { | |||
const int src_offset = (oh_idx * stride_h * iw + | |||
ow_idx / 2 * stride_w * ih_step) * | |||
ic_step; | |||
const int src_offset_odd = | |||
(oh_idx * stride_h * iw + | |||
ow_idx / 2 * stride_w * ih_step + odd_start) * | |||
ic_step; | |||
const int dst_offset = | |||
oc_idx * img_stride + (oh_idx * ow + ow_idx) * oc_step; | |||
const int bias_offset = | |||
bias_mode == BiasMode::BIAS ? dst_offset : oc_idx; | |||
KerNeonXXs2Nchw44FP32<bias_mode, Op, ow_step, filter_size, | |||
big_oc_step, | |||
ow_step>::impl(src + src_offset, | |||
filter + weight_offset, | |||
bias + bias_offset, | |||
dst + dst_offset, ic, ih, | |||
iw, ld_dst_oc, op, | |||
src + src_offset_odd); | |||
} | |||
if (ow_remain > 0) { | |||
const int src_offset = (oh_idx * stride_h * iw + | |||
ow_end / 2 * stride_w * ih_step) * | |||
ic_step; | |||
const int src_offset_odd = | |||
(oh_idx * stride_h * iw + | |||
ow_end / 2 * stride_w * ih_step + odd_start) * | |||
ic_step; | |||
const int dst_offset = | |||
oc_idx * img_stride + (oh_idx * ow + ow_end) * oc_step; | |||
const int bias_offset = | |||
bias_mode == BiasMode::BIAS ? dst_offset : oc_idx; | |||
kern_big_oc_remain(src + src_offset, filter + weight_offset, | |||
bias + bias_offset, dst + dst_offset, ic, ih, | |||
iw, ld_dst_oc, op, src + src_offset_odd); | |||
} | |||
} | |||
} | |||
if (oc_remain > 0) { | |||
int oc_idx = oc_end; | |||
const int weight_offset = oc_idx * ic * fh * fw; | |||
for (int oh_idx = 0; oh_idx < oh_block; oh_idx += oh_step) { | |||
for (int ow_idx = 0; ow_idx < ow_end; ow_idx += ow_step) { | |||
const int src_offset = (oh_idx * stride_h * iw + | |||
ow_idx / 2 * stride_w * ih_step) * | |||
ic_step; | |||
const int src_offset_odd = | |||
(oh_idx * stride_h * iw + | |||
ow_idx / 2 * stride_w * ih_step + odd_start) * | |||
ic_step; | |||
const int dst_offset = | |||
oc_idx * img_stride + (oh_idx * ow + ow_idx) * oc_step; | |||
const int bias_offset = | |||
bias_mode == BiasMode::BIAS ? dst_offset : oc_idx; | |||
KerNeonXXs2Nchw44FP32<bias_mode, Op, ow_step, filter_size, | |||
oc_step, | |||
ow_step>::impl(src + src_offset, | |||
filter + weight_offset, | |||
bias + bias_offset, | |||
dst + dst_offset, ic, ih, | |||
iw, ld_dst_oc, op, | |||
src + src_offset_odd); | |||
} | |||
if (ow_remain > 0) { | |||
const int src_offset = (oh_idx * stride_h * iw + | |||
ow_end / 2 * stride_w * ih_step) * | |||
ic_step; | |||
const int src_offset_odd = | |||
(oh_idx * stride_h * iw + | |||
ow_end / 2 * stride_w * ih_step + odd_start) * | |||
ic_step; | |||
const int dst_offset = | |||
oc_idx * img_stride + (oh_idx * ow + ow_end) * oc_step; | |||
const int bias_offset = | |||
bias_mode == BiasMode::BIAS ? dst_offset : oc_idx; | |||
kern_small_oc_remain(src + src_offset, filter + weight_offset, | |||
bias + bias_offset, dst + dst_offset, ic, | |||
ih, iw, ld_dst_oc, op, | |||
src + src_offset_odd); | |||
} | |||
} | |||
} | |||
} | |||
#define CONSTRUCT_FUNC(filter_size) \ | |||
template <BiasMode bias_mode, typename Op> \ | |||
void conv_bias:: \ | |||
conv_direct_stride2_##filter_size##x##filter_size##_fp32_nchw44( \ | |||
const float32_t* src, const float32_t* filter, \ | |||
const float32_t* bias, float32_t* temp, float32_t* dst, \ | |||
const int oc, const int ic, const int ih, const int iw, \ | |||
const int oh, const int oh_block, const int ow, \ | |||
const Op& op, const int ph, const int pw) { \ | |||
conv_direct_stride2_fp32_nchw44<bias_mode, Op, filter_size>( \ | |||
src, filter, bias, temp, dst, oc, ic, ih, iw, oh, oh_block, \ | |||
ow, op, ph, pw); \ | |||
} | |||
CONSTRUCT_FUNC(2); | |||
CONSTRUCT_FUNC(3); | |||
CONSTRUCT_FUNC(5); | |||
CONSTRUCT_FUNC(7); | |||
#undef CONSTRUCT_FUNC | |||
#define INSTANTIATION(stride, i, bias, Op) \ | |||
template void conv_bias::conv_direct_##stride##_##i##x##i##_fp32_nchw44< \ | |||
bias, Op>(const float32_t*, const float32_t*, const float32_t*, \ | |||
float32_t*, float32_t*, const int, const int, const int, \ | |||
const int, const int, const int, const int, const Op&, \ | |||
const int, const int); | |||
#define FOR_OP(stride, i, bias) \ | |||
INSTANTIATION(stride, i, bias, NoneOp<dt_float32>) \ | |||
INSTANTIATION(stride, i, bias, ReluOp<dt_float32>) \ | |||
INSTANTIATION(stride, i, bias, HSwishOp<dt_float32>) | |||
#define FOR_BIAS(stride, i) \ | |||
FOR_OP(stride, i, BiasMode::NO_BIAS) \ | |||
FOR_OP(stride, i, BiasMode::BROADCAST_CHANNEL_BIAS) \ | |||
FOR_OP(stride, i, BiasMode::BIAS) | |||
#define FOR_FILTER(stride) \ | |||
FOR_BIAS(stride, 2) \ | |||
FOR_BIAS(stride, 3) \ | |||
FOR_BIAS(stride, 5) \ | |||
FOR_BIAS(stride, 7) | |||
FOR_FILTER(stride2) | |||
#undef FOR_STRIDE | |||
#undef FOR_FILTER | |||
#undef FOR_IC | |||
#undef FOR_BIAS | |||
#undef FOR_NONLINEAR | |||
#undef FOR_REMAIN | |||
#undef INSTANTIATION |
@@ -0,0 +1,40 @@ | |||
/** | |||
* \file dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#include "src/arm_common/conv_bias/opr_impl.h" | |||
#include "src/fallback/conv_bias/common.h" | |||
namespace megdnn { | |||
namespace arm_common { | |||
namespace conv_bias { | |||
#define KERN(stride, i, layout) \ | |||
template <BiasMode bias_mode, typename Op> \ | |||
void conv_direct_##stride##_##i##x##i##_fp32_##layout( \ | |||
const float* src, const float* filter, const float* bias, \ | |||
float* temp, float* dst, const int oc, const int ic, const int ih, \ | |||
const int iw, const int oh, const int oh_block, const int ow, \ | |||
const Op& op, const int ph, const int pw); | |||
KERN(stride2, 2, nchw44) | |||
KERN(stride2, 3, nchw44) | |||
KERN(stride2, 5, nchw44) | |||
KERN(stride2, 7, nchw44) | |||
#undef KERN | |||
void pack_src_fp32_nchw44_stride2(float* sptr_base, const float* sptr_origin, | |||
const int ph, const int pw, | |||
const int pad_right, const int ih, | |||
const int iw, const int iw2, | |||
const int pad_top, const int pad_bottom, | |||
const int ic, const int ic_stride); | |||
} // namespace conv_bias | |||
} // namespace arm_common | |||
} // namespace megdnn |
@@ -111,7 +111,7 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 7, oc_block> { | |||
const int ld_src_ic = ih * iw; | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
float32x4_t c[c_dim][8]; | |||
init_ocx_ow8<c_dim, bias_mode>(c, bias_ptr, oc_step); | |||
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
float32x4_t src[src_reg_size]; | |||
@@ -157,7 +157,7 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 5, oc_block> { | |||
const int ld_src_ic = ih * iw; | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
float32x4_t c[c_dim][8]; | |||
init_ocx_ow8<c_dim, bias_mode>(c, bias_ptr, oc_step); | |||
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
float32x4_t src[src_reg_size]; | |||
@@ -201,7 +201,7 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 3, oc_block> { | |||
const int ld_src_ic = ih * iw; | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
float32x4_t c[c_dim][8]; | |||
init_ocx_ow8<c_dim, bias_mode>(c, bias_ptr, oc_step); | |||
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
float32x4_t src[src_reg_size]; | |||
@@ -258,7 +258,7 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 2, oc_block> { | |||
const int ld_src_ic = ih * iw; | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
float32x4_t c[c_dim][8]; | |||
init_ocx_ow8<c_dim, bias_mode>(c, bias_ptr, oc_step); | |||
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
float32x4_t src[src_reg_size]; | |||
@@ -194,7 +194,20 @@ struct StoreOcxOw8Remain<2, 0, Op, T> { | |||
op({{c[1][6], c[1][7]}}, dst_ptr + ld_dst_oc + 24); | |||
} | |||
}; | |||
template <typename Op, typename T> | |||
struct StoreOcxOw8Remain<2, 8, Op, T> { | |||
static void impl(T& c, const Op& op, float32_t* dst_ptr, int ld_dst_oc) { | |||
op({{c[0][0], c[0][1]}}, dst_ptr); | |||
op({{c[0][2], c[0][3]}}, dst_ptr + 8); | |||
op({{c[0][4], c[0][5]}}, dst_ptr + 16); | |||
op({{c[0][6], c[0][7]}}, dst_ptr + 24); | |||
op({{c[1][0], c[1][1]}}, dst_ptr + ld_dst_oc); | |||
op({{c[1][2], c[1][3]}}, dst_ptr + ld_dst_oc + 8); | |||
op({{c[1][4], c[1][5]}}, dst_ptr + ld_dst_oc + 16); | |||
op({{c[1][6], c[1][7]}}, dst_ptr + ld_dst_oc + 24); | |||
} | |||
}; | |||
template <typename Op, typename T> | |||
struct StoreOcxOw8Remain<2, 7, Op, T> { | |||
static void impl(T& c, const Op& op, float32_t* dst_ptr, int ld_dst_oc) { | |||
@@ -277,6 +290,15 @@ struct StoreOcxOw8Remain<1, 0, Op, T> { | |||
op({{c[0][6], c[0][7]}}, dst_ptr + 24); | |||
} | |||
}; | |||
template <typename Op, typename T> | |||
struct StoreOcxOw8Remain<1, 8, Op, T> { | |||
static void impl(T& c, const Op& op, float32_t* dst_ptr, int) { | |||
op({{c[0][0], c[0][1]}}, dst_ptr); | |||
op({{c[0][2], c[0][3]}}, dst_ptr + 8); | |||
op({{c[0][4], c[0][5]}}, dst_ptr + 16); | |||
op({{c[0][6], c[0][7]}}, dst_ptr + 24); | |||
} | |||
}; | |||
template <typename Op, typename T> | |||
struct StoreOcxOw8Remain<1, 7, Op, T> { | |||
@@ -499,46 +521,127 @@ inline void init_oc8_ow8(int32x4_t c[2][8], const int32_t* bias_ptr, | |||
} | |||
} | |||
/////////////////////////init_ocx_ow8//////////////////// | |||
template <int c_dim, BiasMode bias_mode, typename T, typename T2> | |||
template <int c_dim, BiasMode bias_mode, int ow_block, typename T, typename T2> | |||
struct InitOcxOw8 { | |||
static void impl(T& c, T2 bias_ptr, int oc_step); | |||
}; | |||
template <BiasMode bias_mode, typename T, typename T2> | |||
struct InitOcxOw8<2, bias_mode, T, T2> { | |||
template <typename T, typename T2> | |||
struct InitOcxOw8<2, BiasMode::NO_BIAS, 8, T, T2> { | |||
static void impl(T& c, const float32_t*, int) { | |||
#define BAIS_INIT(step) \ | |||
c[0][step] = vdupq_n_f32(0); \ | |||
c[1][step] = vdupq_n_f32(0); | |||
UNROLL_CALL_RAW(8, BAIS_INIT); | |||
#undef BAIS_INIT | |||
} | |||
}; | |||
template <typename T, typename T2> | |||
struct InitOcxOw8<2, BiasMode::NO_BIAS, 4, T, T2> { | |||
static void impl(T& c, const float32_t*, int) { | |||
#define BAIS_INIT(step) \ | |||
c[0][step] = vdupq_n_f32(0); \ | |||
c[1][step] = vdupq_n_f32(0); | |||
UNROLL_CALL_RAW(4, BAIS_INIT); | |||
#undef BAIS_INIT | |||
} | |||
}; | |||
template <typename T, typename T2> | |||
struct InitOcxOw8<2, BiasMode::BROADCAST_CHANNEL_BIAS, 8, T, T2> { | |||
static void impl(T& c, const float32_t* bias_ptr, int oc_step) { | |||
if (bias_mode == BiasMode::BROADCAST_CHANNEL_BIAS) { | |||
#define BAIS_INIT(step) \ | |||
c[0][step] = vld1q_f32(bias_ptr); \ | |||
c[1][step] = vld1q_f32(bias_ptr + oc_step); | |||
UNROLL_CALL_RAW(8, BAIS_INIT); | |||
UNROLL_CALL_RAW(8, BAIS_INIT); | |||
#undef BAIS_INIT | |||
} else { | |||
#define BAIS_INIT(step) \ | |||
c[0][step] = vdupq_n_f32(0); \ | |||
c[1][step] = vdupq_n_f32(0); | |||
UNROLL_CALL_RAW(8, BAIS_INIT); | |||
} | |||
}; | |||
template <typename T, typename T2> | |||
struct InitOcxOw8<2, BiasMode::BROADCAST_CHANNEL_BIAS, 4, T, T2> { | |||
static void impl(T& c, const float32_t* bias_ptr, int oc_step) { | |||
#define BAIS_INIT(step) \ | |||
c[0][step] = vld1q_f32(bias_ptr); \ | |||
c[1][step] = vld1q_f32(bias_ptr + oc_step); | |||
UNROLL_CALL_RAW(4, BAIS_INIT); | |||
#undef BAIS_INIT | |||
} | |||
}; | |||
template <typename T, typename T2> | |||
struct InitOcxOw8<2, BiasMode::BIAS, 8, T, T2> { | |||
static void impl(T& c, const float32_t* bias_ptr, int oc_step) { | |||
constexpr int simd_len = 4; | |||
#define BAIS_INIT(step) \ | |||
c[0][step] = vld1q_f32(bias_ptr + step * simd_len); \ | |||
c[1][step] = vld1q_f32(bias_ptr + oc_step + step * simd_len); | |||
UNROLL_CALL_RAW(8, BAIS_INIT); | |||
#undef BAIS_INIT | |||
} | |||
} | |||
}; | |||
template <BiasMode bias_mode, typename T, typename T2> | |||
struct InitOcxOw8<1, bias_mode, T, T2> { | |||
template <typename T, typename T2> | |||
struct InitOcxOw8<2, BiasMode::BIAS, 4, T, T2> { | |||
static void impl(T& c, const float32_t* bias_ptr, int oc_step) { | |||
constexpr int simd_len = 4; | |||
#define BAIS_INIT(step) \ | |||
c[0][step] = vld1q_f32(bias_ptr + step * simd_len); \ | |||
c[1][step] = vld1q_f32(bias_ptr + oc_step + step * simd_len); | |||
UNROLL_CALL_RAW(4, BAIS_INIT); | |||
#undef BAIS_INIT | |||
} | |||
}; | |||
template <typename T, typename T2> | |||
struct InitOcxOw8<1, BiasMode::NO_BIAS, 8, T, T2> { | |||
static void impl(T& c, const float32_t*, int) { | |||
#define BAIS_INIT(step) c[0][step] = vdupq_n_f32(0); | |||
UNROLL_CALL_RAW(8, BAIS_INIT); | |||
#undef BAIS_INIT | |||
} | |||
}; | |||
template <typename T, typename T2> | |||
struct InitOcxOw8<1, BiasMode::NO_BIAS, 4, T, T2> { | |||
static void impl(T& c, const float32_t*, int) { | |||
#define BAIS_INIT(step) c[0][step] = vdupq_n_f32(0); | |||
UNROLL_CALL_RAW(4, BAIS_INIT); | |||
#undef BAIS_INIT | |||
} | |||
}; | |||
template <typename T, typename T2> | |||
struct InitOcxOw8<1, BiasMode::BROADCAST_CHANNEL_BIAS, 8, T, T2> { | |||
static void impl(T& c, const float32_t* bias_ptr, int) { | |||
if (bias_mode == BiasMode::BROADCAST_CHANNEL_BIAS) { | |||
#define BAIS_INIT(step) c[0][step] = vld1q_f32(bias_ptr); | |||
UNROLL_CALL_RAW(8, BAIS_INIT); | |||
UNROLL_CALL_RAW(8, BAIS_INIT); | |||
#undef BAIS_INIT | |||
} else { | |||
#define BAIS_INIT(step) c[0][step] = vdupq_n_f32(0); | |||
UNROLL_CALL_RAW(8, BAIS_INIT); | |||
} | |||
}; | |||
template <typename T, typename T2> | |||
struct InitOcxOw8<1, BiasMode::BROADCAST_CHANNEL_BIAS, 4, T, T2> { | |||
static void impl(T& c, const float32_t* bias_ptr, int) { | |||
#define BAIS_INIT(step) c[0][step] = vld1q_f32(bias_ptr); | |||
UNROLL_CALL_RAW(4, BAIS_INIT); | |||
#undef BAIS_INIT | |||
} | |||
}; | |||
template <typename T, typename T2> | |||
struct InitOcxOw8<1, BiasMode::BIAS, 8, T, T2> { | |||
static void impl(T& c, const float32_t* bias_ptr, int) { | |||
constexpr int simd_len = 4; | |||
#define BAIS_INIT(step) c[0][step] = vld1q_f32(bias_ptr + step * simd_len); | |||
UNROLL_CALL_RAW(8, BAIS_INIT); | |||
#undef BAIS_INIT | |||
} | |||
}; | |||
template <typename T, typename T2> | |||
struct InitOcxOw8<1, BiasMode::BIAS, 4, T, T2> { | |||
static void impl(T& c, const float32_t* bias_ptr, int) { | |||
constexpr int simd_len = 4; | |||
#define BAIS_INIT(step) c[0][step] = vld1q_f32(bias_ptr + step * simd_len); | |||
UNROLL_CALL_RAW(4, BAIS_INIT); | |||
#undef BAIS_INIT | |||
} | |||
} | |||
}; | |||
template <int c_dim, BiasMode bias_mode, typename T, typename T2> | |||
template <int c_dim, BiasMode bias_mode, int ow_block, typename T, typename T2> | |||
inline void init_ocx_ow8(T& c, T2 bias_ptr, int oc_step) { | |||
InitOcxOw8<c_dim, bias_mode, T, T2>::impl(c, bias_ptr, oc_step); | |||
InitOcxOw8<c_dim, bias_mode, ow_block, T, T2>::impl(c, bias_ptr, oc_step); | |||
} | |||
/////////////////////init_ocx_ow4///////////////////// | |||
template <int c_dim, BiasMode bias_mode, typename T> | |||
@@ -638,6 +741,20 @@ struct LoadHelper<6, base_offset, ptr_step, 0, Func, T, T2, XT...> { | |||
UNROLL_CALL_RAW(6, WEIGHT_CB); | |||
} | |||
}; | |||
template <int base_offset, int ptr_step, typename Func, typename T, typename T2, | |||
typename... XT> | |||
struct LoadHelper<7, base_offset, ptr_step, 0, Func, T, T2, XT...> { | |||
static void impl(T& src, T2 ptr, int, XT... args) { | |||
UNROLL_CALL_RAW(7, WEIGHT_CB); | |||
} | |||
}; | |||
template <int base_offset, int ptr_step, typename Func, typename T, typename T2, | |||
typename... XT> | |||
struct LoadHelper<8, base_offset, ptr_step, 0, Func, T, T2, XT...> { | |||
static void impl(T& src, T2 ptr, int, XT... args) { | |||
UNROLL_CALL_RAW(8, WEIGHT_CB); | |||
} | |||
}; | |||
#undef WEIGHT_CB | |||
#define WEIGHT_CB(step) \ | |||
@@ -674,6 +791,11 @@ struct LoadHelper<7, base_offset, ptr_step, 1, Func, T, T2> { | |||
static void impl(T& src, T2 ptr, int) { UNROLL_CALL_RAW(7, WEIGHT_CB); } | |||
}; | |||
template <int base_offset, int ptr_step, typename Func, typename T, typename T2> | |||
struct LoadHelper<8, base_offset, ptr_step, 1, Func, T, T2> { | |||
static void impl(T& src, T2 ptr, int) { UNROLL_CALL_RAW(8, WEIGHT_CB); } | |||
}; | |||
#undef WEIGHT_CB | |||
#define WEIGHT_CB(step) \ | |||
@@ -724,6 +846,13 @@ struct LoadHelper<7, base_offset, ptr_step, 2, Func, T, T2> { | |||
} | |||
}; | |||
template <int base_offset, int ptr_step, typename Func, typename T, typename T2> | |||
struct LoadHelper<8, base_offset, ptr_step, 2, Func, T, T2> { | |||
static void impl(T& src, T2 ptr, int oc_offset) { | |||
UNROLL_CALL_RAW(8, WEIGHT_CB); | |||
} | |||
}; | |||
#undef WEIGHT_CB | |||
template <int weight_number, int base_offset, int ptr_step, int c_dim, | |||
@@ -67,6 +67,7 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj { | |||
AlgoF32Direct f32_direct_large_group{true}; | |||
AlgoF32Direct f32_direct_small_group{false}; | |||
AlgoF32DirectStride2NCHW44 f32_direct_stride2_nchw44; | |||
AlgoF32DirectStride2 f32_direct_stride2_large_group{true}; | |||
AlgoF32DirectStride2 f32_direct_stride2_small_group{false}; | |||
AlgoF32DirectStride1 f32_direct_stride1_large_group{true}; | |||
@@ -125,6 +126,8 @@ public: | |||
direct_algos.emplace_back(&i8x8x16_stride2_large_group); | |||
direct_algos.emplace_back(&i8x8x16_stride2_small_group); | |||
direct_algos.emplace_back(&f32_direct_stride2_nchw_nchw44); | |||
direct_algos.emplace_back(&f32_direct_stride2_nchw44); | |||
direct_algos.emplace_back(&f32_direct_stride1_large_group); | |||
direct_algos.emplace_back(&f32_direct_stride1_small_group); | |||
direct_algos.emplace_back(&f32_direct_stride2_large_group); | |||
@@ -68,6 +68,8 @@ private: | |||
class AlgoF32DirectStride1; | |||
class AlgoF32DirectStride2; | |||
class AlgoF32DirectStride2NCHWNCHW44; | |||
class AlgoF32DirectStride2NCHW44; | |||
class AlgoI8x8x16Direct; | |||
class AlgoI8x8x16Stride2; | |||
class AlgoI8x8x16Stride2Filter2; | |||
@@ -198,6 +198,16 @@ static void benchmark_convbias(Handle* handle, bool is_fp32 = false) { | |||
run(1, 1, 4, 112, 112, 2, 2, true); | |||
run(1, 3, 32, 224, 224, 3, 2, true); | |||
run(1, 3, 64, 224, 224, 7, 2, true); | |||
run(1, 64, 128, 56, 56, 3, 2, false); | |||
run(1, 128, 256, 28, 28, 3, 2, false); | |||
run(1, 256, 512, 14, 14, 3, 2, false); | |||
run(1, 64, 128, 56, 56, 7, 2, false); | |||
run(1, 128, 256, 28, 28, 7, 2, false); | |||
run(1, 256, 512, 14, 14, 7, 2, false); | |||
run(1, 64, 64, 48, 48, 3, 2, false); | |||
} else { | |||
for (size_t stride : {1, 2}) { | |||
printf("stride %zu\n", stride); | |||
@@ -72,18 +72,16 @@ std::vector<conv_bias::TestArg> get_int8_quint8_conv_bias_args( | |||
std::vector<conv_bias::TestArg> get_nchw44_conv_bias_args( | |||
std::vector<size_t> kernel_vec, size_t stride, bool no_pad = false, | |||
bool no_bias = false, bool no_nonlinemode = false, | |||
bool is_input_nchw = false) { | |||
bool is_input_nchw = false, bool support_full_bias = false) { | |||
using namespace conv_bias; | |||
using NLMode = param::ConvBias::NonlineMode; | |||
std::vector<TestArg> args; | |||
auto pack = [&](size_t n, size_t oc, size_t ic, size_t h, size_t w, | |||
size_t kernel, size_t stride, size_t group, NLMode nlmode, | |||
int any_pad = -1) { | |||
megdnn::BiasMode bias_mode, int any_pad = -1) { | |||
constexpr int pack_c = 4; | |||
const size_t pad = any_pad >= 0 ? any_pad : kernel / 2; | |||
auto bias_mode = no_bias ? megdnn::BiasMode::NO_BIAS | |||
: megdnn::BiasMode::BROADCAST_CHANNEL_BIAS; | |||
auto oc_per_group = oc / group; | |||
auto ic_per_group = ic / group; | |||
bool ok_group = (oc % group == 0 && ic % group == 0) && | |||
@@ -116,6 +114,10 @@ std::vector<conv_bias::TestArg> get_nchw44_conv_bias_args( | |||
auto bias_tensor_shape = TensorShape{}; | |||
if (bias_mode == megdnn::BiasMode::BROADCAST_CHANNEL_BIAS) { | |||
bias_tensor_shape = {1, oc / pack_c, 1, 1, pack_c}; | |||
} else if (bias_mode == megdnn::BiasMode::BIAS) { | |||
bias_tensor_shape = {n, oc / pack_c, | |||
(h + 2 * pad - kernel) / stride + 1, | |||
(w + 2 * pad - kernel) / stride + 1, pack_c}; | |||
} | |||
if (group == 1) { | |||
param.sparse = param::ConvBias::Sparse::DENSE; | |||
@@ -149,19 +151,29 @@ std::vector<conv_bias::TestArg> get_nchw44_conv_bias_args( | |||
nonlinemode.emplace_back(NLMode::RELU); | |||
nonlinemode.emplace_back(NLMode::H_SWISH); | |||
} | |||
for (auto nlmode : nonlinemode) | |||
for (size_t n : {1, 2}) | |||
for (size_t kernel : kernel_vec) | |||
for (size_t oc : {4, 12, 32}) | |||
for (size_t ic : {1, 3, 4, 12, 32}) | |||
for (size_t h : {3, 5, 12}) | |||
for (size_t w : {7, 16, 23}) { | |||
for (size_t group = 1; | |||
group <= std::min(oc, ic); ++group) { | |||
pack(n, oc, ic, h, w, kernel, stride, group, | |||
nlmode); | |||
std::vector<megdnn::BiasMode> bias_mode = { | |||
megdnn::BiasMode::BROADCAST_CHANNEL_BIAS}; | |||
if (no_bias) { | |||
bias_mode.emplace_back(megdnn::BiasMode::NO_BIAS); | |||
} | |||
if (support_full_bias) { | |||
bias_mode.emplace_back(megdnn::BiasMode::BIAS); | |||
} | |||
for (auto bias : bias_mode) | |||
for (auto nlmode : nonlinemode) | |||
for (size_t n : {1, 2}) | |||
for (size_t kernel : kernel_vec) | |||
for (size_t oc : {4, 12, 32}) | |||
for (size_t ic : {1, 3, 4, 12, 32}) | |||
for (size_t h : {3, 5, 12}) | |||
for (size_t w : {7, 16, 23}) { | |||
for (size_t group = 1; | |||
group <= std::min(oc, ic); ++group) { | |||
pack(n, oc, ic, h, w, kernel, stride, | |||
group, nlmode, bias); | |||
} | |||
} | |||
} | |||
return args; | |||
} | |||
@@ -325,6 +337,13 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_DIRECT_FP32_SMALL_GROUP) { | |||
get_conv_bias_args({1, 2, 3, 4, 5, 6, 7}, 1, false, false, false), | |||
handle(), "F32DIRECT_SMALL_GROUP"); | |||
} | |||
TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_DIRECT_FP32_NCHW44_S2) { | |||
check_conv_bias(get_nchw44_conv_bias_args({2, 3, 5, 7}, 2, false, false, | |||
false, false, true), | |||
handle(), "F32_CONV_NCHW44_DIRECT_S2"); | |||
} | |||
TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_DIRECT_FP32_STR1_LARGE_GROUP) { | |||
check_conv_bias(get_conv_bias_args({2, 3, 5, 7}, 1, false, false, false), | |||
handle(), "F32STRD1_LARGE_GROUP"); | |||