GitOrigin-RevId: 65f54a4f7e
tags/v0.5.0
@@ -178,6 +178,22 @@ public: | |||||
const NCBKernSizeParam& param) const override; | const NCBKernSizeParam& param) const override; | ||||
}; | }; | ||||
class ConvBiasImpl::AlgoF32DirectNCHW44 final : public AlgoBase { | |||||
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const; | |||||
public: | |||||
AlgoF32DirectNCHW44() {} | |||||
bool is_reproducible() const override { return true; } | |||||
const char* name() const override { return "F32_CONV_NCHW44_DIRECT"; } | |||||
bool usable(fallback::ConvBiasImpl*, const NCBKernSizeParam& param, | |||||
AlgoSelectionStrategy algo_selection_strategy) const override; | |||||
size_t get_workspace(fallback::ConvBiasImpl*, | |||||
const NCBKernSizeParam& param) const override; | |||||
virtual SmallVector<NCBKern> dispatch_kerns( | |||||
fallback::ConvBiasImpl* opr, | |||||
const NCBKernSizeParam& param) const override; | |||||
}; | |||||
class ConvBiasImpl::AlgoF32DirectStride2NCHW44 final : public AlgoBase { | class ConvBiasImpl::AlgoF32DirectStride2NCHW44 final : public AlgoBase { | ||||
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const; | SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const; | ||||
@@ -1,5 +1,5 @@ | |||||
/** | /** | ||||
* \file dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_algo.cpp | |||||
* \file dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw44_algo.cpp | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | ||||
* | * | ||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | ||||
@@ -12,10 +12,9 @@ | |||||
#include "megdnn/oprs.h" | #include "megdnn/oprs.h" | ||||
#include "src/arm_common/conv_bias/fp32/algos.h" | #include "src/arm_common/conv_bias/fp32/algos.h" | ||||
#include "src/arm_common/conv_bias/fp32/f32_direct_stride1_nchw44_kern.h" | |||||
#include "src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.h" | #include "src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.h" | ||||
#include "src/arm_common/conv_bias/fp32/strategy.h" | |||||
#include "src/arm_common/elemwise_op.h" | #include "src/arm_common/elemwise_op.h" | ||||
#include "src/common/opr_delegate.h" | |||||
#include "midout.h" | #include "midout.h" | ||||
@@ -25,7 +24,7 @@ using conv_fun = std::function<void( | |||||
WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& kern_param, | ||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
const CpuNDRange& workspace_ids, const CpuNDRange& ncb_range)>; | const CpuNDRange& workspace_ids, const CpuNDRange& ncb_range)>; | ||||
MIDOUT_DECL(megdnn_arm_common_conv_bias_fp32_nchw44_stride2) | |||||
MIDOUT_DECL(megdnn_arm_common_conv_bias_fp32_nchw44_stride1) | |||||
namespace { | namespace { | ||||
// block_helper is used to calculate oh block size | // block_helper is used to calculate oh block size | ||||
static inline int block_helper(const int nthread, const int amount, | static inline int block_helper(const int nthread, const int amount, | ||||
@@ -79,7 +78,7 @@ static WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param) { | |||||
return {nullptr, {src_size * param.nr_threads}}; | return {nullptr, {src_size * param.nr_threads}}; | ||||
}; | }; | ||||
template <size_t filter, BiasMode bias_mode, typename Op> | |||||
template <size_t filter, BiasMode bias_mode, typename Op, int stride> | |||||
static void do_conv_kern(WorkspaceBundle bundle, | static void do_conv_kern(WorkspaceBundle bundle, | ||||
const ConvBiasImpl::NCBKernParam& kern_param, | const ConvBiasImpl::NCBKernParam& kern_param, | ||||
const ConvBiasImpl::NCBKernIndex& ncb_index, | const ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
@@ -125,11 +124,17 @@ static void do_conv_kern(WorkspaceBundle bundle, | |||||
const size_t src_size = get_perthread_cache_bytes(ic, ih2, iw2); | const size_t src_size = get_perthread_cache_bytes(ic, ih2, iw2); | ||||
float* sptr = reinterpret_cast<float*>((int8_t*)bundle.get(0) + | float* sptr = reinterpret_cast<float*>((int8_t*)bundle.get(0) + | ||||
ncb_index.thread_id * src_size); | ncb_index.thread_id * src_size); | ||||
conv_bias::pack_src_fp32_nchw44_stride2( | |||||
sptr, origin_sptr, ph, pw, remain_right_pad, | |||||
ih_real - src_top_pad - src_bottom_pad, iw, iw2, src_top_pad, | |||||
src_bottom_pad, ic, ih * iw); | |||||
if (stride == 1) { | |||||
conv_bias::pack_src_fp32_nchw44_stride1( | |||||
sptr, origin_sptr, ph, pw, remain_right_pad, | |||||
ih_real - src_top_pad - src_bottom_pad, iw, iw2, src_top_pad, | |||||
src_bottom_pad, ic, ih * iw); | |||||
} else { | |||||
conv_bias::pack_src_fp32_nchw44_stride2( | |||||
sptr, origin_sptr, ph, pw, remain_right_pad, | |||||
ih_real - src_top_pad - src_bottom_pad, iw, iw2, src_top_pad, | |||||
src_bottom_pad, ic, ih * iw); | |||||
} | |||||
const float* fptr = | const float* fptr = | ||||
kern_param.filter<dt_float32>(group_id) + oc_idx * fh * fw * ic; | kern_param.filter<dt_float32>(group_id) + oc_idx * fh * fw * ic; | ||||
@@ -142,46 +147,59 @@ static void do_conv_kern(WorkspaceBundle bundle, | |||||
kern_param.bias<dt_float32>(batch_id, group_id) + bias_offset; | kern_param.bias<dt_float32>(batch_id, group_id) + bias_offset; | ||||
Op op; | Op op; | ||||
if (stride == 1) { | |||||
#define KERN1_NCHW44_CONV(filter) \ | |||||
conv_bias::conv_direct_stride1_##filter##x##filter##_fp32_nchw44< \ | |||||
\ | |||||
bias_mode, Op>(sptr, fptr, bptr, nullptr, dst, oc_block, ic, \ | |||||
ih_real, iw2, oh, oh_block_real, ow, op, ph, pw) | |||||
DISPATCH_FILTER(filter, KERN1_NCHW44_CONV); | |||||
#undef KERN1_NCHW44_CONV | |||||
} else { | |||||
#define KERN1_NCHW44_CONV(filter) \ | #define KERN1_NCHW44_CONV(filter) \ | ||||
conv_bias::conv_direct_stride2_##filter##x##filter##_fp32_nchw44< \ | conv_bias::conv_direct_stride2_##filter##x##filter##_fp32_nchw44< \ | ||||
\ | \ | ||||
bias_mode, Op>(sptr, fptr, bptr, nullptr, dst, oc_block, ic, \ | bias_mode, Op>(sptr, fptr, bptr, nullptr, dst, oc_block, ic, \ | ||||
ih_real, iw2, oh, oh_block_real, ow, op, ph, pw) | ih_real, iw2, oh, oh_block_real, ow, op, ph, pw) | ||||
DISPATCH_FILTER(filter, KERN1_NCHW44_CONV); | |||||
DISPATCH_FILTER(filter, KERN1_NCHW44_CONV); | |||||
#undef KERN1_NCHW44_CONV | #undef KERN1_NCHW44_CONV | ||||
} | |||||
} | } | ||||
} // namespace | } // namespace | ||||
/* ===================== stride2 algo ===================== */ | |||||
bool ConvBiasImpl::AlgoF32DirectStride2NCHW44::usable( | |||||
fallback::ConvBiasImpl*, const NCBKernSizeParam& param, | |||||
AlgoSelectionStrategy) const { | |||||
/* ===================== stride1 algo ===================== */ | |||||
bool ConvBiasImpl::AlgoF32DirectNCHW44::usable(fallback::ConvBiasImpl*, | |||||
const NCBKernSizeParam& param, | |||||
AlgoSelectionStrategy) const { | |||||
auto&& fm = param.filter_meta; | auto&& fm = param.filter_meta; | ||||
auto fh = fm.spatial[0]; | auto fh = fm.spatial[0]; | ||||
int oc = fm.ocpg; | int oc = fm.ocpg; | ||||
int ic = fm.icpg; | |||||
bool ok_type = ((param.src_type.enumv() == DTypeEnum::Float32 && | bool ok_type = ((param.src_type.enumv() == DTypeEnum::Float32 && | ||||
param.filter_type.enumv() == DTypeEnum::Float32 && | param.filter_type.enumv() == DTypeEnum::Float32 && | ||||
(param.dst_type.enumv() == DTypeEnum::Float32))) && | (param.dst_type.enumv() == DTypeEnum::Float32))) && | ||||
(fm.format == param::Convolution::Format::NCHW44); | (fm.format == param::Convolution::Format::NCHW44); | ||||
bool ok_src_dst = (oc % 4 == 0 && oc >= 4); | |||||
bool ok_src_dst = (oc % 4 == 0 && oc >= 4 && ic % 4 == 0 && ic >= 4); | |||||
bool ok_filter = fm.spatial_ndim == 2 && fh == fm.spatial[1] && | bool ok_filter = fm.spatial_ndim == 2 && fh == fm.spatial[1] && | ||||
(fh == 2 || fh == 3 || fh == 5 || fh == 7); | (fh == 2 || fh == 3 || fh == 5 || fh == 7); | ||||
bool ok_slide = fm.dilation[0] == 1 && fm.dilation[1] == 1 && | bool ok_slide = fm.dilation[0] == 1 && fm.dilation[1] == 1 && | ||||
fm.stride[0] == 2 && fm.stride[1] == 2; | |||||
((fm.stride[0] == 1 && fm.stride[1] == 1) || | |||||
(fm.stride[0] == 2 && fm.stride[1] == 2)); | |||||
bool ok_conv = !fm.should_flip; | bool ok_conv = !fm.should_flip; | ||||
bool avaible = ok_type && ok_src_dst && ok_filter && ok_slide && ok_conv; | bool avaible = ok_type && ok_src_dst && ok_filter && ok_slide && ok_conv; | ||||
return avaible; | return avaible; | ||||
} | } | ||||
size_t ConvBiasImpl::AlgoF32DirectStride2NCHW44::get_workspace( | |||||
size_t ConvBiasImpl::AlgoF32DirectNCHW44::get_workspace( | |||||
fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { | fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { | ||||
return get_bundle(param).total_size_in_bytes(); | return get_bundle(param).total_size_in_bytes(); | ||||
} | } | ||||
SmallVector<ConvBiasImpl::NCBKern> | SmallVector<ConvBiasImpl::NCBKern> | ||||
ConvBiasImpl::AlgoF32DirectStride2NCHW44::dispatch_kerns( | |||||
ConvBiasImpl::AlgoF32DirectNCHW44::dispatch_kerns( | |||||
fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { | fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { | ||||
auto fm = param.filter_meta; | auto fm = param.filter_meta; | ||||
const int batch = param.n; | const int batch = param.n; | ||||
@@ -190,27 +208,43 @@ ConvBiasImpl::AlgoF32DirectStride2NCHW44::dispatch_kerns( | |||||
conv_fun do_conv_fun = nullptr; | conv_fun do_conv_fun = nullptr; | ||||
// NOTE: remain_w is not used to gen hash of midout for compatible with | // NOTE: remain_w is not used to gen hash of midout for compatible with | ||||
// shape runtime | // shape runtime | ||||
#define DO_CONV_KERN_FUN(filter, bias_mode, op) \ | |||||
MIDOUT_BEGIN(megdnn_arm_common_conv_bias_fp32_nchw44_stride2, \ | |||||
midout_iv(#filter #bias_mode #op##_hash)) { \ | |||||
do_conv_fun = do_conv_kern<filter, bias_mode, op>; \ | |||||
} \ | |||||
#define DO_CONV_KERN_FUN(filter, bias_mode, op, stride) \ | |||||
MIDOUT_BEGIN(megdnn_arm_common_conv_bias_fp32_nchw44_stride1, \ | |||||
midout_iv(#filter #bias_mode #stride #op##_hash)) { \ | |||||
do_conv_fun = do_conv_kern<filter, bias_mode, op, stride>; \ | |||||
} \ | |||||
MIDOUT_END(); | MIDOUT_END(); | ||||
#define GET_OP_PARAM(filter, bias_mode) \ | |||||
switch (param.nonlineMode) { \ | |||||
case param::ConvBias::NonlineMode::IDENTITY: \ | |||||
DO_CONV_KERN_FUN(filter, bias_mode, NoneOp<dt_float32>) \ | |||||
break; \ | |||||
case param::ConvBias::NonlineMode::RELU: \ | |||||
DO_CONV_KERN_FUN(filter, bias_mode, ReluOp<dt_float32>) \ | |||||
break; \ | |||||
case param::ConvBias::NonlineMode::H_SWISH: \ | |||||
DO_CONV_KERN_FUN(filter, bias_mode, HSwishOp<dt_float32>) \ | |||||
break; \ | |||||
default: \ | |||||
megdnn_assert(0); \ | |||||
break; \ | |||||
#define GET_STRIDE_PARAM(filter, bias_mode, op) \ | |||||
switch (fm.stride[0]) { \ | |||||
case 1: \ | |||||
DO_CONV_KERN_FUN(filter, bias_mode, op, 1); \ | |||||
break; \ | |||||
case 2: \ | |||||
DO_CONV_KERN_FUN(filter, bias_mode, op, 2); \ | |||||
break; \ | |||||
\ | |||||
default: \ | |||||
megdnn_assert(0); \ | |||||
} | |||||
#define GET_OP_PARAM(filter, bias_mode) \ | |||||
switch (param.nonlineMode) { \ | |||||
case param::ConvBias::NonlineMode::IDENTITY: \ | |||||
GET_STRIDE_PARAM(filter, bias_mode, NoneOp<dt_float32>) \ | |||||
break; \ | |||||
case param::ConvBias::NonlineMode::RELU: \ | |||||
GET_STRIDE_PARAM(filter, bias_mode, ReluOp<dt_float32>) \ | |||||
break; \ | |||||
case param::ConvBias::NonlineMode::H_SWISH: \ | |||||
GET_STRIDE_PARAM(filter, bias_mode, HSwishOp<dt_float32>) \ | |||||
break; \ | |||||
case param::ConvBias::NonlineMode::SIGMOID: \ | |||||
GET_STRIDE_PARAM(filter, bias_mode, SigmoidOp<dt_float32>) \ | |||||
break; \ | |||||
default: \ | |||||
megdnn_assert(0); \ | |||||
break; \ | |||||
} | } | ||||
#define GET_BIAS_MODE_PARAM(filter) \ | #define GET_BIAS_MODE_PARAM(filter) \ |
@@ -0,0 +1,571 @@ | |||||
/** | |||||
* \file | |||||
* dnn/src/arm_common/conv_bias/fp32/f32_direct_stride1_nchw44_kern.cpp | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | |||||
#include "src/arm_common/conv_bias/fp32/f32_direct_stride1_nchw44_kern.h" | |||||
#include "src/arm_common/conv_bias/intrinsic_helper.h" | |||||
#include "src/arm_common/elemwise_op.h" | |||||
#include "src/arm_common/simd_macro/marm_neon.h" | |||||
#include "src/common/unroll_macro.h" | |||||
#include "src/common/utils.h" | |||||
#include "src/fallback/conv_bias/common.h" | |||||
using namespace megdnn; | |||||
using namespace arm_common; | |||||
namespace { | |||||
template <int src_idx, int weight_idx, int c_dim, typename Func, int ow_block, | |||||
typename T, typename T2, typename T3, typename T4> | |||||
struct ShiftCalHelper { | |||||
static void impl(T& c, T2& src, T3& weight); | |||||
}; | |||||
template <int src_idx, int weight_idx, typename Func, typename T, typename T2, | |||||
typename T3, typename T4> | |||||
struct ShiftCalHelper<src_idx, weight_idx, 2, Func, 8, T, T2, T3, T4> { | |||||
static void impl(T& c, T2& src, T3& weight) { | |||||
#define cb(step, lane) \ | |||||
c[0][step] = Func::template impl<lane>(c[0][step], weight[0][lane], \ | |||||
src[(step + src_idx) % 8]); \ | |||||
c[1][step] = Func::template impl<lane>(c[1][step], weight[1][lane], \ | |||||
src[(step + src_idx) % 8]); | |||||
UNROLL_CALL_RAW(8, cb, 0); | |||||
UNROLL_CALL_RAW(8, cb, 1); | |||||
UNROLL_CALL_RAW(8, cb, 2); | |||||
UNROLL_CALL_RAW(8, cb, 3); | |||||
#undef cb | |||||
} | |||||
}; | |||||
template <int src_idx, int weight_idx, typename Func, typename T, typename T2, | |||||
typename T3, typename T4> | |||||
struct ShiftCalHelper<src_idx, weight_idx, 2, Func, 4, T, T2, T3, T4> { | |||||
static void impl(T& c, T2& src, T3& weight) { | |||||
#define cb(step, lane) \ | |||||
c[0][step] = Func::template impl<lane>(c[0][step], weight[0][lane], \ | |||||
src[(step + src_idx) % 4]); \ | |||||
c[1][step] = Func::template impl<lane>(c[1][step], weight[1][lane], \ | |||||
src[(step + src_idx) % 4]); | |||||
UNROLL_CALL_RAW(4, cb, 0); | |||||
UNROLL_CALL_RAW(4, cb, 1); | |||||
UNROLL_CALL_RAW(4, cb, 2); | |||||
UNROLL_CALL_RAW(4, cb, 3); | |||||
#undef cb | |||||
} | |||||
}; | |||||
template <int src_idx, int weight_idx, typename Func, typename T, typename T2, | |||||
typename T3, typename T4> | |||||
struct ShiftCalHelper<src_idx, weight_idx, 1, Func, 8, T, T2, T3, T4> { | |||||
static void impl(T& c, T2& src, T3& weight) { | |||||
#define cb(step, lane) \ | |||||
c[0][step] = Func::template impl<lane>(c[0][step], weight[0][lane], \ | |||||
src[(step + src_idx) % 8]); | |||||
UNROLL_CALL_RAW(8, cb, 0); | |||||
UNROLL_CALL_RAW(8, cb, 1); | |||||
UNROLL_CALL_RAW(8, cb, 2); | |||||
UNROLL_CALL_RAW(8, cb, 3); | |||||
#undef cb | |||||
} | |||||
}; | |||||
template <int src_idx, int weight_idx, typename Func, typename T, typename T2, | |||||
typename T3, typename T4> | |||||
struct ShiftCalHelper<src_idx, weight_idx, 1, Func, 4, T, T2, T3, T4> { | |||||
static void impl(T& c, T2& src, T3& weight) { | |||||
#define cb(step, lane) \ | |||||
c[0][step] = Func::template impl<lane>(c[0][step], weight[0][lane], \ | |||||
src[(step + src_idx) % 4]); | |||||
UNROLL_CALL_RAW(4, cb, 0); | |||||
UNROLL_CALL_RAW(4, cb, 1); | |||||
UNROLL_CALL_RAW(4, cb, 2); | |||||
UNROLL_CALL_RAW(4, cb, 3); | |||||
#undef cb | |||||
} | |||||
}; | |||||
template <int src_idx, int weight_idx, int c_dim, typename FUNC, int ow_block, | |||||
typename T, typename T2, typename T3> | |||||
inline void cal_helper(T& c, T2& src, T3& weight) { | |||||
ShiftCalHelper<src_idx, weight_idx, c_dim, FUNC, ow_block, T, T2, T3, | |||||
int>::impl(c, src, weight); | |||||
}; | |||||
template <int oc> | |||||
struct OCHelper { | |||||
public: | |||||
static const int val = -1; | |||||
}; | |||||
template <> | |||||
struct OCHelper<4> { | |||||
public: | |||||
static const int val = 1; | |||||
}; | |||||
#if MEGDNN_AARCH64 | |||||
template <> | |||||
struct OCHelper<8> { | |||||
public: | |||||
static const int val = 2; | |||||
}; | |||||
#endif | |||||
/** | |||||
* oc8_ow8(m = 8, n = 8) and oc4_ow8(m = 4, n = 8) gemm like kernel | |||||
* */ | |||||
template <BiasMode bias_mode, typename Op, int remain_w, int filter_size, | |||||
int oc_block, int ow_block> | |||||
struct KerNeonXXs1Nchw44FP32 { | |||||
static void impl(const float32_t* src_ptr, const float32_t* weight_ptr, | |||||
const float32_t* bias_ptr, float32_t* dst_ptr, int ic, | |||||
int ih, int iw, int ld_dst_oc, const Op& op); | |||||
}; | |||||
template <BiasMode bias_mode, typename Op, int remain_w, int oc_block, | |||||
int ow_block> | |||||
struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> { | |||||
static void impl(const float32_t* src_ptr_origin, | |||||
const float32_t* weight_ptr, const float32_t* bias_ptr, | |||||
float32_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, | |||||
const Op& op) { | |||||
constexpr int ic_step = 4; | |||||
constexpr int filter_size = 2; | |||||
constexpr int oc_step = 4; | |||||
constexpr int simd_len = 4; | |||||
constexpr int ld_weight = oc_step * oc_step; | |||||
const int ld_bias = bias_mode == BiasMode::BIAS ? ld_dst_oc : oc_step; | |||||
const int ld_weight_oc = oc_step * filter_size * filter_size * ic; | |||||
const int ld_weight_fh = oc_step * oc_step * filter_size; | |||||
const int ld_src_ic = ih * iw; | |||||
const int ld_src_iw = iw * oc_step; | |||||
constexpr int c_dim = OCHelper<oc_block>::val; | |||||
float32x4_t c[c_dim][ow_block]; | |||||
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias); | |||||
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | |||||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | |||||
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { | |||||
float32x4_t src[ow_block]; | |||||
float32x4_t weight[c_dim][ic_step]; | |||||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, | |||||
0); | |||||
load_helper<ic_step, 0, oc_step, c_dim, Vld1q_f32>( | |||||
weight, weight_ptr, ld_weight_oc); | |||||
cal_helper<0, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||||
weight); | |||||
src[0] = vld1q_f32(src_ptr + (ow_block)*ic_step); | |||||
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||||
weight, weight_ptr, ld_weight_oc); | |||||
cal_helper<1, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||||
weight); | |||||
src_ptr += ld_src_iw; | |||||
weight_ptr += ld_weight_fh; | |||||
} | |||||
} | |||||
store_ocx_ow8_remain_static<c_dim, remain_w, Op>(c, op, dst_ptr, | |||||
ld_dst_oc); | |||||
} | |||||
}; | |||||
template <BiasMode bias_mode, typename Op, int remain_w, int oc_block, | |||||
int ow_block> | |||||
struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> { | |||||
static void impl(const float32_t* src_ptr_origin, | |||||
const float32_t* weight_ptr, const float32_t* bias_ptr, | |||||
float32_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, | |||||
const Op& op) { | |||||
constexpr int ic_step = 4; | |||||
constexpr int filter_size = 3; | |||||
constexpr int oc_step = 4; | |||||
constexpr int simd_len = 4; | |||||
constexpr int ld_weight = oc_step * oc_step; | |||||
const int ld_bias = bias_mode == BiasMode::BIAS ? ld_dst_oc : oc_step; | |||||
const int ld_weight_oc = oc_step * filter_size * filter_size * ic; | |||||
const int ld_weight_fh = oc_step * oc_step * filter_size; | |||||
const int ld_src_ic = ih * iw; | |||||
const int ld_src_iw = iw * oc_step; | |||||
constexpr int c_dim = OCHelper<oc_block>::val; | |||||
float32x4_t c[c_dim][ow_block]; | |||||
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias); | |||||
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | |||||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | |||||
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { | |||||
float32x4_t src[ow_block]; | |||||
float32x4_t weight[c_dim][ic_step]; | |||||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, | |||||
0); | |||||
load_helper<ic_step, 0, oc_step, c_dim, Vld1q_f32>( | |||||
weight, weight_ptr, ld_weight_oc); | |||||
cal_helper<0, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||||
weight); | |||||
src[0] = vld1q_f32(src_ptr + (ow_block)*ic_step); | |||||
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||||
weight, weight_ptr, ld_weight_oc); | |||||
cal_helper<1, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||||
weight); | |||||
src[1] = vld1q_f32(src_ptr + (ow_block + 1) * ic_step); | |||||
load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||||
weight, weight_ptr, ld_weight_oc); | |||||
cal_helper<2, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||||
weight); | |||||
src_ptr += ld_src_iw; | |||||
weight_ptr += ld_weight_fh; | |||||
} | |||||
} | |||||
store_ocx_ow8_remain_static<c_dim, remain_w, Op>(c, op, dst_ptr, | |||||
ld_dst_oc); | |||||
} | |||||
}; | |||||
template <BiasMode bias_mode, typename Op, int remain_w, int oc_block, | |||||
int ow_block> | |||||
struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 5, oc_block, ow_block> { | |||||
static void impl(const float32_t* src_ptr_origin, | |||||
const float32_t* weight_ptr, const float32_t* bias_ptr, | |||||
float32_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, | |||||
const Op& op) { | |||||
constexpr int ic_step = 4; | |||||
constexpr int filter_size = 5; | |||||
constexpr int oc_step = 4; | |||||
constexpr int simd_len = 4; | |||||
constexpr int ld_weight = oc_step * oc_step; | |||||
const int ld_bias = bias_mode == BiasMode::BIAS ? ld_dst_oc : oc_step; | |||||
const int ld_weight_oc = oc_step * filter_size * filter_size * ic; | |||||
const int ld_weight_fh = oc_step * oc_step * filter_size; | |||||
const int ld_src_ic = ih * iw; | |||||
const int ld_src_iw = iw * oc_step; | |||||
constexpr int c_dim = OCHelper<oc_block>::val; | |||||
float32x4_t c[c_dim][ow_block]; | |||||
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias); | |||||
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | |||||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | |||||
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { | |||||
float32x4_t src[ow_block]; | |||||
float32x4_t weight[c_dim][ic_step]; | |||||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, | |||||
0); | |||||
load_helper<ic_step, 0, oc_step, c_dim, Vld1q_f32>( | |||||
weight, weight_ptr, ld_weight_oc); | |||||
cal_helper<0, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||||
weight); | |||||
src[0] = vld1q_f32(src_ptr + (ow_block)*ic_step); | |||||
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||||
weight, weight_ptr, ld_weight_oc); | |||||
cal_helper<1, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||||
weight); | |||||
src[1] = vld1q_f32(src_ptr + (ow_block + 1) * ic_step); | |||||
load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||||
weight, weight_ptr, ld_weight_oc); | |||||
cal_helper<2, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||||
weight); | |||||
src[2] = vld1q_f32(src_ptr + (ow_block + 2) * ic_step); | |||||
load_helper<ic_step, 3 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||||
weight, weight_ptr, ld_weight_oc); | |||||
cal_helper<3, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||||
weight); | |||||
src[3] = vld1q_f32(src_ptr + (ow_block + 3) * ic_step); | |||||
load_helper<ic_step, 4 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||||
weight, weight_ptr, ld_weight_oc); | |||||
cal_helper<4, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||||
weight); | |||||
src_ptr += ld_src_iw; | |||||
weight_ptr += ld_weight_fh; | |||||
} | |||||
} | |||||
store_ocx_ow8_remain_static<c_dim, remain_w, Op>(c, op, dst_ptr, | |||||
ld_dst_oc); | |||||
} | |||||
}; | |||||
template <BiasMode bias_mode, typename Op, int remain_w, int oc_block, | |||||
int ow_block> | |||||
struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 7, oc_block, ow_block> { | |||||
static void impl(const float32_t* src_ptr_origin, | |||||
const float32_t* weight_ptr, const float32_t* bias_ptr, | |||||
float32_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, | |||||
const Op& op) { | |||||
constexpr int ic_step = 4; | |||||
constexpr int filter_size = 7; | |||||
constexpr int oc_step = 4; | |||||
constexpr int simd_len = 4; | |||||
constexpr int ld_weight = oc_step * oc_step; | |||||
const int ld_bias = bias_mode == BiasMode::BIAS ? ld_dst_oc : oc_step; | |||||
const int ld_weight_oc = oc_step * filter_size * filter_size * ic; | |||||
const int ld_weight_fh = oc_step * oc_step * filter_size; | |||||
const int ld_src_ic = ih * iw; | |||||
const int ld_src_iw = iw * oc_step; | |||||
constexpr int c_dim = OCHelper<oc_block>::val; | |||||
float32x4_t c[c_dim][ow_block]; | |||||
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias); | |||||
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | |||||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | |||||
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { | |||||
float32x4_t src[ow_block]; | |||||
float32x4_t weight[c_dim][ic_step]; | |||||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, | |||||
0); | |||||
load_helper<ic_step, 0, oc_step, c_dim, Vld1q_f32>( | |||||
weight, weight_ptr, ld_weight_oc); | |||||
cal_helper<0, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||||
weight); | |||||
src[0] = vld1q_f32(src_ptr + (ow_block)*ic_step); | |||||
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||||
weight, weight_ptr, ld_weight_oc); | |||||
cal_helper<1, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||||
weight); | |||||
src[1] = vld1q_f32(src_ptr + (ow_block + 1) * ic_step); | |||||
load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||||
weight, weight_ptr, ld_weight_oc); | |||||
cal_helper<2, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||||
weight); | |||||
src[2] = vld1q_f32(src_ptr + (ow_block + 2) * ic_step); | |||||
load_helper<ic_step, 3 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||||
weight, weight_ptr, ld_weight_oc); | |||||
cal_helper<3, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||||
weight); | |||||
src[3] = vld1q_f32(src_ptr + (ow_block + 3) * ic_step); | |||||
load_helper<ic_step, 4 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||||
weight, weight_ptr, ld_weight_oc); | |||||
cal_helper<4, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||||
weight); | |||||
src[4] = vld1q_f32(src_ptr + (ow_block + 4) * ic_step); | |||||
load_helper<ic_step, 5 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||||
weight, weight_ptr, ld_weight_oc); | |||||
cal_helper<5, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||||
weight); | |||||
src[5] = vld1q_f32(src_ptr + (ow_block + 5) * ic_step); | |||||
load_helper<ic_step, 6 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||||
weight, weight_ptr, ld_weight_oc); | |||||
cal_helper<6, 0, c_dim, Vfmaq_laneq_f32, ow_block>(c, src, | |||||
weight); | |||||
src_ptr += ld_src_iw; | |||||
weight_ptr += ld_weight_fh; | |||||
} | |||||
} | |||||
store_ocx_ow8_remain_static<c_dim, remain_w, Op>(c, op, dst_ptr, | |||||
ld_dst_oc); | |||||
} | |||||
}; | |||||
} // namespace | |||||
void conv_bias::pack_src_fp32_nchw44_stride1( | |||||
float* sptr_base, const float* sptr_origin, const int, const int pw, | |||||
const int pad_right, const int ih, const int iw, const int iw2, | |||||
const int pad_top, const int pad_bottom, const int ic, | |||||
const int ic_stride) { | |||||
constexpr int ic_step = 4; | |||||
rep_step(ic_idx, ic, ic_step) { | |||||
const float* sptr = sptr_origin + ic_idx * ic_stride; | |||||
memset(sptr_base, 0, sizeof(float) * iw2 * pad_top * ic_step); | |||||
sptr_base += iw2 * pad_top * ic_step; | |||||
rep(ih_idx, ih) { | |||||
memset(sptr_base, 0, sizeof(float) * pw * ic_step); | |||||
sptr_base += pw * ic_step; | |||||
memcpy(sptr_base, sptr, sizeof(float) * iw * ic_step); | |||||
sptr_base += iw * ic_step; | |||||
sptr += iw * ic_step; | |||||
memset(sptr_base, 0, sizeof(float) * pad_right * ic_step); | |||||
sptr_base += pad_right * ic_step; | |||||
} | |||||
memset(sptr_base, 0, sizeof(float) * iw2 * pad_bottom * ic_step); | |||||
sptr_base += iw2 * pad_bottom * ic_step; | |||||
} | |||||
} | |||||
template <BiasMode bias_mode, typename Op, int filter_size> | |||||
static void conv_direct_stride1_fp32_nchw44( | |||||
const float32_t* src, const float32_t* filter, const float32_t* bias, | |||||
float32_t*, float32_t* dst, const int oc, const int ic, const int ih, | |||||
const int iw, const int oh, const int oh_block, const int ow, | |||||
const Op& op, const int, const int) { | |||||
constexpr int fh = filter_size; | |||||
constexpr int fw = filter_size; | |||||
constexpr int ic_step = 4; | |||||
#if MEGDNN_ARMV7 | |||||
constexpr int big_oc_step = 4; | |||||
#else | |||||
constexpr int big_oc_step = 8; | |||||
#endif | |||||
constexpr int oc_step = 4; | |||||
constexpr int ih_step = 1; | |||||
constexpr int oh_step = 1; | |||||
constexpr int ow_step = 8; | |||||
constexpr int stride_h = 1; | |||||
constexpr int stride_w = 1; | |||||
const int img_stride = oh * ow; | |||||
const int ow_end = ow / ow_step * ow_step; | |||||
const int ow_remain = ow - ow_end; | |||||
const int oc_end = oc / big_oc_step * big_oc_step; | |||||
const int oc_remain = oc - oc_end; | |||||
const int ld_dst_oc = oc_step * img_stride; | |||||
using remain_fun = std::function<void( | |||||
const float32_t* src_ptr, const float32_t* weight_ptr, | |||||
const float32_t* bias_ptr, float32_t* dst_ptr, int ic, int ih, | |||||
int iw, int ld_dst_oc, const Op& op)>; | |||||
remain_fun kern_big_oc_remain = nullptr; | |||||
remain_fun kern_small_oc_remain = nullptr; | |||||
switch (ow_remain) { | |||||
#define cb(step) \ | |||||
case step: \ | |||||
kern_big_oc_remain = \ | |||||
KerNeonXXs1Nchw44FP32<bias_mode, Op, step, filter_size, \ | |||||
big_oc_step, ow_step>::impl; \ | |||||
kern_small_oc_remain = \ | |||||
KerNeonXXs1Nchw44FP32<bias_mode, Op, step, filter_size, \ | |||||
oc_step, ow_step>::impl; \ | |||||
break; | |||||
UNROLL_CALL_RAW(8, cb); | |||||
default: | |||||
megdnn_assert(0, "no remain %d for kern", ow_remain); | |||||
} | |||||
for (int oc_idx = 0; oc_idx < oc_end; oc_idx += big_oc_step) { | |||||
const int weight_offset = oc_idx * ic * fh * fw; | |||||
for (int oh_idx = 0; oh_idx < oh_block; oh_idx += oh_step) { | |||||
for (int ow_idx = 0; ow_idx < ow_end; ow_idx += ow_step) { | |||||
const int src_offset = | |||||
(oh_idx * stride_h * iw + ow_idx * stride_w * ih_step) * | |||||
ic_step; | |||||
const int dst_offset = | |||||
oc_idx * img_stride + (oh_idx * ow + ow_idx) * oc_step; | |||||
const int bias_offset = | |||||
bias_mode == BiasMode::BIAS ? dst_offset : oc_idx; | |||||
KerNeonXXs1Nchw44FP32<bias_mode, Op, ow_step, filter_size, | |||||
big_oc_step, | |||||
ow_step>::impl(src + src_offset, | |||||
filter + weight_offset, | |||||
bias + bias_offset, | |||||
dst + dst_offset, ic, ih, | |||||
iw, ld_dst_oc, op); | |||||
} | |||||
if (ow_remain > 0) { | |||||
const int src_offset = | |||||
(oh_idx * stride_h * iw + ow_end * stride_w * ih_step) * | |||||
ic_step; | |||||
const int dst_offset = | |||||
oc_idx * img_stride + (oh_idx * ow + ow_end) * oc_step; | |||||
const int bias_offset = | |||||
bias_mode == BiasMode::BIAS ? dst_offset : oc_idx; | |||||
kern_big_oc_remain(src + src_offset, filter + weight_offset, | |||||
bias + bias_offset, dst + dst_offset, ic, ih, | |||||
iw, ld_dst_oc, op); | |||||
} | |||||
} | |||||
} | |||||
if (oc_remain > 0) { | |||||
int oc_idx = oc_end; | |||||
const int weight_offset = oc_idx * ic * fh * fw; | |||||
for (int oh_idx = 0; oh_idx < oh_block; oh_idx += oh_step) { | |||||
for (int ow_idx = 0; ow_idx < ow_end; ow_idx += ow_step) { | |||||
const int src_offset = | |||||
(oh_idx * stride_h * iw + ow_idx * stride_w * ih_step) * | |||||
ic_step; | |||||
const int dst_offset = | |||||
oc_idx * img_stride + (oh_idx * ow + ow_idx) * oc_step; | |||||
const int bias_offset = | |||||
bias_mode == BiasMode::BIAS ? dst_offset : oc_idx; | |||||
KerNeonXXs1Nchw44FP32<bias_mode, Op, ow_step, filter_size, | |||||
oc_step, | |||||
ow_step>::impl(src + src_offset, | |||||
filter + weight_offset, | |||||
bias + bias_offset, | |||||
dst + dst_offset, ic, ih, | |||||
iw, ld_dst_oc, op); | |||||
} | |||||
if (ow_remain > 0) { | |||||
const int src_offset = | |||||
(oh_idx * stride_h * iw + ow_end * stride_w * ih_step) * | |||||
ic_step; | |||||
const int dst_offset = | |||||
oc_idx * img_stride + (oh_idx * ow + ow_end) * oc_step; | |||||
const int bias_offset = | |||||
bias_mode == BiasMode::BIAS ? dst_offset : oc_idx; | |||||
kern_small_oc_remain(src + src_offset, filter + weight_offset, | |||||
bias + bias_offset, dst + dst_offset, ic, | |||||
ih, iw, ld_dst_oc, op); | |||||
} | |||||
} | |||||
} | |||||
} | |||||
#define CONSTRUCT_FUNC(filter_size) \ | |||||
template <BiasMode bias_mode, typename Op> \ | |||||
void conv_bias:: \ | |||||
conv_direct_stride1_##filter_size##x##filter_size##_fp32_nchw44( \ | |||||
const float32_t* src, const float32_t* filter, \ | |||||
const float32_t* bias, float32_t* temp, float32_t* dst, \ | |||||
const int oc, const int ic, const int ih, const int iw, \ | |||||
const int oh, const int oh_block, const int ow, \ | |||||
const Op& op, const int ph, const int pw) { \ | |||||
conv_direct_stride1_fp32_nchw44<bias_mode, Op, filter_size>( \ | |||||
src, filter, bias, temp, dst, oc, ic, ih, iw, oh, oh_block, \ | |||||
ow, op, ph, pw); \ | |||||
} | |||||
CONSTRUCT_FUNC(2); | |||||
CONSTRUCT_FUNC(3); | |||||
CONSTRUCT_FUNC(5); | |||||
CONSTRUCT_FUNC(7); | |||||
#undef CONSTRUCT_FUNC | |||||
#define INSTANTIATION(stride, i, bias, Op) \ | |||||
template void conv_bias::conv_direct_##stride##_##i##x##i##_fp32_nchw44< \ | |||||
bias, Op>(const float32_t*, const float32_t*, const float32_t*, \ | |||||
float32_t*, float32_t*, const int, const int, const int, \ | |||||
const int, const int, const int, const int, const Op&, \ | |||||
const int, const int); | |||||
#define FOR_OP(stride, i, bias) \ | |||||
INSTANTIATION(stride, i, bias, NoneOp<dt_float32>) \ | |||||
INSTANTIATION(stride, i, bias, ReluOp<dt_float32>) \ | |||||
INSTANTIATION(stride, i, bias, HSwishOp<dt_float32>) \ | |||||
INSTANTIATION(stride, i, bias, SigmoidOp<dt_float32>) | |||||
#define FOR_BIAS(stride, i) \ | |||||
FOR_OP(stride, i, BiasMode::NO_BIAS) \ | |||||
FOR_OP(stride, i, BiasMode::BROADCAST_CHANNEL_BIAS) \ | |||||
FOR_OP(stride, i, BiasMode::BIAS) | |||||
#define FOR_FILTER(stride) \ | |||||
FOR_BIAS(stride, 2) \ | |||||
FOR_BIAS(stride, 3) \ | |||||
FOR_BIAS(stride, 5) \ | |||||
FOR_BIAS(stride, 7) | |||||
FOR_FILTER(stride1) | |||||
#undef FOR_STRIDE | |||||
#undef FOR_FILTER | |||||
#undef FOR_IC | |||||
#undef FOR_BIAS | |||||
#undef FOR_NONLINEAR | |||||
#undef FOR_REMAIN | |||||
#undef INSTANTIATION |
@@ -0,0 +1,40 @@ | |||||
/** | |||||
* \file dnn/src/arm_common/conv_bias/fp32/f32_direct_stride1_nchw44_kern.h | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | |||||
#include "src/arm_common/conv_bias/opr_impl.h" | |||||
#include "src/fallback/conv_bias/common.h" | |||||
namespace megdnn { | |||||
namespace arm_common { | |||||
namespace conv_bias { | |||||
#define KERN(stride, i, layout) \ | |||||
template <BiasMode bias_mode, typename Op> \ | |||||
void conv_direct_##stride##_##i##x##i##_fp32_##layout( \ | |||||
const float* src, const float* filter, const float* bias, \ | |||||
float* temp, float* dst, const int oc, const int ic, const int ih, \ | |||||
const int iw, const int oh, const int oh_block, const int ow, \ | |||||
const Op& op, const int ph, const int pw); | |||||
KERN(stride1, 2, nchw44) | |||||
KERN(stride1, 3, nchw44) | |||||
KERN(stride1, 5, nchw44) | |||||
KERN(stride1, 7, nchw44) | |||||
#undef KERN | |||||
void pack_src_fp32_nchw44_stride1(float* sptr_base, const float* sptr_origin, | |||||
const int ph, const int pw, | |||||
const int pad_right, const int ih, | |||||
const int iw, const int iw2, | |||||
const int pad_top, const int pad_bottom, | |||||
const int ic, const int ic_stride); | |||||
} // namespace conv_bias | |||||
} // namespace arm_common | |||||
} // namespace megdnn |
@@ -721,10 +721,11 @@ CONSTRUCT_FUNC(7); | |||||
const int, const int, const int, const int, const Op&, \ | const int, const int, const int, const int, const Op&, \ | ||||
const int, const int); | const int, const int); | ||||
#define FOR_OP(stride, i, bias) \ | |||||
INSTANTIATION(stride, i, bias, NoneOp<dt_float32>) \ | |||||
INSTANTIATION(stride, i, bias, ReluOp<dt_float32>) \ | |||||
INSTANTIATION(stride, i, bias, HSwishOp<dt_float32>) | |||||
#define FOR_OP(stride, i, bias) \ | |||||
INSTANTIATION(stride, i, bias, NoneOp<dt_float32>) \ | |||||
INSTANTIATION(stride, i, bias, ReluOp<dt_float32>) \ | |||||
INSTANTIATION(stride, i, bias, HSwishOp<dt_float32>) \ | |||||
INSTANTIATION(stride, i, bias, SigmoidOp<dt_float32>) | |||||
#define FOR_BIAS(stride, i) \ | #define FOR_BIAS(stride, i) \ | ||||
FOR_OP(stride, i, BiasMode::NO_BIAS) \ | FOR_OP(stride, i, BiasMode::NO_BIAS) \ | ||||
@@ -67,7 +67,7 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj { | |||||
AlgoF32Direct f32_direct_large_group{true}; | AlgoF32Direct f32_direct_large_group{true}; | ||||
AlgoF32Direct f32_direct_small_group{false}; | AlgoF32Direct f32_direct_small_group{false}; | ||||
AlgoF32DirectStride2NCHW44 f32_direct_stride2_nchw44; | |||||
AlgoF32DirectNCHW44 f32_direct_nchw44; | |||||
AlgoF32DirectStride2 f32_direct_stride2_large_group{true}; | AlgoF32DirectStride2 f32_direct_stride2_large_group{true}; | ||||
AlgoF32DirectStride2 f32_direct_stride2_small_group{false}; | AlgoF32DirectStride2 f32_direct_stride2_small_group{false}; | ||||
AlgoF32DirectStride1 f32_direct_stride1_large_group{true}; | AlgoF32DirectStride1 f32_direct_stride1_large_group{true}; | ||||
@@ -126,8 +126,7 @@ public: | |||||
direct_algos.emplace_back(&i8x8x16_stride2_large_group); | direct_algos.emplace_back(&i8x8x16_stride2_large_group); | ||||
direct_algos.emplace_back(&i8x8x16_stride2_small_group); | direct_algos.emplace_back(&i8x8x16_stride2_small_group); | ||||
direct_algos.emplace_back(&f32_direct_stride2_nchw_nchw44); | direct_algos.emplace_back(&f32_direct_stride2_nchw_nchw44); | ||||
direct_algos.emplace_back(&f32_direct_stride2_nchw44); | |||||
direct_algos.emplace_back(&f32_direct_nchw44); | |||||
direct_algos.emplace_back(&f32_direct_stride1_large_group); | direct_algos.emplace_back(&f32_direct_stride1_large_group); | ||||
direct_algos.emplace_back(&f32_direct_stride1_small_group); | direct_algos.emplace_back(&f32_direct_stride1_small_group); | ||||
direct_algos.emplace_back(&f32_direct_stride2_large_group); | direct_algos.emplace_back(&f32_direct_stride2_large_group); | ||||
@@ -66,10 +66,11 @@ private: | |||||
#endif | #endif | ||||
class AlgoF32Direct; | class AlgoF32Direct; | ||||
class AlgoF32DirectStride1; | class AlgoF32DirectStride1; | ||||
class AlgoF32DirectNCHW44; | |||||
class AlgoF32DirectStride2; | class AlgoF32DirectStride2; | ||||
class AlgoF32DirectStride2NCHWNCHW44; | class AlgoF32DirectStride2NCHWNCHW44; | ||||
class AlgoF32DirectStride2NCHW44; | class AlgoF32DirectStride2NCHW44; | ||||
class AlgoI8x8x16Direct; | class AlgoI8x8x16Direct; | ||||
class AlgoI8x8x16Stride2; | class AlgoI8x8x16Stride2; | ||||
class AlgoI8x8x16Stride2Filter2; | class AlgoI8x8x16Stride2Filter2; | ||||
@@ -6,7 +6,8 @@ | |||||
* | * | ||||
* Unless required by applicable law or agreed to in writing, | * Unless required by applicable law or agreed to in writing, | ||||
* software distributed under the License is distributed on an | * software distributed under the License is distributed on an | ||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | */ | ||||
#pragma once | #pragma once | ||||
@@ -43,6 +44,10 @@ struct SigmoidOp; | |||||
vst1q_##_func_suffix(dst, vitem.val[0]); \ | vst1q_##_func_suffix(dst, vitem.val[0]); \ | ||||
vst1q_##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ | vst1q_##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ | ||||
} \ | } \ | ||||
void operator()(const _neon_type& src, _ctype* dst) const { \ | |||||
auto vitem = operator()(src); \ | |||||
vst1q_##_func_suffix(dst, vitem); \ | |||||
} \ | |||||
_neon_type2 operator()(const _neon_type2& src) const { \ | _neon_type2 operator()(const _neon_type2& src) const { \ | ||||
return {{operator()(src.val[0]), operator()(src.val[1])}}; \ | return {{operator()(src.val[0]), operator()(src.val[1])}}; \ | ||||
} \ | } \ | ||||
@@ -203,11 +203,9 @@ static void benchmark_convbias(Handle* handle, bool is_fp32 = false) { | |||||
run(1, 128, 256, 28, 28, 3, 2, false); | run(1, 128, 256, 28, 28, 3, 2, false); | ||||
run(1, 256, 512, 14, 14, 3, 2, false); | run(1, 256, 512, 14, 14, 3, 2, false); | ||||
run(1, 64, 128, 56, 56, 7, 2, false); | |||||
run(1, 128, 256, 28, 28, 7, 2, false); | |||||
run(1, 256, 512, 14, 14, 7, 2, false); | |||||
run(1, 64, 64, 48, 48, 3, 2, false); | |||||
run(1, 128, 128, 28, 28, 3, 1, false); | |||||
run(1, 256, 256, 14, 14, 3, 1, false); | |||||
run(1, 512, 512, 7, 7, 3, 1, false); | |||||
} else { | } else { | ||||
for (size_t stride : {1, 2}) { | for (size_t stride : {1, 2}) { | ||||
printf("stride %zu\n", stride); | printf("stride %zu\n", stride); | ||||
@@ -72,7 +72,8 @@ std::vector<conv_bias::TestArg> get_int8_quint8_conv_bias_args( | |||||
std::vector<conv_bias::TestArg> get_nchw44_conv_bias_args( | std::vector<conv_bias::TestArg> get_nchw44_conv_bias_args( | ||||
std::vector<size_t> kernel_vec, size_t stride, bool no_pad = false, | std::vector<size_t> kernel_vec, size_t stride, bool no_pad = false, | ||||
bool no_bias = false, bool no_nonlinemode = false, | bool no_bias = false, bool no_nonlinemode = false, | ||||
bool is_input_nchw = false, bool support_full_bias = false) { | |||||
bool is_input_nchw = false, bool support_full_bias = false, | |||||
bool support_sigmoid = false) { | |||||
using namespace conv_bias; | using namespace conv_bias; | ||||
using NLMode = param::ConvBias::NonlineMode; | using NLMode = param::ConvBias::NonlineMode; | ||||
std::vector<TestArg> args; | std::vector<TestArg> args; | ||||
@@ -151,6 +152,9 @@ std::vector<conv_bias::TestArg> get_nchw44_conv_bias_args( | |||||
nonlinemode.emplace_back(NLMode::RELU); | nonlinemode.emplace_back(NLMode::RELU); | ||||
nonlinemode.emplace_back(NLMode::H_SWISH); | nonlinemode.emplace_back(NLMode::H_SWISH); | ||||
} | } | ||||
if (support_sigmoid) { | |||||
nonlinemode.emplace_back(NLMode::SIGMOID); | |||||
} | |||||
std::vector<megdnn::BiasMode> bias_mode = { | std::vector<megdnn::BiasMode> bias_mode = { | ||||
megdnn::BiasMode::BROADCAST_CHANNEL_BIAS}; | megdnn::BiasMode::BROADCAST_CHANNEL_BIAS}; | ||||
@@ -337,11 +341,16 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_DIRECT_FP32_SMALL_GROUP) { | |||||
get_conv_bias_args({1, 2, 3, 4, 5, 6, 7}, 1, false, false, false), | get_conv_bias_args({1, 2, 3, 4, 5, 6, 7}, 1, false, false, false), | ||||
handle(), "F32DIRECT_SMALL_GROUP"); | handle(), "F32DIRECT_SMALL_GROUP"); | ||||
} | } | ||||
TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_DIRECT_FP32_NCHW44_S1) { | |||||
check_conv_bias(get_nchw44_conv_bias_args({2, 3, 5, 7}, 1, false, false, | |||||
false, false, true, true), | |||||
handle(), "F32_CONV_NCHW44_DIRECT"); | |||||
} | |||||
TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_DIRECT_FP32_NCHW44_S2) { | TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_DIRECT_FP32_NCHW44_S2) { | ||||
check_conv_bias(get_nchw44_conv_bias_args({2, 3, 5, 7}, 2, false, false, | check_conv_bias(get_nchw44_conv_bias_args({2, 3, 5, 7}, 2, false, false, | ||||
false, false, true), | |||||
handle(), "F32_CONV_NCHW44_DIRECT_S2"); | |||||
false, false, true, true), | |||||
handle(), "F32_CONV_NCHW44_DIRECT"); | |||||
} | } | ||||
TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_DIRECT_FP32_STR1_LARGE_GROUP) { | TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_DIRECT_FP32_STR1_LARGE_GROUP) { | ||||
@@ -682,8 +691,8 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD) { | |||||
size_t conv_bias_workspace_in_bytes = | size_t conv_bias_workspace_in_bytes = | ||||
conv_bias_opr->get_workspace_in_bytes( | conv_bias_opr->get_workspace_in_bytes( | ||||
tensors[0].layout, filter_transform_layout, | tensors[0].layout, filter_transform_layout, | ||||
tensors[2].layout, tensors[3].layout, | |||||
tensors[4].layout, nullptr); | |||||
tensors[2].layout, tensors[3].layout, tensors[4].layout, | |||||
nullptr); | |||||
WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(), | WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(), | ||||
conv_bias_workspace_in_bytes, | conv_bias_workspace_in_bytes, | ||||