Browse Source

fix(dnn/arm): fix read invalid data in arm kernel

GitOrigin-RevId: f1c4cae667
tags/v1.0.0-rc1
Megvii Engine Team 4 years ago
parent
commit
343335932a
15 changed files with 384 additions and 385 deletions
  1. +80
    -87
      dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h
  2. +89
    -95
      dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h
  3. +71
    -51
      dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h
  4. +17
    -17
      dnn/src/arm_common/conv_bias/fp32/do_conv_stride1.cpp
  5. +2
    -1
      dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw44_algo.cpp
  6. +4
    -4
      dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw_nchw44_s1.cpp
  7. +4
    -4
      dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw_nchw44_s2.cpp
  8. +7
    -7
      dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw44_s1.cpp
  9. +5
    -5
      dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw44_s2.cpp
  10. +4
    -4
      dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.cpp
  11. +77
    -110
      dnn/src/arm_common/conv_bias/intrinsic_helper.h
  12. +2
    -0
      dnn/src/arm_common/simd_macro/neon_helper.h
  13. +14
    -0
      dnn/test/common/checker.h
  14. +4
    -0
      dnn/test/fallback/warp_perspective.cpp
  15. +4
    -0
      dnn/test/naive/warp_perspective.cpp

+ 80
- 87
dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h View File

@@ -24,82 +24,75 @@ using namespace megdnn;
using namespace arm_common;
namespace {

template <int src_idx, int weight_idx, int c_dim, int ow_block, typename T,
typename T2, typename T3, typename T4>
template <int src_idx, int weight_idx, int c_dim, int ow_block, int remain_w,
typename T, typename T2, typename T3, typename T4>
struct ShiftCalHelper {
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight);
};

template <int src_idx, int weight_idx, typename T, typename T2, typename T3,
typename T4>
struct ShiftCalHelper<src_idx, weight_idx, 2, 8, T, T2, T3, T4> {
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
#define cb(step, lane) \
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \
src[(step + src_idx) % 8], lane); \
c[1][step] = vfmaq_laneq_f32(c[1][step], weight[1][lane], \
src[(step + src_idx) % 8], lane);

UNROLL_CALL_RAW(8, cb, 0);
UNROLL_CALL_RAW(8, cb, 1);
UNROLL_CALL_RAW(8, cb, 2);
UNROLL_CALL_RAW(8, cb, 3);
#undef cb
}
};
template <int src_idx, int weight_idx, typename T, typename T2, typename T3,
typename T4>
struct ShiftCalHelper<src_idx, weight_idx, 2, 4, T, T2, T3, T4> {
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
#define cb(step, lane) \
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \
src[(step + src_idx) % 4], lane); \
c[1][step] = vfmaq_laneq_f32(c[1][step], weight[1][lane], \
src[(step + src_idx) % 4], lane);

UNROLL_CALL_RAW(4, cb, 0);
UNROLL_CALL_RAW(4, cb, 1);
UNROLL_CALL_RAW(4, cb, 2);
UNROLL_CALL_RAW(4, cb, 3);
#undef cb
}
template <int src_idx, int weight_idx, int c_dim, int ow_block, typename T,
typename T2, typename T3, typename T4>
struct ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, 0, T, T2, T3, T4> {
static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&) {}
};
template <int src_idx, int weight_idx, typename T, typename T2, typename T3,
typename T4>
struct ShiftCalHelper<src_idx, weight_idx, 1, 8, T, T2, T3, T4> {
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
#define cb(step, lane) \
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \
src[(step + src_idx) % 8], lane);

UNROLL_CALL_RAW(8, cb, 0);
UNROLL_CALL_RAW(8, cb, 1);
UNROLL_CALL_RAW(8, cb, 2);
UNROLL_CALL_RAW(8, cb, 3);
#undef cb
}
};
template <int src_idx, int weight_idx, typename T, typename T2, typename T3,
typename T4>
struct ShiftCalHelper<src_idx, weight_idx, 1, 4, T, T2, T3, T4> {
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
#define cb(step, lane) \
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \
src[(step + src_idx) % 4], lane);
#define cb2(step, lane, ow_block) \
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \
src[(step + src_idx) % ow_block], lane); \
c[1][step] = vfmaq_laneq_f32(c[1][step], weight[1][lane], \
src[(step + src_idx) % ow_block], lane);

UNROLL_CALL_RAW(4, cb, 0);
UNROLL_CALL_RAW(4, cb, 1);
UNROLL_CALL_RAW(4, cb, 2);
UNROLL_CALL_RAW(4, cb, 3);
#define cb(step, lane, ow_block) \
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \
src[(step + src_idx) % ow_block], lane);

#define SHIFT_CAL_HELPER(ow_block, remain_w) \
template <int src_idx, int weight_idx, typename T, typename T2, \
typename T3, typename T4> \
struct ShiftCalHelper<src_idx, weight_idx, 2, ow_block, remain_w, T, T2, \
T3, T4> { \
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { \
UNROLL_CALL_RAW(remain_w, cb2, 0, ow_block); \
UNROLL_CALL_RAW(remain_w, cb2, 1, ow_block); \
UNROLL_CALL_RAW(remain_w, cb2, 2, ow_block); \
UNROLL_CALL_RAW(remain_w, cb2, 3, ow_block); \
} \
}; \
template <int src_idx, int weight_idx, typename T, typename T2, \
typename T3, typename T4> \
struct ShiftCalHelper<src_idx, weight_idx, 1, ow_block, remain_w, T, T2, \
T3, T4> { \
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { \
UNROLL_CALL_RAW(remain_w, cb, 0, ow_block); \
UNROLL_CALL_RAW(remain_w, cb, 1, ow_block); \
UNROLL_CALL_RAW(remain_w, cb, 2, ow_block); \
UNROLL_CALL_RAW(remain_w, cb, 3, ow_block); \
} \
};

SHIFT_CAL_HELPER(8, 1);
SHIFT_CAL_HELPER(8, 2);
SHIFT_CAL_HELPER(8, 3);
SHIFT_CAL_HELPER(8, 4);
SHIFT_CAL_HELPER(8, 5);
SHIFT_CAL_HELPER(8, 6);
SHIFT_CAL_HELPER(8, 7);
SHIFT_CAL_HELPER(8, 8);

SHIFT_CAL_HELPER(4, 1);
SHIFT_CAL_HELPER(4, 2);
SHIFT_CAL_HELPER(4, 3);
SHIFT_CAL_HELPER(4, 4);

#undef SHIFT_CAL_HELPER
#undef cb
}
};
#undef cb2

template <int src_idx, int weight_idx, int c_dim, int ow_block, typename T,
typename T2, typename T3>
template <int src_idx, int weight_idx, int c_dim, int ow_block, int remain_w,
typename T, typename T2, typename T3>
MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight) {
ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, T, T2, T3, int>::impl(
c, src, weight);
ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, remain_w, T, T2, T3,
int>::impl(c, src, weight);
};
template <int oc>
struct OCHelper {
@@ -151,7 +144,7 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> {
const int ld_src_iw = iw * oc_step;
constexpr int c_dim = OCHelper<oc_block>::val;
float32x4_t c[c_dim][ow_block];
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);

for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic;
@@ -162,11 +155,11 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> {
0);
load_helper<ic_step, 0, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
src[0] = vld1q_f32(src_ptr + (ow_block)*ic_step);
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<1, 0, c_dim, ow_block>(c, src, weight);
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);
src_ptr += ld_src_iw;
weight_ptr += ld_weight_fh;
}
@@ -196,7 +189,7 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> {
const int ld_src_iw = iw * oc_step;
constexpr int c_dim = OCHelper<oc_block>::val;
float32x4_t c[c_dim][ow_block];
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);

for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic;
@@ -207,15 +200,15 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> {
0);
load_helper<ic_step, 0, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
src[0] = vld1q_f32(src_ptr + (ow_block)*ic_step);
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<1, 0, c_dim, ow_block>(c, src, weight);
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);
src[1] = vld1q_f32(src_ptr + (ow_block + 1) * ic_step);
load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<2, 0, c_dim, ow_block>(c, src, weight);
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight);
src_ptr += ld_src_iw;
weight_ptr += ld_weight_fh;
}
@@ -244,7 +237,7 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 5, oc_block, ow_block> {
const int ld_src_iw = iw * oc_step;
constexpr int c_dim = OCHelper<oc_block>::val;
float32x4_t c[c_dim][ow_block];
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);

for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic;
@@ -255,27 +248,27 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 5, oc_block, ow_block> {
0);
load_helper<ic_step, 0, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);

src[0] = vld1q_f32(src_ptr + (ow_block)*ic_step);
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<1, 0, c_dim, ow_block>(c, src, weight);
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);

src[1] = vld1q_f32(src_ptr + (ow_block + 1) * ic_step);
load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<2, 0, c_dim, ow_block>(c, src, weight);
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight);

src[2] = vld1q_f32(src_ptr + (ow_block + 2) * ic_step);
load_helper<ic_step, 3 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<3, 0, c_dim, ow_block>(c, src, weight);
cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight);

src[3] = vld1q_f32(src_ptr + (ow_block + 3) * ic_step);
load_helper<ic_step, 4 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<4, 0, c_dim, ow_block>(c, src, weight);
cal_helper<4, 0, c_dim, ow_block, remain_w>(c, src, weight);
src_ptr += ld_src_iw;
weight_ptr += ld_weight_fh;
}
@@ -305,7 +298,7 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 7, oc_block, ow_block> {
const int ld_src_iw = iw * oc_step;
constexpr int c_dim = OCHelper<oc_block>::val;
float32x4_t c[c_dim][ow_block];
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);

for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic;
@@ -316,37 +309,37 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 7, oc_block, ow_block> {
0);
load_helper<ic_step, 0, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);

src[0] = vld1q_f32(src_ptr + (ow_block)*ic_step);
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<1, 0, c_dim, ow_block>(c, src, weight);
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);

src[1] = vld1q_f32(src_ptr + (ow_block + 1) * ic_step);
load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<2, 0, c_dim, ow_block>(c, src, weight);
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight);

src[2] = vld1q_f32(src_ptr + (ow_block + 2) * ic_step);
load_helper<ic_step, 3 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<3, 0, c_dim, ow_block>(c, src, weight);
cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight);

src[3] = vld1q_f32(src_ptr + (ow_block + 3) * ic_step);
load_helper<ic_step, 4 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<4, 0, c_dim, ow_block>(c, src, weight);
cal_helper<4, 0, c_dim, ow_block, remain_w>(c, src, weight);

src[4] = vld1q_f32(src_ptr + (ow_block + 4) * ic_step);
load_helper<ic_step, 5 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<5, 0, c_dim, ow_block>(c, src, weight);
cal_helper<5, 0, c_dim, ow_block, remain_w>(c, src, weight);

src[5] = vld1q_f32(src_ptr + (ow_block + 5) * ic_step);
load_helper<ic_step, 6 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<6, 0, c_dim, ow_block>(c, src, weight);
cal_helper<6, 0, c_dim, ow_block, remain_w>(c, src, weight);
src_ptr += ld_src_iw;
weight_ptr += ld_weight_fh;
}


+ 89
- 95
dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h View File

@@ -24,83 +24,77 @@ using namespace megdnn;
using namespace arm_common;
namespace {

template <int src_idx, int weight_idx, int c_dim, int ow_block, typename T,
typename T2, typename T3, typename T4>
template <int src_idx, int weight_idx, int c_dim, int ow_block, int remain_w,
typename T, typename T2, typename T3, typename T4>
struct ShiftCalHelper {
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight);
};

template <int src_idx, int weight_idx, typename T, typename T2, typename T3,
typename T4>
struct ShiftCalHelper<src_idx, weight_idx, 2, 8, T, T2, T3, T4> {
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
#define cb(step, lane) \
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \
src[(step + src_idx) % 8], lane); \
c[1][step] = vfmaq_laneq_f32(c[1][step], weight[1][lane], \
src[(step + src_idx) % 8], lane);

UNROLL_CALL_RAW(8, cb, 0);
UNROLL_CALL_RAW(8, cb, 1);
UNROLL_CALL_RAW(8, cb, 2);
UNROLL_CALL_RAW(8, cb, 3);
#undef cb
}
};
template <int src_idx, int weight_idx, typename T, typename T2, typename T3,
typename T4>
struct ShiftCalHelper<src_idx, weight_idx, 2, 4, T, T2, T3, T4> {
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
#define cb(step, lane) \
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \
src[(step + src_idx) % 4], lane); \
c[1][step] = vfmaq_laneq_f32(c[1][step], weight[1][lane], \
src[(step + src_idx) % 4], lane);

UNROLL_CALL_RAW(4, cb, 0);
UNROLL_CALL_RAW(4, cb, 1);
UNROLL_CALL_RAW(4, cb, 2);
UNROLL_CALL_RAW(4, cb, 3);
#undef cb
}
template <int src_idx, int weight_idx, int c_dim, int ow_block, typename T,
typename T2, typename T3, typename T4>
struct ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, 0, T, T2, T3, T4> {
static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&) {}
};
template <int src_idx, int weight_idx, typename T, typename T2, typename T3,
typename T4>
struct ShiftCalHelper<src_idx, weight_idx, 1, 8, T, T2, T3, T4> {
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
#define cb(step, lane) \
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \
src[(step + src_idx) % 8], lane);

UNROLL_CALL_RAW(8, cb, 0);
UNROLL_CALL_RAW(8, cb, 1);
UNROLL_CALL_RAW(8, cb, 2);
UNROLL_CALL_RAW(8, cb, 3);
#undef cb
}
};
template <int src_idx, int weight_idx, typename T, typename T2, typename T3,
typename T4>
struct ShiftCalHelper<src_idx, weight_idx, 1, 4, T, T2, T3, T4> {
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
#define cb(step, lane) \
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \
src[(step + src_idx) % 4], lane);
#define cb2(step, lane, ow_block) \
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \
src[(step + src_idx) % ow_block], lane); \
c[1][step] = vfmaq_laneq_f32(c[1][step], weight[1][lane], \
src[(step + src_idx) % ow_block], lane);

UNROLL_CALL_RAW(4, cb, 0);
UNROLL_CALL_RAW(4, cb, 1);
UNROLL_CALL_RAW(4, cb, 2);
UNROLL_CALL_RAW(4, cb, 3);
#define cb(step, lane, ow_block) \
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \
src[(step + src_idx) % ow_block], lane);

#define SHIFT_CAL_HELPER(ow_block, remain_w) \
template <int src_idx, int weight_idx, typename T, typename T2, \
typename T3, typename T4> \
struct ShiftCalHelper<src_idx, weight_idx, 2, ow_block, remain_w, T, T2, \
T3, T4> { \
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { \
UNROLL_CALL_RAW(remain_w, cb2, 0, ow_block); \
UNROLL_CALL_RAW(remain_w, cb2, 1, ow_block); \
UNROLL_CALL_RAW(remain_w, cb2, 2, ow_block); \
UNROLL_CALL_RAW(remain_w, cb2, 3, ow_block); \
} \
}; \
template <int src_idx, int weight_idx, typename T, typename T2, \
typename T3, typename T4> \
struct ShiftCalHelper<src_idx, weight_idx, 1, ow_block, remain_w, T, T2, \
T3, T4> { \
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { \
UNROLL_CALL_RAW(remain_w, cb, 0, ow_block); \
UNROLL_CALL_RAW(remain_w, cb, 1, ow_block); \
UNROLL_CALL_RAW(remain_w, cb, 2, ow_block); \
UNROLL_CALL_RAW(remain_w, cb, 3, ow_block); \
} \
};

SHIFT_CAL_HELPER(8, 1);
SHIFT_CAL_HELPER(8, 2);
SHIFT_CAL_HELPER(8, 3);
SHIFT_CAL_HELPER(8, 4);
SHIFT_CAL_HELPER(8, 5);
SHIFT_CAL_HELPER(8, 6);
SHIFT_CAL_HELPER(8, 7);
SHIFT_CAL_HELPER(8, 8);

SHIFT_CAL_HELPER(4, 1);
SHIFT_CAL_HELPER(4, 2);
SHIFT_CAL_HELPER(4, 3);
SHIFT_CAL_HELPER(4, 4);

#undef SHIFT_CAL_HELPER
#undef cb
}
};
#undef cb2

template <int src_idx, int weight_idx, int c_dim, int ow_block, typename T,
typename T2, typename T3>
template <int src_idx, int weight_idx, int c_dim, int ow_block, int remain_w,
typename T, typename T2, typename T3>
MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight) {
ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, T, T2, T3, int>::impl(
c, src, weight);
ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, remain_w, T, T2, T3,
int>::impl(c, src, weight);
};

template <int oc>
struct OCHelper {
public:
@@ -151,7 +145,7 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> {
const int ld_src_iw = iw * oc_step;
constexpr int c_dim = OCHelper<oc_block>::val;
float32x4_t c[c_dim][ow_block];
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);

for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic;
@@ -163,13 +157,13 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> {
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, 0);
load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr,
ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);

load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr_odd,
0);
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
src_ptr += ld_src_iw;
src_ptr_odd += ld_src_iw;
weight_ptr += ld_weight_fh;
@@ -177,13 +171,13 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> {
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, 0);
load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr,
ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);

load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr_odd,
0);
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
src_ptr += ld_src_iw;
src_ptr_odd += ld_src_iw;
weight_ptr += ld_weight_fh;
@@ -213,7 +207,7 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> {
const int ld_src_iw = iw * oc_step;
constexpr int c_dim = OCHelper<oc_block>::val;
float32x4_t c[c_dim][ow_block];
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic;
const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic;
@@ -224,18 +218,18 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> {
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, 0);
load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr,
ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);

src[0] = vld1q_f32(src_ptr + ow_block * simd_len);
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<1, 0, c_dim, ow_block>(c, src, weight);
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);

load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr_odd,
0);
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
src_ptr += ld_src_iw;
src_ptr_odd += ld_src_iw;
weight_ptr += ld_weight_fh;
@@ -243,17 +237,17 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> {
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, 0);
load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr,
ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
src[0] = vld1q_f32(src_ptr + ow_block * simd_len);
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<1, 0, c_dim, ow_block>(c, src, weight);
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);

load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr_odd,
0);
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
src_ptr += ld_src_iw;
src_ptr_odd += ld_src_iw;
weight_ptr += ld_weight_fh;
@@ -261,18 +255,18 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> {
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, 0);
load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr,
ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
src[0] = vld1q_f32(src_ptr + ow_block * simd_len);

load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<1, 0, c_dim, ow_block>(c, src, weight);
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);

load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr_odd,
0);
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
src_ptr += ld_src_iw;
src_ptr_odd += ld_src_iw;
weight_ptr += ld_weight_fh;
@@ -302,7 +296,7 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 5, oc_block, ow_block> {
const int ld_src_iw = iw * oc_step;
constexpr int c_dim = OCHelper<oc_block>::val;
float32x4_t c[c_dim][ow_block];
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);

for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic;
@@ -316,25 +310,25 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 5, oc_block, ow_block> {
0);
load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr,
ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
src[0] = vld1q_f32(src_ptr + ow_block * simd_len);
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<1, 0, c_dim, ow_block>(c, src, weight);
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);
src[1] = vld1q_f32(src_ptr + (ow_block + 1) * simd_len);
load_helper<4, 4 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<2, 0, c_dim, ow_block>(c, src, weight);
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight);
// odd element
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(
src, src_ptr_odd, 0);
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
src[0] = vld1q_f32(src_ptr_odd + ow_block * simd_len);
load_helper<4, 3 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<1, 0, c_dim, ow_block>(c, src, weight);
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);

src_ptr += ld_src_iw;
src_ptr_odd += ld_src_iw;
@@ -371,7 +365,7 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 7, oc_block, ow_block> {
const int ld_src_iw = iw * oc_step;
constexpr int c_dim = OCHelper<oc_block>::val;
float32x4_t c[c_dim][ow_block];
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);

for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic;
@@ -385,33 +379,33 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 7, oc_block, ow_block> {
0);
load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr,
ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
src[0] = vld1q_f32(src_ptr + ow_block * simd_len);
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<1, 0, c_dim, ow_block>(c, src, weight);
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);
src[1] = vld1q_f32(src_ptr + (ow_block + 1) * simd_len);
load_helper<4, 4 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<2, 0, c_dim, ow_block>(c, src, weight);
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight);
src[2] = vld1q_f32(src_ptr + (ow_block + 2) * simd_len);
load_helper<4, 6 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<3, 0, c_dim, ow_block>(c, src, weight);
cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight);
// odd element
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(
src, src_ptr_odd, 0);
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
src[0] = vld1q_f32(src_ptr_odd + ow_block * simd_len);
load_helper<4, 3 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<1, 0, c_dim, ow_block>(c, src, weight);
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);
src[1] = vld1q_f32(src_ptr_odd + (ow_block + 1) * simd_len);
load_helper<4, 5 * ld_weight, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<2, 0, c_dim, ow_block>(c, src, weight);
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight);

src_ptr += ld_src_iw;
src_ptr_odd += ld_src_iw;


+ 71
- 51
dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h View File

@@ -39,16 +39,18 @@ namespace {
*\tparam T2 is type of src regs
*\tparam T3 is type of weight regs
*/
template <int src_idx, int weight_idx, int c_dim, int stride, typename T,
typename T2, typename T3>
template <int src_idx, int weight_idx, int c_dim, int stride, int remain_w,
typename T, typename T2, typename T3>
struct ShiftCalHelper {
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight);
};

template <int src_idx, int weight_idx, int stride, typename T, typename T2,
typename T3>
struct ShiftCalHelper<src_idx, weight_idx, 2, stride, T, T2, T3> {
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
template <int src_idx, int weight_idx, int c_dim, int stride, typename T,
typename T2, typename T3>
struct ShiftCalHelper<src_idx, weight_idx, c_dim, stride, 0, T, T2, T3> {
static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&) {}
};

#define cb(step) \
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][weight_idx], \
src[(step * stride + src_idx) / 4], \
@@ -57,29 +59,47 @@ struct ShiftCalHelper<src_idx, weight_idx, 2, stride, T, T2, T3> {
src[(step * stride + src_idx) / 4], \
(step * stride + src_idx) % 4);

UNROLL_CALL_RAW(8, cb);
#undef cb
}
};
template <int src_idx, int weight_idx, int stride, typename T, typename T2,
typename T3>
struct ShiftCalHelper<src_idx, weight_idx, 1, stride, T, T2, T3> {
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
#define cb(step) \
#define cb2(step) \
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][weight_idx], \
src[(step * stride + src_idx) / 4], \
(step * stride + src_idx) % 4);

UNROLL_CALL_RAW(8, cb);
#define SHIFT_CAL_HELPER(ow_remain) \
template <int src_idx, int weight_idx, int stride, typename T, \
typename T2, typename T3> \
struct ShiftCalHelper<src_idx, weight_idx, 2, stride, ow_remain, T, T2, \
T3> { \
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { \
UNROLL_CALL_RAW(ow_remain, cb); \
} \
}; \
template <int src_idx, int weight_idx, int stride, typename T, \
typename T2, typename T3> \
struct ShiftCalHelper<src_idx, weight_idx, 1, stride, ow_remain, T, T2, \
T3> { \
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { \
UNROLL_CALL_RAW(ow_remain, cb2); \
} \
};

SHIFT_CAL_HELPER(1)
SHIFT_CAL_HELPER(2)
SHIFT_CAL_HELPER(3)
SHIFT_CAL_HELPER(4)
SHIFT_CAL_HELPER(5)
SHIFT_CAL_HELPER(6)
SHIFT_CAL_HELPER(7)
SHIFT_CAL_HELPER(8)

#undef SHIFT_CAL_HELPER
#undef cb
}
};
#undef cb2

template <int src_idx, int weight_idx, int c_dim, int stride, typename T,
typename T2, typename T3>
template <int src_idx, int weight_idx, int c_dim, int stride, int remain_w,
typename T, typename T2, typename T3>
MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight) {
ShiftCalHelper<src_idx, weight_idx, c_dim, stride, T, T2, T3>::impl(c, src,
weight);
ShiftCalHelper<src_idx, weight_idx, c_dim, stride, remain_w, T, T2,
T3>::impl(c, src, weight);
};
enum CpuTag {
DEFAULT_CPU_TAG = 0,
@@ -134,7 +154,7 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 7, oc_block, stride,
const int ld_src_ic = ih * iw;
constexpr int c_dim = OCHelper<oc_block>::val;
float32x4_t c[c_dim][8];
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
float32x4_t src[src_reg_size];
@@ -145,13 +165,13 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 7, oc_block, stride,
src, src_ptr + step * iw, 0); \
load_helper<filter_size, 0, oc_step, c_dim, Vld1q_f32>( \
weight, weight_ptr + step * ld_weight_fw, ld_weight_oc); \
cal_helper<0, 0, c_dim, stride>(c, src, weight); \
cal_helper<1, 1, c_dim, stride>(c, src, weight); \
cal_helper<2, 2, c_dim, stride>(c, src, weight); \
cal_helper<3, 3, c_dim, stride>(c, src, weight); \
cal_helper<4, 4, c_dim, stride>(c, src, weight); \
cal_helper<5, 5, c_dim, stride>(c, src, weight); \
cal_helper<6, 6, c_dim, stride>(c, src, weight);
cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<3, 3, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<4, 4, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<5, 5, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<6, 6, c_dim, stride, remain_w>(c, src, weight);

UNROLL_CALL_RAW(7, KERNEL_CB)
#undef KERNEL_CB
@@ -185,7 +205,7 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 5, oc_block, stride,
const int ld_src_ic = ih * iw;
constexpr int c_dim = OCHelper<oc_block>::val;
float32x4_t c[c_dim][8];
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
float32x4_t src[src_reg_size];
@@ -196,11 +216,11 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 5, oc_block, stride,
src, src_ptr + step * iw, 0); \
load_helper<filter_size, 0, oc_step, c_dim, Vld1q_f32>( \
weight, weight_ptr + step * ld_weight_fw, ld_weight_oc); \
cal_helper<0, 0, c_dim, stride>(c, src, weight); \
cal_helper<1, 1, c_dim, stride>(c, src, weight); \
cal_helper<2, 2, c_dim, stride>(c, src, weight); \
cal_helper<3, 3, c_dim, stride>(c, src, weight); \
cal_helper<4, 4, c_dim, stride>(c, src, weight);
cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<3, 3, c_dim, stride, remain_w>(c, src, weight); \
cal_helper<4, 4, c_dim, stride, remain_w>(c, src, weight);
UNROLL_CALL_RAW(5, KERNEL_CB)
#undef KERNEL_CB

@@ -233,7 +253,7 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 3, oc_block, stride,
const int ld_src_ic = ih * iw;
constexpr int c_dim = OCHelper<oc_block>::val;
float32x4_t c[c_dim][8];
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
float32x4_t src[src_reg_size];
@@ -243,27 +263,27 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 3, oc_block, stride,
0);
load_helper<filter_size, 0, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, stride>(c, src, weight);
cal_helper<1, 1, c_dim, stride>(c, src, weight);
cal_helper<2, 2, c_dim, stride>(c, src, weight);
cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);
cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight);
cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight);

// row 1
load_helper<src_reg_size, 0, simd_len, 0, Vld1q_f32>(
src, src_ptr + iw, 0);
load_helper<filter_size, 0, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr + 1 * ld_weight_fw, ld_weight_oc);
cal_helper<0, 0, c_dim, stride>(c, src, weight);
cal_helper<1, 1, c_dim, stride>(c, src, weight);
cal_helper<2, 2, c_dim, stride>(c, src, weight);
cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);
cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight);
cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight);

// row 2
load_helper<src_reg_size, 0, simd_len, 0, Vld1q_f32>(
src, src_ptr + 2 * iw, 0);
load_helper<filter_size, 0, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr + 2 * ld_weight_fw, ld_weight_oc);
cal_helper<0, 0, c_dim, stride>(c, src, weight);
cal_helper<1, 1, c_dim, stride>(c, src, weight);
cal_helper<2, 2, c_dim, stride>(c, src, weight);
cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);
cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight);
cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight);

src_ptr += ld_src_ic;
weight_ptr += ld_weight_ic;
@@ -634,7 +654,7 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 2, oc_block, stride,
const int ld_src_ic = ih * iw;
constexpr int c_dim = OCHelper<oc_block>::val;
float32x4_t c[c_dim][8];
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
float32x4_t src[src_reg_size];
@@ -644,16 +664,16 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 2, oc_block, stride,
0);
load_helper<filter_size, 0, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, stride>(c, src, weight);
cal_helper<1, 1, c_dim, stride>(c, src, weight);
cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);
cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight);

// row 1
load_helper<src_reg_size, 0, simd_len, 0, Vld1q_f32>(
src, src_ptr + iw, 0);
load_helper<filter_size, 0, oc_step, c_dim, Vld1q_f32>(
weight, weight_ptr + 1 * ld_weight_fw, ld_weight_oc);
cal_helper<0, 0, c_dim, stride>(c, src, weight);
cal_helper<1, 1, c_dim, stride>(c, src, weight);
cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);
cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight);

src_ptr += ld_src_ic;
weight_ptr += ld_weight_ic;


+ 17
- 17
dnn/src/arm_common/conv_bias/fp32/do_conv_stride1.cpp View File

@@ -6,14 +6,15 @@
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/

#include <algorithm>

#include "src/arm_common/conv_bias/fp32/do_conv_stride1.h"
#include "src/arm_common/simd_macro/neon_helper.h"
#include "src/arm_common/conv_bias/postprocess_helper.h"
#include "src/arm_common/simd_macro/neon_helper.h"

#include "midout.h"

@@ -27,10 +28,9 @@ using namespace conv_stride1;
using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam;
using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam;


void conv_stride1::do_conv_2x2_stride1(const float* src, const float* filter, float* dst,
size_t IH, size_t IW, size_t OH, size_t OW,
size_t IC) {
void conv_stride1::do_conv_2x2_stride1(const float* src, const float* filter,
float* dst, size_t IH, size_t IW,
size_t OH, size_t OW, size_t IC) {
const size_t tail_step = IW - OW;
//! unroll of 2
size_t ic = 0;
@@ -143,9 +143,9 @@ void conv_stride1::do_conv_2x2_stride1(const float* src, const float* filter, fl
}
}

void conv_stride1::do_conv_3x3_stride1(const float* src, const float* filter, float* dst,
size_t IH, size_t IW, size_t OH, size_t OW,
size_t IC) {
void conv_stride1::do_conv_3x3_stride1(const float* src, const float* filter,
float* dst, size_t IH, size_t IW,
size_t OH, size_t OW, size_t IC) {
const size_t tail_step = IW - OW;

rep(ic, IC) {
@@ -193,7 +193,7 @@ void conv_stride1::do_conv_3x3_stride1(const float* src, const float* filter, fl
MEGDNN_SIMD_TYPE _r22 = MEGDNN_SIMD_EXT(_r20, _r20n, 2);

MEGDNN_SIMD_TYPE _r30 = MEGDNN_SIMD_LOADU(r3);
MEGDNN_SIMD_TYPE _r30n = MEGDNN_SIMD_LOADU(r3 + 4);
MEGDNN_SIMD_TYPE _r30n = MEGDNN_SIMD_LOADU_2(r3 + 4);
MEGDNN_SIMD_TYPE _r31 = MEGDNN_SIMD_EXT(_r30, _r30n, 1);
MEGDNN_SIMD_TYPE _r32 = MEGDNN_SIMD_EXT(_r30, _r30n, 2);

@@ -290,9 +290,9 @@ void conv_stride1::do_conv_3x3_stride1(const float* src, const float* filter, fl
}
}

void conv_stride1::do_conv_5x5_stride1(const float* src, const float* filter, float* dst,
size_t IH, size_t IW, size_t OH, size_t OW,
size_t IC) {
void conv_stride1::do_conv_5x5_stride1(const float* src, const float* filter,
float* dst, size_t IH, size_t IW,
size_t OH, size_t OW, size_t IC) {
const size_t tail_step = IW - OW;

rep(ic, IC) {
@@ -530,9 +530,9 @@ void conv_stride1::do_conv_5x5_stride1(const float* src, const float* filter, fl
}
}

void conv_stride1::do_conv_7x7_stride1(const float* src, const float* filter, float* dst,
size_t IH, size_t IW, size_t OH, size_t OW,
size_t IC) {
void conv_stride1::do_conv_7x7_stride1(const float* src, const float* filter,
float* dst, size_t IH, size_t IW,
size_t OH, size_t OW, size_t IC) {
const size_t tail_step = IW - OW;

rep(ic, IC) {
@@ -688,7 +688,7 @@ void conv_stride1::do_conv_7x7_stride1(const float* src, const float* filter, fl
_sum = MEGDNN_SIMD_FMA_LANE(_sum, _r56, _k39404142, 2);

MEGDNN_SIMD_TYPE _k42434445 = MEGDNN_SIMD_LOADU(k6);
MEGDNN_SIMD_TYPE _k46474849 = MEGDNN_SIMD_LOADU(k6 + 4);
MEGDNN_SIMD_TYPE _k46474849 = MEGDNN_SIMD_LOADU_3(k6 + 4);

MEGDNN_SIMD_TYPE _r60 = MEGDNN_SIMD_LOADU(r6);
MEGDNN_SIMD_TYPE _r64 = MEGDNN_SIMD_LOADU(r6 + 4);


+ 2
- 1
dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw44_algo.cpp View File

@@ -126,7 +126,8 @@ static void do_conv_kern(const WorkspaceBundle& bundle,
? oh_idx * oh_block * ow * pack_c
: oc_idx;
const float* bptr =
kern_param.bias<dt_float32>(batch_id, group_id) + bias_offset;
kern_param.bias<dt_float32>(batch_id, group_id, oc_idx, 1, pack_c) +
bias_offset;

Op op;
conv_bias::conv_direct_fp32_nchw44<bias_mode, Op, filter, stride>(


+ 4
- 4
dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw_nchw44_s1.cpp View File

@@ -69,7 +69,7 @@ struct KerNeonDotXXs2Nchw44Int8<bias_mode, Op, remain_w, 2, oc_block, ow_block,
constexpr int c_dim = OCHelper<oc_block>::val;

int32x4_t c[c_dim][8];
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
int8x16_t src[src_reg];
int8x16_t weight[c_dim][weight_reg];
@@ -117,7 +117,7 @@ struct KerNeonDotXXs2Nchw44Int8<bias_mode, Op, remain_w, 3, oc_block, ow_block,
constexpr int c_dim = OCHelper<oc_block>::val;

int32x4_t c[c_dim][8];
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
int8x16_t src[src_reg];
int8x16_t weight[c_dim][weight_reg];
@@ -171,7 +171,7 @@ struct KerNeonDotXXs2Nchw44Int8<bias_mode, Op, remain_w, 5, oc_block, ow_block,
constexpr int c_dim = OCHelper<oc_block>::val;

int32x4_t c[c_dim][8];
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);

for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
int8x16_t src[src_reg];
@@ -220,7 +220,7 @@ struct KerNeonDotXXs2Nchw44Int8<bias_mode, Op, remain_w, 7, oc_block, ow_block,
constexpr int c_dim = OCHelper<oc_block>::val;

int32x4_t c[c_dim][8];
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);

for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
int8x16_t src[src_reg];


+ 4
- 4
dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw_nchw44_s2.cpp View File

@@ -80,7 +80,7 @@ struct KerNeonDotXXs2Nchw44Int8<bias_mode, Op, remain_w, 2, oc_block, ow_block,
constexpr int c_dim = OCHelper<oc_block>::val;

int32x4_t c[c_dim][8];
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
int8x16_t src[2][src_reg];
int8x16_t weight[c_dim][weight_reg];
@@ -131,7 +131,7 @@ struct KerNeonDotXXs2Nchw44Int8<bias_mode, Op, remain_w, 3, oc_block, ow_block,
constexpr int c_dim = OCHelper<oc_block>::val;

int32x4_t c[c_dim][8];
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
int8x16_t src[2][src_reg];
int8x16_t weight[c_dim][weight_reg];
@@ -189,7 +189,7 @@ struct KerNeonDotXXs2Nchw44Int8<bias_mode, Op, remain_w, 5, oc_block, ow_block,
constexpr int c_dim = OCHelper<oc_block>::val;

int32x4_t c[c_dim][8];
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);

for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
int8x16_t src[2][src_reg];
@@ -244,7 +244,7 @@ struct KerNeonDotXXs2Nchw44Int8<bias_mode, Op, remain_w, 7, oc_block, ow_block,
constexpr int c_dim = OCHelper<oc_block>::val;

int32x4_t c[c_dim][8];
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);

for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
int8x16_t src[2][src_reg];


+ 7
- 7
dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw44_s1.cpp View File

@@ -45,7 +45,7 @@ static void ker_neon_dirctconv_2x2s1_oc8_ow8(const int8_t* src_ptr,
int8x16_t src[8 + 1];
int16x8_t temp_c[4];

init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
for (int fh_idx = 0; fh_idx < fh; ++fh_idx) {
@@ -135,7 +135,7 @@ static void ker_neon_dirctconv_2x2s1_oc4_ow8(const int8_t* src_ptr,
int8x16_t weight[1][2];
int8x16_t src[8 + 1];
int16x8_t temp_c[2];
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
for (int fh_idx = 0; fh_idx < fh; ++fh_idx) {
@@ -224,7 +224,7 @@ struct KerNeonDirectStride1Int8<bias_mode, Op, remain_w, 3, c_dim, DstType> {
int8x16_t weight[3];
int8x16_t src[8 + 2];
int16x8_t temp_c[2];
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
for (int fh_idx = 0; fh_idx < fh; ++fh_idx) {
@@ -306,7 +306,7 @@ struct KerNeonDirectStride1Int8<bias_mode, Op, remain_w, 5, c_dim, DstType> {
int8x16_t weight[5];
int8x16_t src[8 + 2];
int16x8_t temp_c[2];
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
for (int fh_idx = 0; fh_idx < fh; ++fh_idx) {
@@ -409,7 +409,7 @@ struct KerNeonDirectStride1Int8<bias_mode, Op, remain_w, 7, c_dim, DstType> {
int8x16_t weight[7];
int8x16_t src[8 + 2];
int16x8_t temp_c[2];
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
for (int fh_idx = 0; fh_idx < fh; ++fh_idx) {
@@ -569,7 +569,7 @@ void conv_direct_stride1_2x2_int8_nchw44(const int8_t* src,
(oh_idx * iw + ow_idx) * ic_step * pack_iw_len;
const size_t dst_offset =
oc_idx * img_stride + (oh_idx * ow + ow_idx) * oc_step;
ker_neon_dirctconv_2x2s1_oc8_ow8<bias_mode, Op, 0, filter_size,
ker_neon_dirctconv_2x2s1_oc8_ow8<bias_mode, Op, ow_step, filter_size,
2, DstType>(
src + src_offset, filter + weight_offset, bias + oc_idx,
dst + dst_offset, ic, ih, iw, ld_oc, op);
@@ -594,7 +594,7 @@ void conv_direct_stride1_2x2_int8_nchw44(const int8_t* src,
(oh_idx * iw + ow_idx) * ic_step * pack_iw_len;
const size_t dst_offset =
oc_idx * img_stride + (oh_idx * ow + ow_idx) * oc_step;
ker_neon_dirctconv_2x2s1_oc4_ow8<bias_mode, Op, 0, filter_size,
ker_neon_dirctconv_2x2s1_oc4_ow8<bias_mode, Op, ow_step, filter_size,
1, DstType>(
src + src_offset, filter + weight_offset, bias + oc_idx,
dst + dst_offset, ic, ih, iw, ld_oc, op);


+ 5
- 5
dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw44_s2.cpp View File

@@ -54,7 +54,7 @@ static void ker_neon_dirctconv_2x2s2_oc8_ow8(const int8_t* src_ptr,
int8x16_t src[8 + 1];
int16x8_t temp_c[4];

init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
for (int fh_idx = 0; fh_idx < fh; ++fh_idx) {
@@ -151,7 +151,7 @@ static void ker_neon_dirctconv_2x2s2_oc4_ow8(const int8_t* src_ptr,
int8x16_t weight[2];
int8x16_t src[8 + 1];
int16x8_t temp_c[2];
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
for (int fh_idx = 0; fh_idx < fh; ++fh_idx) {
@@ -239,7 +239,7 @@ struct KerNeonDirectStride2Int8<bias_mode, Op, remain_w, 3, c_dim, DstType> {
int8x16_t weight[3];
int8x16_t src[8 + 2];
int16x8_t temp_c[4];
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
for (int fh_idx = 0; fh_idx < fh; ++fh_idx) {
@@ -327,7 +327,7 @@ struct KerNeonDirectStride2Int8<bias_mode, Op, remain_w, 5, c_dim, DstType> {
int8x16_t weight[5];
int8x16_t src[8 + 2];
int16x8_t temp_c[4];
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
for (int fh_idx = 0; fh_idx < fh; ++fh_idx) {
const int8_t* src_ic_0_3 = src_ptr + ic_idx * ic_stride +
@@ -435,7 +435,7 @@ struct KerNeonDirectStride2Int8<bias_mode, Op, remain_w, 7, c_dim, DstType> {
int8x16_t weight[7];
int8x16_t src[8 + 2];
int16x8_t temp_c[4];
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
for (int fh_idx = 0; fh_idx < fh; ++fh_idx) {
const int8_t* src_ic_0_3 = src_ptr + ic_idx * ic_stride +


+ 4
- 4
dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.cpp View File

@@ -131,7 +131,7 @@ struct KerNeonXXs2NchwNchw44<bias_mode, Op, remain_w, 2, oc_block, 1> {
const int ld_weight_oc = oc_step * filter_height * filter_width * ic;
constexpr int c_dim = OCHelper<oc_block>::val;
int32x4_t c[c_dim][8];
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride;
@@ -178,7 +178,7 @@ struct KerNeonXXs2NchwNchw44<bias_mode, Op, remain_w, 3, oc_block, 1> {
const int ld_weight_oc = oc_step * filter_height * filter_width * ic;
constexpr int c_dim = OCHelper<oc_block>::val;
int32x4_t c[c_dim][8];
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride;
@@ -232,7 +232,7 @@ struct KerNeonXXs2NchwNchw44<bias_mode, Op, remain_w, 5, oc_block, 1> {
const int ld_weight_oc = oc_step * filter_height * filter_width * ic;
constexpr int c_dim = OCHelper<oc_block>::val;
int32x4_t c[c_dim][8];
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride;
@@ -279,7 +279,7 @@ struct KerNeonXXs2NchwNchw44<bias_mode, Op, remain_w, 7, oc_block, 1> {
const int ld_weight_oc = oc_step * filter_height * filter_width * ic;
constexpr int c_dim = OCHelper<oc_block>::val;
int32x4_t c[c_dim][8];
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride;


+ 77
- 110
dnn/src/arm_common/conv_bias/intrinsic_helper.h View File

@@ -643,128 +643,95 @@ __ai int32x4_t neon_vld1q(const int* ptr) {
__ai int16x8_t neon_vld1q(const int16_t* ptr) {
return vld1q_s16(ptr);
}

template <int c_dim, BiasMode bias_mode, int ow_block, typename T, typename T2>
template <typename T>
struct NeonLdqSimd;
template <>
struct NeonLdqSimd<float> {
static constexpr int simd_len = 4;
};
template <>
struct NeonLdqSimd<int> {
static constexpr int simd_len = 4;
};
template <>
struct NeonLdqSimd<int16_t> {
static constexpr int simd_len = 8;
};
template <int c_dim, BiasMode bias_mode, int ow_remain, typename T, typename T2>
struct InitOcxOw8 {
static __ai void impl(T& c, const T2* bias_ptr, int oc_step);
};
template <typename T, typename T2>
struct InitOcxOw8<2, BiasMode::NO_BIAS, 8, T, T2> {
static __ai void impl(T& c, const T2*, int) {
#define BAIS_INIT(step) \
c[0][step] = neon_vdupq_n(static_cast<T2>(0)); \
c[1][step] = neon_vdupq_n(static_cast<T2>(0));
UNROLL_CALL_RAW(8, BAIS_INIT);
#undef BAIS_INIT
}
template <int c_dim, BiasMode bias_mode, typename T, typename T2>
struct InitOcxOw8<c_dim, bias_mode, 0, T, T2> {
static __ai void impl(T&, const T2*, int) {}
};
template <typename T, typename T2>
struct InitOcxOw8<2, BiasMode::NO_BIAS, 4, T, T2> {
static __ai void impl(T& c, const T2*, int) {
#define BAIS_INIT(step) \

#define BAIS_INIT_NO_BIAS_C2(step) \
c[0][step] = neon_vdupq_n(static_cast<T2>(0)); \
c[1][step] = neon_vdupq_n(static_cast<T2>(0));
UNROLL_CALL_RAW(4, BAIS_INIT);
#undef BAIS_INIT
}
};
template <typename T, typename T2>
struct InitOcxOw8<2, BiasMode::BROADCAST_CHANNEL_BIAS, 8, T, T2> {
static __ai void impl(T& c, const T2* bias_ptr, int oc_step) {
#define BAIS_INIT(step) \
c[0][step] = neon_vld1q(bias_ptr); \
c[1][step] = neon_vld1q(bias_ptr + oc_step);
UNROLL_CALL_RAW(8, BAIS_INIT);
#undef BAIS_INIT
}
};
template <typename T, typename T2>
struct InitOcxOw8<2, BiasMode::BROADCAST_CHANNEL_BIAS, 4, T, T2> {
static __ai void impl(T& c, const T2* bias_ptr, int oc_step) {
#define BAIS_INIT(step) \
#define BAIS_INIT_NO_BIAS_C1(step) \
c[0][step] = neon_vdupq_n(static_cast<T2>(0));

#define BAIS_INIT_BROADCAST_C2(step) \
c[0][step] = neon_vld1q(bias_ptr); \
c[1][step] = neon_vld1q(bias_ptr + oc_step);
UNROLL_CALL_RAW(4, BAIS_INIT);
#undef BAIS_INIT
}
};
template <typename T, typename T2>
struct InitOcxOw8<2, BiasMode::BIAS, 8, T, T2> {
static __ai void impl(T& c, const T2* bias_ptr, int oc_step) {
constexpr int simd_len = 4;
#define BAIS_INIT(step) \
c[0][step] = neon_vld1q(bias_ptr + step * simd_len); \
c[1][step] = neon_vld1q(bias_ptr + oc_step + step * simd_len);
UNROLL_CALL_RAW(8, BAIS_INIT);
#undef BAIS_INIT
}
};
template <typename T, typename T2>
struct InitOcxOw8<2, BiasMode::BIAS, 4, T, T2> {
static __ai void impl(T& c, const T2* bias_ptr, int oc_step) {
constexpr int simd_len = 4;
#define BAIS_INIT(step) \
#define BAIS_INIT_BROADCAST_C1(step) c[0][step] = neon_vld1q(bias_ptr);

#define BAIS_INIT_BIAS_C2(step) \
c[0][step] = neon_vld1q(bias_ptr + step * simd_len); \
c[1][step] = neon_vld1q(bias_ptr + oc_step + step * simd_len);
UNROLL_CALL_RAW(4, BAIS_INIT);
#undef BAIS_INIT
}
};

template <typename T, typename T2>
struct InitOcxOw8<1, BiasMode::NO_BIAS, 8, T, T2> {
static __ai void impl(T& c, const T2*, int) {
#define BAIS_INIT(step) c[0][step] = neon_vdupq_n(static_cast<T2>(0));
UNROLL_CALL_RAW(8, BAIS_INIT);
#undef BAIS_INIT
}
};
template <typename T, typename T2>
struct InitOcxOw8<1, BiasMode::NO_BIAS, 4, T, T2> {
static __ai void impl(T& c, const T2*, int) {
#define BAIS_INIT(step) c[0][step] = neon_vdupq_n(static_cast<T2>(0));
UNROLL_CALL_RAW(4, BAIS_INIT);
#undef BAIS_INIT
}
};
template <typename T, typename T2>
struct InitOcxOw8<1, BiasMode::BROADCAST_CHANNEL_BIAS, 8, T, T2> {
static __ai void impl(T& c, const T2* bias_ptr, int) {
#define BAIS_INIT(step) c[0][step] = neon_vld1q(bias_ptr);
UNROLL_CALL_RAW(8, BAIS_INIT);
#undef BAIS_INIT
}
};
template <typename T, typename T2>
struct InitOcxOw8<1, BiasMode::BROADCAST_CHANNEL_BIAS, 4, T, T2> {
static __ai void impl(T& c, const T2* bias_ptr, int) {
#define BAIS_INIT(step) c[0][step] = neon_vld1q(bias_ptr);
UNROLL_CALL_RAW(4, BAIS_INIT);
#undef BAIS_INIT
}
};
template <typename T, typename T2>
struct InitOcxOw8<1, BiasMode::BIAS, 8, T, T2> {
static __ai void impl(T& c, const T2* bias_ptr, int) {
constexpr int simd_len = 4;
#define BAIS_INIT(step) c[0][step] = neon_vld1q(bias_ptr + step * simd_len);
UNROLL_CALL_RAW(8, BAIS_INIT);
#undef BAIS_INIT
}
};
template <typename T, typename T2>
struct InitOcxOw8<1, BiasMode::BIAS, 4, T, T2> {
static __ai void impl(T& c, const T2* bias_ptr, int) {
constexpr int simd_len = 4;
#define BAIS_INIT(step) c[0][step] = neon_vld1q(bias_ptr + step * simd_len);
UNROLL_CALL_RAW(4, BAIS_INIT);
#undef BAIS_INIT
}
};

template <int c_dim, BiasMode bias_mode, int ow_block, typename T, typename T2>
#define BAIS_INIT_BIAS_C1(step) \
c[0][step] = neon_vld1q(bias_ptr + step * simd_len);

#define INSTANCE_InitOcxOw8(ow_remain, cdim) \
template <typename T, typename T2> \
struct InitOcxOw8<cdim, BiasMode::NO_BIAS, ow_remain, T, T2> { \
static __ai void impl(T& c, const T2*, int) { \
UNROLL_CALL_RAW(ow_remain, BAIS_INIT_NO_BIAS_C##cdim); \
} \
}; \
template <typename T, typename T2> \
struct InitOcxOw8<cdim, BiasMode::BROADCAST_CHANNEL_BIAS, ow_remain, T, \
T2> { \
static __ai void impl(T& c, const T2* bias_ptr, int oc_step) { \
(void)oc_step; \
UNROLL_CALL_RAW(ow_remain, BAIS_INIT_BROADCAST_C##cdim); \
} \
}; \
template <typename T, typename T2> \
struct InitOcxOw8<cdim, BiasMode::BIAS, ow_remain, T, T2> { \
static __ai void impl(T& c, const T2* bias_ptr, int oc_step) { \
constexpr int simd_len = NeonLdqSimd<T2>::simd_len; \
(void)oc_step; \
UNROLL_CALL_RAW(ow_remain, BAIS_INIT_BIAS_C##cdim); \
} \
};
#define INSTANCE_InitOcxOw8_C(ow_remain) \
INSTANCE_InitOcxOw8(ow_remain, 2); \
INSTANCE_InitOcxOw8(ow_remain, 1);

INSTANCE_InitOcxOw8_C(1);
INSTANCE_InitOcxOw8_C(2);
INSTANCE_InitOcxOw8_C(3);
INSTANCE_InitOcxOw8_C(4);
INSTANCE_InitOcxOw8_C(5);
INSTANCE_InitOcxOw8_C(6);
INSTANCE_InitOcxOw8_C(7);
INSTANCE_InitOcxOw8_C(8);

#undef INSTANCE_InitOcxOw8
#undef INSTANCE_InitOcxOw8_C
#undef BAIS_INIT_BIAS_C1
#undef BAIS_INIT_BIAS_C2
#undef BAIS_INIT_BROADCAST_C1
#undef BAIS_INIT_BROADCAST_C2
#undef BAIS_INIT_NO_BIAS_C1
#undef BAIS_INIT_NO_BIAS_C2

template <int c_dim, BiasMode bias_mode, int ow_remain, typename T, typename T2>
__ai void init_ocx_ow8(T& c, const T2* bias_ptr, int oc_step) {
InitOcxOw8<c_dim, bias_mode, ow_block, T, T2>::impl(c, bias_ptr, oc_step);
InitOcxOw8<c_dim, bias_mode, ow_remain, T, T2>::impl(c, bias_ptr, oc_step);
}
/////////////////////init_ocx_ow4/////////////////////
template <int c_dim, BiasMode bias_mode, typename T>


+ 2
- 0
dnn/src/arm_common/simd_macro/neon_helper.h View File

@@ -18,6 +18,8 @@
#define MEGDNN_SIMD_TYPE float32x4_t
#define MEGDNN_SIMD_TYPE2 float32x4x2_t
#define MEGDNN_SIMD_LOADU(addr) vld1q_f32(addr)
#define MEGDNN_SIMD_LOADU_2(addr) vcombine_f32(vld1_f32(addr), vdup_n_f32(0.f))
#define MEGDNN_SIMD_LOADU_3(addr) vld1q_lane_f32(addr + 2, vcombine_f32(vld1_f32(addr), vdup_n_f32(0.f)), 2)
#define MEGDNN_SIMD_STOREU(addr, reg) vst1q_f32(addr, reg)
#define MEGDNN_SIMD_SETZERO() vdupq_n_f32(0.0f)
#define MEGDNN_SIMD_SET1(num) vdupq_n_f32(num)


+ 14
- 0
dnn/test/common/checker.h View File

@@ -23,6 +23,20 @@
#include <regex>
#include <unordered_map>

// clang-format off
#if defined(__has_feature)
#if __has_feature(address_sanitizer)
#define MEGDNN_TEST_ASAN 1
#else
#define MEGDNN_TEST_ASAN 0
#endif
#elif defined(__SANITIZE_ADDRESS__)
#define MEGDNN_TEST_ASAN 1
#else
#define MEGDNN_TEST_ASAN 0
#endif
// clang-format on

namespace megdnn {
namespace test {



+ 4
- 0
dnn/test/fallback/warp_perspective.cpp View File

@@ -76,6 +76,9 @@ TEST_F(FALLBACK, WARP_PERSPECTIVE) {
checker.set_param(param);
checker.exec({{1000, 2, 10, 11}, {1000, 3, 3}, {1000, 2, 12, 13}});
}
#if MEGDNN_TEST_ASAN
//! asan detect nan will make test failed
#else
// resize nan case
UniformFloatRNG rng_zero(0, 0);
checker.set_rng(1, &rng_zero);
@@ -85,6 +88,7 @@ TEST_F(FALLBACK, WARP_PERSPECTIVE) {
checker.set_param(param);
checker.exec({{1000, 2, 10, 11}, {1000, 3, 3}, {1000, 2, 12, 13}});
}
#endif
}

TEST_F(FALLBACK, WARP_PERSPECTIVE_MAT_IDX) {


+ 4
- 0
dnn/test/naive/warp_perspective.cpp View File

@@ -352,6 +352,9 @@ TEST_F(NAIVE_MULTI_THREADS, WARP_PERSPECTIVE_FORWARD_HWCD4) {
checker.execs({{22, 10, 1, 11, 4}, {22, 3, 3}, {22, 11, 1, 12, 4}});
}
}
#if MEGDNN_TEST_ASAN
//! asan detect nan will make test failed
#else
// nan case
NanMatRNG rng_nan;
UniformFloatRNG rng_zero(0, 0);
@@ -369,6 +372,7 @@ TEST_F(NAIVE_MULTI_THREADS, WARP_PERSPECTIVE_FORWARD_HWCD4) {
checker.set_param(param);
checker.exec({{10, 10, 1, 11, 4}, {10, 3, 3}, {10, 12, 1, 13, 4}});
}
#endif
}

#if MEGDNN_WITH_BENCHMARK


Loading…
Cancel
Save