GitOrigin-RevId: f1c4cae667
tags/v1.0.0-rc1
@@ -24,82 +24,75 @@ using namespace megdnn; | |||
using namespace arm_common; | |||
namespace { | |||
template <int src_idx, int weight_idx, int c_dim, int ow_block, typename T, | |||
typename T2, typename T3, typename T4> | |||
template <int src_idx, int weight_idx, int c_dim, int ow_block, int remain_w, | |||
typename T, typename T2, typename T3, typename T4> | |||
struct ShiftCalHelper { | |||
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight); | |||
}; | |||
template <int src_idx, int weight_idx, typename T, typename T2, typename T3, | |||
typename T4> | |||
struct ShiftCalHelper<src_idx, weight_idx, 2, 8, T, T2, T3, T4> { | |||
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { | |||
#define cb(step, lane) \ | |||
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \ | |||
src[(step + src_idx) % 8], lane); \ | |||
c[1][step] = vfmaq_laneq_f32(c[1][step], weight[1][lane], \ | |||
src[(step + src_idx) % 8], lane); | |||
UNROLL_CALL_RAW(8, cb, 0); | |||
UNROLL_CALL_RAW(8, cb, 1); | |||
UNROLL_CALL_RAW(8, cb, 2); | |||
UNROLL_CALL_RAW(8, cb, 3); | |||
#undef cb | |||
} | |||
}; | |||
template <int src_idx, int weight_idx, typename T, typename T2, typename T3, | |||
typename T4> | |||
struct ShiftCalHelper<src_idx, weight_idx, 2, 4, T, T2, T3, T4> { | |||
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { | |||
#define cb(step, lane) \ | |||
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \ | |||
src[(step + src_idx) % 4], lane); \ | |||
c[1][step] = vfmaq_laneq_f32(c[1][step], weight[1][lane], \ | |||
src[(step + src_idx) % 4], lane); | |||
UNROLL_CALL_RAW(4, cb, 0); | |||
UNROLL_CALL_RAW(4, cb, 1); | |||
UNROLL_CALL_RAW(4, cb, 2); | |||
UNROLL_CALL_RAW(4, cb, 3); | |||
#undef cb | |||
} | |||
template <int src_idx, int weight_idx, int c_dim, int ow_block, typename T, | |||
typename T2, typename T3, typename T4> | |||
struct ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, 0, T, T2, T3, T4> { | |||
static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&) {} | |||
}; | |||
template <int src_idx, int weight_idx, typename T, typename T2, typename T3, | |||
typename T4> | |||
struct ShiftCalHelper<src_idx, weight_idx, 1, 8, T, T2, T3, T4> { | |||
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { | |||
#define cb(step, lane) \ | |||
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \ | |||
src[(step + src_idx) % 8], lane); | |||
UNROLL_CALL_RAW(8, cb, 0); | |||
UNROLL_CALL_RAW(8, cb, 1); | |||
UNROLL_CALL_RAW(8, cb, 2); | |||
UNROLL_CALL_RAW(8, cb, 3); | |||
#undef cb | |||
} | |||
}; | |||
template <int src_idx, int weight_idx, typename T, typename T2, typename T3, | |||
typename T4> | |||
struct ShiftCalHelper<src_idx, weight_idx, 1, 4, T, T2, T3, T4> { | |||
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { | |||
#define cb(step, lane) \ | |||
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \ | |||
src[(step + src_idx) % 4], lane); | |||
#define cb2(step, lane, ow_block) \ | |||
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \ | |||
src[(step + src_idx) % ow_block], lane); \ | |||
c[1][step] = vfmaq_laneq_f32(c[1][step], weight[1][lane], \ | |||
src[(step + src_idx) % ow_block], lane); | |||
UNROLL_CALL_RAW(4, cb, 0); | |||
UNROLL_CALL_RAW(4, cb, 1); | |||
UNROLL_CALL_RAW(4, cb, 2); | |||
UNROLL_CALL_RAW(4, cb, 3); | |||
#define cb(step, lane, ow_block) \ | |||
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \ | |||
src[(step + src_idx) % ow_block], lane); | |||
#define SHIFT_CAL_HELPER(ow_block, remain_w) \ | |||
template <int src_idx, int weight_idx, typename T, typename T2, \ | |||
typename T3, typename T4> \ | |||
struct ShiftCalHelper<src_idx, weight_idx, 2, ow_block, remain_w, T, T2, \ | |||
T3, T4> { \ | |||
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { \ | |||
UNROLL_CALL_RAW(remain_w, cb2, 0, ow_block); \ | |||
UNROLL_CALL_RAW(remain_w, cb2, 1, ow_block); \ | |||
UNROLL_CALL_RAW(remain_w, cb2, 2, ow_block); \ | |||
UNROLL_CALL_RAW(remain_w, cb2, 3, ow_block); \ | |||
} \ | |||
}; \ | |||
template <int src_idx, int weight_idx, typename T, typename T2, \ | |||
typename T3, typename T4> \ | |||
struct ShiftCalHelper<src_idx, weight_idx, 1, ow_block, remain_w, T, T2, \ | |||
T3, T4> { \ | |||
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { \ | |||
UNROLL_CALL_RAW(remain_w, cb, 0, ow_block); \ | |||
UNROLL_CALL_RAW(remain_w, cb, 1, ow_block); \ | |||
UNROLL_CALL_RAW(remain_w, cb, 2, ow_block); \ | |||
UNROLL_CALL_RAW(remain_w, cb, 3, ow_block); \ | |||
} \ | |||
}; | |||
SHIFT_CAL_HELPER(8, 1); | |||
SHIFT_CAL_HELPER(8, 2); | |||
SHIFT_CAL_HELPER(8, 3); | |||
SHIFT_CAL_HELPER(8, 4); | |||
SHIFT_CAL_HELPER(8, 5); | |||
SHIFT_CAL_HELPER(8, 6); | |||
SHIFT_CAL_HELPER(8, 7); | |||
SHIFT_CAL_HELPER(8, 8); | |||
SHIFT_CAL_HELPER(4, 1); | |||
SHIFT_CAL_HELPER(4, 2); | |||
SHIFT_CAL_HELPER(4, 3); | |||
SHIFT_CAL_HELPER(4, 4); | |||
#undef SHIFT_CAL_HELPER | |||
#undef cb | |||
} | |||
}; | |||
#undef cb2 | |||
template <int src_idx, int weight_idx, int c_dim, int ow_block, typename T, | |||
typename T2, typename T3> | |||
template <int src_idx, int weight_idx, int c_dim, int ow_block, int remain_w, | |||
typename T, typename T2, typename T3> | |||
MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight) { | |||
ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, T, T2, T3, int>::impl( | |||
c, src, weight); | |||
ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, remain_w, T, T2, T3, | |||
int>::impl(c, src, weight); | |||
}; | |||
template <int oc> | |||
struct OCHelper { | |||
@@ -151,7 +144,7 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> { | |||
const int ld_src_iw = iw * oc_step; | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
float32x4_t c[c_dim][ow_block]; | |||
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | |||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | |||
@@ -162,11 +155,11 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> { | |||
0); | |||
load_helper<ic_step, 0, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src[0] = vld1q_f32(src_ptr + (ow_block)*ic_step); | |||
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src_ptr += ld_src_iw; | |||
weight_ptr += ld_weight_fh; | |||
} | |||
@@ -196,7 +189,7 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> { | |||
const int ld_src_iw = iw * oc_step; | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
float32x4_t c[c_dim][ow_block]; | |||
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | |||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | |||
@@ -207,15 +200,15 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> { | |||
0); | |||
load_helper<ic_step, 0, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src[0] = vld1q_f32(src_ptr + (ow_block)*ic_step); | |||
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src[1] = vld1q_f32(src_ptr + (ow_block + 1) * ic_step); | |||
load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<2, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src_ptr += ld_src_iw; | |||
weight_ptr += ld_weight_fh; | |||
} | |||
@@ -244,7 +237,7 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 5, oc_block, ow_block> { | |||
const int ld_src_iw = iw * oc_step; | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
float32x4_t c[c_dim][ow_block]; | |||
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | |||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | |||
@@ -255,27 +248,27 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 5, oc_block, ow_block> { | |||
0); | |||
load_helper<ic_step, 0, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src[0] = vld1q_f32(src_ptr + (ow_block)*ic_step); | |||
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src[1] = vld1q_f32(src_ptr + (ow_block + 1) * ic_step); | |||
load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<2, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src[2] = vld1q_f32(src_ptr + (ow_block + 2) * ic_step); | |||
load_helper<ic_step, 3 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<3, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src[3] = vld1q_f32(src_ptr + (ow_block + 3) * ic_step); | |||
load_helper<ic_step, 4 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<4, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<4, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src_ptr += ld_src_iw; | |||
weight_ptr += ld_weight_fh; | |||
} | |||
@@ -305,7 +298,7 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 7, oc_block, ow_block> { | |||
const int ld_src_iw = iw * oc_step; | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
float32x4_t c[c_dim][ow_block]; | |||
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | |||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | |||
@@ -316,37 +309,37 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 7, oc_block, ow_block> { | |||
0); | |||
load_helper<ic_step, 0, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src[0] = vld1q_f32(src_ptr + (ow_block)*ic_step); | |||
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src[1] = vld1q_f32(src_ptr + (ow_block + 1) * ic_step); | |||
load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<2, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src[2] = vld1q_f32(src_ptr + (ow_block + 2) * ic_step); | |||
load_helper<ic_step, 3 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<3, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src[3] = vld1q_f32(src_ptr + (ow_block + 3) * ic_step); | |||
load_helper<ic_step, 4 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<4, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<4, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src[4] = vld1q_f32(src_ptr + (ow_block + 4) * ic_step); | |||
load_helper<ic_step, 5 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<5, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<5, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src[5] = vld1q_f32(src_ptr + (ow_block + 5) * ic_step); | |||
load_helper<ic_step, 6 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<6, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<6, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src_ptr += ld_src_iw; | |||
weight_ptr += ld_weight_fh; | |||
} | |||
@@ -24,83 +24,77 @@ using namespace megdnn; | |||
using namespace arm_common; | |||
namespace { | |||
template <int src_idx, int weight_idx, int c_dim, int ow_block, typename T, | |||
typename T2, typename T3, typename T4> | |||
template <int src_idx, int weight_idx, int c_dim, int ow_block, int remain_w, | |||
typename T, typename T2, typename T3, typename T4> | |||
struct ShiftCalHelper { | |||
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight); | |||
}; | |||
template <int src_idx, int weight_idx, typename T, typename T2, typename T3, | |||
typename T4> | |||
struct ShiftCalHelper<src_idx, weight_idx, 2, 8, T, T2, T3, T4> { | |||
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { | |||
#define cb(step, lane) \ | |||
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \ | |||
src[(step + src_idx) % 8], lane); \ | |||
c[1][step] = vfmaq_laneq_f32(c[1][step], weight[1][lane], \ | |||
src[(step + src_idx) % 8], lane); | |||
UNROLL_CALL_RAW(8, cb, 0); | |||
UNROLL_CALL_RAW(8, cb, 1); | |||
UNROLL_CALL_RAW(8, cb, 2); | |||
UNROLL_CALL_RAW(8, cb, 3); | |||
#undef cb | |||
} | |||
}; | |||
template <int src_idx, int weight_idx, typename T, typename T2, typename T3, | |||
typename T4> | |||
struct ShiftCalHelper<src_idx, weight_idx, 2, 4, T, T2, T3, T4> { | |||
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { | |||
#define cb(step, lane) \ | |||
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \ | |||
src[(step + src_idx) % 4], lane); \ | |||
c[1][step] = vfmaq_laneq_f32(c[1][step], weight[1][lane], \ | |||
src[(step + src_idx) % 4], lane); | |||
UNROLL_CALL_RAW(4, cb, 0); | |||
UNROLL_CALL_RAW(4, cb, 1); | |||
UNROLL_CALL_RAW(4, cb, 2); | |||
UNROLL_CALL_RAW(4, cb, 3); | |||
#undef cb | |||
} | |||
template <int src_idx, int weight_idx, int c_dim, int ow_block, typename T, | |||
typename T2, typename T3, typename T4> | |||
struct ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, 0, T, T2, T3, T4> { | |||
static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&) {} | |||
}; | |||
template <int src_idx, int weight_idx, typename T, typename T2, typename T3, | |||
typename T4> | |||
struct ShiftCalHelper<src_idx, weight_idx, 1, 8, T, T2, T3, T4> { | |||
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { | |||
#define cb(step, lane) \ | |||
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \ | |||
src[(step + src_idx) % 8], lane); | |||
UNROLL_CALL_RAW(8, cb, 0); | |||
UNROLL_CALL_RAW(8, cb, 1); | |||
UNROLL_CALL_RAW(8, cb, 2); | |||
UNROLL_CALL_RAW(8, cb, 3); | |||
#undef cb | |||
} | |||
}; | |||
template <int src_idx, int weight_idx, typename T, typename T2, typename T3, | |||
typename T4> | |||
struct ShiftCalHelper<src_idx, weight_idx, 1, 4, T, T2, T3, T4> { | |||
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { | |||
#define cb(step, lane) \ | |||
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \ | |||
src[(step + src_idx) % 4], lane); | |||
#define cb2(step, lane, ow_block) \ | |||
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \ | |||
src[(step + src_idx) % ow_block], lane); \ | |||
c[1][step] = vfmaq_laneq_f32(c[1][step], weight[1][lane], \ | |||
src[(step + src_idx) % ow_block], lane); | |||
UNROLL_CALL_RAW(4, cb, 0); | |||
UNROLL_CALL_RAW(4, cb, 1); | |||
UNROLL_CALL_RAW(4, cb, 2); | |||
UNROLL_CALL_RAW(4, cb, 3); | |||
#define cb(step, lane, ow_block) \ | |||
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \ | |||
src[(step + src_idx) % ow_block], lane); | |||
#define SHIFT_CAL_HELPER(ow_block, remain_w) \ | |||
template <int src_idx, int weight_idx, typename T, typename T2, \ | |||
typename T3, typename T4> \ | |||
struct ShiftCalHelper<src_idx, weight_idx, 2, ow_block, remain_w, T, T2, \ | |||
T3, T4> { \ | |||
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { \ | |||
UNROLL_CALL_RAW(remain_w, cb2, 0, ow_block); \ | |||
UNROLL_CALL_RAW(remain_w, cb2, 1, ow_block); \ | |||
UNROLL_CALL_RAW(remain_w, cb2, 2, ow_block); \ | |||
UNROLL_CALL_RAW(remain_w, cb2, 3, ow_block); \ | |||
} \ | |||
}; \ | |||
template <int src_idx, int weight_idx, typename T, typename T2, \ | |||
typename T3, typename T4> \ | |||
struct ShiftCalHelper<src_idx, weight_idx, 1, ow_block, remain_w, T, T2, \ | |||
T3, T4> { \ | |||
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { \ | |||
UNROLL_CALL_RAW(remain_w, cb, 0, ow_block); \ | |||
UNROLL_CALL_RAW(remain_w, cb, 1, ow_block); \ | |||
UNROLL_CALL_RAW(remain_w, cb, 2, ow_block); \ | |||
UNROLL_CALL_RAW(remain_w, cb, 3, ow_block); \ | |||
} \ | |||
}; | |||
SHIFT_CAL_HELPER(8, 1); | |||
SHIFT_CAL_HELPER(8, 2); | |||
SHIFT_CAL_HELPER(8, 3); | |||
SHIFT_CAL_HELPER(8, 4); | |||
SHIFT_CAL_HELPER(8, 5); | |||
SHIFT_CAL_HELPER(8, 6); | |||
SHIFT_CAL_HELPER(8, 7); | |||
SHIFT_CAL_HELPER(8, 8); | |||
SHIFT_CAL_HELPER(4, 1); | |||
SHIFT_CAL_HELPER(4, 2); | |||
SHIFT_CAL_HELPER(4, 3); | |||
SHIFT_CAL_HELPER(4, 4); | |||
#undef SHIFT_CAL_HELPER | |||
#undef cb | |||
} | |||
}; | |||
#undef cb2 | |||
template <int src_idx, int weight_idx, int c_dim, int ow_block, typename T, | |||
typename T2, typename T3> | |||
template <int src_idx, int weight_idx, int c_dim, int ow_block, int remain_w, | |||
typename T, typename T2, typename T3> | |||
MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight) { | |||
ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, T, T2, T3, int>::impl( | |||
c, src, weight); | |||
ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, remain_w, T, T2, T3, | |||
int>::impl(c, src, weight); | |||
}; | |||
template <int oc> | |||
struct OCHelper { | |||
public: | |||
@@ -151,7 +145,7 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> { | |||
const int ld_src_iw = iw * oc_step; | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
float32x4_t c[c_dim][ow_block]; | |||
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | |||
@@ -163,13 +157,13 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> { | |||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, 0); | |||
load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr, | |||
ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr_odd, | |||
0); | |||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src_ptr += ld_src_iw; | |||
src_ptr_odd += ld_src_iw; | |||
weight_ptr += ld_weight_fh; | |||
@@ -177,13 +171,13 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> { | |||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, 0); | |||
load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr, | |||
ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr_odd, | |||
0); | |||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src_ptr += ld_src_iw; | |||
src_ptr_odd += ld_src_iw; | |||
weight_ptr += ld_weight_fh; | |||
@@ -213,7 +207,7 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> { | |||
const int ld_src_iw = iw * oc_step; | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
float32x4_t c[c_dim][ow_block]; | |||
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | |||
const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic; | |||
@@ -224,18 +218,18 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> { | |||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, 0); | |||
load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr, | |||
ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src[0] = vld1q_f32(src_ptr + ow_block * simd_len); | |||
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr_odd, | |||
0); | |||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src_ptr += ld_src_iw; | |||
src_ptr_odd += ld_src_iw; | |||
weight_ptr += ld_weight_fh; | |||
@@ -243,17 +237,17 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> { | |||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, 0); | |||
load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr, | |||
ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src[0] = vld1q_f32(src_ptr + ow_block * simd_len); | |||
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr_odd, | |||
0); | |||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src_ptr += ld_src_iw; | |||
src_ptr_odd += ld_src_iw; | |||
weight_ptr += ld_weight_fh; | |||
@@ -261,18 +255,18 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> { | |||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, 0); | |||
load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr, | |||
ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src[0] = vld1q_f32(src_ptr + ow_block * simd_len); | |||
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr_odd, | |||
0); | |||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src_ptr += ld_src_iw; | |||
src_ptr_odd += ld_src_iw; | |||
weight_ptr += ld_weight_fh; | |||
@@ -302,7 +296,7 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 5, oc_block, ow_block> { | |||
const int ld_src_iw = iw * oc_step; | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
float32x4_t c[c_dim][ow_block]; | |||
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | |||
@@ -316,25 +310,25 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 5, oc_block, ow_block> { | |||
0); | |||
load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr, | |||
ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src[0] = vld1q_f32(src_ptr + ow_block * simd_len); | |||
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src[1] = vld1q_f32(src_ptr + (ow_block + 1) * simd_len); | |||
load_helper<4, 4 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<2, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
// odd element | |||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>( | |||
src, src_ptr_odd, 0); | |||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src[0] = vld1q_f32(src_ptr_odd + ow_block * simd_len); | |||
load_helper<4, 3 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src_ptr += ld_src_iw; | |||
src_ptr_odd += ld_src_iw; | |||
@@ -371,7 +365,7 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 7, oc_block, ow_block> { | |||
const int ld_src_iw = iw * oc_step; | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
float32x4_t c[c_dim][ow_block]; | |||
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; | |||
@@ -385,33 +379,33 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 7, oc_block, ow_block> { | |||
0); | |||
load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr, | |||
ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src[0] = vld1q_f32(src_ptr + ow_block * simd_len); | |||
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src[1] = vld1q_f32(src_ptr + (ow_block + 1) * simd_len); | |||
load_helper<4, 4 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<2, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src[2] = vld1q_f32(src_ptr + (ow_block + 2) * simd_len); | |||
load_helper<4, 6 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<3, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
// odd element | |||
load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>( | |||
src, src_ptr_odd, 0); | |||
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src[0] = vld1q_f32(src_ptr_odd + ow_block * simd_len); | |||
load_helper<4, 3 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<1, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src[1] = vld1q_f32(src_ptr_odd + (ow_block + 1) * simd_len); | |||
load_helper<4, 5 * ld_weight, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<2, 0, c_dim, ow_block>(c, src, weight); | |||
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); | |||
src_ptr += ld_src_iw; | |||
src_ptr_odd += ld_src_iw; | |||
@@ -39,16 +39,18 @@ namespace { | |||
*\tparam T2 is type of src regs | |||
*\tparam T3 is type of weight regs | |||
*/ | |||
template <int src_idx, int weight_idx, int c_dim, int stride, typename T, | |||
typename T2, typename T3> | |||
template <int src_idx, int weight_idx, int c_dim, int stride, int remain_w, | |||
typename T, typename T2, typename T3> | |||
struct ShiftCalHelper { | |||
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight); | |||
}; | |||
template <int src_idx, int weight_idx, int stride, typename T, typename T2, | |||
typename T3> | |||
struct ShiftCalHelper<src_idx, weight_idx, 2, stride, T, T2, T3> { | |||
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { | |||
template <int src_idx, int weight_idx, int c_dim, int stride, typename T, | |||
typename T2, typename T3> | |||
struct ShiftCalHelper<src_idx, weight_idx, c_dim, stride, 0, T, T2, T3> { | |||
static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&) {} | |||
}; | |||
#define cb(step) \ | |||
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][weight_idx], \ | |||
src[(step * stride + src_idx) / 4], \ | |||
@@ -57,29 +59,47 @@ struct ShiftCalHelper<src_idx, weight_idx, 2, stride, T, T2, T3> { | |||
src[(step * stride + src_idx) / 4], \ | |||
(step * stride + src_idx) % 4); | |||
UNROLL_CALL_RAW(8, cb); | |||
#undef cb | |||
} | |||
}; | |||
template <int src_idx, int weight_idx, int stride, typename T, typename T2, | |||
typename T3> | |||
struct ShiftCalHelper<src_idx, weight_idx, 1, stride, T, T2, T3> { | |||
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { | |||
#define cb(step) \ | |||
#define cb2(step) \ | |||
c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][weight_idx], \ | |||
src[(step * stride + src_idx) / 4], \ | |||
(step * stride + src_idx) % 4); | |||
UNROLL_CALL_RAW(8, cb); | |||
#define SHIFT_CAL_HELPER(ow_remain) \ | |||
template <int src_idx, int weight_idx, int stride, typename T, \ | |||
typename T2, typename T3> \ | |||
struct ShiftCalHelper<src_idx, weight_idx, 2, stride, ow_remain, T, T2, \ | |||
T3> { \ | |||
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { \ | |||
UNROLL_CALL_RAW(ow_remain, cb); \ | |||
} \ | |||
}; \ | |||
template <int src_idx, int weight_idx, int stride, typename T, \ | |||
typename T2, typename T3> \ | |||
struct ShiftCalHelper<src_idx, weight_idx, 1, stride, ow_remain, T, T2, \ | |||
T3> { \ | |||
static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) { \ | |||
UNROLL_CALL_RAW(ow_remain, cb2); \ | |||
} \ | |||
}; | |||
SHIFT_CAL_HELPER(1) | |||
SHIFT_CAL_HELPER(2) | |||
SHIFT_CAL_HELPER(3) | |||
SHIFT_CAL_HELPER(4) | |||
SHIFT_CAL_HELPER(5) | |||
SHIFT_CAL_HELPER(6) | |||
SHIFT_CAL_HELPER(7) | |||
SHIFT_CAL_HELPER(8) | |||
#undef SHIFT_CAL_HELPER | |||
#undef cb | |||
} | |||
}; | |||
#undef cb2 | |||
template <int src_idx, int weight_idx, int c_dim, int stride, typename T, | |||
typename T2, typename T3> | |||
template <int src_idx, int weight_idx, int c_dim, int stride, int remain_w, | |||
typename T, typename T2, typename T3> | |||
MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight) { | |||
ShiftCalHelper<src_idx, weight_idx, c_dim, stride, T, T2, T3>::impl(c, src, | |||
weight); | |||
ShiftCalHelper<src_idx, weight_idx, c_dim, stride, remain_w, T, T2, | |||
T3>::impl(c, src, weight); | |||
}; | |||
enum CpuTag { | |||
DEFAULT_CPU_TAG = 0, | |||
@@ -134,7 +154,7 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 7, oc_block, stride, | |||
const int ld_src_ic = ih * iw; | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
float32x4_t c[c_dim][8]; | |||
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
float32x4_t src[src_reg_size]; | |||
@@ -145,13 +165,13 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 7, oc_block, stride, | |||
src, src_ptr + step * iw, 0); \ | |||
load_helper<filter_size, 0, oc_step, c_dim, Vld1q_f32>( \ | |||
weight, weight_ptr + step * ld_weight_fw, ld_weight_oc); \ | |||
cal_helper<0, 0, c_dim, stride>(c, src, weight); \ | |||
cal_helper<1, 1, c_dim, stride>(c, src, weight); \ | |||
cal_helper<2, 2, c_dim, stride>(c, src, weight); \ | |||
cal_helper<3, 3, c_dim, stride>(c, src, weight); \ | |||
cal_helper<4, 4, c_dim, stride>(c, src, weight); \ | |||
cal_helper<5, 5, c_dim, stride>(c, src, weight); \ | |||
cal_helper<6, 6, c_dim, stride>(c, src, weight); | |||
cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight); \ | |||
cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight); \ | |||
cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight); \ | |||
cal_helper<3, 3, c_dim, stride, remain_w>(c, src, weight); \ | |||
cal_helper<4, 4, c_dim, stride, remain_w>(c, src, weight); \ | |||
cal_helper<5, 5, c_dim, stride, remain_w>(c, src, weight); \ | |||
cal_helper<6, 6, c_dim, stride, remain_w>(c, src, weight); | |||
UNROLL_CALL_RAW(7, KERNEL_CB) | |||
#undef KERNEL_CB | |||
@@ -185,7 +205,7 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 5, oc_block, stride, | |||
const int ld_src_ic = ih * iw; | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
float32x4_t c[c_dim][8]; | |||
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
float32x4_t src[src_reg_size]; | |||
@@ -196,11 +216,11 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 5, oc_block, stride, | |||
src, src_ptr + step * iw, 0); \ | |||
load_helper<filter_size, 0, oc_step, c_dim, Vld1q_f32>( \ | |||
weight, weight_ptr + step * ld_weight_fw, ld_weight_oc); \ | |||
cal_helper<0, 0, c_dim, stride>(c, src, weight); \ | |||
cal_helper<1, 1, c_dim, stride>(c, src, weight); \ | |||
cal_helper<2, 2, c_dim, stride>(c, src, weight); \ | |||
cal_helper<3, 3, c_dim, stride>(c, src, weight); \ | |||
cal_helper<4, 4, c_dim, stride>(c, src, weight); | |||
cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight); \ | |||
cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight); \ | |||
cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight); \ | |||
cal_helper<3, 3, c_dim, stride, remain_w>(c, src, weight); \ | |||
cal_helper<4, 4, c_dim, stride, remain_w>(c, src, weight); | |||
UNROLL_CALL_RAW(5, KERNEL_CB) | |||
#undef KERNEL_CB | |||
@@ -233,7 +253,7 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 3, oc_block, stride, | |||
const int ld_src_ic = ih * iw; | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
float32x4_t c[c_dim][8]; | |||
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
float32x4_t src[src_reg_size]; | |||
@@ -243,27 +263,27 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 3, oc_block, stride, | |||
0); | |||
load_helper<filter_size, 0, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, stride>(c, src, weight); | |||
cal_helper<1, 1, c_dim, stride>(c, src, weight); | |||
cal_helper<2, 2, c_dim, stride>(c, src, weight); | |||
cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight); | |||
cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight); | |||
cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight); | |||
// row 1 | |||
load_helper<src_reg_size, 0, simd_len, 0, Vld1q_f32>( | |||
src, src_ptr + iw, 0); | |||
load_helper<filter_size, 0, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr + 1 * ld_weight_fw, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, stride>(c, src, weight); | |||
cal_helper<1, 1, c_dim, stride>(c, src, weight); | |||
cal_helper<2, 2, c_dim, stride>(c, src, weight); | |||
cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight); | |||
cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight); | |||
cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight); | |||
// row 2 | |||
load_helper<src_reg_size, 0, simd_len, 0, Vld1q_f32>( | |||
src, src_ptr + 2 * iw, 0); | |||
load_helper<filter_size, 0, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr + 2 * ld_weight_fw, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, stride>(c, src, weight); | |||
cal_helper<1, 1, c_dim, stride>(c, src, weight); | |||
cal_helper<2, 2, c_dim, stride>(c, src, weight); | |||
cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight); | |||
cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight); | |||
cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight); | |||
src_ptr += ld_src_ic; | |||
weight_ptr += ld_weight_ic; | |||
@@ -634,7 +654,7 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 2, oc_block, stride, | |||
const int ld_src_ic = ih * iw; | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
float32x4_t c[c_dim][8]; | |||
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
float32x4_t src[src_reg_size]; | |||
@@ -644,16 +664,16 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 2, oc_block, stride, | |||
0); | |||
load_helper<filter_size, 0, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, stride>(c, src, weight); | |||
cal_helper<1, 1, c_dim, stride>(c, src, weight); | |||
cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight); | |||
cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight); | |||
// row 1 | |||
load_helper<src_reg_size, 0, simd_len, 0, Vld1q_f32>( | |||
src, src_ptr + iw, 0); | |||
load_helper<filter_size, 0, oc_step, c_dim, Vld1q_f32>( | |||
weight, weight_ptr + 1 * ld_weight_fw, ld_weight_oc); | |||
cal_helper<0, 0, c_dim, stride>(c, src, weight); | |||
cal_helper<1, 1, c_dim, stride>(c, src, weight); | |||
cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight); | |||
cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight); | |||
src_ptr += ld_src_ic; | |||
weight_ptr += ld_weight_ic; | |||
@@ -6,14 +6,15 @@ | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#include <algorithm> | |||
#include "src/arm_common/conv_bias/fp32/do_conv_stride1.h" | |||
#include "src/arm_common/simd_macro/neon_helper.h" | |||
#include "src/arm_common/conv_bias/postprocess_helper.h" | |||
#include "src/arm_common/simd_macro/neon_helper.h" | |||
#include "midout.h" | |||
@@ -27,10 +28,9 @@ using namespace conv_stride1; | |||
using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam; | |||
using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam; | |||
void conv_stride1::do_conv_2x2_stride1(const float* src, const float* filter, float* dst, | |||
size_t IH, size_t IW, size_t OH, size_t OW, | |||
size_t IC) { | |||
void conv_stride1::do_conv_2x2_stride1(const float* src, const float* filter, | |||
float* dst, size_t IH, size_t IW, | |||
size_t OH, size_t OW, size_t IC) { | |||
const size_t tail_step = IW - OW; | |||
//! unroll of 2 | |||
size_t ic = 0; | |||
@@ -143,9 +143,9 @@ void conv_stride1::do_conv_2x2_stride1(const float* src, const float* filter, fl | |||
} | |||
} | |||
void conv_stride1::do_conv_3x3_stride1(const float* src, const float* filter, float* dst, | |||
size_t IH, size_t IW, size_t OH, size_t OW, | |||
size_t IC) { | |||
void conv_stride1::do_conv_3x3_stride1(const float* src, const float* filter, | |||
float* dst, size_t IH, size_t IW, | |||
size_t OH, size_t OW, size_t IC) { | |||
const size_t tail_step = IW - OW; | |||
rep(ic, IC) { | |||
@@ -193,7 +193,7 @@ void conv_stride1::do_conv_3x3_stride1(const float* src, const float* filter, fl | |||
MEGDNN_SIMD_TYPE _r22 = MEGDNN_SIMD_EXT(_r20, _r20n, 2); | |||
MEGDNN_SIMD_TYPE _r30 = MEGDNN_SIMD_LOADU(r3); | |||
MEGDNN_SIMD_TYPE _r30n = MEGDNN_SIMD_LOADU(r3 + 4); | |||
MEGDNN_SIMD_TYPE _r30n = MEGDNN_SIMD_LOADU_2(r3 + 4); | |||
MEGDNN_SIMD_TYPE _r31 = MEGDNN_SIMD_EXT(_r30, _r30n, 1); | |||
MEGDNN_SIMD_TYPE _r32 = MEGDNN_SIMD_EXT(_r30, _r30n, 2); | |||
@@ -290,9 +290,9 @@ void conv_stride1::do_conv_3x3_stride1(const float* src, const float* filter, fl | |||
} | |||
} | |||
void conv_stride1::do_conv_5x5_stride1(const float* src, const float* filter, float* dst, | |||
size_t IH, size_t IW, size_t OH, size_t OW, | |||
size_t IC) { | |||
void conv_stride1::do_conv_5x5_stride1(const float* src, const float* filter, | |||
float* dst, size_t IH, size_t IW, | |||
size_t OH, size_t OW, size_t IC) { | |||
const size_t tail_step = IW - OW; | |||
rep(ic, IC) { | |||
@@ -530,9 +530,9 @@ void conv_stride1::do_conv_5x5_stride1(const float* src, const float* filter, fl | |||
} | |||
} | |||
void conv_stride1::do_conv_7x7_stride1(const float* src, const float* filter, float* dst, | |||
size_t IH, size_t IW, size_t OH, size_t OW, | |||
size_t IC) { | |||
void conv_stride1::do_conv_7x7_stride1(const float* src, const float* filter, | |||
float* dst, size_t IH, size_t IW, | |||
size_t OH, size_t OW, size_t IC) { | |||
const size_t tail_step = IW - OW; | |||
rep(ic, IC) { | |||
@@ -688,7 +688,7 @@ void conv_stride1::do_conv_7x7_stride1(const float* src, const float* filter, fl | |||
_sum = MEGDNN_SIMD_FMA_LANE(_sum, _r56, _k39404142, 2); | |||
MEGDNN_SIMD_TYPE _k42434445 = MEGDNN_SIMD_LOADU(k6); | |||
MEGDNN_SIMD_TYPE _k46474849 = MEGDNN_SIMD_LOADU(k6 + 4); | |||
MEGDNN_SIMD_TYPE _k46474849 = MEGDNN_SIMD_LOADU_3(k6 + 4); | |||
MEGDNN_SIMD_TYPE _r60 = MEGDNN_SIMD_LOADU(r6); | |||
MEGDNN_SIMD_TYPE _r64 = MEGDNN_SIMD_LOADU(r6 + 4); | |||
@@ -126,7 +126,8 @@ static void do_conv_kern(const WorkspaceBundle& bundle, | |||
? oh_idx * oh_block * ow * pack_c | |||
: oc_idx; | |||
const float* bptr = | |||
kern_param.bias<dt_float32>(batch_id, group_id) + bias_offset; | |||
kern_param.bias<dt_float32>(batch_id, group_id, oc_idx, 1, pack_c) + | |||
bias_offset; | |||
Op op; | |||
conv_bias::conv_direct_fp32_nchw44<bias_mode, Op, filter, stride>( | |||
@@ -69,7 +69,7 @@ struct KerNeonDotXXs2Nchw44Int8<bias_mode, Op, remain_w, 2, oc_block, ow_block, | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
int32x4_t c[c_dim][8]; | |||
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | |||
int8x16_t src[src_reg]; | |||
int8x16_t weight[c_dim][weight_reg]; | |||
@@ -117,7 +117,7 @@ struct KerNeonDotXXs2Nchw44Int8<bias_mode, Op, remain_w, 3, oc_block, ow_block, | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
int32x4_t c[c_dim][8]; | |||
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | |||
int8x16_t src[src_reg]; | |||
int8x16_t weight[c_dim][weight_reg]; | |||
@@ -171,7 +171,7 @@ struct KerNeonDotXXs2Nchw44Int8<bias_mode, Op, remain_w, 5, oc_block, ow_block, | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
int32x4_t c[c_dim][8]; | |||
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | |||
int8x16_t src[src_reg]; | |||
@@ -220,7 +220,7 @@ struct KerNeonDotXXs2Nchw44Int8<bias_mode, Op, remain_w, 7, oc_block, ow_block, | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
int32x4_t c[c_dim][8]; | |||
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | |||
int8x16_t src[src_reg]; | |||
@@ -80,7 +80,7 @@ struct KerNeonDotXXs2Nchw44Int8<bias_mode, Op, remain_w, 2, oc_block, ow_block, | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
int32x4_t c[c_dim][8]; | |||
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | |||
int8x16_t src[2][src_reg]; | |||
int8x16_t weight[c_dim][weight_reg]; | |||
@@ -131,7 +131,7 @@ struct KerNeonDotXXs2Nchw44Int8<bias_mode, Op, remain_w, 3, oc_block, ow_block, | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
int32x4_t c[c_dim][8]; | |||
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | |||
int8x16_t src[2][src_reg]; | |||
int8x16_t weight[c_dim][weight_reg]; | |||
@@ -189,7 +189,7 @@ struct KerNeonDotXXs2Nchw44Int8<bias_mode, Op, remain_w, 5, oc_block, ow_block, | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
int32x4_t c[c_dim][8]; | |||
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | |||
int8x16_t src[2][src_reg]; | |||
@@ -244,7 +244,7 @@ struct KerNeonDotXXs2Nchw44Int8<bias_mode, Op, remain_w, 7, oc_block, ow_block, | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
int32x4_t c[c_dim][8]; | |||
init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { | |||
int8x16_t src[2][src_reg]; | |||
@@ -45,7 +45,7 @@ static void ker_neon_dirctconv_2x2s1_oc8_ow8(const int8_t* src_ptr, | |||
int8x16_t src[8 + 1]; | |||
int16x8_t temp_c[4]; | |||
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
for (int fh_idx = 0; fh_idx < fh; ++fh_idx) { | |||
@@ -135,7 +135,7 @@ static void ker_neon_dirctconv_2x2s1_oc4_ow8(const int8_t* src_ptr, | |||
int8x16_t weight[1][2]; | |||
int8x16_t src[8 + 1]; | |||
int16x8_t temp_c[2]; | |||
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
for (int fh_idx = 0; fh_idx < fh; ++fh_idx) { | |||
@@ -224,7 +224,7 @@ struct KerNeonDirectStride1Int8<bias_mode, Op, remain_w, 3, c_dim, DstType> { | |||
int8x16_t weight[3]; | |||
int8x16_t src[8 + 2]; | |||
int16x8_t temp_c[2]; | |||
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
for (int fh_idx = 0; fh_idx < fh; ++fh_idx) { | |||
@@ -306,7 +306,7 @@ struct KerNeonDirectStride1Int8<bias_mode, Op, remain_w, 5, c_dim, DstType> { | |||
int8x16_t weight[5]; | |||
int8x16_t src[8 + 2]; | |||
int16x8_t temp_c[2]; | |||
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
for (int fh_idx = 0; fh_idx < fh; ++fh_idx) { | |||
@@ -409,7 +409,7 @@ struct KerNeonDirectStride1Int8<bias_mode, Op, remain_w, 7, c_dim, DstType> { | |||
int8x16_t weight[7]; | |||
int8x16_t src[8 + 2]; | |||
int16x8_t temp_c[2]; | |||
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
for (int fh_idx = 0; fh_idx < fh; ++fh_idx) { | |||
@@ -569,7 +569,7 @@ void conv_direct_stride1_2x2_int8_nchw44(const int8_t* src, | |||
(oh_idx * iw + ow_idx) * ic_step * pack_iw_len; | |||
const size_t dst_offset = | |||
oc_idx * img_stride + (oh_idx * ow + ow_idx) * oc_step; | |||
ker_neon_dirctconv_2x2s1_oc8_ow8<bias_mode, Op, 0, filter_size, | |||
ker_neon_dirctconv_2x2s1_oc8_ow8<bias_mode, Op, ow_step, filter_size, | |||
2, DstType>( | |||
src + src_offset, filter + weight_offset, bias + oc_idx, | |||
dst + dst_offset, ic, ih, iw, ld_oc, op); | |||
@@ -594,7 +594,7 @@ void conv_direct_stride1_2x2_int8_nchw44(const int8_t* src, | |||
(oh_idx * iw + ow_idx) * ic_step * pack_iw_len; | |||
const size_t dst_offset = | |||
oc_idx * img_stride + (oh_idx * ow + ow_idx) * oc_step; | |||
ker_neon_dirctconv_2x2s1_oc4_ow8<bias_mode, Op, 0, filter_size, | |||
ker_neon_dirctconv_2x2s1_oc4_ow8<bias_mode, Op, ow_step, filter_size, | |||
1, DstType>( | |||
src + src_offset, filter + weight_offset, bias + oc_idx, | |||
dst + dst_offset, ic, ih, iw, ld_oc, op); | |||
@@ -54,7 +54,7 @@ static void ker_neon_dirctconv_2x2s2_oc8_ow8(const int8_t* src_ptr, | |||
int8x16_t src[8 + 1]; | |||
int16x8_t temp_c[4]; | |||
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
for (int fh_idx = 0; fh_idx < fh; ++fh_idx) { | |||
@@ -151,7 +151,7 @@ static void ker_neon_dirctconv_2x2s2_oc4_ow8(const int8_t* src_ptr, | |||
int8x16_t weight[2]; | |||
int8x16_t src[8 + 1]; | |||
int16x8_t temp_c[2]; | |||
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
for (int fh_idx = 0; fh_idx < fh; ++fh_idx) { | |||
@@ -239,7 +239,7 @@ struct KerNeonDirectStride2Int8<bias_mode, Op, remain_w, 3, c_dim, DstType> { | |||
int8x16_t weight[3]; | |||
int8x16_t src[8 + 2]; | |||
int16x8_t temp_c[4]; | |||
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
for (int fh_idx = 0; fh_idx < fh; ++fh_idx) { | |||
@@ -327,7 +327,7 @@ struct KerNeonDirectStride2Int8<bias_mode, Op, remain_w, 5, c_dim, DstType> { | |||
int8x16_t weight[5]; | |||
int8x16_t src[8 + 2]; | |||
int16x8_t temp_c[4]; | |||
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
for (int fh_idx = 0; fh_idx < fh; ++fh_idx) { | |||
const int8_t* src_ic_0_3 = src_ptr + ic_idx * ic_stride + | |||
@@ -435,7 +435,7 @@ struct KerNeonDirectStride2Int8<bias_mode, Op, remain_w, 7, c_dim, DstType> { | |||
int8x16_t weight[7]; | |||
int8x16_t src[8 + 2]; | |||
int16x8_t temp_c[4]; | |||
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
for (int fh_idx = 0; fh_idx < fh; ++fh_idx) { | |||
const int8_t* src_ic_0_3 = src_ptr + ic_idx * ic_stride + | |||
@@ -131,7 +131,7 @@ struct KerNeonXXs2NchwNchw44<bias_mode, Op, remain_w, 2, oc_block, 1> { | |||
const int ld_weight_oc = oc_step * filter_height * filter_width * ic; | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
int32x4_t c[c_dim][8]; | |||
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride; | |||
@@ -178,7 +178,7 @@ struct KerNeonXXs2NchwNchw44<bias_mode, Op, remain_w, 3, oc_block, 1> { | |||
const int ld_weight_oc = oc_step * filter_height * filter_width * ic; | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
int32x4_t c[c_dim][8]; | |||
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride; | |||
@@ -232,7 +232,7 @@ struct KerNeonXXs2NchwNchw44<bias_mode, Op, remain_w, 5, oc_block, 1> { | |||
const int ld_weight_oc = oc_step * filter_height * filter_width * ic; | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
int32x4_t c[c_dim][8]; | |||
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride; | |||
@@ -279,7 +279,7 @@ struct KerNeonXXs2NchwNchw44<bias_mode, Op, remain_w, 7, oc_block, 1> { | |||
const int ld_weight_oc = oc_step * filter_height * filter_width * ic; | |||
constexpr int c_dim = OCHelper<oc_block>::val; | |||
int32x4_t c[c_dim][8]; | |||
init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step); | |||
init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step); | |||
for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { | |||
const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride; | |||
@@ -643,128 +643,95 @@ __ai int32x4_t neon_vld1q(const int* ptr) { | |||
__ai int16x8_t neon_vld1q(const int16_t* ptr) { | |||
return vld1q_s16(ptr); | |||
} | |||
template <int c_dim, BiasMode bias_mode, int ow_block, typename T, typename T2> | |||
template <typename T> | |||
struct NeonLdqSimd; | |||
template <> | |||
struct NeonLdqSimd<float> { | |||
static constexpr int simd_len = 4; | |||
}; | |||
template <> | |||
struct NeonLdqSimd<int> { | |||
static constexpr int simd_len = 4; | |||
}; | |||
template <> | |||
struct NeonLdqSimd<int16_t> { | |||
static constexpr int simd_len = 8; | |||
}; | |||
template <int c_dim, BiasMode bias_mode, int ow_remain, typename T, typename T2> | |||
struct InitOcxOw8 { | |||
static __ai void impl(T& c, const T2* bias_ptr, int oc_step); | |||
}; | |||
template <typename T, typename T2> | |||
struct InitOcxOw8<2, BiasMode::NO_BIAS, 8, T, T2> { | |||
static __ai void impl(T& c, const T2*, int) { | |||
#define BAIS_INIT(step) \ | |||
c[0][step] = neon_vdupq_n(static_cast<T2>(0)); \ | |||
c[1][step] = neon_vdupq_n(static_cast<T2>(0)); | |||
UNROLL_CALL_RAW(8, BAIS_INIT); | |||
#undef BAIS_INIT | |||
} | |||
template <int c_dim, BiasMode bias_mode, typename T, typename T2> | |||
struct InitOcxOw8<c_dim, bias_mode, 0, T, T2> { | |||
static __ai void impl(T&, const T2*, int) {} | |||
}; | |||
template <typename T, typename T2> | |||
struct InitOcxOw8<2, BiasMode::NO_BIAS, 4, T, T2> { | |||
static __ai void impl(T& c, const T2*, int) { | |||
#define BAIS_INIT(step) \ | |||
#define BAIS_INIT_NO_BIAS_C2(step) \ | |||
c[0][step] = neon_vdupq_n(static_cast<T2>(0)); \ | |||
c[1][step] = neon_vdupq_n(static_cast<T2>(0)); | |||
UNROLL_CALL_RAW(4, BAIS_INIT); | |||
#undef BAIS_INIT | |||
} | |||
}; | |||
template <typename T, typename T2> | |||
struct InitOcxOw8<2, BiasMode::BROADCAST_CHANNEL_BIAS, 8, T, T2> { | |||
static __ai void impl(T& c, const T2* bias_ptr, int oc_step) { | |||
#define BAIS_INIT(step) \ | |||
c[0][step] = neon_vld1q(bias_ptr); \ | |||
c[1][step] = neon_vld1q(bias_ptr + oc_step); | |||
UNROLL_CALL_RAW(8, BAIS_INIT); | |||
#undef BAIS_INIT | |||
} | |||
}; | |||
template <typename T, typename T2> | |||
struct InitOcxOw8<2, BiasMode::BROADCAST_CHANNEL_BIAS, 4, T, T2> { | |||
static __ai void impl(T& c, const T2* bias_ptr, int oc_step) { | |||
#define BAIS_INIT(step) \ | |||
#define BAIS_INIT_NO_BIAS_C1(step) \ | |||
c[0][step] = neon_vdupq_n(static_cast<T2>(0)); | |||
#define BAIS_INIT_BROADCAST_C2(step) \ | |||
c[0][step] = neon_vld1q(bias_ptr); \ | |||
c[1][step] = neon_vld1q(bias_ptr + oc_step); | |||
UNROLL_CALL_RAW(4, BAIS_INIT); | |||
#undef BAIS_INIT | |||
} | |||
}; | |||
template <typename T, typename T2> | |||
struct InitOcxOw8<2, BiasMode::BIAS, 8, T, T2> { | |||
static __ai void impl(T& c, const T2* bias_ptr, int oc_step) { | |||
constexpr int simd_len = 4; | |||
#define BAIS_INIT(step) \ | |||
c[0][step] = neon_vld1q(bias_ptr + step * simd_len); \ | |||
c[1][step] = neon_vld1q(bias_ptr + oc_step + step * simd_len); | |||
UNROLL_CALL_RAW(8, BAIS_INIT); | |||
#undef BAIS_INIT | |||
} | |||
}; | |||
template <typename T, typename T2> | |||
struct InitOcxOw8<2, BiasMode::BIAS, 4, T, T2> { | |||
static __ai void impl(T& c, const T2* bias_ptr, int oc_step) { | |||
constexpr int simd_len = 4; | |||
#define BAIS_INIT(step) \ | |||
#define BAIS_INIT_BROADCAST_C1(step) c[0][step] = neon_vld1q(bias_ptr); | |||
#define BAIS_INIT_BIAS_C2(step) \ | |||
c[0][step] = neon_vld1q(bias_ptr + step * simd_len); \ | |||
c[1][step] = neon_vld1q(bias_ptr + oc_step + step * simd_len); | |||
UNROLL_CALL_RAW(4, BAIS_INIT); | |||
#undef BAIS_INIT | |||
} | |||
}; | |||
template <typename T, typename T2> | |||
struct InitOcxOw8<1, BiasMode::NO_BIAS, 8, T, T2> { | |||
static __ai void impl(T& c, const T2*, int) { | |||
#define BAIS_INIT(step) c[0][step] = neon_vdupq_n(static_cast<T2>(0)); | |||
UNROLL_CALL_RAW(8, BAIS_INIT); | |||
#undef BAIS_INIT | |||
} | |||
}; | |||
template <typename T, typename T2> | |||
struct InitOcxOw8<1, BiasMode::NO_BIAS, 4, T, T2> { | |||
static __ai void impl(T& c, const T2*, int) { | |||
#define BAIS_INIT(step) c[0][step] = neon_vdupq_n(static_cast<T2>(0)); | |||
UNROLL_CALL_RAW(4, BAIS_INIT); | |||
#undef BAIS_INIT | |||
} | |||
}; | |||
template <typename T, typename T2> | |||
struct InitOcxOw8<1, BiasMode::BROADCAST_CHANNEL_BIAS, 8, T, T2> { | |||
static __ai void impl(T& c, const T2* bias_ptr, int) { | |||
#define BAIS_INIT(step) c[0][step] = neon_vld1q(bias_ptr); | |||
UNROLL_CALL_RAW(8, BAIS_INIT); | |||
#undef BAIS_INIT | |||
} | |||
}; | |||
template <typename T, typename T2> | |||
struct InitOcxOw8<1, BiasMode::BROADCAST_CHANNEL_BIAS, 4, T, T2> { | |||
static __ai void impl(T& c, const T2* bias_ptr, int) { | |||
#define BAIS_INIT(step) c[0][step] = neon_vld1q(bias_ptr); | |||
UNROLL_CALL_RAW(4, BAIS_INIT); | |||
#undef BAIS_INIT | |||
} | |||
}; | |||
template <typename T, typename T2> | |||
struct InitOcxOw8<1, BiasMode::BIAS, 8, T, T2> { | |||
static __ai void impl(T& c, const T2* bias_ptr, int) { | |||
constexpr int simd_len = 4; | |||
#define BAIS_INIT(step) c[0][step] = neon_vld1q(bias_ptr + step * simd_len); | |||
UNROLL_CALL_RAW(8, BAIS_INIT); | |||
#undef BAIS_INIT | |||
} | |||
}; | |||
template <typename T, typename T2> | |||
struct InitOcxOw8<1, BiasMode::BIAS, 4, T, T2> { | |||
static __ai void impl(T& c, const T2* bias_ptr, int) { | |||
constexpr int simd_len = 4; | |||
#define BAIS_INIT(step) c[0][step] = neon_vld1q(bias_ptr + step * simd_len); | |||
UNROLL_CALL_RAW(4, BAIS_INIT); | |||
#undef BAIS_INIT | |||
} | |||
}; | |||
template <int c_dim, BiasMode bias_mode, int ow_block, typename T, typename T2> | |||
#define BAIS_INIT_BIAS_C1(step) \ | |||
c[0][step] = neon_vld1q(bias_ptr + step * simd_len); | |||
#define INSTANCE_InitOcxOw8(ow_remain, cdim) \ | |||
template <typename T, typename T2> \ | |||
struct InitOcxOw8<cdim, BiasMode::NO_BIAS, ow_remain, T, T2> { \ | |||
static __ai void impl(T& c, const T2*, int) { \ | |||
UNROLL_CALL_RAW(ow_remain, BAIS_INIT_NO_BIAS_C##cdim); \ | |||
} \ | |||
}; \ | |||
template <typename T, typename T2> \ | |||
struct InitOcxOw8<cdim, BiasMode::BROADCAST_CHANNEL_BIAS, ow_remain, T, \ | |||
T2> { \ | |||
static __ai void impl(T& c, const T2* bias_ptr, int oc_step) { \ | |||
(void)oc_step; \ | |||
UNROLL_CALL_RAW(ow_remain, BAIS_INIT_BROADCAST_C##cdim); \ | |||
} \ | |||
}; \ | |||
template <typename T, typename T2> \ | |||
struct InitOcxOw8<cdim, BiasMode::BIAS, ow_remain, T, T2> { \ | |||
static __ai void impl(T& c, const T2* bias_ptr, int oc_step) { \ | |||
constexpr int simd_len = NeonLdqSimd<T2>::simd_len; \ | |||
(void)oc_step; \ | |||
UNROLL_CALL_RAW(ow_remain, BAIS_INIT_BIAS_C##cdim); \ | |||
} \ | |||
}; | |||
#define INSTANCE_InitOcxOw8_C(ow_remain) \ | |||
INSTANCE_InitOcxOw8(ow_remain, 2); \ | |||
INSTANCE_InitOcxOw8(ow_remain, 1); | |||
INSTANCE_InitOcxOw8_C(1); | |||
INSTANCE_InitOcxOw8_C(2); | |||
INSTANCE_InitOcxOw8_C(3); | |||
INSTANCE_InitOcxOw8_C(4); | |||
INSTANCE_InitOcxOw8_C(5); | |||
INSTANCE_InitOcxOw8_C(6); | |||
INSTANCE_InitOcxOw8_C(7); | |||
INSTANCE_InitOcxOw8_C(8); | |||
#undef INSTANCE_InitOcxOw8 | |||
#undef INSTANCE_InitOcxOw8_C | |||
#undef BAIS_INIT_BIAS_C1 | |||
#undef BAIS_INIT_BIAS_C2 | |||
#undef BAIS_INIT_BROADCAST_C1 | |||
#undef BAIS_INIT_BROADCAST_C2 | |||
#undef BAIS_INIT_NO_BIAS_C1 | |||
#undef BAIS_INIT_NO_BIAS_C2 | |||
template <int c_dim, BiasMode bias_mode, int ow_remain, typename T, typename T2> | |||
__ai void init_ocx_ow8(T& c, const T2* bias_ptr, int oc_step) { | |||
InitOcxOw8<c_dim, bias_mode, ow_block, T, T2>::impl(c, bias_ptr, oc_step); | |||
InitOcxOw8<c_dim, bias_mode, ow_remain, T, T2>::impl(c, bias_ptr, oc_step); | |||
} | |||
/////////////////////init_ocx_ow4///////////////////// | |||
template <int c_dim, BiasMode bias_mode, typename T> | |||
@@ -18,6 +18,8 @@ | |||
#define MEGDNN_SIMD_TYPE float32x4_t | |||
#define MEGDNN_SIMD_TYPE2 float32x4x2_t | |||
#define MEGDNN_SIMD_LOADU(addr) vld1q_f32(addr) | |||
#define MEGDNN_SIMD_LOADU_2(addr) vcombine_f32(vld1_f32(addr), vdup_n_f32(0.f)) | |||
#define MEGDNN_SIMD_LOADU_3(addr) vld1q_lane_f32(addr + 2, vcombine_f32(vld1_f32(addr), vdup_n_f32(0.f)), 2) | |||
#define MEGDNN_SIMD_STOREU(addr, reg) vst1q_f32(addr, reg) | |||
#define MEGDNN_SIMD_SETZERO() vdupq_n_f32(0.0f) | |||
#define MEGDNN_SIMD_SET1(num) vdupq_n_f32(num) | |||
@@ -23,6 +23,20 @@ | |||
#include <regex> | |||
#include <unordered_map> | |||
// clang-format off | |||
#if defined(__has_feature) | |||
#if __has_feature(address_sanitizer) | |||
#define MEGDNN_TEST_ASAN 1 | |||
#else | |||
#define MEGDNN_TEST_ASAN 0 | |||
#endif | |||
#elif defined(__SANITIZE_ADDRESS__) | |||
#define MEGDNN_TEST_ASAN 1 | |||
#else | |||
#define MEGDNN_TEST_ASAN 0 | |||
#endif | |||
// clang-format on | |||
namespace megdnn { | |||
namespace test { | |||
@@ -76,6 +76,9 @@ TEST_F(FALLBACK, WARP_PERSPECTIVE) { | |||
checker.set_param(param); | |||
checker.exec({{1000, 2, 10, 11}, {1000, 3, 3}, {1000, 2, 12, 13}}); | |||
} | |||
#if MEGDNN_TEST_ASAN | |||
//! asan detect nan will make test failed | |||
#else | |||
// resize nan case | |||
UniformFloatRNG rng_zero(0, 0); | |||
checker.set_rng(1, &rng_zero); | |||
@@ -85,6 +88,7 @@ TEST_F(FALLBACK, WARP_PERSPECTIVE) { | |||
checker.set_param(param); | |||
checker.exec({{1000, 2, 10, 11}, {1000, 3, 3}, {1000, 2, 12, 13}}); | |||
} | |||
#endif | |||
} | |||
TEST_F(FALLBACK, WARP_PERSPECTIVE_MAT_IDX) { | |||
@@ -352,6 +352,9 @@ TEST_F(NAIVE_MULTI_THREADS, WARP_PERSPECTIVE_FORWARD_HWCD4) { | |||
checker.execs({{22, 10, 1, 11, 4}, {22, 3, 3}, {22, 11, 1, 12, 4}}); | |||
} | |||
} | |||
#if MEGDNN_TEST_ASAN | |||
//! asan detect nan will make test failed | |||
#else | |||
// nan case | |||
NanMatRNG rng_nan; | |||
UniformFloatRNG rng_zero(0, 0); | |||
@@ -369,6 +372,7 @@ TEST_F(NAIVE_MULTI_THREADS, WARP_PERSPECTIVE_FORWARD_HWCD4) { | |||
checker.set_param(param); | |||
checker.exec({{10, 10, 1, 11, 4}, {10, 3, 3}, {10, 12, 1, 13, 4}}); | |||
} | |||
#endif | |||
} | |||
#if MEGDNN_WITH_BENCHMARK | |||