Browse Source

fix(dnn/arm_common): fix some load beyond memory

GitOrigin-RevId: acd6363945
release-1.10
Megvii Engine Team 3 years ago
parent
commit
711b5bf502
3 changed files with 11 additions and 5 deletions
  1. +4
    -2
      dnn/src/arm_common/conv_bias/int8/direct.cpp
  2. +4
    -2
      dnn/src/arm_common/conv_bias/int8/direct_dotprod.cpp
  3. +3
    -1
      dnn/src/arm_common/convolution/int8x8x32/conv_backdata_stride1.cpp

+ 4
- 2
dnn/src/arm_common/conv_bias/int8/direct.cpp View File

@@ -389,7 +389,8 @@ void conv_bias::conv_direct_stride1_3x3_int8_nchw(
ACC_S16_S32(sum10, sum11, d1);

int8x8_t _r30 = vld1_s8(sptr + 3 * IW);
int8x8_t _r3n = vld1_s8(sptr + 3 * IW + 8);
int8x8_t _r3n =
vreinterpret_s8_s16(vld1_dup_s16((int16_t*)(sptr + 3 * IW + 8)));
int8x8_t _r31 = vext_s8(_r30, _r3n, 1);
int8x8_t _r32 = vext_s8(_r30, _r3n, 2);
d1 = vmull_s8(_r30, k20);
@@ -444,7 +445,8 @@ void conv_bias::conv_direct_stride1_3x3_int8_nchw(
ACC_S16_S32(sum00, sum01, d0);

int8x8_t _r20 = vld1_s8(sptr + 2 * IW);
int8x8_t _r2n = vld1_s8(sptr + 2 * IW + 8);
int8x8_t _r2n =
vreinterpret_s8_s16(vld1_dup_s16((int16_t*)(sptr + 2 * IW + 8)));
int8x8_t _r21 = vext_s8(_r20, _r2n, 1);
int8x8_t _r22 = vext_s8(_r20, _r2n, 2);
d0 = vmull_s8(_r20, k20);


+ 4
- 2
dnn/src/arm_common/conv_bias/int8/direct_dotprod.cpp View File

@@ -437,8 +437,10 @@ void conv_bias::conv_direct_stride1_3x3_int8_dot(

_tmp = vtranslq_s8(vld1_s8(r2));
CALC_2(678, 345, 0);

_tmp = vtranslq_s8(vld1_s8(r3));
int8x8_t tmp_last = vreinterpret_s8_s32(vld1_dup_s32(r3));
tmp_last = vreinterpret_s8_s16(
vld1_lane_s16(r3 + 4, vreinterpret_s16_s8(tmp_last), 2));
_tmp = vtranslq_s8(tmp_last);
CALC_1(678, 0);

POSTPROCESS_2X4(_sum00, _sum10, outptr, outptr2, dstptr, dstptr2);


+ 3
- 1
dnn/src/arm_common/convolution/int8x8x32/conv_backdata_stride1.cpp View File

@@ -297,7 +297,9 @@ void deconv_direct_2x2(
_tmp = vtranslq_s8(vld1_s8(r0));
CALC_0(1, 0);

_tmp = vtranslq_s8(vld1_s8(r1));
int8x8_t temp_x = vld1_dup_s32(r1);
temp_x = vld1_lane_s8(r1 + 4, temp_x, 4);
_tmp = vtranslq_s8(temp_x);
CALC_0(23, 0);

vst1q_s32(outptr, _sum00);


Loading…
Cancel
Save