diff --git a/dnn/src/arm_common/conv_bias/int8/strategy_2x3_8x8.cpp b/dnn/src/arm_common/conv_bias/int8/strategy_2x3_8x8.cpp index 6df32194..96f14a2e 100644 --- a/dnn/src/arm_common/conv_bias/int8/strategy_2x3_8x8.cpp +++ b/dnn/src/arm_common/conv_bias/int8/strategy_2x3_8x8.cpp @@ -339,6 +339,9 @@ void winograd_2x3_8x8_s8::input( size_t nr_units_in_tile) { megdnn_assert(IC % 8 == 0); constexpr int alpha = 3 + 2 - 1; + constexpr int SIMD_WIDTH = 4; + //! the input is load with int8 this is used to keep the borad load valid + constexpr int board_security_width = std::max(2 * SIMD_WIDTH, alpha); // OW = IW + 2 * PW - KERNEL_SIZE + 1 auto units_w = div_ceil(IW + 2 * PW - KERNEL_SIZE + 1, OUTPUT_BLOCK_SIZE); @@ -353,7 +356,8 @@ void winograd_2x3_8x8_s8::input( int ih_start = nh * OUTPUT_BLOCK_SIZE - PH; int iw_start = nw * OUTPUT_BLOCK_SIZE - PW; if (ih_start >= 0 && ih_start + alpha <= static_cast(IH) && - iw_start >= 0 && iw_start + alpha <= static_cast(IW)) { + iw_start >= 0 && + iw_start + board_security_width <= static_cast(IW)) { InputTransform2X3_qs8::prepare( input, patch, patchT, ih_start, iw_start, IH, IW, ic, IC); InputTransform2X3_qs8::transform( diff --git a/dnn/test/arm_common/cpuinfo.cpp b/dnn/test/arm_common/cpuinfo.cpp index f60a57c5..7a88f92f 100644 --- a/dnn/test/arm_common/cpuinfo.cpp +++ b/dnn/test/arm_common/cpuinfo.cpp @@ -202,9 +202,9 @@ TEST(ARM_RUNTIME, CPUINFO_SDM8GEN1) { ASSERT_TRUE(cpuinfo_has_arm_neon_dot()); ASSERT_FALSE(cpuinfo_has_arm_sve2()); - +#if MEGDNN_AARCH64 ASSERT_TRUE(cpuinfo_has_arm_i8mm()); - +#endif for (uint32_t i = 0; i < cpuinfo_get_processors_count(); i++) { ASSERT_EQ(cpuinfo_get_core(i), cpuinfo_get_processor(i)->core); }