diff --git a/dnn/src/armv7/matrix_mul/fp32/strategy_4x12.cpp b/dnn/src/armv7/matrix_mul/fp32/strategy_4x12.cpp index ebfa1a4e..1686be77 100644 --- a/dnn/src/armv7/matrix_mul/fp32/strategy_4x12.cpp +++ b/dnn/src/armv7/matrix_mul/fp32/strategy_4x12.cpp @@ -6,12 +6,13 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ -#include "src/armv7/matrix_mul/fp32/strategy.h" -#include "src/armv7/matrix_mul/asm/common.h" #include "src/arm_common/simd_macro/marm_neon.h" +#include "src/armv7/matrix_mul/asm/common.h" +#include "src/armv7/matrix_mul/fp32/strategy.h" #include "src/common/utils.h" using namespace megdnn; @@ -42,8 +43,8 @@ namespace { // +--+ - - - - +--------+--------+--------+ // // Accumulator -void kern_4x12(const float* packA, const float* packB, int K, - float* output, int LDC, bool is_first_k, int m_remain) { +void kern_4x12(const float* packA, const float* packB, int K, float* output, + int LDC, bool is_first_k, int m_remain) { const float* a_ptr = packA; const float* b_ptr = packB; int oddk = (K & 1); @@ -84,140 +85,139 @@ void kern_4x12(const float* packA, const float* packB, int K, STORE_LINE("20", "21", "22", "23", "24", "25", "2") \ STORE_LINE("26", "27", "28", "29", "30", "31", "3") \ "101:\n" - // clang-format on - - asm volatile( - // load accumulator C - "add r1, r0, %[LDC]\n" - "add r2, r1, %[LDC]\n" - "add r3, r2, %[LDC]\n" - - "cmp %[is_first_k], #1\n" - "beq 1f\n" LOAD_C - - "b 2f\n" - - "1:\n" - "veor.32 q4, q4, q4\n" - "veor.32 q5, q5, q5\n" - "veor.32 q6, q6, q6\n" - "veor.32 q7, q7, q7\n" - "veor.32 q8, q8, q8\n" - "veor.32 q9, q9, q9\n" - "veor.32 q10, q10, q10\n" - "veor.32 q11, q11, q11\n" - "veor.32 q12, q12, q12\n" - "veor.32 q13, q13, q13\n" - "veor.32 q14, q14, q14\n" - "veor.32 q15, q15, q15\n" - - "2: \n" - "vld1.32 {d2, d3, d4, d5}, [%[b_ptr]]!\n" - "vld1.32 {d6, d7}, [%[b_ptr]]!\n" - - "cmp %[K], #0\n" - "beq 4f\n" - - "3:\n" - "vld1.32 {d0, d1}, [%[a_ptr]]!\n" - "vmla.f32 q4, q1, d0[0]\n" - "vmla.f32 q5, q2, d0[0]\n" - "vmla.f32 q6, q3, d0[0]\n" - "vmla.f32 q7, q1, d0[1]\n" - "vmla.f32 q8, q2, d0[1]\n" - "vmla.f32 q9, q3, d0[1]\n" - "vmla.f32 q10, q1, d1[0]\n" - "vmla.f32 q11, q2, d1[0]\n" - "vmla.f32 q12, q3, d1[0]\n" - "vmla.f32 q13, q1, d1[1]\n" - "vmla.f32 q14, q2, d1[1]\n" - "vmla.f32 q15, q3, d1[1]\n" - - "vld1.32 {d0, d1}, [%[a_ptr]]!\n" - "vld1.32 {d2, d3, d4, d5}, [%[b_ptr]]!\n" - "vld1.32 {d6, d7}, [%[b_ptr]]!\n" - "vmla.f32 q4, q1, d0[0]\n" - "vmla.f32 q5, q2, d0[0]\n" - "vmla.f32 q6, q3, d0[0]\n" - "vmla.f32 q7, q1, d0[1]\n" - "vmla.f32 q8, q2, d0[1]\n" - "vmla.f32 q9, q3, d0[1]\n" - "vmla.f32 q10, q1, d1[0]\n" - "vmla.f32 q11, q2, d1[0]\n" - "vmla.f32 q12, q3, d1[0]\n" - "vmla.f32 q13, q1, d1[1]\n" - "vmla.f32 q14, q2, d1[1]\n" - "vmla.f32 q15, q3, d1[1]\n" - - "vld1.32 {d2, d3, d4, d5}, [%[b_ptr]]!\n" - "vld1.32 {d6, d7}, [%[b_ptr]]!\n" - "subs %[K], #1\n" - "bne 3b\n" - - "4:\n" - "cmp %[oddk], #1\n" - "beq 5f\n" - - // Even tail - "vld1.32 {d0, d1}, [%[a_ptr]]!\n" - "vmla.f32 q4, q1, d0[0]\n" - "vmla.f32 q5, q2, d0[0]\n" - "vmla.f32 q6, q3, d0[0]\n" - "vmla.f32 q7, q1, d0[1]\n" - "vmla.f32 q8, q2, d0[1]\n" - "vmla.f32 q9, q3, d0[1]\n" - "vmla.f32 q10, q1, d1[0]\n" - "vmla.f32 q11, q2, d1[0]\n" - "vmla.f32 q12, q3, d1[0]\n" - "vmla.f32 q13, q1, d1[1]\n" - "vmla.f32 q14, q2, d1[1]\n" - "vmla.f32 q15, q3, d1[1]\n" - - "vld1.32 {d0, d1}, [%[a_ptr]]!\n" - "vld1.32 {d2, d3, d4, d5}, [%[b_ptr]]!\n" - "vld1.32 {d6, d7}, [%[b_ptr]]!\n" - "vmla.f32 q4, q1, d0[0]\n" - "vmla.f32 q5, q2, d0[0]\n" - "vmla.f32 q6, q3, d0[0]\n" - "vmla.f32 q7, q1, d0[1]\n" - "vmla.f32 q8, q2, d0[1]\n" - "vmla.f32 q9, q3, d0[1]\n" - "vmla.f32 q10, q1, d1[0]\n" - "vmla.f32 q11, q2, d1[0]\n" - "vmla.f32 q12, q3, d1[0]\n" - "vmla.f32 q13, q1, d1[1]\n" - "vmla.f32 q14, q2, d1[1]\n" - "vmla.f32 q15, q3, d1[1]\n" - "b 6f\n" - - // odd tail - "5:\n" - "vld1.32 {d0, d1}, [%[a_ptr]]!\n" - "vmla.f32 q4, q1, d0[0]\n" - "vmla.f32 q5, q2, d0[0]\n" - "vmla.f32 q6, q3, d0[0]\n" - "vmla.f32 q7, q1, d0[1]\n" - "vmla.f32 q8, q2, d0[1]\n" - "vmla.f32 q9, q3, d0[1]\n" - "vmla.f32 q10, q1, d1[0]\n" - "vmla.f32 q11, q2, d1[0]\n" - "vmla.f32 q12, q3, d1[0]\n" - "vmla.f32 q13, q1, d1[1]\n" - "vmla.f32 q14, q2, d1[1]\n" - "vmla.f32 q15, q3, d1[1]\n" - - "6:\n" STORE_C - - : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [K] "+r"(K), - [LDC] "+r"(LDC), [is_first_k] "+r"(is_first_k), - [oddk] "+r"(oddk), [m_remain] "+r"(m_remain), - [outptr] "+r"(outptr) - : - : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", - "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", - "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", - "d25", "d26", "d27", "d28", "d29", "d30", "d31", "r1", - "r2", "r3", "r9", "r10", "cc", "memory"); + // clang-format on + + asm volatile( + // load accumulator C + "add r1, r0, %[LDC]\n" + "add r2, r1, %[LDC]\n" + "add r3, r2, %[LDC]\n" + + "cmp %[is_first_k], #1\n" + "beq 1f\n" LOAD_C + + "b 2f\n" + + "1:\n" + "veor.32 q4, q4, q4\n" + "veor.32 q5, q5, q5\n" + "veor.32 q6, q6, q6\n" + "veor.32 q7, q7, q7\n" + "veor.32 q8, q8, q8\n" + "veor.32 q9, q9, q9\n" + "veor.32 q10, q10, q10\n" + "veor.32 q11, q11, q11\n" + "veor.32 q12, q12, q12\n" + "veor.32 q13, q13, q13\n" + "veor.32 q14, q14, q14\n" + "veor.32 q15, q15, q15\n" + + "2: \n" + "vld1.32 {d2, d3, d4, d5}, [%[b_ptr]]!\n" + "vld1.32 {d6, d7}, [%[b_ptr]]!\n" + + "cmp %[K], #0\n" + "beq 4f\n" + + "3:\n" + "vld1.32 {d0, d1}, [%[a_ptr]]!\n" + "vmla.f32 q4, q1, d0[0]\n" + "vmla.f32 q5, q2, d0[0]\n" + "vmla.f32 q6, q3, d0[0]\n" + "vmla.f32 q7, q1, d0[1]\n" + "vmla.f32 q8, q2, d0[1]\n" + "vmla.f32 q9, q3, d0[1]\n" + "vmla.f32 q10, q1, d1[0]\n" + "vmla.f32 q11, q2, d1[0]\n" + "vmla.f32 q12, q3, d1[0]\n" + "vmla.f32 q13, q1, d1[1]\n" + "vmla.f32 q14, q2, d1[1]\n" + "vmla.f32 q15, q3, d1[1]\n" + + "vld1.32 {d0, d1}, [%[a_ptr]]!\n" + "vld1.32 {d2, d3, d4, d5}, [%[b_ptr]]!\n" + "vld1.32 {d6, d7}, [%[b_ptr]]!\n" + "vmla.f32 q4, q1, d0[0]\n" + "vmla.f32 q5, q2, d0[0]\n" + "vmla.f32 q6, q3, d0[0]\n" + "vmla.f32 q7, q1, d0[1]\n" + "vmla.f32 q8, q2, d0[1]\n" + "vmla.f32 q9, q3, d0[1]\n" + "vmla.f32 q10, q1, d1[0]\n" + "vmla.f32 q11, q2, d1[0]\n" + "vmla.f32 q12, q3, d1[0]\n" + "vmla.f32 q13, q1, d1[1]\n" + "vmla.f32 q14, q2, d1[1]\n" + "vmla.f32 q15, q3, d1[1]\n" + + "vld1.32 {d2, d3, d4, d5}, [%[b_ptr]]!\n" + "vld1.32 {d6, d7}, [%[b_ptr]]!\n" + "subs %[K], #1\n" + "bne 3b\n" + + "4:\n" + "cmp %[oddk], #1\n" + "beq 5f\n" + + // Even tail + "vld1.32 {d0, d1}, [%[a_ptr]]!\n" + "vmla.f32 q4, q1, d0[0]\n" + "vmla.f32 q5, q2, d0[0]\n" + "vmla.f32 q6, q3, d0[0]\n" + "vmla.f32 q7, q1, d0[1]\n" + "vmla.f32 q8, q2, d0[1]\n" + "vmla.f32 q9, q3, d0[1]\n" + "vmla.f32 q10, q1, d1[0]\n" + "vmla.f32 q11, q2, d1[0]\n" + "vmla.f32 q12, q3, d1[0]\n" + "vmla.f32 q13, q1, d1[1]\n" + "vmla.f32 q14, q2, d1[1]\n" + "vmla.f32 q15, q3, d1[1]\n" + + "vld1.32 {d0, d1}, [%[a_ptr]]!\n" + "vld1.32 {d2, d3, d4, d5}, [%[b_ptr]]!\n" + "vld1.32 {d6, d7}, [%[b_ptr]]!\n" + "vmla.f32 q4, q1, d0[0]\n" + "vmla.f32 q5, q2, d0[0]\n" + "vmla.f32 q6, q3, d0[0]\n" + "vmla.f32 q7, q1, d0[1]\n" + "vmla.f32 q8, q2, d0[1]\n" + "vmla.f32 q9, q3, d0[1]\n" + "vmla.f32 q10, q1, d1[0]\n" + "vmla.f32 q11, q2, d1[0]\n" + "vmla.f32 q12, q3, d1[0]\n" + "vmla.f32 q13, q1, d1[1]\n" + "vmla.f32 q14, q2, d1[1]\n" + "vmla.f32 q15, q3, d1[1]\n" + "b 6f\n" + + // odd tail + "5:\n" + "vld1.32 {d0, d1}, [%[a_ptr]]!\n" + "vmla.f32 q4, q1, d0[0]\n" + "vmla.f32 q5, q2, d0[0]\n" + "vmla.f32 q6, q3, d0[0]\n" + "vmla.f32 q7, q1, d0[1]\n" + "vmla.f32 q8, q2, d0[1]\n" + "vmla.f32 q9, q3, d0[1]\n" + "vmla.f32 q10, q1, d1[0]\n" + "vmla.f32 q11, q2, d1[0]\n" + "vmla.f32 q12, q3, d1[0]\n" + "vmla.f32 q13, q1, d1[1]\n" + "vmla.f32 q14, q2, d1[1]\n" + "vmla.f32 q15, q3, d1[1]\n" + + "6:\n" STORE_C + + : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [K] "+r"(K), + [LDC] "+r"(LDC), [is_first_k] "+r"(is_first_k), [oddk] "+r"(oddk), + [m_remain] "+r"(m_remain), [outptr] "+r"(outptr) + : + : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", + "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", + "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", + "d29", "d30", "d31", "r1", "r2", "r3", "r9", "r10", "cc", + "memory"); #undef LOAD_LINE #undef LOAD_C @@ -316,92 +316,92 @@ void kern_4x4(const float* packA, const float* packB, int K, float* output, STORE_LINE("12", "13", "2") \ STORE_LINE("14", "15", "3") \ "105:\n" - // clang-format on - - asm volatile( - // load accumulator C - "add r1, r0, %[LDC]\n" - "add r2, r1, %[LDC]\n" - "add r3, r2, %[LDC]\n" - - "cmp %[is_first_k], #1\n" - "beq 1f\n" LOAD_C - - "b 2f\n" - - "1:\n" - "veor.32 q4, q4, q4\n" - "veor.32 q5, q5, q5\n" - "veor.32 q6, q6, q6\n" - "veor.32 q7, q7, q7\n" - - "2: \n" - "vld1.32 {d0, d1}, [%[a_ptr]]!\n" - "vld1.32 {d4, d5}, [%[b_ptr]]!\n" - "cmp %[K], #0\n" - "beq 4f\n" - - "3:\n" - "vld1.32 {d2, d3}, [%[a_ptr]]!\n" - "vld1.32 {d6, d7}, [%[b_ptr]]!\n" - "vmla.f32 q4, q2, d0[0]\n" - "vmla.f32 q5, q2, d0[1]\n" - "vmla.f32 q6, q2, d1[0]\n" - "vmla.f32 q7, q2, d1[1]\n" - - "vld1.32 {d0, d1}, [%[a_ptr]]!\n" - "vld1.32 {d4, d5}, [%[b_ptr]]!\n" - "vmla.f32 q4, q3, d2[0]\n" - "vmla.f32 q5, q3, d2[1]\n" - "vmla.f32 q6, q3, d3[0]\n" - "vmla.f32 q7, q3, d3[1]\n" - - "subs %[K], #1\n" - "bne 3b\n" - - "4:\n" - "cmp %[oddk], #1\n" - "beq 5f\n" - - // Even tail - "vld1.32 {d2, d3}, [%[a_ptr]]!\n" - "vld1.32 {d6, d7}, [%[b_ptr]]!\n" - "vmla.f32 q4, q2, d0[0]\n" - "vmla.f32 q5, q2, d0[1]\n" - "vmla.f32 q6, q2, d1[0]\n" - "vmla.f32 q7, q2, d1[1]\n" - - "vmla.f32 q4, q3, d2[0]\n" - "vmla.f32 q5, q3, d2[1]\n" - "vmla.f32 q6, q3, d3[0]\n" - "vmla.f32 q7, q3, d3[1]\n" - - "b 6f\n" - - // odd tail - "5:\n" - "vmla.f32 q4, q2, d0[0]\n" - "vmla.f32 q5, q2, d0[1]\n" - "vmla.f32 q6, q2, d1[0]\n" - "vmla.f32 q7, q2, d1[1]\n" - - "6:\n" STORE_C - - : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [K] "+r"(K), - [LDC] "+r"(LDC), [is_first_k] "+r"(is_first_k), - [oddk] "+r"(oddk), [m_remain] "+r"(m_remain), - [n_remain] "+r"(n_remain), [outptr] "+r"(outptr) - : - : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", - "d9", "d10", "d11", "d12", "d13", "d14", "d15", "r1", - "r2", "r3", "r10", "cc", "memory"); + // clang-format on + + asm volatile( + // load accumulator C + "add r1, r0, %[LDC]\n" + "add r2, r1, %[LDC]\n" + "add r3, r2, %[LDC]\n" + + "cmp %[is_first_k], #1\n" + "beq 1f\n" LOAD_C + + "b 2f\n" + + "1:\n" + "veor.32 q4, q4, q4\n" + "veor.32 q5, q5, q5\n" + "veor.32 q6, q6, q6\n" + "veor.32 q7, q7, q7\n" + + "2: \n" + "vld1.32 {d0, d1}, [%[a_ptr]]!\n" + "vld1.32 {d4, d5}, [%[b_ptr]]!\n" + "cmp %[K], #0\n" + "beq 4f\n" + + "3:\n" + "vld1.32 {d2, d3}, [%[a_ptr]]!\n" + "vld1.32 {d6, d7}, [%[b_ptr]]!\n" + "vmla.f32 q4, q2, d0[0]\n" + "vmla.f32 q5, q2, d0[1]\n" + "vmla.f32 q6, q2, d1[0]\n" + "vmla.f32 q7, q2, d1[1]\n" + + "vld1.32 {d0, d1}, [%[a_ptr]]!\n" + "vld1.32 {d4, d5}, [%[b_ptr]]!\n" + "vmla.f32 q4, q3, d2[0]\n" + "vmla.f32 q5, q3, d2[1]\n" + "vmla.f32 q6, q3, d3[0]\n" + "vmla.f32 q7, q3, d3[1]\n" + + "subs %[K], #1\n" + "bne 3b\n" + + "4:\n" + "cmp %[oddk], #1\n" + "beq 5f\n" + + // Even tail + "vld1.32 {d2, d3}, [%[a_ptr]]!\n" + "vld1.32 {d6, d7}, [%[b_ptr]]!\n" + "vmla.f32 q4, q2, d0[0]\n" + "vmla.f32 q5, q2, d0[1]\n" + "vmla.f32 q6, q2, d1[0]\n" + "vmla.f32 q7, q2, d1[1]\n" + + "vmla.f32 q4, q3, d2[0]\n" + "vmla.f32 q5, q3, d2[1]\n" + "vmla.f32 q6, q3, d3[0]\n" + "vmla.f32 q7, q3, d3[1]\n" + + "b 6f\n" + + // odd tail + "5:\n" + "vmla.f32 q4, q2, d0[0]\n" + "vmla.f32 q5, q2, d0[1]\n" + "vmla.f32 q6, q2, d1[0]\n" + "vmla.f32 q7, q2, d1[1]\n" + + "6:\n" STORE_C + + : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [K] "+r"(K), + [LDC] "+r"(LDC), [is_first_k] "+r"(is_first_k), [oddk] "+r"(oddk), + [m_remain] "+r"(m_remain), [n_remain] "+r"(n_remain), + [outptr] "+r"(outptr) + : + : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", + "d11", "d12", "d13", "d14", "d15", "r1", "r2", "r3", "r10", "cc", + "memory"); #undef LOAD_LINE #undef LOAD_C #undef STORE_LINE #undef STORE_C } -void sgemm_4x12_pack_A_n(float * outptr, const float * inptr, int ldin, int y0, +void sgemm_4x12_pack_A_n(float* outptr, const float* inptr, int ldin, int y0, int ymax, int k0, int kmax) { float zerobuff[4]; std::memset(zerobuff, 0, sizeof(float) * 4); @@ -444,8 +444,10 @@ void sgemm_4x12_pack_A_n(float * outptr, const float * inptr, int ldin, int y0, /* Everything falls through in here */ case 2: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr3 = zerobuff; break; @@ -463,8 +465,10 @@ void sgemm_4x12_pack_A_n(float * outptr, const float * inptr, int ldin, int y0, /* Everything falls through in here */ case 2: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr3 = zerobuff; break; @@ -530,8 +534,8 @@ void sgemm_4x12_pack_A_t(float* out, const float* in, int ldin, int x0, } } -void sgemm_4x12_pack_B_n(float* out, const float* in, int ldin, - int x0, int xmax, int k0, int kmax) { +void sgemm_4x12_pack_B_n(float* out, const float* in, int ldin, int x0, + int xmax, int k0, int kmax) { int ksize = kmax - k0; int ksize12 = ksize * 12; int ksize4 = (ksize << 2); @@ -600,8 +604,8 @@ void sgemm_4x12_pack_B_n(float* out, const float* in, int ldin, } } -void sgemm_4x12_pack_B_t(float* out, const float* in, int ldin, - int y0, int ymax, int k0, int kmax) { +void sgemm_4x12_pack_B_t(float* out, const float* in, int ldin, int y0, + int ymax, int k0, int kmax) { float* outptr = out; const float* inptr = in; float zerobuff[4]; @@ -660,8 +664,10 @@ void sgemm_4x12_pack_B_t(float* out, const float* in, int ldin, /* Everything falls through in here */ case 2: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr3 = zerobuff; break; @@ -679,8 +685,10 @@ void sgemm_4x12_pack_B_t(float* out, const float* in, int ldin, /* Everything falls through in here */ case 2: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr3 = zerobuff; break; @@ -697,8 +705,8 @@ void sgemm_4x12_pack_B_t(float* out, const float* in, int ldin, MEGDNN_REG_GEMM_STRATEGY_IMPL(sgemm_4x12); -void sgemm_4x12::pack_A(float* out, const float* in, int ldin, int y0, - int ymax, int k0, int kmax, bool transpose_A) const { +void sgemm_4x12::pack_A(float* out, const float* in, int ldin, int y0, int ymax, + int k0, int kmax, bool transpose_A) const { if (transpose_A) { sgemm_4x12_pack_A_t(out, in, ldin, y0, ymax, k0, kmax); } else { @@ -715,9 +723,9 @@ void sgemm_4x12::pack_B(float* out, const float* in, int ldin, int x0, int xmax, } } -void sgemm_4x12::kern(const float* packA, const float* packB, - size_t M, size_t N, size_t K, float* C, size_t LDC, - bool is_first_k, const float*, float*) const { +void sgemm_4x12::kern(const float* packA, const float* packB, size_t M, + size_t N, size_t K, float* C, size_t LDC, bool is_first_k, + const float*, float*) const { megdnn_assert(A_dtype.enumv() == B_dtype.enumv() && A_dtype.enumv() == C_dtype.enumv() && A_dtype.enumv() == DTypeEnum::Float32); diff --git a/dnn/src/armv7/matrix_mul/int16x16x32/kernel_12x4x1.h b/dnn/src/armv7/matrix_mul/int16x16x32/kernel_12x4x1.h index 9cd6efe8..e100c03b 100644 --- a/dnn/src/armv7/matrix_mul/int16x16x32/kernel_12x4x1.h +++ b/dnn/src/armv7/matrix_mul/int16x16x32/kernel_12x4x1.h @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "src/arm_common/simd_macro/marm_neon.h" @@ -988,8 +989,10 @@ static void gemm_s16x16x32_12x4_transpose_pack_B_n(dt_int16* outptr, switch (y + 3 - ymax) { case 2: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr3 = zerobuff; break; diff --git a/dnn/src/armv7/matrix_mul/int8/kernel_4x2x16.h b/dnn/src/armv7/matrix_mul/int8/kernel_4x2x16.h index 62f03730..1b5c9dd9 100644 --- a/dnn/src/armv7/matrix_mul/int8/kernel_4x2x16.h +++ b/dnn/src/armv7/matrix_mul/int8/kernel_4x2x16.h @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "src/arm_common/simd_macro/marm_neon.h" @@ -330,9 +331,9 @@ static void gemm_s8_4x2_pack_A_n(dt_int8* outptr, const dt_int8* inptr, if (y + 3 >= ymax) { switch (y + 3 - ymax) { case 2: - inptr1 = zerobuff; + inptr1 = zerobuff;MEGDNN_FALLTHRU case 1: - inptr2 = zerobuff; + inptr2 = zerobuff;MEGDNN_FALLTHRU case 0: inptr3 = zerobuff; break; @@ -347,9 +348,9 @@ static void gemm_s8_4x2_pack_A_n(dt_int8* outptr, const dt_int8* inptr, if (y + 3 >= ymax) { switch (y + 3 - ymax) { case 2: - inptr1 = zerobuff; + inptr1 = zerobuff;MEGDNN_FALLTHRU case 1: - inptr2 = zerobuff; + inptr2 = zerobuff;MEGDNN_FALLTHRU case 0: inptr3 = zerobuff; break; @@ -390,19 +391,19 @@ static void gemm_s8_4x2_pack_A_t(dt_int8* out, const dt_int8* in, int ldin, if (remain >= 0) { switch (remain) { case 7: - inptr0 = zerobuff; + inptr0 = zerobuff;MEGDNN_FALLTHRU case 6: - inptr1 = zerobuff; + inptr1 = zerobuff;MEGDNN_FALLTHRU case 5: - inptr2 = zerobuff; + inptr2 = zerobuff;MEGDNN_FALLTHRU case 4: - inptr3 = zerobuff; + inptr3 = zerobuff;MEGDNN_FALLTHRU case 3: - inptr4 = zerobuff; + inptr4 = zerobuff;MEGDNN_FALLTHRU case 2: - inptr5 = zerobuff; + inptr5 = zerobuff;MEGDNN_FALLTHRU case 1: - inptr6 = zerobuff; + inptr6 = zerobuff;MEGDNN_FALLTHRU case 0: inptr7 = zerobuff; break; @@ -421,19 +422,19 @@ static void gemm_s8_4x2_pack_A_t(dt_int8* out, const dt_int8* in, int ldin, if (remain >= 0) { switch (remain) { case 7: - inptr0 = zerobuff; + inptr0 = zerobuff;MEGDNN_FALLTHRU case 6: - inptr1 = zerobuff; + inptr1 = zerobuff;MEGDNN_FALLTHRU case 5: - inptr2 = zerobuff; + inptr2 = zerobuff;MEGDNN_FALLTHRU case 4: - inptr3 = zerobuff; + inptr3 = zerobuff;MEGDNN_FALLTHRU case 3: - inptr4 = zerobuff; + inptr4 = zerobuff;MEGDNN_FALLTHRU case 2: - inptr5 = zerobuff; + inptr5 = zerobuff;MEGDNN_FALLTHRU case 1: - inptr6 = zerobuff; + inptr6 = zerobuff;MEGDNN_FALLTHRU case 0: inptr7 = zerobuff; break; @@ -488,19 +489,19 @@ static void gemm_s8_4x2_pack_B_n(dt_int8* out, const dt_int8* in, int ldin, if (remain >= 0) { switch (remain) { case 7: - inptr0 = zerobuff; + inptr0 = zerobuff;MEGDNN_FALLTHRU case 6: - inptr1 = zerobuff; + inptr1 = zerobuff;MEGDNN_FALLTHRU case 5: - inptr2 = zerobuff; + inptr2 = zerobuff;MEGDNN_FALLTHRU case 4: - inptr3 = zerobuff; + inptr3 = zerobuff;MEGDNN_FALLTHRU case 3: - inptr4 = zerobuff; + inptr4 = zerobuff;MEGDNN_FALLTHRU case 2: - inptr5 = zerobuff; + inptr5 = zerobuff;MEGDNN_FALLTHRU case 1: - inptr6 = zerobuff; + inptr6 = zerobuff;MEGDNN_FALLTHRU case 0: inptr7 = zerobuff; break; @@ -519,19 +520,19 @@ static void gemm_s8_4x2_pack_B_n(dt_int8* out, const dt_int8* in, int ldin, if (remain >= 0) { switch (remain) { case 7: - inptr0 = zerobuff; + inptr0 = zerobuff;MEGDNN_FALLTHRU case 6: - inptr1 = zerobuff; + inptr1 = zerobuff;MEGDNN_FALLTHRU case 5: - inptr2 = zerobuff; + inptr2 = zerobuff;MEGDNN_FALLTHRU case 4: - inptr3 = zerobuff; + inptr3 = zerobuff;MEGDNN_FALLTHRU case 3: - inptr4 = zerobuff; + inptr4 = zerobuff;MEGDNN_FALLTHRU case 2: - inptr5 = zerobuff; + inptr5 = zerobuff;MEGDNN_FALLTHRU case 1: - inptr6 = zerobuff; + inptr6 = zerobuff;MEGDNN_FALLTHRU case 0: inptr7 = zerobuff; break; diff --git a/dnn/src/armv7/matrix_mul/int8/kernel_4x8x8.h b/dnn/src/armv7/matrix_mul/int8/kernel_4x8x8.h index 6d01e9d4..411bfcd5 100644 --- a/dnn/src/armv7/matrix_mul/int8/kernel_4x8x8.h +++ b/dnn/src/armv7/matrix_mul/int8/kernel_4x8x8.h @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "src/arm_common/simd_macro/marm_neon.h" @@ -378,7 +379,6 @@ static void kern_4x4(const int8_t* packA, const int8_t* packB, int K, #undef STORE_C } - static void gemm_s8_4x8_pack_A_n(dt_int8* outptr, const dt_int8* inptr, int ldin, int y0, int ymax, int k0, int kmax) { int8_t zerobuff[16]; @@ -402,8 +402,10 @@ static void gemm_s8_4x8_pack_A_n(dt_int8* outptr, const dt_int8* inptr, switch (y + 3 - ymax) { case 2: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr3 = zerobuff; break; @@ -420,8 +422,10 @@ static void gemm_s8_4x8_pack_A_n(dt_int8* outptr, const dt_int8* inptr, switch (y + 3 - ymax) { case 2: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr3 = zerobuff; break; @@ -472,16 +476,22 @@ static void gemm_s8_4x8_transpose_pack_A_n(dt_int8* out, const dt_int8* in, switch (k + 7 - kmax) { case 6: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 5: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 4: inptr3 = zerobuff; + MEGDNN_FALLTHRU case 3: inptr4 = zerobuff; + MEGDNN_FALLTHRU case 2: inptr5 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr6 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr7 = zerobuff; break; @@ -500,16 +510,22 @@ static void gemm_s8_4x8_transpose_pack_A_n(dt_int8* out, const dt_int8* in, switch (k + 7 - kmax) { case 6: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 5: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 4: inptr3 = zerobuff; + MEGDNN_FALLTHRU case 3: inptr4 = zerobuff; + MEGDNN_FALLTHRU case 2: inptr5 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr6 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr7 = zerobuff; break; @@ -565,16 +581,22 @@ static void gemm_s8_4x8_pack_B_n(dt_int8* out, const dt_int8* in, int ldin, switch (k + 7 - kmax) { case 6: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 5: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 4: inptr3 = zerobuff; + MEGDNN_FALLTHRU case 3: inptr4 = zerobuff; + MEGDNN_FALLTHRU case 2: inptr5 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr6 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr7 = zerobuff; break; @@ -594,16 +616,22 @@ static void gemm_s8_4x8_pack_B_n(dt_int8* out, const dt_int8* in, int ldin, switch (k + 7 - kmax) { case 6: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 5: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 4: inptr3 = zerobuff; + MEGDNN_FALLTHRU case 3: inptr4 = zerobuff; + MEGDNN_FALLTHRU case 2: inptr5 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr6 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr7 = zerobuff; break; @@ -623,16 +651,22 @@ static void gemm_s8_4x8_pack_B_n(dt_int8* out, const dt_int8* in, int ldin, switch (k + 7 - kmax) { case 6: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 5: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 4: inptr3 = zerobuff; + MEGDNN_FALLTHRU case 3: inptr4 = zerobuff; + MEGDNN_FALLTHRU case 2: inptr5 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr6 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr7 = zerobuff; break; @@ -710,8 +744,10 @@ static void gemm_s8_4x8_transpose_pack_B_n(dt_int8* outptr, switch (y + 3 - ymax) { case 2: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr3 = zerobuff; break; @@ -729,8 +765,10 @@ static void gemm_s8_4x8_transpose_pack_B_n(dt_int8* outptr, switch (y + 3 - ymax) { case 2: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr3 = zerobuff; break; diff --git a/dnn/src/armv7/matrix_mul/int8/kernel_mk4_4x2x16.h b/dnn/src/armv7/matrix_mul/int8/kernel_mk4_4x2x16.h index 9d4831ca..2692dcf6 100644 --- a/dnn/src/armv7/matrix_mul/int8/kernel_mk4_4x2x16.h +++ b/dnn/src/armv7/matrix_mul/int8/kernel_mk4_4x2x16.h @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "src/arm_common/simd_macro/marm_neon.h" @@ -323,8 +324,10 @@ static void gemm_mk4_s8_4x2_pack_B(dt_int8* out, const dt_int8* in, int ldin, switch (k + 3 - ICB) { case 2: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr3 = zerobuff; break; @@ -340,8 +343,10 @@ static void gemm_mk4_s8_4x2_pack_B(dt_int8* out, const dt_int8* in, int ldin, switch (k + 3 - ICB) { case 2: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr3 = zerobuff; break; diff --git a/dnn/src/armv7/matrix_mul/int8x8x16/kernel_4x2x16.h b/dnn/src/armv7/matrix_mul/int8x8x16/kernel_4x2x16.h index ac0a591c..8f24b5a0 100644 --- a/dnn/src/armv7/matrix_mul/int8x8x16/kernel_4x2x16.h +++ b/dnn/src/armv7/matrix_mul/int8x8x16/kernel_4x2x16.h @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "src/arm_common/simd_macro/marm_neon.h" @@ -278,9 +279,9 @@ static void gemm_s8x8x16_4x2_pack_A_n(dt_int8* outptr, const dt_int8* inptr, if (y + 3 >= ymax) { switch (y + 3 - ymax) { case 2: - inptr1 = zerobuff; + inptr1 = zerobuff;MEGDNN_FALLTHRU case 1: - inptr2 = zerobuff; + inptr2 = zerobuff;MEGDNN_FALLTHRU case 0: inptr3 = zerobuff; break; @@ -295,9 +296,9 @@ static void gemm_s8x8x16_4x2_pack_A_n(dt_int8* outptr, const dt_int8* inptr, if (y + 3 >= ymax) { switch (y + 3 - ymax) { case 2: - inptr1 = zerobuff; + inptr1 = zerobuff;MEGDNN_FALLTHRU case 1: - inptr2 = zerobuff; + inptr2 = zerobuff;MEGDNN_FALLTHRU case 0: inptr3 = zerobuff; break; @@ -338,19 +339,19 @@ static void gemm_s8x8x16_4x2_pack_A_t(dt_int8* out, const dt_int8* in, int ldin, if (remain >= 0) { switch (remain) { case 7: - inptr0 = zerobuff; + inptr0 = zerobuff;MEGDNN_FALLTHRU case 6: - inptr1 = zerobuff; + inptr1 = zerobuff;MEGDNN_FALLTHRU case 5: - inptr2 = zerobuff; + inptr2 = zerobuff;MEGDNN_FALLTHRU case 4: - inptr3 = zerobuff; + inptr3 = zerobuff;MEGDNN_FALLTHRU case 3: - inptr4 = zerobuff; + inptr4 = zerobuff;MEGDNN_FALLTHRU case 2: - inptr5 = zerobuff; + inptr5 = zerobuff;MEGDNN_FALLTHRU case 1: - inptr6 = zerobuff; + inptr6 = zerobuff;MEGDNN_FALLTHRU case 0: inptr7 = zerobuff; break; @@ -369,19 +370,19 @@ static void gemm_s8x8x16_4x2_pack_A_t(dt_int8* out, const dt_int8* in, int ldin, if (remain >= 0) { switch (remain) { case 7: - inptr0 = zerobuff; + inptr0 = zerobuff;MEGDNN_FALLTHRU case 6: - inptr1 = zerobuff; + inptr1 = zerobuff;MEGDNN_FALLTHRU case 5: - inptr2 = zerobuff; + inptr2 = zerobuff;MEGDNN_FALLTHRU case 4: - inptr3 = zerobuff; + inptr3 = zerobuff;MEGDNN_FALLTHRU case 3: - inptr4 = zerobuff; + inptr4 = zerobuff;MEGDNN_FALLTHRU case 2: - inptr5 = zerobuff; + inptr5 = zerobuff;MEGDNN_FALLTHRU case 1: - inptr6 = zerobuff; + inptr6 = zerobuff;MEGDNN_FALLTHRU case 0: inptr7 = zerobuff; break; @@ -436,19 +437,19 @@ static void gemm_s8x8x16_4x2_pack_B_n(dt_int8* out, const dt_int8* in, int ldin, if (remain >= 0) { switch (remain) { case 7: - inptr0 = zerobuff; + inptr0 = zerobuff;MEGDNN_FALLTHRU case 6: - inptr1 = zerobuff; + inptr1 = zerobuff;MEGDNN_FALLTHRU case 5: - inptr2 = zerobuff; + inptr2 = zerobuff;MEGDNN_FALLTHRU case 4: - inptr3 = zerobuff; + inptr3 = zerobuff;MEGDNN_FALLTHRU case 3: - inptr4 = zerobuff; + inptr4 = zerobuff;MEGDNN_FALLTHRU case 2: - inptr5 = zerobuff; + inptr5 = zerobuff;MEGDNN_FALLTHRU case 1: - inptr6 = zerobuff; + inptr6 = zerobuff;MEGDNN_FALLTHRU case 0: inptr7 = zerobuff; break; @@ -467,19 +468,19 @@ static void gemm_s8x8x16_4x2_pack_B_n(dt_int8* out, const dt_int8* in, int ldin, if (remain >= 0) { switch (remain) { case 7: - inptr0 = zerobuff; + inptr0 = zerobuff;MEGDNN_FALLTHRU case 6: - inptr1 = zerobuff; + inptr1 = zerobuff;MEGDNN_FALLTHRU case 5: - inptr2 = zerobuff; + inptr2 = zerobuff;MEGDNN_FALLTHRU case 4: - inptr3 = zerobuff; + inptr3 = zerobuff;MEGDNN_FALLTHRU case 3: - inptr4 = zerobuff; + inptr4 = zerobuff;MEGDNN_FALLTHRU case 2: - inptr5 = zerobuff; + inptr5 = zerobuff;MEGDNN_FALLTHRU case 1: - inptr6 = zerobuff; + inptr6 = zerobuff;MEGDNN_FALLTHRU case 0: inptr7 = zerobuff; break; diff --git a/dnn/src/armv7/matrix_mul/int8x8x16/kernel_4x8x8.h b/dnn/src/armv7/matrix_mul/int8x8x16/kernel_4x8x8.h index 5ad26daa..cd3b9e22 100644 --- a/dnn/src/armv7/matrix_mul/int8x8x16/kernel_4x8x8.h +++ b/dnn/src/armv7/matrix_mul/int8x8x16/kernel_4x8x8.h @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "src/arm_common/simd_macro/marm_neon.h" @@ -76,7 +77,7 @@ static void kern_4x8(const int8_t* packA, const int8_t* packB, int K, STORE_LINE("14", "15", "3") \ "101:\n" -// clang-format on + // clang-format on register int16_t* outptr asm("r0") = output; asm volatile( @@ -406,8 +407,10 @@ static void gemm_s8x8x16_4x8_pack_A_n(dt_int8* outptr, const dt_int8* inptr, switch (y + 3 - ymax) { case 2: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr3 = zerobuff; break; @@ -424,8 +427,10 @@ static void gemm_s8x8x16_4x8_pack_A_n(dt_int8* outptr, const dt_int8* inptr, switch (y + 3 - ymax) { case 2: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr3 = zerobuff; break; @@ -476,16 +481,22 @@ static void gemm_s8x8x16_4x8_transpose_pack_A_n(dt_int8* out, const dt_int8* in, switch (k + 7 - kmax) { case 6: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 5: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 4: inptr3 = zerobuff; + MEGDNN_FALLTHRU case 3: inptr4 = zerobuff; + MEGDNN_FALLTHRU case 2: inptr5 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr6 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr7 = zerobuff; break; @@ -504,16 +515,22 @@ static void gemm_s8x8x16_4x8_transpose_pack_A_n(dt_int8* out, const dt_int8* in, switch (k + 7 - kmax) { case 6: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 5: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 4: inptr3 = zerobuff; + MEGDNN_FALLTHRU case 3: inptr4 = zerobuff; + MEGDNN_FALLTHRU case 2: inptr5 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr6 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr7 = zerobuff; break; @@ -569,16 +586,22 @@ static void gemm_s8x8x16_4x8_pack_B_n(dt_int8* out, const dt_int8* in, int ldin, switch (k + 7 - kmax) { case 6: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 5: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 4: inptr3 = zerobuff; + MEGDNN_FALLTHRU case 3: inptr4 = zerobuff; + MEGDNN_FALLTHRU case 2: inptr5 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr6 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr7 = zerobuff; break; @@ -598,16 +621,22 @@ static void gemm_s8x8x16_4x8_pack_B_n(dt_int8* out, const dt_int8* in, int ldin, switch (k + 7 - kmax) { case 6: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 5: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 4: inptr3 = zerobuff; + MEGDNN_FALLTHRU case 3: inptr4 = zerobuff; + MEGDNN_FALLTHRU case 2: inptr5 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr6 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr7 = zerobuff; break; @@ -627,16 +656,22 @@ static void gemm_s8x8x16_4x8_pack_B_n(dt_int8* out, const dt_int8* in, int ldin, switch (k + 7 - kmax) { case 6: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 5: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 4: inptr3 = zerobuff; + MEGDNN_FALLTHRU case 3: inptr4 = zerobuff; + MEGDNN_FALLTHRU case 2: inptr5 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr6 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr7 = zerobuff; break; @@ -715,8 +750,10 @@ static void gemm_s8x8x16_4x8_transpose_pack_B_n(dt_int8* outptr, switch (y + 3 - ymax) { case 2: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr3 = zerobuff; break; @@ -734,8 +771,10 @@ static void gemm_s8x8x16_4x8_transpose_pack_B_n(dt_int8* outptr, switch (y + 3 - ymax) { case 2: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr3 = zerobuff; break; diff --git a/dnn/src/armv7/matrix_mul/quint8/kernel_4x8x8.h b/dnn/src/armv7/matrix_mul/quint8/kernel_4x8x8.h index 31afa188..fb917bd7 100644 --- a/dnn/src/armv7/matrix_mul/quint8/kernel_4x8x8.h +++ b/dnn/src/armv7/matrix_mul/quint8/kernel_4x8x8.h @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "src/arm_common/simd_macro/marm_neon.h" @@ -453,8 +454,10 @@ static void gemm_u8_4x8_pack_A_n(dt_uint8* outptr, const dt_uint8* inptr, switch (y + 3 - ymax) { case 2: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr3 = zerobuff; break; @@ -471,8 +474,10 @@ static void gemm_u8_4x8_pack_A_n(dt_uint8* outptr, const dt_uint8* inptr, switch (y + 3 - ymax) { case 2: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr3 = zerobuff; break; @@ -523,16 +528,22 @@ static void gemm_u8_4x8_transpose_pack_A_n(dt_uint8* out, const dt_uint8* in, switch (k + 7 - kmax) { case 6: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 5: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 4: inptr3 = zerobuff; + MEGDNN_FALLTHRU case 3: inptr4 = zerobuff; + MEGDNN_FALLTHRU case 2: inptr5 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr6 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr7 = zerobuff; break; @@ -551,16 +562,22 @@ static void gemm_u8_4x8_transpose_pack_A_n(dt_uint8* out, const dt_uint8* in, switch (k + 7 - kmax) { case 6: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 5: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 4: inptr3 = zerobuff; + MEGDNN_FALLTHRU case 3: inptr4 = zerobuff; + MEGDNN_FALLTHRU case 2: inptr5 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr6 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr7 = zerobuff; break; @@ -617,16 +634,22 @@ static void gemm_u8_4x8_pack_B_n(dt_uint8* out, const dt_uint8* in, int ldin, switch (k + 7 - kmax) { case 6: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 5: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 4: inptr3 = zerobuff; + MEGDNN_FALLTHRU case 3: inptr4 = zerobuff; + MEGDNN_FALLTHRU case 2: inptr5 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr6 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr7 = zerobuff; break; @@ -646,16 +669,22 @@ static void gemm_u8_4x8_pack_B_n(dt_uint8* out, const dt_uint8* in, int ldin, switch (k + 7 - kmax) { case 6: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 5: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 4: inptr3 = zerobuff; + MEGDNN_FALLTHRU case 3: inptr4 = zerobuff; + MEGDNN_FALLTHRU case 2: inptr5 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr6 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr7 = zerobuff; break; @@ -675,16 +704,22 @@ static void gemm_u8_4x8_pack_B_n(dt_uint8* out, const dt_uint8* in, int ldin, switch (k + 7 - kmax) { case 6: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 5: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 4: inptr3 = zerobuff; + MEGDNN_FALLTHRU case 3: inptr4 = zerobuff; + MEGDNN_FALLTHRU case 2: inptr5 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr6 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr7 = zerobuff; break; @@ -763,8 +798,10 @@ static void gemm_u8_4x8_transpose_pack_B_n(dt_uint8* outptr, switch (y + 3 - ymax) { case 2: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr3 = zerobuff; break; @@ -782,8 +819,10 @@ static void gemm_u8_4x8_transpose_pack_B_n(dt_uint8* outptr, switch (y + 3 - ymax) { case 2: inptr1 = zerobuff; + MEGDNN_FALLTHRU case 1: inptr2 = zerobuff; + MEGDNN_FALLTHRU case 0: inptr3 = zerobuff; break;