GitOrigin-RevId: ab6c9644da
release-0.6
@@ -6,12 +6,13 @@ | |||||
* | * | ||||
* Unless required by applicable law or agreed to in writing, | * Unless required by applicable law or agreed to in writing, | ||||
* software distributed under the License is distributed on an | * software distributed under the License is distributed on an | ||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | */ | ||||
#include "src/armv7/matrix_mul/fp32/strategy.h" | |||||
#include "src/armv7/matrix_mul/asm/common.h" | |||||
#include "src/arm_common/simd_macro/marm_neon.h" | #include "src/arm_common/simd_macro/marm_neon.h" | ||||
#include "src/armv7/matrix_mul/asm/common.h" | |||||
#include "src/armv7/matrix_mul/fp32/strategy.h" | |||||
#include "src/common/utils.h" | #include "src/common/utils.h" | ||||
using namespace megdnn; | using namespace megdnn; | ||||
@@ -42,8 +43,8 @@ namespace { | |||||
// +--+ - - - - +--------+--------+--------+ | // +--+ - - - - +--------+--------+--------+ | ||||
// | // | ||||
// Accumulator | // Accumulator | ||||
void kern_4x12(const float* packA, const float* packB, int K, | |||||
float* output, int LDC, bool is_first_k, int m_remain) { | |||||
void kern_4x12(const float* packA, const float* packB, int K, float* output, | |||||
int LDC, bool is_first_k, int m_remain) { | |||||
const float* a_ptr = packA; | const float* a_ptr = packA; | ||||
const float* b_ptr = packB; | const float* b_ptr = packB; | ||||
int oddk = (K & 1); | int oddk = (K & 1); | ||||
@@ -84,140 +85,139 @@ void kern_4x12(const float* packA, const float* packB, int K, | |||||
STORE_LINE("20", "21", "22", "23", "24", "25", "2") \ | STORE_LINE("20", "21", "22", "23", "24", "25", "2") \ | ||||
STORE_LINE("26", "27", "28", "29", "30", "31", "3") \ | STORE_LINE("26", "27", "28", "29", "30", "31", "3") \ | ||||
"101:\n" | "101:\n" | ||||
// clang-format on | |||||
asm volatile( | |||||
// load accumulator C | |||||
"add r1, r0, %[LDC]\n" | |||||
"add r2, r1, %[LDC]\n" | |||||
"add r3, r2, %[LDC]\n" | |||||
"cmp %[is_first_k], #1\n" | |||||
"beq 1f\n" LOAD_C | |||||
"b 2f\n" | |||||
"1:\n" | |||||
"veor.32 q4, q4, q4\n" | |||||
"veor.32 q5, q5, q5\n" | |||||
"veor.32 q6, q6, q6\n" | |||||
"veor.32 q7, q7, q7\n" | |||||
"veor.32 q8, q8, q8\n" | |||||
"veor.32 q9, q9, q9\n" | |||||
"veor.32 q10, q10, q10\n" | |||||
"veor.32 q11, q11, q11\n" | |||||
"veor.32 q12, q12, q12\n" | |||||
"veor.32 q13, q13, q13\n" | |||||
"veor.32 q14, q14, q14\n" | |||||
"veor.32 q15, q15, q15\n" | |||||
"2: \n" | |||||
"vld1.32 {d2, d3, d4, d5}, [%[b_ptr]]!\n" | |||||
"vld1.32 {d6, d7}, [%[b_ptr]]!\n" | |||||
"cmp %[K], #0\n" | |||||
"beq 4f\n" | |||||
"3:\n" | |||||
"vld1.32 {d0, d1}, [%[a_ptr]]!\n" | |||||
"vmla.f32 q4, q1, d0[0]\n" | |||||
"vmla.f32 q5, q2, d0[0]\n" | |||||
"vmla.f32 q6, q3, d0[0]\n" | |||||
"vmla.f32 q7, q1, d0[1]\n" | |||||
"vmla.f32 q8, q2, d0[1]\n" | |||||
"vmla.f32 q9, q3, d0[1]\n" | |||||
"vmla.f32 q10, q1, d1[0]\n" | |||||
"vmla.f32 q11, q2, d1[0]\n" | |||||
"vmla.f32 q12, q3, d1[0]\n" | |||||
"vmla.f32 q13, q1, d1[1]\n" | |||||
"vmla.f32 q14, q2, d1[1]\n" | |||||
"vmla.f32 q15, q3, d1[1]\n" | |||||
"vld1.32 {d0, d1}, [%[a_ptr]]!\n" | |||||
"vld1.32 {d2, d3, d4, d5}, [%[b_ptr]]!\n" | |||||
"vld1.32 {d6, d7}, [%[b_ptr]]!\n" | |||||
"vmla.f32 q4, q1, d0[0]\n" | |||||
"vmla.f32 q5, q2, d0[0]\n" | |||||
"vmla.f32 q6, q3, d0[0]\n" | |||||
"vmla.f32 q7, q1, d0[1]\n" | |||||
"vmla.f32 q8, q2, d0[1]\n" | |||||
"vmla.f32 q9, q3, d0[1]\n" | |||||
"vmla.f32 q10, q1, d1[0]\n" | |||||
"vmla.f32 q11, q2, d1[0]\n" | |||||
"vmla.f32 q12, q3, d1[0]\n" | |||||
"vmla.f32 q13, q1, d1[1]\n" | |||||
"vmla.f32 q14, q2, d1[1]\n" | |||||
"vmla.f32 q15, q3, d1[1]\n" | |||||
"vld1.32 {d2, d3, d4, d5}, [%[b_ptr]]!\n" | |||||
"vld1.32 {d6, d7}, [%[b_ptr]]!\n" | |||||
"subs %[K], #1\n" | |||||
"bne 3b\n" | |||||
"4:\n" | |||||
"cmp %[oddk], #1\n" | |||||
"beq 5f\n" | |||||
// Even tail | |||||
"vld1.32 {d0, d1}, [%[a_ptr]]!\n" | |||||
"vmla.f32 q4, q1, d0[0]\n" | |||||
"vmla.f32 q5, q2, d0[0]\n" | |||||
"vmla.f32 q6, q3, d0[0]\n" | |||||
"vmla.f32 q7, q1, d0[1]\n" | |||||
"vmla.f32 q8, q2, d0[1]\n" | |||||
"vmla.f32 q9, q3, d0[1]\n" | |||||
"vmla.f32 q10, q1, d1[0]\n" | |||||
"vmla.f32 q11, q2, d1[0]\n" | |||||
"vmla.f32 q12, q3, d1[0]\n" | |||||
"vmla.f32 q13, q1, d1[1]\n" | |||||
"vmla.f32 q14, q2, d1[1]\n" | |||||
"vmla.f32 q15, q3, d1[1]\n" | |||||
"vld1.32 {d0, d1}, [%[a_ptr]]!\n" | |||||
"vld1.32 {d2, d3, d4, d5}, [%[b_ptr]]!\n" | |||||
"vld1.32 {d6, d7}, [%[b_ptr]]!\n" | |||||
"vmla.f32 q4, q1, d0[0]\n" | |||||
"vmla.f32 q5, q2, d0[0]\n" | |||||
"vmla.f32 q6, q3, d0[0]\n" | |||||
"vmla.f32 q7, q1, d0[1]\n" | |||||
"vmla.f32 q8, q2, d0[1]\n" | |||||
"vmla.f32 q9, q3, d0[1]\n" | |||||
"vmla.f32 q10, q1, d1[0]\n" | |||||
"vmla.f32 q11, q2, d1[0]\n" | |||||
"vmla.f32 q12, q3, d1[0]\n" | |||||
"vmla.f32 q13, q1, d1[1]\n" | |||||
"vmla.f32 q14, q2, d1[1]\n" | |||||
"vmla.f32 q15, q3, d1[1]\n" | |||||
"b 6f\n" | |||||
// odd tail | |||||
"5:\n" | |||||
"vld1.32 {d0, d1}, [%[a_ptr]]!\n" | |||||
"vmla.f32 q4, q1, d0[0]\n" | |||||
"vmla.f32 q5, q2, d0[0]\n" | |||||
"vmla.f32 q6, q3, d0[0]\n" | |||||
"vmla.f32 q7, q1, d0[1]\n" | |||||
"vmla.f32 q8, q2, d0[1]\n" | |||||
"vmla.f32 q9, q3, d0[1]\n" | |||||
"vmla.f32 q10, q1, d1[0]\n" | |||||
"vmla.f32 q11, q2, d1[0]\n" | |||||
"vmla.f32 q12, q3, d1[0]\n" | |||||
"vmla.f32 q13, q1, d1[1]\n" | |||||
"vmla.f32 q14, q2, d1[1]\n" | |||||
"vmla.f32 q15, q3, d1[1]\n" | |||||
"6:\n" STORE_C | |||||
: [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [K] "+r"(K), | |||||
[LDC] "+r"(LDC), [is_first_k] "+r"(is_first_k), | |||||
[oddk] "+r"(oddk), [m_remain] "+r"(m_remain), | |||||
[outptr] "+r"(outptr) | |||||
: | |||||
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", | |||||
"d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", | |||||
"d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", | |||||
"d25", "d26", "d27", "d28", "d29", "d30", "d31", "r1", | |||||
"r2", "r3", "r9", "r10", "cc", "memory"); | |||||
// clang-format on | |||||
asm volatile( | |||||
// load accumulator C | |||||
"add r1, r0, %[LDC]\n" | |||||
"add r2, r1, %[LDC]\n" | |||||
"add r3, r2, %[LDC]\n" | |||||
"cmp %[is_first_k], #1\n" | |||||
"beq 1f\n" LOAD_C | |||||
"b 2f\n" | |||||
"1:\n" | |||||
"veor.32 q4, q4, q4\n" | |||||
"veor.32 q5, q5, q5\n" | |||||
"veor.32 q6, q6, q6\n" | |||||
"veor.32 q7, q7, q7\n" | |||||
"veor.32 q8, q8, q8\n" | |||||
"veor.32 q9, q9, q9\n" | |||||
"veor.32 q10, q10, q10\n" | |||||
"veor.32 q11, q11, q11\n" | |||||
"veor.32 q12, q12, q12\n" | |||||
"veor.32 q13, q13, q13\n" | |||||
"veor.32 q14, q14, q14\n" | |||||
"veor.32 q15, q15, q15\n" | |||||
"2: \n" | |||||
"vld1.32 {d2, d3, d4, d5}, [%[b_ptr]]!\n" | |||||
"vld1.32 {d6, d7}, [%[b_ptr]]!\n" | |||||
"cmp %[K], #0\n" | |||||
"beq 4f\n" | |||||
"3:\n" | |||||
"vld1.32 {d0, d1}, [%[a_ptr]]!\n" | |||||
"vmla.f32 q4, q1, d0[0]\n" | |||||
"vmla.f32 q5, q2, d0[0]\n" | |||||
"vmla.f32 q6, q3, d0[0]\n" | |||||
"vmla.f32 q7, q1, d0[1]\n" | |||||
"vmla.f32 q8, q2, d0[1]\n" | |||||
"vmla.f32 q9, q3, d0[1]\n" | |||||
"vmla.f32 q10, q1, d1[0]\n" | |||||
"vmla.f32 q11, q2, d1[0]\n" | |||||
"vmla.f32 q12, q3, d1[0]\n" | |||||
"vmla.f32 q13, q1, d1[1]\n" | |||||
"vmla.f32 q14, q2, d1[1]\n" | |||||
"vmla.f32 q15, q3, d1[1]\n" | |||||
"vld1.32 {d0, d1}, [%[a_ptr]]!\n" | |||||
"vld1.32 {d2, d3, d4, d5}, [%[b_ptr]]!\n" | |||||
"vld1.32 {d6, d7}, [%[b_ptr]]!\n" | |||||
"vmla.f32 q4, q1, d0[0]\n" | |||||
"vmla.f32 q5, q2, d0[0]\n" | |||||
"vmla.f32 q6, q3, d0[0]\n" | |||||
"vmla.f32 q7, q1, d0[1]\n" | |||||
"vmla.f32 q8, q2, d0[1]\n" | |||||
"vmla.f32 q9, q3, d0[1]\n" | |||||
"vmla.f32 q10, q1, d1[0]\n" | |||||
"vmla.f32 q11, q2, d1[0]\n" | |||||
"vmla.f32 q12, q3, d1[0]\n" | |||||
"vmla.f32 q13, q1, d1[1]\n" | |||||
"vmla.f32 q14, q2, d1[1]\n" | |||||
"vmla.f32 q15, q3, d1[1]\n" | |||||
"vld1.32 {d2, d3, d4, d5}, [%[b_ptr]]!\n" | |||||
"vld1.32 {d6, d7}, [%[b_ptr]]!\n" | |||||
"subs %[K], #1\n" | |||||
"bne 3b\n" | |||||
"4:\n" | |||||
"cmp %[oddk], #1\n" | |||||
"beq 5f\n" | |||||
// Even tail | |||||
"vld1.32 {d0, d1}, [%[a_ptr]]!\n" | |||||
"vmla.f32 q4, q1, d0[0]\n" | |||||
"vmla.f32 q5, q2, d0[0]\n" | |||||
"vmla.f32 q6, q3, d0[0]\n" | |||||
"vmla.f32 q7, q1, d0[1]\n" | |||||
"vmla.f32 q8, q2, d0[1]\n" | |||||
"vmla.f32 q9, q3, d0[1]\n" | |||||
"vmla.f32 q10, q1, d1[0]\n" | |||||
"vmla.f32 q11, q2, d1[0]\n" | |||||
"vmla.f32 q12, q3, d1[0]\n" | |||||
"vmla.f32 q13, q1, d1[1]\n" | |||||
"vmla.f32 q14, q2, d1[1]\n" | |||||
"vmla.f32 q15, q3, d1[1]\n" | |||||
"vld1.32 {d0, d1}, [%[a_ptr]]!\n" | |||||
"vld1.32 {d2, d3, d4, d5}, [%[b_ptr]]!\n" | |||||
"vld1.32 {d6, d7}, [%[b_ptr]]!\n" | |||||
"vmla.f32 q4, q1, d0[0]\n" | |||||
"vmla.f32 q5, q2, d0[0]\n" | |||||
"vmla.f32 q6, q3, d0[0]\n" | |||||
"vmla.f32 q7, q1, d0[1]\n" | |||||
"vmla.f32 q8, q2, d0[1]\n" | |||||
"vmla.f32 q9, q3, d0[1]\n" | |||||
"vmla.f32 q10, q1, d1[0]\n" | |||||
"vmla.f32 q11, q2, d1[0]\n" | |||||
"vmla.f32 q12, q3, d1[0]\n" | |||||
"vmla.f32 q13, q1, d1[1]\n" | |||||
"vmla.f32 q14, q2, d1[1]\n" | |||||
"vmla.f32 q15, q3, d1[1]\n" | |||||
"b 6f\n" | |||||
// odd tail | |||||
"5:\n" | |||||
"vld1.32 {d0, d1}, [%[a_ptr]]!\n" | |||||
"vmla.f32 q4, q1, d0[0]\n" | |||||
"vmla.f32 q5, q2, d0[0]\n" | |||||
"vmla.f32 q6, q3, d0[0]\n" | |||||
"vmla.f32 q7, q1, d0[1]\n" | |||||
"vmla.f32 q8, q2, d0[1]\n" | |||||
"vmla.f32 q9, q3, d0[1]\n" | |||||
"vmla.f32 q10, q1, d1[0]\n" | |||||
"vmla.f32 q11, q2, d1[0]\n" | |||||
"vmla.f32 q12, q3, d1[0]\n" | |||||
"vmla.f32 q13, q1, d1[1]\n" | |||||
"vmla.f32 q14, q2, d1[1]\n" | |||||
"vmla.f32 q15, q3, d1[1]\n" | |||||
"6:\n" STORE_C | |||||
: [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [K] "+r"(K), | |||||
[LDC] "+r"(LDC), [is_first_k] "+r"(is_first_k), [oddk] "+r"(oddk), | |||||
[m_remain] "+r"(m_remain), [outptr] "+r"(outptr) | |||||
: | |||||
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", | |||||
"d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", | |||||
"d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", | |||||
"d29", "d30", "d31", "r1", "r2", "r3", "r9", "r10", "cc", | |||||
"memory"); | |||||
#undef LOAD_LINE | #undef LOAD_LINE | ||||
#undef LOAD_C | #undef LOAD_C | ||||
@@ -316,92 +316,92 @@ void kern_4x4(const float* packA, const float* packB, int K, float* output, | |||||
STORE_LINE("12", "13", "2") \ | STORE_LINE("12", "13", "2") \ | ||||
STORE_LINE("14", "15", "3") \ | STORE_LINE("14", "15", "3") \ | ||||
"105:\n" | "105:\n" | ||||
// clang-format on | |||||
asm volatile( | |||||
// load accumulator C | |||||
"add r1, r0, %[LDC]\n" | |||||
"add r2, r1, %[LDC]\n" | |||||
"add r3, r2, %[LDC]\n" | |||||
"cmp %[is_first_k], #1\n" | |||||
"beq 1f\n" LOAD_C | |||||
"b 2f\n" | |||||
"1:\n" | |||||
"veor.32 q4, q4, q4\n" | |||||
"veor.32 q5, q5, q5\n" | |||||
"veor.32 q6, q6, q6\n" | |||||
"veor.32 q7, q7, q7\n" | |||||
"2: \n" | |||||
"vld1.32 {d0, d1}, [%[a_ptr]]!\n" | |||||
"vld1.32 {d4, d5}, [%[b_ptr]]!\n" | |||||
"cmp %[K], #0\n" | |||||
"beq 4f\n" | |||||
"3:\n" | |||||
"vld1.32 {d2, d3}, [%[a_ptr]]!\n" | |||||
"vld1.32 {d6, d7}, [%[b_ptr]]!\n" | |||||
"vmla.f32 q4, q2, d0[0]\n" | |||||
"vmla.f32 q5, q2, d0[1]\n" | |||||
"vmla.f32 q6, q2, d1[0]\n" | |||||
"vmla.f32 q7, q2, d1[1]\n" | |||||
"vld1.32 {d0, d1}, [%[a_ptr]]!\n" | |||||
"vld1.32 {d4, d5}, [%[b_ptr]]!\n" | |||||
"vmla.f32 q4, q3, d2[0]\n" | |||||
"vmla.f32 q5, q3, d2[1]\n" | |||||
"vmla.f32 q6, q3, d3[0]\n" | |||||
"vmla.f32 q7, q3, d3[1]\n" | |||||
"subs %[K], #1\n" | |||||
"bne 3b\n" | |||||
"4:\n" | |||||
"cmp %[oddk], #1\n" | |||||
"beq 5f\n" | |||||
// Even tail | |||||
"vld1.32 {d2, d3}, [%[a_ptr]]!\n" | |||||
"vld1.32 {d6, d7}, [%[b_ptr]]!\n" | |||||
"vmla.f32 q4, q2, d0[0]\n" | |||||
"vmla.f32 q5, q2, d0[1]\n" | |||||
"vmla.f32 q6, q2, d1[0]\n" | |||||
"vmla.f32 q7, q2, d1[1]\n" | |||||
"vmla.f32 q4, q3, d2[0]\n" | |||||
"vmla.f32 q5, q3, d2[1]\n" | |||||
"vmla.f32 q6, q3, d3[0]\n" | |||||
"vmla.f32 q7, q3, d3[1]\n" | |||||
"b 6f\n" | |||||
// odd tail | |||||
"5:\n" | |||||
"vmla.f32 q4, q2, d0[0]\n" | |||||
"vmla.f32 q5, q2, d0[1]\n" | |||||
"vmla.f32 q6, q2, d1[0]\n" | |||||
"vmla.f32 q7, q2, d1[1]\n" | |||||
"6:\n" STORE_C | |||||
: [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [K] "+r"(K), | |||||
[LDC] "+r"(LDC), [is_first_k] "+r"(is_first_k), | |||||
[oddk] "+r"(oddk), [m_remain] "+r"(m_remain), | |||||
[n_remain] "+r"(n_remain), [outptr] "+r"(outptr) | |||||
: | |||||
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", | |||||
"d9", "d10", "d11", "d12", "d13", "d14", "d15", "r1", | |||||
"r2", "r3", "r10", "cc", "memory"); | |||||
// clang-format on | |||||
asm volatile( | |||||
// load accumulator C | |||||
"add r1, r0, %[LDC]\n" | |||||
"add r2, r1, %[LDC]\n" | |||||
"add r3, r2, %[LDC]\n" | |||||
"cmp %[is_first_k], #1\n" | |||||
"beq 1f\n" LOAD_C | |||||
"b 2f\n" | |||||
"1:\n" | |||||
"veor.32 q4, q4, q4\n" | |||||
"veor.32 q5, q5, q5\n" | |||||
"veor.32 q6, q6, q6\n" | |||||
"veor.32 q7, q7, q7\n" | |||||
"2: \n" | |||||
"vld1.32 {d0, d1}, [%[a_ptr]]!\n" | |||||
"vld1.32 {d4, d5}, [%[b_ptr]]!\n" | |||||
"cmp %[K], #0\n" | |||||
"beq 4f\n" | |||||
"3:\n" | |||||
"vld1.32 {d2, d3}, [%[a_ptr]]!\n" | |||||
"vld1.32 {d6, d7}, [%[b_ptr]]!\n" | |||||
"vmla.f32 q4, q2, d0[0]\n" | |||||
"vmla.f32 q5, q2, d0[1]\n" | |||||
"vmla.f32 q6, q2, d1[0]\n" | |||||
"vmla.f32 q7, q2, d1[1]\n" | |||||
"vld1.32 {d0, d1}, [%[a_ptr]]!\n" | |||||
"vld1.32 {d4, d5}, [%[b_ptr]]!\n" | |||||
"vmla.f32 q4, q3, d2[0]\n" | |||||
"vmla.f32 q5, q3, d2[1]\n" | |||||
"vmla.f32 q6, q3, d3[0]\n" | |||||
"vmla.f32 q7, q3, d3[1]\n" | |||||
"subs %[K], #1\n" | |||||
"bne 3b\n" | |||||
"4:\n" | |||||
"cmp %[oddk], #1\n" | |||||
"beq 5f\n" | |||||
// Even tail | |||||
"vld1.32 {d2, d3}, [%[a_ptr]]!\n" | |||||
"vld1.32 {d6, d7}, [%[b_ptr]]!\n" | |||||
"vmla.f32 q4, q2, d0[0]\n" | |||||
"vmla.f32 q5, q2, d0[1]\n" | |||||
"vmla.f32 q6, q2, d1[0]\n" | |||||
"vmla.f32 q7, q2, d1[1]\n" | |||||
"vmla.f32 q4, q3, d2[0]\n" | |||||
"vmla.f32 q5, q3, d2[1]\n" | |||||
"vmla.f32 q6, q3, d3[0]\n" | |||||
"vmla.f32 q7, q3, d3[1]\n" | |||||
"b 6f\n" | |||||
// odd tail | |||||
"5:\n" | |||||
"vmla.f32 q4, q2, d0[0]\n" | |||||
"vmla.f32 q5, q2, d0[1]\n" | |||||
"vmla.f32 q6, q2, d1[0]\n" | |||||
"vmla.f32 q7, q2, d1[1]\n" | |||||
"6:\n" STORE_C | |||||
: [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [K] "+r"(K), | |||||
[LDC] "+r"(LDC), [is_first_k] "+r"(is_first_k), [oddk] "+r"(oddk), | |||||
[m_remain] "+r"(m_remain), [n_remain] "+r"(n_remain), | |||||
[outptr] "+r"(outptr) | |||||
: | |||||
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", | |||||
"d11", "d12", "d13", "d14", "d15", "r1", "r2", "r3", "r10", "cc", | |||||
"memory"); | |||||
#undef LOAD_LINE | #undef LOAD_LINE | ||||
#undef LOAD_C | #undef LOAD_C | ||||
#undef STORE_LINE | #undef STORE_LINE | ||||
#undef STORE_C | #undef STORE_C | ||||
} | } | ||||
void sgemm_4x12_pack_A_n(float * outptr, const float * inptr, int ldin, int y0, | |||||
void sgemm_4x12_pack_A_n(float* outptr, const float* inptr, int ldin, int y0, | |||||
int ymax, int k0, int kmax) { | int ymax, int k0, int kmax) { | ||||
float zerobuff[4]; | float zerobuff[4]; | ||||
std::memset(zerobuff, 0, sizeof(float) * 4); | std::memset(zerobuff, 0, sizeof(float) * 4); | ||||
@@ -444,8 +444,10 @@ void sgemm_4x12_pack_A_n(float * outptr, const float * inptr, int ldin, int y0, | |||||
/* Everything falls through in here */ | /* Everything falls through in here */ | ||||
case 2: | case 2: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
break; | break; | ||||
@@ -463,8 +465,10 @@ void sgemm_4x12_pack_A_n(float * outptr, const float * inptr, int ldin, int y0, | |||||
/* Everything falls through in here */ | /* Everything falls through in here */ | ||||
case 2: | case 2: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
break; | break; | ||||
@@ -530,8 +534,8 @@ void sgemm_4x12_pack_A_t(float* out, const float* in, int ldin, int x0, | |||||
} | } | ||||
} | } | ||||
void sgemm_4x12_pack_B_n(float* out, const float* in, int ldin, | |||||
int x0, int xmax, int k0, int kmax) { | |||||
void sgemm_4x12_pack_B_n(float* out, const float* in, int ldin, int x0, | |||||
int xmax, int k0, int kmax) { | |||||
int ksize = kmax - k0; | int ksize = kmax - k0; | ||||
int ksize12 = ksize * 12; | int ksize12 = ksize * 12; | ||||
int ksize4 = (ksize << 2); | int ksize4 = (ksize << 2); | ||||
@@ -600,8 +604,8 @@ void sgemm_4x12_pack_B_n(float* out, const float* in, int ldin, | |||||
} | } | ||||
} | } | ||||
void sgemm_4x12_pack_B_t(float* out, const float* in, int ldin, | |||||
int y0, int ymax, int k0, int kmax) { | |||||
void sgemm_4x12_pack_B_t(float* out, const float* in, int ldin, int y0, | |||||
int ymax, int k0, int kmax) { | |||||
float* outptr = out; | float* outptr = out; | ||||
const float* inptr = in; | const float* inptr = in; | ||||
float zerobuff[4]; | float zerobuff[4]; | ||||
@@ -660,8 +664,10 @@ void sgemm_4x12_pack_B_t(float* out, const float* in, int ldin, | |||||
/* Everything falls through in here */ | /* Everything falls through in here */ | ||||
case 2: | case 2: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
break; | break; | ||||
@@ -679,8 +685,10 @@ void sgemm_4x12_pack_B_t(float* out, const float* in, int ldin, | |||||
/* Everything falls through in here */ | /* Everything falls through in here */ | ||||
case 2: | case 2: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
break; | break; | ||||
@@ -697,8 +705,8 @@ void sgemm_4x12_pack_B_t(float* out, const float* in, int ldin, | |||||
MEGDNN_REG_GEMM_STRATEGY_IMPL(sgemm_4x12); | MEGDNN_REG_GEMM_STRATEGY_IMPL(sgemm_4x12); | ||||
void sgemm_4x12::pack_A(float* out, const float* in, int ldin, int y0, | |||||
int ymax, int k0, int kmax, bool transpose_A) const { | |||||
void sgemm_4x12::pack_A(float* out, const float* in, int ldin, int y0, int ymax, | |||||
int k0, int kmax, bool transpose_A) const { | |||||
if (transpose_A) { | if (transpose_A) { | ||||
sgemm_4x12_pack_A_t(out, in, ldin, y0, ymax, k0, kmax); | sgemm_4x12_pack_A_t(out, in, ldin, y0, ymax, k0, kmax); | ||||
} else { | } else { | ||||
@@ -715,9 +723,9 @@ void sgemm_4x12::pack_B(float* out, const float* in, int ldin, int x0, int xmax, | |||||
} | } | ||||
} | } | ||||
void sgemm_4x12::kern(const float* packA, const float* packB, | |||||
size_t M, size_t N, size_t K, float* C, size_t LDC, | |||||
bool is_first_k, const float*, float*) const { | |||||
void sgemm_4x12::kern(const float* packA, const float* packB, size_t M, | |||||
size_t N, size_t K, float* C, size_t LDC, bool is_first_k, | |||||
const float*, float*) const { | |||||
megdnn_assert(A_dtype.enumv() == B_dtype.enumv() && | megdnn_assert(A_dtype.enumv() == B_dtype.enumv() && | ||||
A_dtype.enumv() == C_dtype.enumv() && | A_dtype.enumv() == C_dtype.enumv() && | ||||
A_dtype.enumv() == DTypeEnum::Float32); | A_dtype.enumv() == DTypeEnum::Float32); | ||||
@@ -6,7 +6,8 @@ | |||||
* | * | ||||
* Unless required by applicable law or agreed to in writing, | * Unless required by applicable law or agreed to in writing, | ||||
* software distributed under the License is distributed on an | * software distributed under the License is distributed on an | ||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | */ | ||||
#include "src/arm_common/simd_macro/marm_neon.h" | #include "src/arm_common/simd_macro/marm_neon.h" | ||||
@@ -988,8 +989,10 @@ static void gemm_s16x16x32_12x4_transpose_pack_B_n(dt_int16* outptr, | |||||
switch (y + 3 - ymax) { | switch (y + 3 - ymax) { | ||||
case 2: | case 2: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
break; | break; | ||||
@@ -6,7 +6,8 @@ | |||||
* | * | ||||
* Unless required by applicable law or agreed to in writing, | * Unless required by applicable law or agreed to in writing, | ||||
* software distributed under the License is distributed on an | * software distributed under the License is distributed on an | ||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | */ | ||||
#include "src/arm_common/simd_macro/marm_neon.h" | #include "src/arm_common/simd_macro/marm_neon.h" | ||||
@@ -330,9 +331,9 @@ static void gemm_s8_4x2_pack_A_n(dt_int8* outptr, const dt_int8* inptr, | |||||
if (y + 3 >= ymax) { | if (y + 3 >= ymax) { | ||||
switch (y + 3 - ymax) { | switch (y + 3 - ymax) { | ||||
case 2: | case 2: | ||||
inptr1 = zerobuff; | |||||
inptr1 = zerobuff;MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr2 = zerobuff; | |||||
inptr2 = zerobuff;MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
break; | break; | ||||
@@ -347,9 +348,9 @@ static void gemm_s8_4x2_pack_A_n(dt_int8* outptr, const dt_int8* inptr, | |||||
if (y + 3 >= ymax) { | if (y + 3 >= ymax) { | ||||
switch (y + 3 - ymax) { | switch (y + 3 - ymax) { | ||||
case 2: | case 2: | ||||
inptr1 = zerobuff; | |||||
inptr1 = zerobuff;MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr2 = zerobuff; | |||||
inptr2 = zerobuff;MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
break; | break; | ||||
@@ -390,19 +391,19 @@ static void gemm_s8_4x2_pack_A_t(dt_int8* out, const dt_int8* in, int ldin, | |||||
if (remain >= 0) { | if (remain >= 0) { | ||||
switch (remain) { | switch (remain) { | ||||
case 7: | case 7: | ||||
inptr0 = zerobuff; | |||||
inptr0 = zerobuff;MEGDNN_FALLTHRU | |||||
case 6: | case 6: | ||||
inptr1 = zerobuff; | |||||
inptr1 = zerobuff;MEGDNN_FALLTHRU | |||||
case 5: | case 5: | ||||
inptr2 = zerobuff; | |||||
inptr2 = zerobuff;MEGDNN_FALLTHRU | |||||
case 4: | case 4: | ||||
inptr3 = zerobuff; | |||||
inptr3 = zerobuff;MEGDNN_FALLTHRU | |||||
case 3: | case 3: | ||||
inptr4 = zerobuff; | |||||
inptr4 = zerobuff;MEGDNN_FALLTHRU | |||||
case 2: | case 2: | ||||
inptr5 = zerobuff; | |||||
inptr5 = zerobuff;MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr6 = zerobuff; | |||||
inptr6 = zerobuff;MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr7 = zerobuff; | inptr7 = zerobuff; | ||||
break; | break; | ||||
@@ -421,19 +422,19 @@ static void gemm_s8_4x2_pack_A_t(dt_int8* out, const dt_int8* in, int ldin, | |||||
if (remain >= 0) { | if (remain >= 0) { | ||||
switch (remain) { | switch (remain) { | ||||
case 7: | case 7: | ||||
inptr0 = zerobuff; | |||||
inptr0 = zerobuff;MEGDNN_FALLTHRU | |||||
case 6: | case 6: | ||||
inptr1 = zerobuff; | |||||
inptr1 = zerobuff;MEGDNN_FALLTHRU | |||||
case 5: | case 5: | ||||
inptr2 = zerobuff; | |||||
inptr2 = zerobuff;MEGDNN_FALLTHRU | |||||
case 4: | case 4: | ||||
inptr3 = zerobuff; | |||||
inptr3 = zerobuff;MEGDNN_FALLTHRU | |||||
case 3: | case 3: | ||||
inptr4 = zerobuff; | |||||
inptr4 = zerobuff;MEGDNN_FALLTHRU | |||||
case 2: | case 2: | ||||
inptr5 = zerobuff; | |||||
inptr5 = zerobuff;MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr6 = zerobuff; | |||||
inptr6 = zerobuff;MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr7 = zerobuff; | inptr7 = zerobuff; | ||||
break; | break; | ||||
@@ -488,19 +489,19 @@ static void gemm_s8_4x2_pack_B_n(dt_int8* out, const dt_int8* in, int ldin, | |||||
if (remain >= 0) { | if (remain >= 0) { | ||||
switch (remain) { | switch (remain) { | ||||
case 7: | case 7: | ||||
inptr0 = zerobuff; | |||||
inptr0 = zerobuff;MEGDNN_FALLTHRU | |||||
case 6: | case 6: | ||||
inptr1 = zerobuff; | |||||
inptr1 = zerobuff;MEGDNN_FALLTHRU | |||||
case 5: | case 5: | ||||
inptr2 = zerobuff; | |||||
inptr2 = zerobuff;MEGDNN_FALLTHRU | |||||
case 4: | case 4: | ||||
inptr3 = zerobuff; | |||||
inptr3 = zerobuff;MEGDNN_FALLTHRU | |||||
case 3: | case 3: | ||||
inptr4 = zerobuff; | |||||
inptr4 = zerobuff;MEGDNN_FALLTHRU | |||||
case 2: | case 2: | ||||
inptr5 = zerobuff; | |||||
inptr5 = zerobuff;MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr6 = zerobuff; | |||||
inptr6 = zerobuff;MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr7 = zerobuff; | inptr7 = zerobuff; | ||||
break; | break; | ||||
@@ -519,19 +520,19 @@ static void gemm_s8_4x2_pack_B_n(dt_int8* out, const dt_int8* in, int ldin, | |||||
if (remain >= 0) { | if (remain >= 0) { | ||||
switch (remain) { | switch (remain) { | ||||
case 7: | case 7: | ||||
inptr0 = zerobuff; | |||||
inptr0 = zerobuff;MEGDNN_FALLTHRU | |||||
case 6: | case 6: | ||||
inptr1 = zerobuff; | |||||
inptr1 = zerobuff;MEGDNN_FALLTHRU | |||||
case 5: | case 5: | ||||
inptr2 = zerobuff; | |||||
inptr2 = zerobuff;MEGDNN_FALLTHRU | |||||
case 4: | case 4: | ||||
inptr3 = zerobuff; | |||||
inptr3 = zerobuff;MEGDNN_FALLTHRU | |||||
case 3: | case 3: | ||||
inptr4 = zerobuff; | |||||
inptr4 = zerobuff;MEGDNN_FALLTHRU | |||||
case 2: | case 2: | ||||
inptr5 = zerobuff; | |||||
inptr5 = zerobuff;MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr6 = zerobuff; | |||||
inptr6 = zerobuff;MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr7 = zerobuff; | inptr7 = zerobuff; | ||||
break; | break; | ||||
@@ -6,7 +6,8 @@ | |||||
* | * | ||||
* Unless required by applicable law or agreed to in writing, | * Unless required by applicable law or agreed to in writing, | ||||
* software distributed under the License is distributed on an | * software distributed under the License is distributed on an | ||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | */ | ||||
#include "src/arm_common/simd_macro/marm_neon.h" | #include "src/arm_common/simd_macro/marm_neon.h" | ||||
@@ -378,7 +379,6 @@ static void kern_4x4(const int8_t* packA, const int8_t* packB, int K, | |||||
#undef STORE_C | #undef STORE_C | ||||
} | } | ||||
static void gemm_s8_4x8_pack_A_n(dt_int8* outptr, const dt_int8* inptr, | static void gemm_s8_4x8_pack_A_n(dt_int8* outptr, const dt_int8* inptr, | ||||
int ldin, int y0, int ymax, int k0, int kmax) { | int ldin, int y0, int ymax, int k0, int kmax) { | ||||
int8_t zerobuff[16]; | int8_t zerobuff[16]; | ||||
@@ -402,8 +402,10 @@ static void gemm_s8_4x8_pack_A_n(dt_int8* outptr, const dt_int8* inptr, | |||||
switch (y + 3 - ymax) { | switch (y + 3 - ymax) { | ||||
case 2: | case 2: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
break; | break; | ||||
@@ -420,8 +422,10 @@ static void gemm_s8_4x8_pack_A_n(dt_int8* outptr, const dt_int8* inptr, | |||||
switch (y + 3 - ymax) { | switch (y + 3 - ymax) { | ||||
case 2: | case 2: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
break; | break; | ||||
@@ -472,16 +476,22 @@ static void gemm_s8_4x8_transpose_pack_A_n(dt_int8* out, const dt_int8* in, | |||||
switch (k + 7 - kmax) { | switch (k + 7 - kmax) { | ||||
case 6: | case 6: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 5: | case 5: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 4: | case 4: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 3: | case 3: | ||||
inptr4 = zerobuff; | inptr4 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 2: | case 2: | ||||
inptr5 = zerobuff; | inptr5 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr6 = zerobuff; | inptr6 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr7 = zerobuff; | inptr7 = zerobuff; | ||||
break; | break; | ||||
@@ -500,16 +510,22 @@ static void gemm_s8_4x8_transpose_pack_A_n(dt_int8* out, const dt_int8* in, | |||||
switch (k + 7 - kmax) { | switch (k + 7 - kmax) { | ||||
case 6: | case 6: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 5: | case 5: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 4: | case 4: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 3: | case 3: | ||||
inptr4 = zerobuff; | inptr4 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 2: | case 2: | ||||
inptr5 = zerobuff; | inptr5 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr6 = zerobuff; | inptr6 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr7 = zerobuff; | inptr7 = zerobuff; | ||||
break; | break; | ||||
@@ -565,16 +581,22 @@ static void gemm_s8_4x8_pack_B_n(dt_int8* out, const dt_int8* in, int ldin, | |||||
switch (k + 7 - kmax) { | switch (k + 7 - kmax) { | ||||
case 6: | case 6: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 5: | case 5: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 4: | case 4: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 3: | case 3: | ||||
inptr4 = zerobuff; | inptr4 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 2: | case 2: | ||||
inptr5 = zerobuff; | inptr5 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr6 = zerobuff; | inptr6 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr7 = zerobuff; | inptr7 = zerobuff; | ||||
break; | break; | ||||
@@ -594,16 +616,22 @@ static void gemm_s8_4x8_pack_B_n(dt_int8* out, const dt_int8* in, int ldin, | |||||
switch (k + 7 - kmax) { | switch (k + 7 - kmax) { | ||||
case 6: | case 6: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 5: | case 5: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 4: | case 4: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 3: | case 3: | ||||
inptr4 = zerobuff; | inptr4 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 2: | case 2: | ||||
inptr5 = zerobuff; | inptr5 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr6 = zerobuff; | inptr6 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr7 = zerobuff; | inptr7 = zerobuff; | ||||
break; | break; | ||||
@@ -623,16 +651,22 @@ static void gemm_s8_4x8_pack_B_n(dt_int8* out, const dt_int8* in, int ldin, | |||||
switch (k + 7 - kmax) { | switch (k + 7 - kmax) { | ||||
case 6: | case 6: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 5: | case 5: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 4: | case 4: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 3: | case 3: | ||||
inptr4 = zerobuff; | inptr4 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 2: | case 2: | ||||
inptr5 = zerobuff; | inptr5 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr6 = zerobuff; | inptr6 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr7 = zerobuff; | inptr7 = zerobuff; | ||||
break; | break; | ||||
@@ -710,8 +744,10 @@ static void gemm_s8_4x8_transpose_pack_B_n(dt_int8* outptr, | |||||
switch (y + 3 - ymax) { | switch (y + 3 - ymax) { | ||||
case 2: | case 2: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
break; | break; | ||||
@@ -729,8 +765,10 @@ static void gemm_s8_4x8_transpose_pack_B_n(dt_int8* outptr, | |||||
switch (y + 3 - ymax) { | switch (y + 3 - ymax) { | ||||
case 2: | case 2: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
break; | break; | ||||
@@ -6,7 +6,8 @@ | |||||
* | * | ||||
* Unless required by applicable law or agreed to in writing, | * Unless required by applicable law or agreed to in writing, | ||||
* software distributed under the License is distributed on an | * software distributed under the License is distributed on an | ||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | */ | ||||
#include "src/arm_common/simd_macro/marm_neon.h" | #include "src/arm_common/simd_macro/marm_neon.h" | ||||
@@ -323,8 +324,10 @@ static void gemm_mk4_s8_4x2_pack_B(dt_int8* out, const dt_int8* in, int ldin, | |||||
switch (k + 3 - ICB) { | switch (k + 3 - ICB) { | ||||
case 2: | case 2: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
break; | break; | ||||
@@ -340,8 +343,10 @@ static void gemm_mk4_s8_4x2_pack_B(dt_int8* out, const dt_int8* in, int ldin, | |||||
switch (k + 3 - ICB) { | switch (k + 3 - ICB) { | ||||
case 2: | case 2: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
break; | break; | ||||
@@ -6,7 +6,8 @@ | |||||
* | * | ||||
* Unless required by applicable law or agreed to in writing, | * Unless required by applicable law or agreed to in writing, | ||||
* software distributed under the License is distributed on an | * software distributed under the License is distributed on an | ||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | */ | ||||
#include "src/arm_common/simd_macro/marm_neon.h" | #include "src/arm_common/simd_macro/marm_neon.h" | ||||
@@ -278,9 +279,9 @@ static void gemm_s8x8x16_4x2_pack_A_n(dt_int8* outptr, const dt_int8* inptr, | |||||
if (y + 3 >= ymax) { | if (y + 3 >= ymax) { | ||||
switch (y + 3 - ymax) { | switch (y + 3 - ymax) { | ||||
case 2: | case 2: | ||||
inptr1 = zerobuff; | |||||
inptr1 = zerobuff;MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr2 = zerobuff; | |||||
inptr2 = zerobuff;MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
break; | break; | ||||
@@ -295,9 +296,9 @@ static void gemm_s8x8x16_4x2_pack_A_n(dt_int8* outptr, const dt_int8* inptr, | |||||
if (y + 3 >= ymax) { | if (y + 3 >= ymax) { | ||||
switch (y + 3 - ymax) { | switch (y + 3 - ymax) { | ||||
case 2: | case 2: | ||||
inptr1 = zerobuff; | |||||
inptr1 = zerobuff;MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr2 = zerobuff; | |||||
inptr2 = zerobuff;MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
break; | break; | ||||
@@ -338,19 +339,19 @@ static void gemm_s8x8x16_4x2_pack_A_t(dt_int8* out, const dt_int8* in, int ldin, | |||||
if (remain >= 0) { | if (remain >= 0) { | ||||
switch (remain) { | switch (remain) { | ||||
case 7: | case 7: | ||||
inptr0 = zerobuff; | |||||
inptr0 = zerobuff;MEGDNN_FALLTHRU | |||||
case 6: | case 6: | ||||
inptr1 = zerobuff; | |||||
inptr1 = zerobuff;MEGDNN_FALLTHRU | |||||
case 5: | case 5: | ||||
inptr2 = zerobuff; | |||||
inptr2 = zerobuff;MEGDNN_FALLTHRU | |||||
case 4: | case 4: | ||||
inptr3 = zerobuff; | |||||
inptr3 = zerobuff;MEGDNN_FALLTHRU | |||||
case 3: | case 3: | ||||
inptr4 = zerobuff; | |||||
inptr4 = zerobuff;MEGDNN_FALLTHRU | |||||
case 2: | case 2: | ||||
inptr5 = zerobuff; | |||||
inptr5 = zerobuff;MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr6 = zerobuff; | |||||
inptr6 = zerobuff;MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr7 = zerobuff; | inptr7 = zerobuff; | ||||
break; | break; | ||||
@@ -369,19 +370,19 @@ static void gemm_s8x8x16_4x2_pack_A_t(dt_int8* out, const dt_int8* in, int ldin, | |||||
if (remain >= 0) { | if (remain >= 0) { | ||||
switch (remain) { | switch (remain) { | ||||
case 7: | case 7: | ||||
inptr0 = zerobuff; | |||||
inptr0 = zerobuff;MEGDNN_FALLTHRU | |||||
case 6: | case 6: | ||||
inptr1 = zerobuff; | |||||
inptr1 = zerobuff;MEGDNN_FALLTHRU | |||||
case 5: | case 5: | ||||
inptr2 = zerobuff; | |||||
inptr2 = zerobuff;MEGDNN_FALLTHRU | |||||
case 4: | case 4: | ||||
inptr3 = zerobuff; | |||||
inptr3 = zerobuff;MEGDNN_FALLTHRU | |||||
case 3: | case 3: | ||||
inptr4 = zerobuff; | |||||
inptr4 = zerobuff;MEGDNN_FALLTHRU | |||||
case 2: | case 2: | ||||
inptr5 = zerobuff; | |||||
inptr5 = zerobuff;MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr6 = zerobuff; | |||||
inptr6 = zerobuff;MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr7 = zerobuff; | inptr7 = zerobuff; | ||||
break; | break; | ||||
@@ -436,19 +437,19 @@ static void gemm_s8x8x16_4x2_pack_B_n(dt_int8* out, const dt_int8* in, int ldin, | |||||
if (remain >= 0) { | if (remain >= 0) { | ||||
switch (remain) { | switch (remain) { | ||||
case 7: | case 7: | ||||
inptr0 = zerobuff; | |||||
inptr0 = zerobuff;MEGDNN_FALLTHRU | |||||
case 6: | case 6: | ||||
inptr1 = zerobuff; | |||||
inptr1 = zerobuff;MEGDNN_FALLTHRU | |||||
case 5: | case 5: | ||||
inptr2 = zerobuff; | |||||
inptr2 = zerobuff;MEGDNN_FALLTHRU | |||||
case 4: | case 4: | ||||
inptr3 = zerobuff; | |||||
inptr3 = zerobuff;MEGDNN_FALLTHRU | |||||
case 3: | case 3: | ||||
inptr4 = zerobuff; | |||||
inptr4 = zerobuff;MEGDNN_FALLTHRU | |||||
case 2: | case 2: | ||||
inptr5 = zerobuff; | |||||
inptr5 = zerobuff;MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr6 = zerobuff; | |||||
inptr6 = zerobuff;MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr7 = zerobuff; | inptr7 = zerobuff; | ||||
break; | break; | ||||
@@ -467,19 +468,19 @@ static void gemm_s8x8x16_4x2_pack_B_n(dt_int8* out, const dt_int8* in, int ldin, | |||||
if (remain >= 0) { | if (remain >= 0) { | ||||
switch (remain) { | switch (remain) { | ||||
case 7: | case 7: | ||||
inptr0 = zerobuff; | |||||
inptr0 = zerobuff;MEGDNN_FALLTHRU | |||||
case 6: | case 6: | ||||
inptr1 = zerobuff; | |||||
inptr1 = zerobuff;MEGDNN_FALLTHRU | |||||
case 5: | case 5: | ||||
inptr2 = zerobuff; | |||||
inptr2 = zerobuff;MEGDNN_FALLTHRU | |||||
case 4: | case 4: | ||||
inptr3 = zerobuff; | |||||
inptr3 = zerobuff;MEGDNN_FALLTHRU | |||||
case 3: | case 3: | ||||
inptr4 = zerobuff; | |||||
inptr4 = zerobuff;MEGDNN_FALLTHRU | |||||
case 2: | case 2: | ||||
inptr5 = zerobuff; | |||||
inptr5 = zerobuff;MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr6 = zerobuff; | |||||
inptr6 = zerobuff;MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr7 = zerobuff; | inptr7 = zerobuff; | ||||
break; | break; | ||||
@@ -6,7 +6,8 @@ | |||||
* | * | ||||
* Unless required by applicable law or agreed to in writing, | * Unless required by applicable law or agreed to in writing, | ||||
* software distributed under the License is distributed on an | * software distributed under the License is distributed on an | ||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | */ | ||||
#include "src/arm_common/simd_macro/marm_neon.h" | #include "src/arm_common/simd_macro/marm_neon.h" | ||||
@@ -76,7 +77,7 @@ static void kern_4x8(const int8_t* packA, const int8_t* packB, int K, | |||||
STORE_LINE("14", "15", "3") \ | STORE_LINE("14", "15", "3") \ | ||||
"101:\n" | "101:\n" | ||||
// clang-format on | |||||
// clang-format on | |||||
register int16_t* outptr asm("r0") = output; | register int16_t* outptr asm("r0") = output; | ||||
asm volatile( | asm volatile( | ||||
@@ -406,8 +407,10 @@ static void gemm_s8x8x16_4x8_pack_A_n(dt_int8* outptr, const dt_int8* inptr, | |||||
switch (y + 3 - ymax) { | switch (y + 3 - ymax) { | ||||
case 2: | case 2: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
break; | break; | ||||
@@ -424,8 +427,10 @@ static void gemm_s8x8x16_4x8_pack_A_n(dt_int8* outptr, const dt_int8* inptr, | |||||
switch (y + 3 - ymax) { | switch (y + 3 - ymax) { | ||||
case 2: | case 2: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
break; | break; | ||||
@@ -476,16 +481,22 @@ static void gemm_s8x8x16_4x8_transpose_pack_A_n(dt_int8* out, const dt_int8* in, | |||||
switch (k + 7 - kmax) { | switch (k + 7 - kmax) { | ||||
case 6: | case 6: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 5: | case 5: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 4: | case 4: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 3: | case 3: | ||||
inptr4 = zerobuff; | inptr4 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 2: | case 2: | ||||
inptr5 = zerobuff; | inptr5 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr6 = zerobuff; | inptr6 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr7 = zerobuff; | inptr7 = zerobuff; | ||||
break; | break; | ||||
@@ -504,16 +515,22 @@ static void gemm_s8x8x16_4x8_transpose_pack_A_n(dt_int8* out, const dt_int8* in, | |||||
switch (k + 7 - kmax) { | switch (k + 7 - kmax) { | ||||
case 6: | case 6: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 5: | case 5: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 4: | case 4: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 3: | case 3: | ||||
inptr4 = zerobuff; | inptr4 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 2: | case 2: | ||||
inptr5 = zerobuff; | inptr5 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr6 = zerobuff; | inptr6 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr7 = zerobuff; | inptr7 = zerobuff; | ||||
break; | break; | ||||
@@ -569,16 +586,22 @@ static void gemm_s8x8x16_4x8_pack_B_n(dt_int8* out, const dt_int8* in, int ldin, | |||||
switch (k + 7 - kmax) { | switch (k + 7 - kmax) { | ||||
case 6: | case 6: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 5: | case 5: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 4: | case 4: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 3: | case 3: | ||||
inptr4 = zerobuff; | inptr4 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 2: | case 2: | ||||
inptr5 = zerobuff; | inptr5 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr6 = zerobuff; | inptr6 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr7 = zerobuff; | inptr7 = zerobuff; | ||||
break; | break; | ||||
@@ -598,16 +621,22 @@ static void gemm_s8x8x16_4x8_pack_B_n(dt_int8* out, const dt_int8* in, int ldin, | |||||
switch (k + 7 - kmax) { | switch (k + 7 - kmax) { | ||||
case 6: | case 6: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 5: | case 5: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 4: | case 4: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 3: | case 3: | ||||
inptr4 = zerobuff; | inptr4 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 2: | case 2: | ||||
inptr5 = zerobuff; | inptr5 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr6 = zerobuff; | inptr6 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr7 = zerobuff; | inptr7 = zerobuff; | ||||
break; | break; | ||||
@@ -627,16 +656,22 @@ static void gemm_s8x8x16_4x8_pack_B_n(dt_int8* out, const dt_int8* in, int ldin, | |||||
switch (k + 7 - kmax) { | switch (k + 7 - kmax) { | ||||
case 6: | case 6: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 5: | case 5: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 4: | case 4: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 3: | case 3: | ||||
inptr4 = zerobuff; | inptr4 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 2: | case 2: | ||||
inptr5 = zerobuff; | inptr5 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr6 = zerobuff; | inptr6 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr7 = zerobuff; | inptr7 = zerobuff; | ||||
break; | break; | ||||
@@ -715,8 +750,10 @@ static void gemm_s8x8x16_4x8_transpose_pack_B_n(dt_int8* outptr, | |||||
switch (y + 3 - ymax) { | switch (y + 3 - ymax) { | ||||
case 2: | case 2: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
break; | break; | ||||
@@ -734,8 +771,10 @@ static void gemm_s8x8x16_4x8_transpose_pack_B_n(dt_int8* outptr, | |||||
switch (y + 3 - ymax) { | switch (y + 3 - ymax) { | ||||
case 2: | case 2: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
break; | break; | ||||
@@ -6,7 +6,8 @@ | |||||
* | * | ||||
* Unless required by applicable law or agreed to in writing, | * Unless required by applicable law or agreed to in writing, | ||||
* software distributed under the License is distributed on an | * software distributed under the License is distributed on an | ||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | */ | ||||
#include "src/arm_common/simd_macro/marm_neon.h" | #include "src/arm_common/simd_macro/marm_neon.h" | ||||
@@ -453,8 +454,10 @@ static void gemm_u8_4x8_pack_A_n(dt_uint8* outptr, const dt_uint8* inptr, | |||||
switch (y + 3 - ymax) { | switch (y + 3 - ymax) { | ||||
case 2: | case 2: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
break; | break; | ||||
@@ -471,8 +474,10 @@ static void gemm_u8_4x8_pack_A_n(dt_uint8* outptr, const dt_uint8* inptr, | |||||
switch (y + 3 - ymax) { | switch (y + 3 - ymax) { | ||||
case 2: | case 2: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
break; | break; | ||||
@@ -523,16 +528,22 @@ static void gemm_u8_4x8_transpose_pack_A_n(dt_uint8* out, const dt_uint8* in, | |||||
switch (k + 7 - kmax) { | switch (k + 7 - kmax) { | ||||
case 6: | case 6: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 5: | case 5: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 4: | case 4: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 3: | case 3: | ||||
inptr4 = zerobuff; | inptr4 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 2: | case 2: | ||||
inptr5 = zerobuff; | inptr5 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr6 = zerobuff; | inptr6 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr7 = zerobuff; | inptr7 = zerobuff; | ||||
break; | break; | ||||
@@ -551,16 +562,22 @@ static void gemm_u8_4x8_transpose_pack_A_n(dt_uint8* out, const dt_uint8* in, | |||||
switch (k + 7 - kmax) { | switch (k + 7 - kmax) { | ||||
case 6: | case 6: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 5: | case 5: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 4: | case 4: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 3: | case 3: | ||||
inptr4 = zerobuff; | inptr4 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 2: | case 2: | ||||
inptr5 = zerobuff; | inptr5 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr6 = zerobuff; | inptr6 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr7 = zerobuff; | inptr7 = zerobuff; | ||||
break; | break; | ||||
@@ -617,16 +634,22 @@ static void gemm_u8_4x8_pack_B_n(dt_uint8* out, const dt_uint8* in, int ldin, | |||||
switch (k + 7 - kmax) { | switch (k + 7 - kmax) { | ||||
case 6: | case 6: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 5: | case 5: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 4: | case 4: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 3: | case 3: | ||||
inptr4 = zerobuff; | inptr4 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 2: | case 2: | ||||
inptr5 = zerobuff; | inptr5 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr6 = zerobuff; | inptr6 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr7 = zerobuff; | inptr7 = zerobuff; | ||||
break; | break; | ||||
@@ -646,16 +669,22 @@ static void gemm_u8_4x8_pack_B_n(dt_uint8* out, const dt_uint8* in, int ldin, | |||||
switch (k + 7 - kmax) { | switch (k + 7 - kmax) { | ||||
case 6: | case 6: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 5: | case 5: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 4: | case 4: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 3: | case 3: | ||||
inptr4 = zerobuff; | inptr4 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 2: | case 2: | ||||
inptr5 = zerobuff; | inptr5 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr6 = zerobuff; | inptr6 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr7 = zerobuff; | inptr7 = zerobuff; | ||||
break; | break; | ||||
@@ -675,16 +704,22 @@ static void gemm_u8_4x8_pack_B_n(dt_uint8* out, const dt_uint8* in, int ldin, | |||||
switch (k + 7 - kmax) { | switch (k + 7 - kmax) { | ||||
case 6: | case 6: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 5: | case 5: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 4: | case 4: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 3: | case 3: | ||||
inptr4 = zerobuff; | inptr4 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 2: | case 2: | ||||
inptr5 = zerobuff; | inptr5 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr6 = zerobuff; | inptr6 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr7 = zerobuff; | inptr7 = zerobuff; | ||||
break; | break; | ||||
@@ -763,8 +798,10 @@ static void gemm_u8_4x8_transpose_pack_B_n(dt_uint8* outptr, | |||||
switch (y + 3 - ymax) { | switch (y + 3 - ymax) { | ||||
case 2: | case 2: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
break; | break; | ||||
@@ -782,8 +819,10 @@ static void gemm_u8_4x8_transpose_pack_B_n(dt_uint8* outptr, | |||||
switch (y + 3 - ymax) { | switch (y + 3 - ymax) { | ||||
case 2: | case 2: | ||||
inptr1 = zerobuff; | inptr1 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 1: | case 1: | ||||
inptr2 = zerobuff; | inptr2 = zerobuff; | ||||
MEGDNN_FALLTHRU | |||||
case 0: | case 0: | ||||
inptr3 = zerobuff; | inptr3 = zerobuff; | ||||
break; | break; | ||||