|
|
@@ -1,5 +1,5 @@ |
|
|
|
/** |
|
|
|
* \file dnn/src/armv7/matrix_mul/int8/kernel_mk4_dot_8x6x4.h |
|
|
|
* \file dnn/src/armv7/matrix_mul/int8/kernel_mk4_dot_8x4x4.h |
|
|
|
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") |
|
|
|
* |
|
|
|
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. |
|
|
@@ -17,205 +17,7 @@ |
|
|
|
|
|
|
|
namespace megdnn { |
|
|
|
namespace armv7 { |
|
|
|
namespace matmul_mk4_dot_8x6x4 { |
|
|
|
|
|
|
|
// Overview of register layout: |
|
|
|
// |
|
|
|
// A 1x6x4 cell of Rhs is stored in 8bit in q0, q1. |
|
|
|
// A 2x1x4x4 cell of Lhs is stored in 8bit in q2, q3 |
|
|
|
// A 2x6x4 block of accumulators is stored in 8bit in q4-q15 |
|
|
|
// |
|
|
|
// +--------+ |
|
|
|
// Rhs |q0[0-16]| |
|
|
|
// |q1[0-16]| |
|
|
|
// +--------+ |
|
|
|
// Lhs | | |
|
|
|
// +-------+-------+ - - - - +--------+ |
|
|
|
// | q2[0-16]| | q4[0-4]| |
|
|
|
// | q3[0-16]| | q5[0-4]| |
|
|
|
// +---------+ | q6[0-4]| |
|
|
|
// | q7[0-4]| |
|
|
|
// | q8[0-4]| |
|
|
|
// | q9[0-4]| |
|
|
|
// |q10[0-4]| |
|
|
|
// |q11[0-4]| |
|
|
|
// |q12[0-4]| |
|
|
|
// |q13[0-4]| |
|
|
|
// |q14[0-4]| |
|
|
|
// |q15[0-4]| |
|
|
|
// +--------+ |
|
|
|
// Accumulator |
|
|
|
|
|
|
|
static void kern_8x6(const int8_t* packA, const int8_t* packB, int K, |
|
|
|
int32_t* output, int LDC, bool is_first_k) { |
|
|
|
K /= 4; |
|
|
|
const int8_t* a_ptr = packA; |
|
|
|
const int8_t* b_ptr = packB; |
|
|
|
// Fix up for odd lengths - set a flag if K is odd, but make |
|
|
|
// sure we round up the iteration count. |
|
|
|
int oddk = (K & 1); |
|
|
|
int k = (K + 1) / 2 - 1; |
|
|
|
|
|
|
|
LDC = LDC * sizeof(int32_t); |
|
|
|
int32_t* outptr0 = output; |
|
|
|
int32_t* outptr1; |
|
|
|
|
|
|
|
asm volatile( |
|
|
|
// load accumulator C |
|
|
|
"add %[outptr1], %[outptr0], %[LDC]\n" |
|
|
|
"cmp %[is_first_k], #1\n" |
|
|
|
"beq 1f\n" |
|
|
|
"vld1.32 {d8, d9}, [%[outptr0]]!\n" |
|
|
|
"vld1.32 {d10, d11}, [%[outptr0]]!\n" |
|
|
|
"vld1.32 {d12, d13}, [%[outptr0]]!\n" |
|
|
|
"vld1.32 {d14, d15}, [%[outptr0]]!\n" |
|
|
|
"vld1.32 {d16, d17}, [%[outptr0]]!\n" |
|
|
|
"vld1.32 {d18, d19}, [%[outptr0]]!\n" |
|
|
|
"vld1.32 {d20, d21}, [%[outptr1]]!\n" |
|
|
|
"vld1.32 {d22, d23}, [%[outptr1]]!\n" |
|
|
|
"vld1.32 {d24, d25}, [%[outptr1]]!\n" |
|
|
|
"vld1.32 {d26, d27}, [%[outptr1]]!\n" |
|
|
|
"vld1.32 {d28, d29}, [%[outptr1]]!\n" |
|
|
|
"vld1.32 {d30, d31}, [%[outptr1]]!\n" |
|
|
|
|
|
|
|
"b 2f\n" |
|
|
|
|
|
|
|
"1:\n" |
|
|
|
"veor.s32 q4, q4, q4\n" |
|
|
|
"veor.s32 q5, q5, q5\n" |
|
|
|
"veor.s32 q6, q6, q6\n" |
|
|
|
"veor.s32 q7, q7, q7\n" |
|
|
|
"veor.s32 q8, q8, q8\n" |
|
|
|
"veor.s32 q9, q9, q9\n" |
|
|
|
"veor.s32 q10, q10, q10\n" |
|
|
|
"veor.s32 q11, q11, q11\n" |
|
|
|
"veor.s32 q12, q12, q12\n" |
|
|
|
"veor.s32 q13, q13, q13\n" |
|
|
|
"veor.s32 q14, q14, q14\n" |
|
|
|
"veor.s32 q15, q15, q15\n" |
|
|
|
|
|
|
|
"2: \n" |
|
|
|
"vld1.s8 {q0}, [%[b_ptr]]!\n" |
|
|
|
"vld1.s8 {d2}, [%[b_ptr]]!\n" |
|
|
|
"vld1.s8 {q2}, [%[a_ptr]]!\n" |
|
|
|
"vld1.s8 {q3}, [%[a_ptr]]!\n" |
|
|
|
|
|
|
|
"cmp %[k], #0 \n" |
|
|
|
"beq 4f \n" |
|
|
|
|
|
|
|
"3:\n" |
|
|
|
"vsdot.s8 q4 , q2, d0[0]\n" |
|
|
|
"vsdot.s8 q5 , q2, d0[1]\n" |
|
|
|
"vsdot.s8 q6 , q2, d1[0]\n" |
|
|
|
"vsdot.s8 q7 , q2, d1[1]\n" |
|
|
|
"vsdot.s8 q8 , q2, d2[0]\n" |
|
|
|
"vsdot.s8 q9 , q2, d2[1]\n" |
|
|
|
"vsdot.s8 q10 , q3, d0[0]\n" |
|
|
|
"vsdot.s8 q11 , q3, d0[1]\n" |
|
|
|
"vsdot.s8 q12 , q3, d1[0]\n" |
|
|
|
"vsdot.s8 q13 , q3, d1[1]\n" |
|
|
|
"vsdot.s8 q14 , q3, d2[0]\n" |
|
|
|
"vsdot.s8 q15 , q3, d2[1]\n" |
|
|
|
|
|
|
|
"vld1.s8 {q0}, [%[b_ptr]]!\n" |
|
|
|
"vld1.s8 {d2}, [%[b_ptr]]!\n" |
|
|
|
"vld1.s8 {q2}, [%[a_ptr]]!\n" |
|
|
|
"vld1.s8 {q3}, [%[a_ptr]]!\n" |
|
|
|
"vsdot.s8 q4 , q2, d0[0]\n" |
|
|
|
"vsdot.s8 q5 , q2, d0[1]\n" |
|
|
|
"vsdot.s8 q6 , q2, d1[0]\n" |
|
|
|
"vsdot.s8 q7 , q2, d1[1]\n" |
|
|
|
"vsdot.s8 q8 , q2, d2[0]\n" |
|
|
|
"vsdot.s8 q9 , q2, d2[1]\n" |
|
|
|
"vsdot.s8 q10 , q3, d0[0]\n" |
|
|
|
"vsdot.s8 q11 , q3, d0[1]\n" |
|
|
|
"vsdot.s8 q12 , q3, d1[0]\n" |
|
|
|
"vsdot.s8 q13 , q3, d1[1]\n" |
|
|
|
"vsdot.s8 q14 , q3, d2[0]\n" |
|
|
|
"vsdot.s8 q15 , q3, d2[1]\n" |
|
|
|
|
|
|
|
"vld1.s8 {q0}, [%[b_ptr]]!\n" |
|
|
|
"vld1.s8 {d2}, [%[b_ptr]]!\n" |
|
|
|
"vld1.s8 {q2}, [%[a_ptr]]!\n" |
|
|
|
"vld1.s8 {q3}, [%[a_ptr]]!\n" |
|
|
|
|
|
|
|
"subs %[k], %[k], #1\n" |
|
|
|
"bne 3b\n" |
|
|
|
|
|
|
|
// Target to use when K is 1 or 2 (i.e. zero iterations of main |
|
|
|
// loop) |
|
|
|
"4:\n" |
|
|
|
|
|
|
|
"cmp %[oddk], #0 \n" |
|
|
|
"bne 5f \n" |
|
|
|
"vsdot.s8 q4 , q2, d0[0]\n" |
|
|
|
"vsdot.s8 q5 , q2, d0[1]\n" |
|
|
|
"vsdot.s8 q6 , q2, d1[0]\n" |
|
|
|
"vsdot.s8 q7 , q2, d1[1]\n" |
|
|
|
"vsdot.s8 q8 , q2, d2[0]\n" |
|
|
|
"vsdot.s8 q9 , q2, d2[1]\n" |
|
|
|
"vsdot.s8 q10 , q3, d0[0]\n" |
|
|
|
"vsdot.s8 q11 , q3, d0[1]\n" |
|
|
|
"vsdot.s8 q12 , q3, d1[0]\n" |
|
|
|
"vsdot.s8 q13 , q3, d1[1]\n" |
|
|
|
"vsdot.s8 q14 , q3, d2[0]\n" |
|
|
|
"vsdot.s8 q15 , q3, d2[1]\n" |
|
|
|
|
|
|
|
"vld1.s8 {q0}, [%[b_ptr]]!\n" |
|
|
|
"vld1.s8 {d2}, [%[b_ptr]]!\n" |
|
|
|
"vld1.s8 {q2}, [%[a_ptr]]!\n" |
|
|
|
"vld1.s8 {q3}, [%[a_ptr]]!\n" |
|
|
|
"vsdot.s8 q4 , q2, d0[0]\n" |
|
|
|
"vsdot.s8 q5 , q2, d0[1]\n" |
|
|
|
"vsdot.s8 q6 , q2, d1[0]\n" |
|
|
|
"vst1.32 {d8, d9}, [%[outptr0]]!\n" |
|
|
|
"vsdot.s8 q7 , q2, d1[1]\n" |
|
|
|
"vsdot.s8 q8 , q2, d2[0]\n" |
|
|
|
"vsdot.s8 q9 , q2, d2[1]\n" |
|
|
|
"vst1.32 {d10, d11}, [%[outptr0]]!\n" |
|
|
|
"vsdot.s8 q10 , q3, d0[0]\n" |
|
|
|
"vsdot.s8 q11 , q3, d0[1]\n" |
|
|
|
"vsdot.s8 q12 , q3, d1[0]\n" |
|
|
|
"vst1.32 {d12, d13}, [%[outptr0]]!\n" |
|
|
|
"vsdot.s8 q13 , q3, d1[1]\n" |
|
|
|
"vsdot.s8 q14 , q3, d2[0]\n" |
|
|
|
"vsdot.s8 q15 , q3, d2[1]\n" |
|
|
|
|
|
|
|
"b 6f\n" |
|
|
|
"5: \n" |
|
|
|
"vsdot.s8 q4 , q2, d0[0]\n" |
|
|
|
"vsdot.s8 q5 , q2, d0[1]\n" |
|
|
|
"vsdot.s8 q6 , q2, d1[0]\n" |
|
|
|
"vst1.32 {d8, d9}, [%[outptr0]]!\n" |
|
|
|
"vsdot.s8 q7 , q2, d1[1]\n" |
|
|
|
"vsdot.s8 q8 , q2, d2[0]\n" |
|
|
|
"vsdot.s8 q9 , q2, d2[1]\n" |
|
|
|
"vst1.32 {d10, d11}, [%[outptr0]]!\n" |
|
|
|
"vsdot.s8 q10 , q3, d0[0]\n" |
|
|
|
"vsdot.s8 q11 , q3, d0[1]\n" |
|
|
|
"vsdot.s8 q12 , q3, d1[0]\n" |
|
|
|
"vst1.32 {d12, d13}, [%[outptr0]]!\n" |
|
|
|
"vsdot.s8 q13 , q3, d1[1]\n" |
|
|
|
"vsdot.s8 q14 , q3, d2[0]\n" |
|
|
|
"vsdot.s8 q15 , q3, d2[1]\n" |
|
|
|
|
|
|
|
"6: \n" |
|
|
|
"vst1.32 {d14, d15}, [%[outptr0]]!\n" |
|
|
|
"vst1.32 {d16, d17}, [%[outptr0]]!\n" |
|
|
|
"vst1.32 {d18, d19}, [%[outptr0]]!\n" |
|
|
|
"vst1.32 {d20, d21}, [%[outptr1]]!\n" |
|
|
|
"vst1.32 {d22, d23}, [%[outptr1]]!\n" |
|
|
|
"vst1.32 {d24, d25}, [%[outptr1]]!\n" |
|
|
|
"vst1.32 {d26, d27}, [%[outptr1]]!\n" |
|
|
|
"vst1.32 {d28, d29}, [%[outptr1]]!\n" |
|
|
|
"vst1.32 {d30, d31}, [%[outptr1]]!\n" |
|
|
|
: [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [LDC] "+r"(LDC), |
|
|
|
[oddk] "+r"(oddk), [is_first_k] "+r"(is_first_k), [k] "+r"(k), |
|
|
|
[outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1) |
|
|
|
: |
|
|
|
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", |
|
|
|
"q11", "q12", "q14", "q15", "cc", "memory"); |
|
|
|
} |
|
|
|
namespace matmul_mk4_dot_8x4x4 { |
|
|
|
|
|
|
|
// Overview of register layout: |
|
|
|
// |
|
|
@@ -392,144 +194,6 @@ static void kern_8x4(const int8_t* packA, const int8_t* packB, int K, |
|
|
|
|
|
|
|
// Overview of register layout: |
|
|
|
// |
|
|
|
// A 1x6x4 pingpong cell of Rhs is stored in 8bit in q0-q3. |
|
|
|
// A 1x1x4x4 pingpong cell of Lhs is stored in 8bit in q4-q5 |
|
|
|
// A 2x6x4 block of accumulators is stored in 8bit in q10-q15 |
|
|
|
// |
|
|
|
// +--------+ |
|
|
|
// Rhs |q0[0-16]| |
|
|
|
// |q1[0-16]| |
|
|
|
// +--------+ |
|
|
|
// Lhs | | |
|
|
|
// +-------+-------+ - - - - +--------+ |
|
|
|
// | q4[0-16]| |q10[0-4]| |
|
|
|
// | q5[0-16]| |q11[0-4]| |
|
|
|
// +---------+ |q12[0-4]| |
|
|
|
// |q13[0-4]| |
|
|
|
// |q14[0-4]| |
|
|
|
// |q15[0-4]| |
|
|
|
// +--------+ |
|
|
|
// Accumulator |
|
|
|
|
|
|
|
static void kern_4x6(const int8_t* packA, const int8_t* packB, int K, |
|
|
|
int32_t* output, int LDC, bool is_first_k) { |
|
|
|
K /= 4; |
|
|
|
const int8_t* a_ptr = packA; |
|
|
|
const int8_t* b_ptr = packB; |
|
|
|
// Fix up for odd lengths - set a flag if K is odd, but make |
|
|
|
// sure we round up the iteration count. |
|
|
|
int oddk = (K & 1); |
|
|
|
int k = (K + 1) / 2 - 1; |
|
|
|
|
|
|
|
LDC = LDC * sizeof(int32_t); |
|
|
|
int32_t* outptr0 = output; |
|
|
|
|
|
|
|
asm volatile( |
|
|
|
// load accumulator C |
|
|
|
"cmp %[is_first_k], #1\n" |
|
|
|
"beq 1f\n" |
|
|
|
"vld1.32 {d20, d21}, [%[outptr0]]!\n" |
|
|
|
"vld1.32 {d22, d23}, [%[outptr0]]!\n" |
|
|
|
"vld1.32 {d24, d25}, [%[outptr0]]!\n" |
|
|
|
"vld1.32 {d26, d27}, [%[outptr0]]!\n" |
|
|
|
"vld1.32 {d28, d29}, [%[outptr0]]!\n" |
|
|
|
"vld1.32 {d30, d31}, [%[outptr0]]!\n" |
|
|
|
|
|
|
|
"b 2f\n" |
|
|
|
|
|
|
|
"1:\n" |
|
|
|
"veor.s32 q10, q10, q10\n" |
|
|
|
"veor.s32 q11, q11, q11\n" |
|
|
|
"veor.s32 q12, q12, q12\n" |
|
|
|
"veor.s32 q13, q13, q13\n" |
|
|
|
"veor.s32 q14, q14, q14\n" |
|
|
|
"veor.s32 q15, q15, q15\n" |
|
|
|
|
|
|
|
"2: \n" |
|
|
|
"vld1.s8 {q0}, [%[b_ptr]]!\n" |
|
|
|
"vld1.s8 {d2}, [%[b_ptr]]!\n" |
|
|
|
"vld1.s8 {q4}, [%[a_ptr]]!\n" |
|
|
|
|
|
|
|
"cmp %[k], #0 \n" |
|
|
|
"beq 4f \n" |
|
|
|
|
|
|
|
"3:\n" |
|
|
|
"vsdot.s8 q10 , q4, d0[0]\n" |
|
|
|
"vsdot.s8 q11 , q4, d0[1]\n" |
|
|
|
"vsdot.s8 q12 , q4, d1[0]\n" |
|
|
|
"vld1.s8 {q2}, [%[b_ptr]]!\n" |
|
|
|
"vld1.s8 {d6}, [%[b_ptr]]!\n" |
|
|
|
"vld1.s8 {q5}, [%[a_ptr]]!\n" |
|
|
|
"vsdot.s8 q13 , q4, d1[1]\n" |
|
|
|
"vsdot.s8 q14 , q4, d2[0]\n" |
|
|
|
"vsdot.s8 q15 , q4, d2[1]\n" |
|
|
|
|
|
|
|
"vld1.s8 {q0}, [%[b_ptr]]!\n" |
|
|
|
"vsdot.s8 q10 , q5, d4[0]\n" |
|
|
|
"vsdot.s8 q11 , q5, d4[1]\n" |
|
|
|
"vsdot.s8 q12 , q5, d5[0]\n" |
|
|
|
"vld1.s8 {d2}, [%[b_ptr]]!\n" |
|
|
|
"vsdot.s8 q13 , q5, d5[1]\n" |
|
|
|
"vsdot.s8 q14 , q5, d6[0]\n" |
|
|
|
"vsdot.s8 q15 , q5, d6[1]\n" |
|
|
|
"vld1.s8 {q4}, [%[a_ptr]]!\n" |
|
|
|
|
|
|
|
"subs %[k], %[k], #1\n" |
|
|
|
"bne 3b\n" |
|
|
|
|
|
|
|
// Target to use when K is 1 or 2 (i.e. zero iterations of main |
|
|
|
// loop) |
|
|
|
"4:\n" |
|
|
|
|
|
|
|
"cmp %[oddk], #0 \n" |
|
|
|
"bne 5f \n" |
|
|
|
|
|
|
|
"vsdot.s8 q10 , q4, d0[0]\n" |
|
|
|
"vsdot.s8 q11 , q4, d0[1]\n" |
|
|
|
"vsdot.s8 q12 , q4, d1[0]\n" |
|
|
|
"vld1.s8 {q2}, [%[b_ptr]]!\n" |
|
|
|
"vld1.s8 {d6}, [%[b_ptr]]!\n" |
|
|
|
"vld1.s8 {q5}, [%[a_ptr]]!\n" |
|
|
|
"vsdot.s8 q13 , q4, d1[1]\n" |
|
|
|
"vsdot.s8 q14 , q4, d2[0]\n" |
|
|
|
"vsdot.s8 q15 , q4, d2[1]\n" |
|
|
|
|
|
|
|
"vsdot.s8 q10 , q5, d4[0]\n" |
|
|
|
"vsdot.s8 q11 , q5, d4[1]\n" |
|
|
|
"vsdot.s8 q12 , q5, d5[0]\n" |
|
|
|
"vst1.32 {d20, d21}, [%[outptr0]]!\n" |
|
|
|
"vsdot.s8 q13 , q5, d5[1]\n" |
|
|
|
"vsdot.s8 q14 , q5, d6[0]\n" |
|
|
|
"vsdot.s8 q15 , q5, d6[1]\n" |
|
|
|
"vst1.32 {d22, d23}, [%[outptr0]]!\n" |
|
|
|
|
|
|
|
"b 6f\n" |
|
|
|
"5: \n" |
|
|
|
|
|
|
|
"vsdot.s8 q10 , q4, d0[0]\n" |
|
|
|
"vsdot.s8 q11 , q4, d0[1]\n" |
|
|
|
"vsdot.s8 q12 , q4, d1[0]\n" |
|
|
|
"vst1.32 {d20, d21}, [%[outptr0]]!\n" |
|
|
|
"vsdot.s8 q13 , q4, d1[1]\n" |
|
|
|
"vsdot.s8 q14 , q4, d2[0]\n" |
|
|
|
"vsdot.s8 q15 , q4, d2[1]\n" |
|
|
|
"vst1.32 {d22, d23}, [%[outptr0]]!\n" |
|
|
|
|
|
|
|
"6: \n" |
|
|
|
"vst1.32 {d24, d25}, [%[outptr0]]!\n" |
|
|
|
"vst1.32 {d26, d27}, [%[outptr0]]!\n" |
|
|
|
"vst1.32 {d28, d29}, [%[outptr0]]!\n" |
|
|
|
"vst1.32 {d30, d31}, [%[outptr0]]!\n" |
|
|
|
: [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [LDC] "+r"(LDC), |
|
|
|
[oddk] "+r"(oddk), [is_first_k] "+r"(is_first_k), [k] "+r"(k), |
|
|
|
[outptr0] "+r"(outptr0) |
|
|
|
: |
|
|
|
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", |
|
|
|
"q11", "q12", "q14", "q15", "cc", "memory"); |
|
|
|
} |
|
|
|
|
|
|
|
// Overview of register layout: |
|
|
|
// |
|
|
|
// A 2x4x4 cell of Rhs is stored in 8bit in q1, q3. |
|
|
|
// A 1x2x4x4 ping-pong cell of Lhs is stored in 8bit in q5, q7 |
|
|
|
// A 1x4x4 block of accumulators is stored in 8bit in q0, q2, q4, q6 |
|
|
@@ -671,7 +335,7 @@ static void kern_4x4(const int8_t* packA, const int8_t* packB, int K, |
|
|
|
#undef STORE_C |
|
|
|
} |
|
|
|
|
|
|
|
static void gemm_dots8_8x6_pack_A(dt_int8* outptr, const dt_int8* inptr, |
|
|
|
static void gemm_dots8_8x4_pack_A(dt_int8* outptr, const dt_int8* inptr, |
|
|
|
int ldin, int y0, int ymax, int k0, |
|
|
|
int kmax) { |
|
|
|
int y = y0, y_start = y0 / 4; |
|
|
@@ -692,14 +356,12 @@ static void gemm_dots8_8x6_pack_A(dt_int8* outptr, const dt_int8* inptr, |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
static void gemm_dots8_8x6_pack_B(dt_int8* out, const dt_int8* in, int ldin, |
|
|
|
static void gemm_dots8_8x4_pack_B(dt_int8* out, const dt_int8* in, int ldin, |
|
|
|
int x0, int xmax, int k0, int kmax) { |
|
|
|
const int ksize = kmax - k0; |
|
|
|
const int ksize4 = ksize * 4; |
|
|
|
const int ksize6 = ksize * 6; |
|
|
|
int8_t* outptr = out; |
|
|
|
int8_t* outptr_base = out; |
|
|
|
int8_t* outptr_base4 = out + ((xmax - x0) / 6) * ksize6; |
|
|
|
|
|
|
|
int k = k0; |
|
|
|
for (; k + 3 < kmax; k += 4) { |
|
|
@@ -708,13 +370,6 @@ static void gemm_dots8_8x6_pack_B(dt_int8* out, const dt_int8* in, int ldin, |
|
|
|
|
|
|
|
outptr = outptr_base; |
|
|
|
int x = x0; |
|
|
|
for (; x + 5 < xmax; x += 6) { |
|
|
|
memcpy(outptr, inptr, sizeof(int8_t) * 24); |
|
|
|
outptr += ksize6; |
|
|
|
inptr += 24; |
|
|
|
} |
|
|
|
|
|
|
|
outptr = outptr_base4; |
|
|
|
for (; x + 3 < xmax; x += 4) { |
|
|
|
memcpy(outptr, inptr, sizeof(int8_t) * 16); |
|
|
|
outptr += ksize4; |
|
|
@@ -735,12 +390,11 @@ static void gemm_dots8_8x6_pack_B(dt_int8* out, const dt_int8* in, int ldin, |
|
|
|
*outptr++ = 0; |
|
|
|
} |
|
|
|
} |
|
|
|
outptr_base += 24; |
|
|
|
outptr_base4 += 16; |
|
|
|
outptr_base += 16; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
} // namespace matmul_mk4_dot_8x6x4 |
|
|
|
} // namespace matmul_mk4_dot_8x4x4 |
|
|
|
} // namespace armv7 |
|
|
|
} // namespace megdnn |
|
|
|
#endif |