feat(dnn/aarch64): add arm64 nchw44 mk matmul

GitOrigin-RevId: 698a11c3fd
5 years ago · 538d3de9d2
--- a/dnn/src/common/hash_ct.h
+++ b/dnn/src/common/hash_ct.h
@@ -0,0 +1,147 @@
 /**
 * Copyright (c) 2015 Daniel Kirchner
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *
 * ---------------------------------------------------------------------------
 * \file dnn/src/common/hash_ct.h
 *
 * \brief compile time hash for strings
 *
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 *
 * This file has been modified by Megvii ("Megvii Modifications").
 * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights
 * reserved.
 *
 * ---------------------------------------------------------------------------
 *
 */

 #pragma once
 #include <cstdint>
 namespace megdnn {
 /*!
 * \brief compile-time XX64 hash implementation
 *
 * see https://github.com/ekpyron/xxhashct/blob/master/xxh64.hpp
 */
 class XXHash64CT {
 public:
    static constexpr uint64_t hash(const char* p, uint64_t len, uint64_t seed) {
        return finalize(
                (len >= 32 ? h32bytes(p, len, seed) : seed + PRIME5) + len,
                p + (len & ~0x1F), len & 0x1F);
    }

 private:
    static constexpr uint64_t PRIME1 = 11400714785074694791ULL;
    static constexpr uint64_t PRIME2 = 14029467366897019727ULL;
    static constexpr uint64_t PRIME3 = 1609587929392839161ULL;
    static constexpr uint64_t PRIME4 = 9650029242287828579ULL;
    static constexpr uint64_t PRIME5 = 2870177450012600261ULL;

    static constexpr uint64_t rotl(uint64_t x, int r) {
        return ((x << r) | (x >> (64 - r)));
    }
    static constexpr uint64_t mix1(const uint64_t h, const uint64_t prime,
                                   int rshift) {
        return (h ^ (h >> rshift)) * prime;
    }
    static constexpr uint64_t mix2(const uint64_t p, const uint64_t v = 0) {
        return rotl(v + p * PRIME2, 31) * PRIME1;
    }
    static constexpr uint64_t mix3(const uint64_t h, const uint64_t v) {
        return (h ^ mix2(v)) * PRIME1 + PRIME4;
    }
 #ifdef XXH64_BIG_ENDIAN
    static constexpr uint32_t endian32(const char* v) {
        return uint32_t(uint8_t(v[3])) | (uint32_t(uint8_t(v[2])) << 8) |
               (uint32_t(uint8_t(v[1])) << 16) |
               (uint32_t(uint8_t(v[0])) << 24);
    }
    static constexpr uint64_t endian64(const char* v) {
        return uint64_t(uint8_t(v[7])) | (uint64_t(uint8_t(v[6])) << 8) |
               (uint64_t(uint8_t(v[5])) << 16) |
               (uint64_t(uint8_t(v[4])) << 24) |
               (uint64_t(uint8_t(v[3])) << 32) |
               (uint64_t(uint8_t(v[2])) << 40) |
               (uint64_t(uint8_t(v[1])) << 48) |
               (uint64_t(uint8_t(v[0])) << 56);
    }
 #else
    static constexpr uint32_t endian32(const char* v) {
        return uint32_t(uint8_t(v[0])) | (uint32_t(uint8_t(v[1])) << 8) |
               (uint32_t(uint8_t(v[2])) << 16) |
               (uint32_t(uint8_t(v[3])) << 24);
    }
    static constexpr uint64_t endian64(const char* v) {
        return uint64_t(uint8_t(v[0])) | (uint64_t(uint8_t(v[1])) << 8) |
               (uint64_t(uint8_t(v[2])) << 16) |
               (uint64_t(uint8_t(v[3])) << 24) |
               (uint64_t(uint8_t(v[4])) << 32) |
               (uint64_t(uint8_t(v[5])) << 40) |
               (uint64_t(uint8_t(v[6])) << 48) |
               (uint64_t(uint8_t(v[7])) << 56);
    }
 #endif
    static constexpr uint64_t fetch64(const char* p, const uint64_t v = 0) {
        return mix2(endian64(p), v);
    }
    static constexpr uint64_t fetch32(const char* p) {
        return uint64_t(endian32(p)) * PRIME1;
    }
    static constexpr uint64_t fetch8(const char* p) {
        return uint8_t(*p) * PRIME5;
    }
    // clang-format off
    static constexpr uint64_t finalize (const uint64_t h, const char *p,
                                       uint64_t len) {
        return (len >= 8) ? (finalize (rotl (h ^ fetch64 (p), 27)
                    * PRIME1 + PRIME4, p + 8, len - 8)) :
            ((len >= 4) ? (finalize (rotl (h ^ fetch32 (p), 23)
                    * PRIME2 + PRIME3, p + 4, len - 4)) :
             ((len > 0) ? (finalize (rotl (h ^ fetch8 (p), 11)
                     * PRIME1, p + 1, len - 1)) :
              (mix1 (mix1 (mix1 (h, PRIME2, 33), PRIME3, 29), 1, 32))));
    }
    static constexpr uint64_t h32bytes (const char *p, uint64_t len,
                                        const uint64_t v1,const uint64_t v2,
                                        const uint64_t v3, const uint64_t v4) {
        return (len >= 32) ? h32bytes (p + 32, len - 32, fetch64 (p, v1), 
                fetch64 (p + 8, v2), fetch64 (p + 16, v3), 
                fetch64 (p + 24, v4)) :
                mix3 (mix3 (mix3 (mix3 (rotl (v1, 1) + rotl (v2, 7) + rotl (v3, 12)
                + rotl (v4, 18), v1), v2), v3), v4);
    }
    static constexpr uint64_t h32bytes (const char *p, uint64_t len, const uint64_t seed) {
        return h32bytes (p, len, seed + PRIME1 + PRIME2, seed + PRIME2, seed, seed - PRIME1);
    }
    // clang-format on
 };
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/dnn/src/common/utils.h
+++ b/dnn/src/common/utils.h
@@ -17,6 +17,7 @@
 #include "megdnn/handle.h"
 #include "megdnn/thin/small_vector.h"

 #include "src/common/hash_ct.h"
 #include "src/common/utils.cuh"

 #include <cmath>
@@ -228,6 +229,10 @@ MEGDNN_CONSTEXPR std::size_t operator"" _z(unsigned long long n) {
    return n;
 }

 constexpr uint32_t operator"" _hash(char const* str, size_t count) {
    return XXHash64CT::hash(str, count, 20160701);
 }

 template <typename Vec>
 std::string vec2str(Vec&& vec) {
    std::string res;
--- a/dnn/src/fallback/matrix_mul/gemm_common.h
+++ b/dnn/src/fallback/matrix_mul/gemm_common.h
@@ -362,96 +362,111 @@ void gemm_kern(const Tin* packA, const Tin* packB, size_t M, size_t N, size_t K,
    InnerBlockSize get_inner_block_size() const override;             \
    size_t get_packA_type_size() const override;

 #define MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_PACKA(                            \
        _algo_name, _midout_name, _mid_index, _strategy, _i_type, _c_type,     \
        _packa_type)                                                           \
                                                                               \
    MatrixMulImpl::kern_naked_t MatrixMulImpl::_algo_name::get_kern_naked(     \
            const KernSizeParam&) const {                                      \
        auto kern = [](const MatrixMulImpl::KernParam& kern_param,             \
                       const void* packed_a, const void* packed_b) {           \
            MIDOUT_BEGIN(_midout_name, midout_iv(_mid_index)) {                \
                auto M = kern_param.M, N = kern_param.N, K = kern_param.K;     \
                auto trA = kern_param.trA, trB = kern_param.trB;               \
                auto LDC = kern_param.LDC;                                     \
                auto A_type = kern_param.A_type, B_type = kern_param.B_type,   \
                     C_type = kern_param.C_type;                               \
                auto Cptr = kern_param.C<_c_type>();                           \
                                                                               \
                _strategy strategy(M, N, K, A_type, B_type, C_type);           \
                megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB,  \
                                                           strategy)           \
                        .execute_naked(Cptr, LDC, packed_a, packed_b);         \
            }                                                                  \
            MIDOUT_END();                                                      \
        };                                                                     \
        return kern;                                                           \
    }                                                                          \
                                                                               \
    void MatrixMulImpl::_algo_name::pack_A(const KernParam& kern_param,        \
                                           void* out, size_t index,            \
                                           size_t stride) const {              \
        auto M = kern_param.M, N = kern_param.N, K = kern_param.K;             \
        auto A_type = kern_param.A_type, B_type = kern_param.B_type,           \
             C_type = kern_param.C_type;                                       \
                                                                               \
        auto trA = kern_param.trA, trB = kern_param.trB;                       \
        auto LDA = kern_param.LDA;                                             \
        const auto Aptr = kern_param.A<_i_type>();                             \
        _strategy strategy(M, N, K, A_type, B_type, C_type);                   \
        size_t start_index = index * stride;                                   \
        size_t end_index = start_index + stride;                               \
        end_index = std::min(end_index, M);                                    \
        megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB,          \
                                                   strategy)                   \
                .pack_A(reinterpret_cast<_packa_type*>(out), Aptr, LDA,        \
                        start_index, end_index);                               \
    }                                                                          \
                                                                               \
    void MatrixMulImpl::_algo_name::pack_B(const KernParam& kern_param,        \
                                           void* out, const size_t x0,         \
                                           size_t xmax) const {                \
        auto M = kern_param.M, N = kern_param.N, K = kern_param.K;             \
        auto A_type = kern_param.A_type, B_type = kern_param.B_type,           \
             C_type = kern_param.C_type;                                       \
                                                                               \
        auto trA = kern_param.trA, trB = kern_param.trB;                       \
        auto LDB = kern_param.LDB;                                             \
        const auto Bptr = kern_param.B<_i_type>();                             \
        _strategy strategy(M, N, K, A_type, B_type, C_type);                   \
        megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB,          \
                                                   strategy)                   \
                .pack_B(reinterpret_cast<_i_type*>(out), Bptr, LDB, x0, xmax); \
    }                                                                          \
                                                                               \
    WorkspaceBundle MatrixMulImpl::_algo_name::get_bundle(                     \
            const KernSizeParam& kern_size_param) const {                      \
        auto M = kern_size_param.M, N = kern_size_param.N,                     \
             K = kern_size_param.K;                                            \
        auto trA = kern_size_param.trA, trB = kern_size_param.trB;             \
        auto A_type = kern_size_param.A_type, B_type = kern_size_param.B_type, \
             C_type = kern_size_param.C_type;                                  \
        _strategy strategy(M, N, K, A_type, B_type, C_type);                   \
        return megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB,   \
                                                          strategy)            \
                .get_bundle();                                                 \
    }                                                                          \
                                                                               \
    MatrixMulImpl::_algo_name::InnerBlockSize                                  \
    MatrixMulImpl::_algo_name::get_inner_block_size() const {                  \
        return {_strategy::KERNEL_H, _strategy::KERNEL_W,                      \
                _strategy::UNROLL_K};                                          \
    }                                                                          \
                                                                               \
    size_t MatrixMulImpl::_algo_name::get_packA_type_size() const {            \
        return sizeof(_packa_type);                                            \
 #define MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_DETAIL(                          \
        _algo_name, _midout_name, _mid_index, _strategy, _i_type, _c_type,    \
        _packa_type)                                                          \
                                                                              \
    MatrixMulImpl::kern_naked_t MatrixMulImpl::_algo_name::get_kern_naked(    \
            const KernSizeParam&) const {                                     \
        auto kern = [](const MatrixMulImpl::KernParam& kern_param,            \
                       const void* packed_a, const void* packed_b) {          \
            MIDOUT_BEGIN(_midout_name, midout_iv(_mid_index),                 \
                         midout_iv("get_kern_naked"_hash)) {                  \
                auto M = kern_param.M, N = kern_param.N, K = kern_param.K;    \
                auto trA = kern_param.trA, trB = kern_param.trB;              \
                auto LDC = kern_param.LDC;                                    \
                auto A_type = kern_param.A_type, B_type = kern_param.B_type,  \
                     C_type = kern_param.C_type;                              \
                auto Cptr = kern_param.C<_c_type>();                          \
                                                                              \
                _strategy strategy(M, N, K, A_type, B_type, C_type);          \
                megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB, \
                                                           strategy)          \
                        .execute_naked(Cptr, LDC, packed_a, packed_b);        \
            }                                                                 \
            MIDOUT_END();                                                     \
        };                                                                    \
        return kern;                                                          \
    }                                                                         \
                                                                              \
    void MatrixMulImpl::_algo_name::pack_A(const KernParam& kern_param,       \
                                           void* out, size_t index,           \
                                           size_t stride) const {             \
        MIDOUT_BEGIN(_midout_name, midout_iv(_mid_index),                     \
                     midout_iv("pack_A"_hash)) {                              \
            auto M = kern_param.M, N = kern_param.N, K = kern_param.K;        \
            auto A_type = kern_param.A_type, B_type = kern_param.B_type,      \
                 C_type = kern_param.C_type;                                  \
                                                                              \
            auto trA = kern_param.trA, trB = kern_param.trB;                  \
            auto LDA = kern_param.LDA;                                        \
            const auto Aptr = kern_param.A<_i_type>();                        \
            _strategy strategy(M, N, K, A_type, B_type, C_type);              \
            size_t start_index = index * stride;                              \
            size_t end_index = start_index + stride;                          \
            end_index = std::min(end_index, M);                               \
            megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB,     \
                                                       strategy)              \
                    .pack_A(reinterpret_cast<_packa_type*>(out), Aptr, LDA,   \
                            start_index, end_index);                          \
        }                                                                     \
        MIDOUT_END();                                                         \
    }                                                                         \
                                                                              \
    void MatrixMulImpl::_algo_name::pack_B(const KernParam& kern_param,       \
                                           void* out, const size_t x0,        \
                                           size_t xmax) const {               \
        MIDOUT_BEGIN(_midout_name, midout_iv(_mid_index),                     \
                     midout_iv("pack_B"_hash)) {                              \
            auto M = kern_param.M, N = kern_param.N, K = kern_param.K;        \
            auto A_type = kern_param.A_type, B_type = kern_param.B_type,      \
                 C_type = kern_param.C_type;                                  \
                                                                              \
            auto trA = kern_param.trA, trB = kern_param.trB;                  \
            auto LDB = kern_param.LDB;                                        \
            const auto Bptr = kern_param.B<_i_type>();                        \
            _strategy strategy(M, N, K, A_type, B_type, C_type);              \
            megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB,     \
                                                       strategy)              \
                    .pack_B(reinterpret_cast<_i_type*>(out), Bptr, LDB, x0,   \
                            xmax);                                            \
        }                                                                     \
        MIDOUT_END();                                                         \
    }                                                                         \
                                                                              \
    WorkspaceBundle MatrixMulImpl::_algo_name::get_bundle(                    \
            const KernSizeParam& kern_size_param) const {                     \
        MIDOUT_BEGIN(_midout_name, midout_iv(_mid_index),                     \
                     midout_iv("get_bundle"_hash)) {                          \
            auto M = kern_size_param.M, N = kern_size_param.N,                \
                 K = kern_size_param.K;                                       \
            auto trA = kern_size_param.trA, trB = kern_size_param.trB;        \
            auto A_type = kern_size_param.A_type,                             \
                 B_type = kern_size_param.B_type,                             \
                 C_type = kern_size_param.C_type;                             \
            _strategy strategy(M, N, K, A_type, B_type, C_type);              \
            return megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA,   \
                                                              trB, strategy)  \
                    .get_bundle();                                            \
        }                                                                     \
        MIDOUT_END();                                                         \
    }                                                                         \
                                                                              \
    MatrixMulImpl::_algo_name::InnerBlockSize                                 \
    MatrixMulImpl::_algo_name::get_inner_block_size() const {                 \
        return {_strategy::KERNEL_H, _strategy::KERNEL_W,                     \
                _strategy::UNROLL_K};                                         \
    }                                                                         \
                                                                              \
    size_t MatrixMulImpl::_algo_name::get_packA_type_size() const {           \
        return sizeof(_packa_type);                                           \
    }

 #define MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL(                                  \
        _algo_name, _midout_name, _mid_index, _strategy, _i_type, _c_type)     \
    MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_PACKA(_algo_name, _midout_name,       \
                                               _mid_index, _strategy, _i_type, \
                                               _c_type, _i_type)
 #define MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL(                              \
        _algo_name, _midout_name, _mid_index, _strategy, _i_type, _c_type) \
    MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_DETAIL(_algo_name, _midout_name,  \
                                                _mid_index, _strategy,     \
                                                _i_type, _c_type, _i_type)
 }  // namespace matmul
 }  // namespace megdnn

--- a/dnn/src/fallback/matrix_mul/gemm_impl.h
+++ b/dnn/src/fallback/matrix_mul/gemm_impl.h
@@ -70,9 +70,9 @@ class GemmInterleaved<Strategy, true> {

 public:
    size_t get_workspace_size() const {
        return get_a_workspace_size() + get_b_workspace_size() +
               get_c_workspace_size();
        return get_bundle().total_size_in_bytes();
    }

    WorkspaceBundle get_bundle() const {
        return {nullptr,
                {get_a_workspace_size(), get_b_workspace_size(),