OpenI
/
MegEngine

/**
 * \file dnn/test/naive/matrix_mul.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
#include "test/naive/fixture.h"

#include "megdnn/oprs/linalg.h"
#include "test/common/checker.h"
#include "test/common/extra_impl_helper.h"
#include "test/common/matrix_mul.h"
#include "test/common/random_state.h"

using namespace megdnn;
using namespace test;

namespace {

void run_matmul_mk_format(
        Handle* handle, param::MatrixMul::Format format, DType Atype, DType Btype,
        DType Ctype) {
    using namespace matrix_mul;
    std::vector<TestArg> args = get_matmul_args();
    Checker<MatrixMul> checker(handle);

    auto extra_impl = [](const TensorNDArray& tensors, param::MatrixMul param,
                         Handle* handle, size_t pack_size) {
        megdnn_assert(
                (param.format == param::MatrixMul::Format::MK4 ||
                 param.format == param::MatrixMul::Format::MK4_DOT ||
                 param.format == param::MatrixMul::Format::MK8) &&
                tensors.size() == 3);
        param::MatrixMul new_param = param;
        new_param.format = param::MatrixMul::Format::DEFAULT;
        size_t M = tensors[2].layout[0] * pack_size;
        size_t N = tensors[2].layout[1];
        size_t K = tensors[0].layout[1 - param.transposeA] * pack_size;

        TensorLayoutArray default_layouts, mk4_layouts;
        if (param.transposeA) {
            default_layouts.emplace_back(tensors[0].layout.reshape({K, M}));
            if (param.format == param::MatrixMul::Format::MK4_DOT) {
                mk4_layouts.emplace_back(default_layouts.back()
                                                 .reshape(
                                                         {K / pack_size, M / pack_size,
                                                          pack_size, pack_size})
                                                 .dimshuffle({0, 3, 1, 2}));
            } else {
                mk4_layouts.emplace_back(default_layouts.back()
                                                 .reshape(
                                                         {K / pack_size, M / pack_size,
                                                          pack_size, pack_size})
                                                 .dimshuffle({0, 2, 1, 3}));
            }
        } else {
            default_layouts.emplace_back(tensors[0].layout.reshape({M, K}));
            if (param.format == param::MatrixMul::Format::MK4_DOT) {
                mk4_layouts.emplace_back(default_layouts.back()
                                                 .reshape(
                                                         {M / pack_size, K / pack_size,
                                                          pack_size, pack_size})
                                                 .dimshuffle({0, 2, 1, 3}));
            } else {
                mk4_layouts.emplace_back(default_layouts.back()
                                                 .reshape(
                                                         {M / pack_size, K / pack_size,
                                                          pack_size, pack_size})
                                                 .dimshuffle({0, 3, 1, 2}));
            }
        }
        if (param.transposeB) {
            default_layouts.emplace_back(tensors[1].layout.reshape({N, K}));
            mk4_layouts.emplace_back(default_layouts.back()
                                             .reshape({N, K / pack_size, pack_size})
                                             .dimshuffle({0, 1, 2}));
        } else {
            default_layouts.emplace_back(tensors[1].layout.reshape({K, N}));
            mk4_layouts.emplace_back(default_layouts.back()
                                             .reshape({K / pack_size, N, pack_size})
                                             .dimshuffle({0, 2, 1}));
        }

        default_layouts.emplace_back(tensors[2].layout.reshape({M, N}));
        mk4_layouts.emplace_back(default_layouts.back()
                                         .reshape({M / pack_size, N, pack_size})
                                         .dimshuffle({0, 2, 1}));

        auto matmul_opr = handle->create_operator<MatrixMul>();
        matmul_opr->param() = new_param;
        size_t matmul_workspace = matmul_opr->get_workspace_in_bytes(
                default_layouts[0], default_layouts[1], default_layouts[2]);
        auto relayout_opr = handle->create_operator<Relayout>();

        WorkspaceBundle wb(
                nullptr, {default_layouts[0].span().dist_byte(),
                          default_layouts[1].span().dist_byte(),
                          default_layouts[2].span().dist_byte(), matmul_workspace});
        wb.set(malloc(wb.total_size_in_bytes()));

        TensorNDArray default_tensors, mk4_tensors;
        for (size_t i = 0; i < 3; i++) {
            default_tensors.emplace_back(wb.get(i), default_layouts[i]);
            mk4_tensors.emplace_back(tensors[i].raw_ptr(), mk4_layouts[i]);
        }
        relayout_opr->exec(mk4_tensors[0], default_tensors[0]);
        relayout_opr->exec(mk4_tensors[1], default_tensors[1]);
        matmul_opr->exec(
                default_tensors[0], default_tensors[1], default_tensors[2],
                wb.get_workspace(3));
        relayout_opr->exec(default_tensors[2], mk4_tensors[2]);

        free(wb.ptr());
    };

    size_t pack_size = MatrixMulForward::pack_size(format);
    for (auto&& arg : args) {
        if (arg.m % pack_size != 0 || arg.k % pack_size != 0)
            continue;
        param::MatrixMul param;
        param.transposeA = arg.mask & 0x1;
        param.transposeB = arg.mask & 0x2;
        param.format = format;
        size_t m = arg.m, n = arg.n, k = arg.k;
        TensorShape A, B;
        if (param.transposeA) {
            A = TensorShape{k / pack_size, m / pack_size, pack_size, pack_size};
        } else {
            A = TensorShape{m / pack_size, k / pack_size, pack_size, pack_size};
        }
        if (param.transposeB) {
            B = TensorShape{n, k / pack_size, pack_size};
        } else {
            B = TensorShape{k / pack_size, n, pack_size};
        }

        checker.set_extra_opr_impl(
                std::bind(extra_impl, std::placeholders::_1, param, handle, pack_size));
        checker.set_dtype(0, Atype)
                .set_dtype(1, Btype)
                .set_dtype(2, Ctype)
                .set_epsilon(1e-3)
                .set_param(param)
                .execs({A, B, {}});
    }
}

}  // namespace

TEST_F(NAIVE, MATRIX_MUL_QUANTIZED4x4x32) {
    Checker<MatrixMul> checker(handle(), /* check_dispatch */ false);
    auto GenTensorValueQuint4 = [](const TensorShape& shape,
                                   dtype::Quantized4Asymm dtype,
                                   const std::vector<int>& values) {
        TensorND tensor;
        tensor.layout = {shape, dtype};
        tensor.reset_ptr(
                static_cast<dt_byte*>(malloc(tensor.layout.span().dist_byte())));
        uint8_t* ptr = static_cast<uint8_t*>(tensor.raw_ptr());
        megdnn_assert(values.size() == tensor.layout.span().dist_elem());
        for (size_t i = 0; i < tensor.layout.span().dist_elem(); i += 2) {
            int val0 = values[i], val1 = values[i + 1];
            ptr[i / 2] = val0 | (val1 << 4);
        }
        return tensor;
    };
    using Param = MatrixMul::Param;
    Param param;
    checker.set_param(param);
    checker.set_dtype(2, dtype::QuantizedS32(0.3f * 0.3f));
    checker.exect(
            Testcase{
                    GenTensorValueQuint4(
                            {8, 8}, dtype::Quantized4Asymm(0.3f, (uint8_t)8),
                            {13, 2,  4, 13, 9,  3,  14, 14, 14, 5,  3,  3,  15,
                             11, 8,  8, 5,  7,  14, 15, 8,  2,  11, 1,  15, 9,
                             13, 14, 2, 3,  11, 11, 15, 10, 11, 0,  13, 12, 3,
                             11, 9,  9, 10, 5,  2,  5,  8,  4,  6,  9,  0,  0,
                             3,  9,  9, 8,  8,  15, 7,  5,  0,  3,  9,  10}),
                    GenTensorValueQuint4(
                            {8, 8}, dtype::Quantized4Asymm(0.3f, (uint8_t)8),
                            {5,  14, 13, 11, 4,  7,  12, 12, 11, 7,  13, 10, 5,
                             6,  4,  2,  3,  12, 2,  2,  13, 3,  14, 0,  15, 15,
                             0,  2,  2,  13, 3,  14, 10, 8,  9,  11, 0,  14, 15,
                             4,  14, 7,  1,  6,  13, 2,  12, 5,  2,  15, 7,  11,
                             13, 9,  8,  10, 0,  11, 6,  10, 12, 2,  2,  12}),
                    {}},
            Testcase{
                    {},
                    {},
                    TensorValue(
                            {8, 8}, dtype::QuantizedS32(0.3f * 0.3f),
                            {-90, 120, -3,  40,  -31, 58,  -54, 165,  -5,  -19, 71,
                             87,  -51, 24,  92,  15,  27,  62,  -59,  -82, -40, 91,
                             11,  -16, -85, 138, -18, -36, 8,   -25,  -56, 75,  -46,
                             -34, 67,  53,  -4,  -83, 111, -86, -29,  -17, 45,  -9,
                             38,  -22, -3,  -19, -17, -95, 94,  78,   63,  -35, -51,
                             21,  -63, -14, 87,  31,  44,  -53, -107, 5}),
            });
}

TEST_F(NAIVE, MATRIX_MUL_QUANTIZEDS4_4x4x16) {
    Checker<MatrixMul> checker(handle(), /* check_dispatch */ false);
    auto GenTensorValueQuint4 = [](const TensorShape& shape, dtype::QuantizedS4 dtype,
                                   const std::vector<int>& values) {
        TensorND tensor;
        tensor.layout = {shape, dtype};
        tensor.reset_ptr(
                static_cast<dt_byte*>(malloc(tensor.layout.span().dist_byte())));
        uint8_t* ptr = static_cast<uint8_t*>(tensor.raw_ptr());
        megdnn_assert(values.size() == tensor.layout.span().dist_elem());
        for (size_t i = 0; i < tensor.layout.span().dist_elem(); i += 2) {
            int val0 = values[i], val1 = values[i + 1];
            ptr[i / 2] = (val0 & 0xF) | (val1 << 4);
        }
        return tensor;
    };
    using Param = MatrixMul::Param;
    Param param;
    checker.set_param(param);
    checker.set_dtype(2, dtype::QuantizedS16(0.3f * 0.3f));
    checker.exect(
            Testcase{
                    GenTensorValueQuint4(
                            {8, 8}, dtype::QuantizedS4(0.3f),
                            {-8, 7, 2,  1,  2,  3, 2, 7, 2,  5,  3,  3,  7,  4, -7, 1,
                             -5, 7, -4, -1, -1, 2, 4, 1, 7,  2,  -6, -2, -6, 3, 4,  4,
                             -2, 2, 3,  0,  6,  5, 3, 4, -1, -1, -5, 5,  2,  5, 1,  4,
                             6,  2, 0,  0,  3,  2, 2, 1, -4, -3, 7,  5,  0,  3, 2,  3}),
                    GenTensorValueQuint4(
                            {8, 8}, dtype::QuantizedS4(0.3f),
                            {5, -8, -7, -6, 4,  7,  -5, -5, -4, 7,  -3, -2, 5,
                             6, 4,  2,  3,  -1, 2,  2,  7,  3,  6,  0,  5,  4,
                             0, 2,  2,  3,  3,  2,  1,  -8, -7, -6, 0,  -5, -4,
                             4, -3, 7,  1,  6,  -2, 2,  -1, 5,  2,  0,  7,  6,
                             5, 4,  3,  2,  0,  0,  1,  0,  5,  2,  2,  6}),
                    {}},
            Testcase{
                    {},
                    {},
                    TensorValue(
                            {8, 8}, dtype::QuantizedS16(0.3f * 0.3f),
                            {-60, 120, 49,  58,  58, 13,  92,  125, -5,  0,   -116,
                             -70, 22,  9,   -14, 46, -69, 111, 44,  48,  6,   19,
                             42,  57,  -8,  25,  10, 16,  26,  97,  -28, -12, -12,
                             14,  2,   26,  48,  7,  24,  93,  -2,  45,  2,   32,
                             -19, -1,  -16, 72,  23, -44, -52, -34, 45,  53,  -28,
                             6,   33,  45,  71,  84, 47,  10,  74,  61})

            });
}

TEST_F(NAIVE, MATRIX_MUL_QUANTIZED8x8x32) {
    Checker<MatrixMul> checker(handle(), /* check_dispatch */ false);
    MatrixMul::Param param;
    param.transposeA = false;
    param.transposeB = false;

    checker.set_param(param).exect(
            Testcase{
                    TensorValue(
                            {4, 7}, dtype::Quantized8Asymm(0.1f, (uint8_t)128),
                            {6,   97,  210, 47,  213, 246, 92,  121, 132, 133,
                             37,  31,  87,  71,  0,   5,   198, 11,  97,  141,
                             222, 166, 76,  212, 190, 108, 245, 143}),
                    TensorValue(
                            {7, 5}, dtype::Quantized8Asymm(0.2f, (uint8_t)233),
                            {89,  207, 79,  135, 43,  29,  235, 171, 40,  78,  119, 145,
                             254, 162, 184, 139, 248, 214, 201, 183, 127, 75,  48,  200,
                             96,  109, 63,  60,  100, 120, 111, 182, 150, 227, 92}),
                    {}},
            Testcase{
                    {},
                    {},
                    TensorValue(
                            {4, 5}, dtype::QuantizedS32(0.1f * 0.2f),
                            {2908,  -36975, -9180,  -3574,  8114,   30496, 23588,
                             32433, 11467,  30974,  36748,  -6939,  26715, 33787,
                             35329, -24486, -25049, -19828, -16627, -18972})});

    param.transposeA = true;
    checker.set_param(param).exect(
            Testcase{
                    TensorValue(
                            {2, 1}, dtype::Quantized8Asymm(0.7f, (uint8_t)128),
                            {129, 129}),
                    TensorValue(
                            {2, 1}, dtype::Quantized8Asymm(0.4f, (uint8_t)128),
                            {129, 129}),
                    {}},
            Testcase{
                    {},
                    {},
                    TensorValue({1, 1}, dtype::QuantizedS32(0.7f * 0.4f), {2})});
}

TEST_F(NAIVE, MATRIX_MUL_MK4) {
    run_matmul_mk_format(
            handle(), param::MatrixMul::Format::MK4, dtype::Float32(), dtype::Float32(),
            dtype::Float32());
}

TEST_F(NAIVE, MATRIX_MUL_MK8) {
    run_matmul_mk_format(
            handle(), param::MatrixMul::Format::MK8, dtype::Int16(), dtype::Int16(),
            dtype::Int32());
}

TEST_F(NAIVE, MATRIX_MUL_MK4_DOT) {
    run_matmul_mk_format(
            handle(), param::MatrixMul::Format::MK4_DOT, dtype::Int8(), dtype::Int8(),
            dtype::Int32());
}

TEST_F(NAIVE, MATRIX_MUL_BFLOAT16) {
    Checker<MatrixMul> checker(handle(), /* check_dispatch */ false);
    MatrixMul::Param param, fp32_param;
    fp32_param = param;
    param.compute_mode = param::MatrixMul::ComputeMode::FLOAT32;
    checker.set_param(param);
    checker.set_dtype(0, dtype::BFloat16());
    checker.set_dtype(1, dtype::BFloat16());
    checker.set_dtype(2, dtype::BFloat16());
    auto extra_impl = extra_impl_helper<MatrixMul>(handle(), fp32_param);

    checker.set_extra_opr_impl(extra_impl);
    checker.set_epsilon(1.5e-2);
    UniformFloatRNG frng{1e-2, 5.f};
    checker.set_rng(0, &frng);
    checker.set_rng(1, &frng);
    checker.execs({{8, 8}, {8, 8}, {}});
    param.compute_mode = param::MatrixMul::ComputeMode::DEFAULT;
    checker.set_param(param);
    checker.execs({{8, 8}, {8, 8}, {}});
}

// vim: syntax=cpp.doxygen