OpenI
/
MegEngine

/**
 * \file dnn/test/cuda/dct.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
#include "megdnn/oprs/nn.h"
#include "test/common/benchmarker.h"
#include "test/common/checker.h"
#include "test/common/dct_ref.h"
#include "test/common/rng.h"
#include "test/cuda/fixture.h"

namespace megdnn {
namespace test {

TEST_F(CUDA, DCT) {
    DctChannelSelectForward::Param param;
    Checker<DctChannelSelectForward> checker(handle_cuda());
    for (size_t n : {1, 3}) {
        for (size_t ic : {1, 3}) {
            for (size_t ih : {8, 16, 32, 512, 1024}) {
                for (size_t iw : {8, 16, 32, 64, 128, 256, 512, 1024}) {
                    checker.set_param(param)
                            .set_dtype(0, dtype::Uint8())
                            .set_dtype(1, dtype::Int32())
                            .set_dtype(2, dtype::Int32())
                            .execs({TensorShape{n, ic, ih, iw}, {}, {}, {}});
                }
            }
        }
    }
}

TEST_F(CUDA, DCT_QINT8) {
    DctChannelSelectForward::Param param;
    Checker<DctChannelSelectForward> checker(handle_cuda());
    param.format = Param::Format::NCHW4;
    for (size_t n : {1, 3}) {
        for (size_t ic : {1, 3}) {
            for (size_t ih : {8, 16, 32, 512, 1024}) {
                for (size_t iw : {8, 16, 32, 64, 128, 256, 512, 1024}) {
                    checker.set_param(param)
                            .set_dtype(0, dtype::Uint8())
                            .set_dtype(1, dtype::Int32())
                            .set_dtype(2, dtype::Int32())
                            .set_dtype(3, dtype::QuantizedS8(10.f))
                            .set_epsilon(1)
                            .execs({TensorShape{n, ic, ih, iw}, {}, {}, {}});
                }
            }
        }
    }
}

TEST_F(CUDA, DCT_WITH_FIX_32_MASK) {
    using Param = DctChannelSelectForward::Param;
    Param param;
    Checker<DctChannelSelectForward> checker(handle_cuda(), false);
    param.fastImpl = Param::FastImpl::FIX_32_MASK;
    auto test_case = gen_dct_case(3, 3, 1024, 768, 32, param);
    checker.set_param(param).exect(test_case->testcase_in,
                                   test_case->testcase_out);
}

TEST_F(CUDA, DCT_WITH_FIX_32_MASK_QINT8) {
    using Param = DctChannelSelectForward::Param;
    Param param;
    Checker<DctChannelSelectForward> checker(handle_cuda(), false);
    param.fastImpl = Param::FastImpl::FIX_32_MASK;
    param.format = Param::Format::NCHW4;
    auto test_case =
            gen_dct_case(3, 3, 1024, 768, 32, param, dtype::QuantizedS8(10.f));
    checker.set_param(param).set_epsilon(1).exect(test_case->testcase_in,
                                                  test_case->testcase_out);
}

TEST_F(CUDA, DCT_WITH_MASK) {
    Checker<DctChannelSelectForward> checker(handle_cuda(), false);
    DctChannelSelectForward::Param param;
    checker.set_param(param).exect(
            Testcase{TensorValue(
                             {1, 3, 8, 16}, dtype::Uint8(),
                             {109, 39,  30,  115, 71,  15,  206, 139, 221, 5,
                              18,  16,  93,  185, 99,  102, 205, 172, 191, 29,
                              185, 6,   47,  84,  0,   47,  105, 203, 251, 73,
                              196, 83,  3,   211, 32,  181, 49,  111, 114, 83,
                              148, 232, 77,  17,  35,  2,   154, 100, 41,  135,
                              141, 206, 56,  91,  137, 199, 104, 192, 75,  122,
                              78,  65,  184, 69,  91,  82,  2,   172, 194, 240,
                              49,  145, 87,  210, 97,  190, 179, 93,  125, 105,
                              181, 207, 148, 178, 133, 53,  25,  198, 238, 151,
                              14,  120, 213, 195, 145, 20,  122, 107, 217, 185,
                              65,  5,   115, 110, 82,  206, 163, 86,  2,   2,
                              44,  125, 50,  38,  41,  106, 30,  5,   151, 243,
                              238, 181, 232, 191, 161, 57,  23,  204,

                              109, 39,  30,  115, 71,  15,  206, 139, 221, 5,
                              18,  16,  93,  185, 99,  102, 205, 172, 191, 29,
                              185, 6,   47,  84,  0,   47,  105, 203, 251, 73,
                              196, 83,  3,   211, 32,  181, 49,  111, 114, 83,
                              148, 232, 77,  17,  35,  2,   154, 100, 41,  135,
                              141, 206, 56,  91,  137, 199, 104, 192, 75,  122,
                              78,  65,  184, 69,  91,  82,  2,   172, 194, 240,
                              49,  145, 87,  210, 97,  190, 179, 93,  125, 105,
                              181, 207, 148, 178, 133, 53,  25,  198, 238, 151,
                              14,  120, 213, 195, 145, 20,  122, 107, 217, 185,
                              65,  5,   115, 110, 82,  206, 163, 86,  2,   2,
                              44,  125, 50,  38,  41,  106, 30,  5,   151, 243,
                              238, 181, 232, 191, 161, 57,  23,  204,

                              109, 39,  30,  115, 71,  15,  206, 139, 221, 5,
                              18,  16,  93,  185, 99,  102, 205, 172, 191, 29,
                              185, 6,   47,  84,  0,   47,  105, 203, 251, 73,
                              196, 83,  3,   211, 32,  181, 49,  111, 114, 83,
                              148, 232, 77,  17,  35,  2,   154, 100, 41,  135,
                              141, 206, 56,  91,  137, 199, 104, 192, 75,  122,
                              78,  65,  184, 69,  91,  82,  2,   172, 194, 240,
                              49,  145, 87,  210, 97,  190, 179, 93,  125, 105,
                              181, 207, 148, 178, 133, 53,  25,  198, 238, 151,
                              14,  120, 213, 195, 145, 20,  122, 107, 217, 185,
                              65,  5,   115, 110, 82,  206, 163, 86,  2,   2,
                              44,  125, 50,  38,  41,  106, 30,  5,   151, 243,
                              238, 181, 232, 191, 161, 57,  23,  204}),
                     TensorValue({4}, dtype::Int32(), {0, 14, 22, 30}),
                     TensorValue({30}, dtype::Int32(),
                                 {8,  16, 9, 2, 3, 10, 17, 24, 32, 25,
                                  18, 11, 4, 5, 0, 1,  8,  16, 9,  2,
                                  3,  10, 0, 1, 8, 16, 9,  2,  3,  10}),
                     {}},
            Testcase{{},
                     {},
                     {},
                     TensorValue({1, 30, 1, 2}, dtype::Float32(),
                                 {-22.850792, -97.862236,  -101.043236,
                                  -4.727012,  28.275675,   -157.96654,
                                  42.1377,    45.06531,    -149.77373,
                                  24.487143,  -8.054966,   -13.990831,
                                  -6.9395194, -3.9211385,  64.79172,
                                  -12.363858, -47.875,     59.,
                                  56.271786,  -62.725567,  120.522675,
                                  16.559765,  85.74334,    112.904495,
                                  99.375,     29.499973,   2.0220923,
                                  -19.681704, 890.12494,   941.25,
                                  -7.0498576, 99.47632,    -22.850792,
                                  -97.862236, -101.043236, -4.727012,
                                  28.275675,  -157.96654,  42.1377,
                                  45.06531,   -149.77373,  24.487143,
                                  -8.054966,  -13.990831,  890.12494,
                                  941.25,     -7.0498576,  99.47632,
                                  -22.850792, -97.862236,  -101.043236,
                                  -4.727012,  28.275675,   -157.96654,
                                  42.1377,    45.06531,    -149.77373,
                                  24.487143,  -8.054966,   -13.990831})});
}

TEST_F(CUDA, DCT_WITH_MASK2) {
    Checker<DctChannelSelectForward> checker(handle_cuda(), false);
    DctChannelSelectForward::Param param;
    UniformIntRNG rng_oc(0, 3 * 64);
    for (size_t n : {1, 3}) {
        for (size_t ic : {1, 3}) {
            for (size_t ih : {8, 16, 32, 512, 1024}) {
                for (size_t iw : {8, 16, 32, 64, 128, 256, 512, 1024}) {
                    int random_oc = static_cast<int>(rng_oc.gen_single_val());
                    int max_oc = ic * 64;
                    int mask_oc = (random_oc % max_oc) + 1;
                    auto test_case =
                            gen_dct_case(n, ic, ih, iw, mask_oc, param);
                    checker.set_param(param).exect(test_case->testcase_in,
                                                   test_case->testcase_out);
                }
            }
        }
    }
}

TEST_F(CUDA, DCT_WITH_MASK2_QINT8) {
    Checker<DctChannelSelectForward> checker(handle_cuda(), false);
    DctChannelSelectForward::Param param;
    param.format = DctChannelSelectForward::Param::Format::NCHW4;

    UniformIntRNG rng_oc(0, 3 * 64);
    for (size_t n : {1, 3}) {
        for (size_t ic : {1, 3}) {
            for (size_t ih : {8, 16, 32, 512, 1024}) {
                for (size_t iw : {8, 16, 32, 64, 128, 256, 512, 1024}) {
                    int random_oc = static_cast<int>(rng_oc.gen_single_val());
                    int max_oc = ic * 64;
                    int mask_oc = (random_oc % max_oc) + 1;
                    mask_oc = (mask_oc + 3) / 4 * 4;
                    auto test_case = gen_dct_case(n, ic, ih, iw, mask_oc, param,
                                                  dtype::QuantizedS8(10.f));
                    checker.set_param(param).set_epsilon(1).exect(
                            test_case->testcase_in, test_case->testcase_out);
                }
            }
        }
    }
}
TEST_F(CUDA, DCT_WITH_MASK2_QINT8_CONSTRAINT) {
    DctChannelSelectForward::Param param;
    param.format = DctChannelSelectForward::Param::Format::NCHW4;

    Checker<DctChannelSelectForward> checker(handle_cuda(), false);
    checker.set_param(param)
            .set_dtype(0, dtype::Uint8())
            .set_dtype(1, dtype::Int32())
            .set_dtype(2, dtype::Int32())
            .set_dtype(3, dtype::QuantizedS8(10.f))
            .set_epsilon(1);

    UniformIntRNG rng_oc(0, 3 * 64);
    for (size_t n : {1, 3}) {
        for (size_t ic : {1, 3}) {
            for (size_t ih : {8, 16, 32, 512, 1024}) {
                for (size_t iw : {8, 16, 32, 64, 128, 256, 512, 1024}) {
                    int random_oc = static_cast<int>(rng_oc.gen_single_val());
                    int max_oc = ic * 64;
                    int mask_oc = (random_oc % max_oc) + 1;
                    mask_oc = (mask_oc + 3) / 4 * 4;
                    if (mask_oc < max_oc) {
                        checker
                                .set_tensors_constraint(gen_dct_constriant(
                                        n, ic, ih, iw, mask_oc, param))
                                .exec({TensorShape{n, ic, ih, iw},
                                       TensorShape{ic + 1},
                                       TensorShape{(size_t)mask_oc},
                                       {}});
                    } else {
                        checker.set_tensors_constraint({}).exec(
                                {TensorShape{n, ic, ih, iw}, {}, {}, {}});
                    }
                }
            }
        }
    }
}

#if MEGDNN_WITH_BENCHMARK

TEST_F(CUDA, BENCHMARK_DCT) {
    using Param = DctChannelSelectForward::Param;

    auto run = [&](const TensorShapeArray& shapes, Param param) {
        Benchmarker<DctChannelSelectForward> benchmarker(handle_cuda());
        benchmarker.set_param(param);
        benchmarker.set_dtype(0, dtype::Uint8())
                .set_dtype(1, dtype::Int32())
                .set_dtype(2, dtype::Int32());
        for (auto&& shape : shapes) {
            double computation = double(shape[0]) * shape[1] * shape[2] *
                                 shape[3] * 32.0 * 1e-6;
            auto time_ms = benchmarker.execs({shape, {}, {}, {}});
            printf("execute %s, %.4f Gops\n", shape.to_string().c_str(),
                   computation / time_ms);
        }
    };

    auto run_case = [&](const DctTestcase& testcase, Param param,
                        std::string comment = "") {
        Benchmarker<DctChannelSelectForward> benchmarker(handle_cuda());
        benchmarker.set_param(param);
        benchmarker.set_dtype(0, dtype::Uint8())
                .set_dtype(1, dtype::Int32())
                .set_dtype(2, dtype::Int32())
                .set_dtype(3, testcase.testcase_out[3].layout.dtype);

        auto src_shape = testcase.testcase_in[0].layout;
        double computation = double(src_shape[0]) * src_shape[1] *
                             src_shape[2] * src_shape[3] * 32.0 * 1e-6;
        auto time_ms = benchmarker.exect(testcase.testcase_in);
        printf("[%s] execute %s, %.4f Gops\n", comment.c_str(),
               src_shape.to_string().c_str(), computation / time_ms);
    };

    auto run_case_constraint =
            [&](const Benchmarker<DctChannelSelectForward>::TensorsConstriant&
                        constraint,
                Param param, const TensorShapeArray& shapes,
                std::string comment = "", DType output_dtype) {
                Benchmarker<DctChannelSelectForward> benchmarker(handle_cuda());
                benchmarker.set_param(param)
                        .set_dtype(0, dtype::Uint8())
                        .set_dtype(1, dtype::Int32())
                        .set_dtype(2, dtype::Int32())
                        .set_dtype(3, output_dtype)
                        .set_tensors_constraint(constraint);

                auto src_shape = shapes[0];
                double computation = double(src_shape[0]) * src_shape[1] *
                                     src_shape[2] * src_shape[3] * 32.0 * 1e-6;
                auto time_ms = benchmarker.exec(shapes);
                printf("[%s] execute %s, %.4f Gops\n", comment.c_str(),
                       src_shape.to_string().c_str(), computation / time_ms);
            };

    TensorShapeArray shapes = {
            {1, 3, 512, 512},
            {8, 3, 2176, 3840},
    };
    {
        Param param;
        run(shapes, param);
    }

    Param fix_32_param;
    fix_32_param.fastImpl = Param::FastImpl::FIX_32_MASK;
    {
        auto test_case = gen_dct_case(8, 3, 2176, 3840, 32, fix_32_param);
        run_case(*test_case, fix_32_param, "FIX_32_MASK");
    }

    {
        Param param;
        auto test_case = gen_dct_case(8, 3, 2176, 3840, 32, fix_32_param);
        run_case(*test_case, param, "MASK 32");
    }

    {
        Param fix_32_nchw4_param;
        fix_32_nchw4_param.fastImpl = Param::FastImpl::FIX_32_MASK;
        fix_32_nchw4_param.format = Param::Format::NCHW4;
        auto test_case = gen_dct_case(8, 3, 2176, 3840, 32, fix_32_nchw4_param,
                                      dtype::QuantizedS8(10.f));
        run_case(*test_case, fix_32_nchw4_param, "FIX_32_MASK QINT8");
    }

    {
        Param fix_32_nchw4_param;
        fix_32_nchw4_param.fastImpl = Param::FastImpl::FIX_32_MASK;
        fix_32_nchw4_param.format = Param::Format::NCHW4;
        auto test_case = gen_dct_case(8, 3, 2176, 3840, 32, fix_32_nchw4_param,
                                      dtype::QuantizedS8(10.f));
        fix_32_nchw4_param.fastImpl = Param::FastImpl::NONE;
        run_case(*test_case, fix_32_nchw4_param, "MASK 32 QINT8");
    }

    {
        Param fix_32_nchw4_param;
        fix_32_nchw4_param.fastImpl = Param::FastImpl::FIX_32_MASK;
        fix_32_nchw4_param.format = Param::Format::NCHW4;
        TensorShapeArray shapes = {{8, 3, 2176, 3840}, {4}, {32}, {}};
        auto constraint =
                gen_dct_constriant(8, 3, 2176, 3840, 32, fix_32_nchw4_param);
        run_case_constraint(constraint, fix_32_nchw4_param, shapes,
                            "FIX_32_MASK QINT8 Constraint",
                            dtype::QuantizedS8(10.f));
    }
}
#endif

}  // namespace test
}  // namespace megdnn
// vim: syntax=cpp.doxygen