OpenI
/
MegEngine

#include "test/common/relayout.h"
#include "megdnn/oprs.h"
#include "test/common/checker.h"
#include "test/common/rng.h"
#include "test/cuda/benchmark.h"
#include "test/cuda/fixture.h"

using namespace megdnn;
using namespace test;

namespace {
template <typename tag>
class CUDA_RELAYOUT : public CUDA {};
TYPED_TEST_CASE(CUDA_RELAYOUT, relayout::test_types);
TYPED_TEST(CUDA_RELAYOUT, run) {
    relayout::run_test<TypeParam>(this->handle_cuda());
}
}  // namespace

TEST_F(CUDA, RELAYOUT_TRANSPOSE) {
    Checker<Relayout> checker(handle_cuda());
    auto run = [&](size_t batch, size_t m, size_t n, size_t c, DType dtype) {
        checker.set_dtype(0, dtype).set_dtype(1, dtype);
        TensorLayout src = {{batch, m, n, c}, dtype};
        src.init_contiguous_stride();
        TensorLayout dst = {{batch, m, n, c}, dtype};
        dst.stride[0] = m * n * c;
        dst.stride[1] = c;
        dst.stride[2] = m * c;
        dst.stride[3] = 1;
        checker.execl({src, dst});
    };
    run(16, 30, 40, 4, dtype::Int8());
    run(16, 20, 10, 4, dtype::Int8());
    run(1, 30, 20, 1, dtype::Int32());
    run(1, 20, 30, 1, dtype::Int32());
    run(1, 11, 21, 1, dtype::Float32());
}

#if MEGDNN_WITH_BENCHMARK
TEST_F(CUDA, BENCHMARK_RELAYOUT_TRANSPOSE) {
    static constexpr size_t RUNS = 1000;
    CUBenchmarker<Relayout> benchmarker(handle_cuda());
    benchmarker.set_times(RUNS);
    auto run = [&](size_t batch, size_t m, size_t n, size_t c, DType dtype) {
        benchmarker.set_dtype(0, dtype).set_dtype(1, dtype);
        TensorLayout src = {{batch, m, n, c}, dtype};
        src.init_contiguous_stride();
        TensorLayout dst = {{batch, m, n, c}, dtype};
        dst.stride[0] = m * n * c;
        dst.stride[1] = c;
        dst.stride[2] = m * c;
        dst.stride[3] = 1;
        auto time_ms = benchmarker.execl({src, dst}) / RUNS;
        printf("{%zux%zux%zux%zu}->{%zux%zux%zux%zu} bandwidth: %.2f gbps\n", batch, m,
               n, c, batch, n, m, c,
               2.f * batch * m * n * c * dtype.size() / (1e6 * time_ms));
    };
    run(16, 640, 480, 4, dtype::Int8());
    run(256, 224, 224, 4, dtype::Int8());
    run(1, 256, 224 * 224, 1, dtype::Int32());
    run(1, 256, 7 * 7 * 512, 1, dtype::Int32());
    run(1, 4096, 4096, 1, dtype::Float32());
}

TEST_F(CUDA, BENCHMARK_RELAYOUT) {
    //! benchmark contious layout, such as (a, b, c, d) -> (b, a, c,d)
    //! just change the first two axis
    static constexpr size_t RUNS = 3;
    auto run = [&](const TensorLayoutArray& layouts) {
        Benchmarker<Relayout> benchmarker(handle_cuda());

        benchmarker.set_times(RUNS);
        for (auto&& layout : layouts) {
            TensorLayout src = layout.dimshuffle({1, 0, 2});
            TensorLayout dst = layout;
            std::swap(dst.shape[0], dst.shape[1]);
            dst.init_contiguous_stride();
            auto used = benchmarker.execl({src, dst});
            printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
                   2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used *
                           1000 / (1024 * 1024 * 1024));
        }
    };

    TensorLayoutArray layouts = {
            {{12, 23, 2}, dtype::Int32()},     {{12, 23, 8}, dtype::Int32()},
            {{12, 23, 17}, dtype::Int32()},    {{12, 23, 64}, dtype::Int32()},
            {{12, 23, 129}, dtype::Int32()},   {{12, 23, 256}, dtype::Int32()},
            {{12, 23, 1029}, dtype::Int32()},  {{12, 23, 4096}, dtype::Int32()},
            {{12, 23, 9143}, dtype::Int32()},  {{12, 23, 18284}, dtype::Int32()},
            {{2, 2, 1000000}, dtype::Int32()},
    };
    run(layouts);

    auto run2 = [&](const TensorLayoutArray& layouts) {
        Benchmarker<Relayout> benchmarker(handle_cuda());

        benchmarker.set_times(RUNS);
        for (auto&& layout : layouts) {
            TensorLayout src = layout.dimshuffle({0, 2, 1, 3});
            TensorLayout dst = layout;
            std::swap(dst.shape[0], dst.shape[1]);
            dst.init_contiguous_stride();
            auto used = benchmarker.execl({src, dst});

            printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
                   2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used *
                           1000 / (1024 * 1024 * 1024));
        }
    };

    layouts = {
            {{3, 12, 24, 100}, dtype::Int32()},
            {{3, 12, 24, 1029}, dtype::Int32()},
            {{3, 4, 24, 9143}, dtype::Int32()},
            {{3, 4, 24, 18284}, dtype::Int32()},
    };

    run2(layouts);
}

TEST_F(CUDA, BENCHMARK_RELAYOUT_LAST_CONTIG) {
    //! src and dst are all get subtensor in channel axis
    static constexpr size_t RUNS = 3;

    Benchmarker<Relayout> benchmarker(handle_cuda());
    benchmarker.set_times(RUNS);
    TensorLayout src =
            TensorLayout({5, 5, 100000}, {800000, 100000, 1}, dtype::Float32());
    TensorLayout dst =
            TensorLayout({5, 5, 100000}, {700000, 100000, 1}, dtype::Float32());
    auto used = benchmarker.execl({src, dst});

    printf("src: %s dst: %s bandwith: %f gbps/s\n", src.to_string().c_str(),
           dst.to_string().c_str(),
           2 * src.total_nr_elems() * src.dtype.size() * RUNS / used * 1000 /
                   (1024 * 1024 * 1024));
}

TEST_F(CUDA, BENCHMARK_RELAYOUT_LAST_NOT_CONTIG) {
    static constexpr size_t RUNS = 3;

    auto run = [&](TensorLayout src, TensorLayout dst) {
        Benchmarker<Relayout> benchmarker(handle_cuda());
        auto&& layout = src;
        benchmarker.set_times(RUNS);
        dst.init_contiguous_stride();
        auto used = benchmarker.execl({src, dst});
        printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
               2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 /
                       (1024 * 1024 * 1024));
    };

    run({{16, 128, 128}, {49152, 384, 3}, dtype::Float32()},
        {{16, 128, 128}, {16384, 128, 1}, dtype::Float32()});
}

TEST_F(CUDA, BENCHMARK_RELAYOUT_6) {
    static constexpr size_t RUNS = 3;
    auto run = [&](TensorLayoutArray layouts,
                   std::vector<std::vector<size_t>> permutations) {
        Benchmarker<Relayout> benchmarker(handle_cuda());

        benchmarker.set_times(RUNS);
        int i = 0;
        for (auto&& layout : layouts) {
            auto per = permutations[i];
            TensorLayout src = layout.dimshuffle(per);
            TensorLayout dst = layout;
            std::swap(dst.shape[0], dst.shape[1]);
            dst.init_contiguous_stride();
            auto used = benchmarker.execl({src, dst});
            Checker<Relayout> checker(handle_cuda());
            checker.exec(TensorLayoutArray{src, dst});
            printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
                   2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used *
                           1000 / (1024 * 1024 * 1024));
            i++;
        }
    };
    TensorLayoutArray layouts = {
            {{7248, 7248}, dtype::Int32()},
            {{43408, 1216}, dtype::Int32()},
            {{1216, 43408}, dtype::Int32()},
            {{368, 384, 384}, dtype::Int32()},
            {{2144, 64, 384}, dtype::Int32()},
            {{368, 64, 2307}, dtype::Int32()},
            {{384, 384, 355}, dtype::Int32()},
            {{2320, 384, 59}, dtype::Int32()},
            {{384, 2320, 59}, dtype::Int32()},
            {{384, 355, 384}, dtype::Int32()},
            {{2320, 59, 384}, dtype::Int32()},
            {{384, 59, 2320}, dtype::Int32()},
            {{80, 96, 75, 96}, dtype::Int32()},
            {{464, 16, 75, 96}, dtype::Int32()},
            {{80, 16, 75, 582}, dtype::Int32()},
            {{96, 75, 96, 75}, dtype::Int32()},
            {{608, 12, 96, 75}, dtype::Int32()},
            {{96, 12, 608, 75}, dtype::Int32()},
            {{96, 75, 96, 75}, dtype::Int32()},
            {{608, 12, 96, 75}, dtype::Int32()},
            {{96, 12, 608, 75}, dtype::Int32()},
            {{96, 96, 75, 75}, dtype::Int32()},
            {{608, 96, 12, 75}, dtype::Int32()},
            {{96, 608, 12, 75}, dtype::Int32()},
            {{96, 75, 75, 96}, dtype::Int32()},
            {{608, 12, 75, 96}, dtype::Int32()},
            {{96, 12, 75, 608}, dtype::Int32()},
            {{32, 48, 28, 28, 48}, dtype::Int32()},
            {{176, 8, 28, 28, 48}, dtype::Int32()},
            {{32, 8, 28, 28, 298}, dtype::Int32()},
            {{48, 28, 28, 48, 28}, dtype::Int32()},
            {{352, 4, 28, 48, 28}, dtype::Int32()},
            {{48, 4, 28, 352, 28}, dtype::Int32()},
            {{48, 28, 48, 28, 28}, dtype::Int32()},
            {{352, 4, 48, 28, 28}, dtype::Int32()},
            {{48, 4, 352, 28, 28}, dtype::Int32()},
            {{48, 48, 28, 28, 28}, dtype::Int32()},
            {{352, 48, 4, 28, 28}, dtype::Int32()},
            {{48, 352, 4, 28, 28}, dtype::Int32()},
            {{48, 28, 28, 28, 48}, dtype::Int32()},
            {{352, 4, 28, 28, 48}, dtype::Int32()},
            {{48, 4, 28, 28, 352}, dtype::Int32()},
            {{16, 32, 15, 32, 15, 15}, dtype::Int32()},
            {{48, 10, 15, 32, 15, 15}, dtype::Int32()},
            {{16, 10, 15, 103, 15, 15}, dtype::Int32()},
            {{32, 15, 15, 32, 15, 15}, dtype::Int32()},
            {{112, 5, 15, 32, 15, 15}, dtype::Int32()},
            {{32, 5, 15, 112, 15, 15}, dtype::Int32()},
            {{32, 15, 32, 15, 15, 15}, dtype::Int32()},
            {{112, 5, 32, 15, 15, 15}, dtype::Int32()},
            {{32, 5, 112, 15, 15, 15}, dtype::Int32()},
            {{32, 15, 15, 32, 15, 15}, dtype::Int32()},
            {{112, 5, 15, 32, 15, 15}, dtype::Int32()},
            {{32, 5, 15, 112, 15, 15}, dtype::Int32()},
            {{32, 15, 15, 15, 15, 32}, dtype::Int32()},
            {{112, 5, 15, 15, 15, 32}, dtype::Int32()},
            {{32, 5, 15, 15, 15, 112}, dtype::Int32()},
    };

    std::vector<std::vector<size_t>> permutations = {
            std::vector<size_t>{1, 0},
            std::vector<size_t>{1, 0},
            std::vector<size_t>{1, 0},
            std::vector<size_t>{0, 2, 1},
            std::vector<size_t>{0, 2, 1},
            std::vector<size_t>{0, 2, 1},
            std::vector<size_t>{1, 0, 2},
            std::vector<size_t>{1, 0, 2},
            std::vector<size_t>{1, 0, 2},
            std::vector<size_t>{2, 1, 0},
            std::vector<size_t>{2, 1, 0},
            std::vector<size_t>{2, 1, 0},
            std::vector<size_t>{0, 3, 2, 1},
            std::vector<size_t>{0, 3, 2, 1},
            std::vector<size_t>{0, 3, 2, 1},
            std::vector<size_t>{2, 1, 3, 0},
            std::vector<size_t>{2, 1, 3, 0},
            std::vector<size_t>{2, 1, 3, 0},
            std::vector<size_t>{2, 0, 3, 1},
            std::vector<size_t>{2, 0, 3, 1},
            std::vector<size_t>{2, 0, 3, 1},
            std::vector<size_t>{1, 0, 3, 2},
            std::vector<size_t>{1, 0, 3, 2},
            std::vector<size_t>{1, 0, 3, 2},
            std::vector<size_t>{3, 2, 1, 0},
            std::vector<size_t>{3, 2, 1, 0},
            std::vector<size_t>{3, 2, 1, 0},
            std::vector<size_t>{0, 4, 2, 1, 3},
            std::vector<size_t>{0, 4, 2, 1, 3},
            std::vector<size_t>{0, 4, 2, 1, 3},
            std::vector<size_t>{3, 2, 1, 4, 0},
            std::vector<size_t>{3, 2, 1, 4, 0},
            std::vector<size_t>{3, 2, 1, 4, 0},
            std::vector<size_t>{2, 0, 4, 1, 3},
            std::vector<size_t>{2, 0, 4, 1, 3},
            std::vector<size_t>{2, 0, 4, 1, 3},
            std::vector<size_t>{1, 3, 0, 4, 2},
            std::vector<size_t>{1, 3, 0, 4, 2},
            std::vector<size_t>{1, 3, 0, 4, 2},
            std::vector<size_t>{4, 3, 2, 1, 0},
            std::vector<size_t>{4, 3, 2, 1, 0},
            std::vector<size_t>{4, 3, 2, 1, 0},
            std::vector<size_t>{0, 3, 2, 5, 4, 1},
            std::vector<size_t>{0, 3, 2, 5, 4, 1},
            std::vector<size_t>{0, 3, 2, 5, 4, 1},
            std::vector<size_t>{3, 2, 0, 5, 1, 4},
            std::vector<size_t>{3, 2, 0, 5, 1, 4},
            std::vector<size_t>{3, 2, 0, 5, 1, 4},
            std::vector<size_t>{2, 0, 4, 1, 5, 3},
            std::vector<size_t>{2, 0, 4, 1, 5, 3},
            std::vector<size_t>{2, 0, 4, 1, 5, 3},
            std::vector<size_t>{3, 2, 5, 1, 0, 4},
            std::vector<size_t>{3, 2, 5, 1, 0, 4},
            std::vector<size_t>{3, 2, 5, 1, 0, 4},
            std::vector<size_t>{5, 4, 3, 2, 1, 0},
            std::vector<size_t>{5, 4, 3, 2, 1, 0},
            std::vector<size_t>{5, 4, 3, 2, 1, 0}};
    run(layouts, permutations);
}

TEST_F(CUDA, BENCHMARK_RELAYOUT_7) {
    static constexpr size_t RUNS = 3;

    auto isTrivial = [&](std::vector<size_t>& permutation) {
        for (size_t i = 0; i < permutation.size(); i++) {
            if (permutation[i] != i)
                return false;
        }
        return true;
    };
    auto run = [&](TensorLayout layout, std::vector<size_t> per) {
        Benchmarker<Relayout> benchmarker(handle_cuda());

        benchmarker.set_times(RUNS);

        TensorLayout src = layout.dimshuffle(per);
        TensorLayout dst = layout;
        std::swap(dst.shape[0], dst.shape[1]);
        dst.init_contiguous_stride();
        auto used = benchmarker.execl({src, dst});
        Checker<Relayout> checker(handle_cuda());
        checker.exec(TensorLayoutArray{src, dst});
        printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
               2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 /
                       (1024 * 1024 * 1024));
    };

    std::vector<size_t> _dim = {5, 3, 2, 4, 35, 33, 37};
    std::vector<size_t> permutation(7);
    // Inverse
    for (size_t r = 0; r < _dim.size(); r++) {
        size_t size = _dim.size();
        permutation[r] = size - 1 - r;
    }
    run({{_dim[0], _dim[1], _dim[2], _dim[3], _dim[4], _dim[5], _dim[6]},
         dtype::Int32()},
        permutation);
    // Random
    for (size_t r = 0; r < _dim.size(); r++)
        permutation[r] = r;
    for (int nsample = 0; nsample < 50; nsample++) {
        COMPAT_RANDOM(_dim.begin(), _dim.end());
        COMPAT_RANDOM(permutation.begin(), permutation.end());
        if (!isTrivial(permutation)) {
            run({{_dim[0], _dim[1], _dim[2], _dim[3], _dim[4], _dim[5], _dim[6]},
                 dtype::Int32()},
                permutation);
        }
    }
}

TEST_F(CUDA, BENCHMARK_RELAYOUT_5) {
    static constexpr size_t RUNS = 10;

    auto isTrivial = [&](std::vector<size_t>& permutation) {
        for (size_t i = 0; i < permutation.size(); i++) {
            if (permutation[i] != i)
                return false;
        }
        return true;
    };
    auto run = [&](TensorLayout layout, std::vector<size_t> per) {
        CUBenchmarker<Relayout> benchmarker(handle_cuda());

        benchmarker.set_times(RUNS);

        TensorLayout src = layout.dimshuffle(per);
        TensorLayout dst = layout;
        // std::swap(dst.shape[0], dst.shape[1]);
        dst.init_contiguous_stride();
        auto used = benchmarker.execl({src, dst});
        Checker<Relayout> checker(handle_cuda());
        checker.exec(TensorLayoutArray{src, dst});
        printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
               2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 /
                       (1024 * 1024 * 1024));
    };

    size_t two = 2;
    int ratio = 5;
    int numElemAvg = 1000000 * 200;
    UniformFloatRNG numElem_dist((double)numElemAvg, (double)numElemAvg * 0.2);
    for (int rank = 5; rank <= 5; rank++) {
        for (int iter = 0; iter < 20; iter++) {
            int numElem = (int)numElem_dist.gen_single_val();

            std::vector<size_t> dim(rank);
            std::vector<size_t> permutation(rank);
            std::vector<double> dimf(rank);
            double volf = 1.0;
            for (int r = 0; r < rank; r++) {
                permutation[r] = (size_t)r;
                dimf[r] = 1.0 + (double)r * (ratio - 1.0) / (double)(rank - 1);
                volf *= dimf[r];
            }
            // fprintf(stderr, "volf %lf\n", volf);
            double scale = pow((double)numElem / volf, 1.0 / (double)rank);
            // fprintf(stderr, "scale %lf\n", scale);
            int vol = 1;
            for (int r = 0; r < rank; r++) {
                if (r == rank - 1) {
                    dim[r] = ratio * dim[0];
                } else {
                    dim[r] = (size_t)round(dimf[r] * scale);
                }
                dim[r] = std::max(two, dim[r]);
                vol *= dim[r];
            }
            // fprintf(stderr, "dim[0] %lf\n", dim[0]);
            double cur_ratio = (double)dim[rank - 1] / (double)dim[0];
            double vol_re = fabs((double)(vol - numElem) / (double)numElem);
            // Fix dimensions if volume is off by more than 5%
            if (vol_re > 0.05) {
                size_t d = (vol < numElem) ? 1 : -1;
                int r = 1;
                while (vol_re > 0.05 && r < rank) {
                    size_t dim_plus_d = std::max(two, dim[r] + d);
                    vol = (vol / dim[r]) * dim_plus_d;
                    dim[r] = dim_plus_d;
                    vol_re = fabs((double)(vol - numElem) / (double)numElem);
                    r++;
                }
            }
            size_t minDim = *(std::min_element(dim.begin(), dim.end()));
            size_t maxDim = *(std::max_element(dim.begin(), dim.end()));
            cur_ratio = (double)maxDim / (double)minDim;
            printf("vol %d cur_ratio %lf | %lf\n", vol, cur_ratio, vol_re);
            // printVec(dim);

            COMPAT_RANDOM(dim.begin(), dim.end());

            while (isTrivial(permutation)) {
                COMPAT_RANDOM(permutation.begin(), permutation.end());
            }

            run({{dim[0], dim[1], dim[2], dim[3], dim[4]}, dtype::Int32()},
                permutation);
            // if (!bench_tensor<T>(dim, permutation)) return false;
        }
    }
}

TEST_F(CUDA, BENCHMARK_RELAYOUT_NCHW_NCHW4) {
    static constexpr size_t RUNS = 10;

    auto run = [&](TensorLayout layout, std::vector<size_t> per) {
        CUBenchmarker<Relayout> benchmarker(handle_cuda());

        benchmarker.set_times(RUNS);

        TensorLayout src = layout.dimshuffle(per);
        TensorLayout dst = layout;
        dst.init_contiguous_stride();
        auto used = benchmarker.execl({src, dst});
        Checker<Relayout> checker(handle_cuda());
        checker.exec(TensorLayoutArray{src, dst});

        printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
               2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 /
                       (1024 * 1024 * 1024));
    };
    UniformIntRNG u(2, 100);
    printf("NCHW->NCHW4\n");
    for (int i = 0; i < 20; i++) {
        int d1 = u.gen_single_val();
        int d2 = (u.gen_single_val() / 4 + 1) * 4;
        int d3 = 4;
        // int d4=(u.gen_single_val()/4+1)*4;
        int d4 = (u.gen_single_val());
        int d5 = (u.gen_single_val());
        // int d5=(u.gen_single_val()/4+1)*4;

        // int d5 = (u.gen_single_val())*2+1;
        run({{(size_t)d1, (size_t)d2 / 4, (size_t)d3, (size_t)d4, (size_t)d5},
             {d2 * d3 * d4 * d5 / 4, d3 * d4 * d5, d4 * d5, d5, 1},
             dtype::Int8()},
            {0, 1, 3, 4, 2});
    }
    printf("\n\nNCHW4->NCHW\n");
    for (int i = 0; i < 20; i++) {
        int d1 = u.gen_single_val();
        int d2 = (u.gen_single_val() / 4 + 1) * 4;
        int d3 = u.gen_single_val();
        // int d5=(u.gen_single_val()/4+1)*4;
        int d4 = u.gen_single_val();
        int d5 = 4;
        run({{(size_t)d1, (size_t)d2 / 4, (size_t)d3, (size_t)d4, (size_t)d5},
             {d2 * d3 * d4 * d5 / 4, d3 * d4 * d5, d4 * d5, d5, 1},
             dtype::Int8()},
            {0, 1, 4, 2, 3});
    }
}

TEST_F(CUDA, BENCHMARK_RELAYOUT_NCHW4_NCHW32) {
    static constexpr size_t RUNS = 10;
    auto run = [&](TensorLayout layout, std::vector<size_t> per) {
        CUBenchmarker<Relayout> benchmarker(handle_cuda());

        benchmarker.set_times(RUNS);

        TensorLayout src = layout.dimshuffle(per);
        TensorLayout dst = layout;
        dst.init_contiguous_stride();
        auto used = benchmarker.execl({src, dst});

        Checker<Relayout> checker(handle_cuda());
        checker.exec(TensorLayoutArray{src, dst});

        printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
               2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 /
                       (1024 * 1024 * 1024));
    };
    UniformIntRNG u(4, 50);
    printf("NCHW4 to NCHW32\n");
    for (int i = 0; i < 20; i++) {
        int d1 = u.gen_single_val();
        int d2 = (u.gen_single_val() / 8 + 1) * 8;
        int d3 = 8;
        int d4 = u.gen_single_val();
        int d5 = u.gen_single_val();
        int d6 = 4;
        run({{(size_t)d1, (size_t)d2 / 8, (size_t)d3, (size_t)d4, (size_t)d5,
              (size_t)d6},
             {d2 * d3 * d4 * d5 * d6 / 8, d3 * d4 * d5 * d6, d4 * d5 * d6, d5 * d6, d6,
              1},
             dtype::Int8()},
            {0, 1, 3, 4, 2, 5});
    }
    printf("\n\nNCHW32 to NCHW4\n");
    for (int i = 0; i < 20; i++) {
        int d1 = u.gen_single_val();
        int d2 = (u.gen_single_val() / 8 + 1) * 8;
        int d3 = u.gen_single_val();
        int d4 = u.gen_single_val();
        int d5 = 8;
        int d6 = 4;
        run({{(size_t)d1, (size_t)d2 / 8, (size_t)d3, (size_t)d4, (size_t)d5,
              (size_t)d6},
             {d2 * d3 * d4 * d5 * d6 / 8, d3 * d4 * d5 * d6, d4 * d5 * d6, d5 * d6, d6,
              1},
             dtype::Int8()},
            {0, 1, 4, 2, 3, 5});
    }
}

TEST_F(CUDA, BENCHMARK_LAST_CONTIG_ALIGN_TEST) {
    static constexpr size_t RUNS = 10;

    auto run = [&](TensorLayout layout, std::vector<size_t> per) {
        CUBenchmarker<Relayout> benchmarker(handle_cuda());
        benchmarker.set_times(RUNS);

        TensorLayout src = layout.dimshuffle(per);
        TensorLayout dst = layout;
        // std::swap(dst.shape[0], dst.shape[1]);
        dst.init_contiguous_stride();
        auto used = benchmarker.execl({src, dst});
        Checker<Relayout> checker(handle_cuda());
        checker.exec(TensorLayoutArray{src, dst});
        printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
               2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 /
                       (1024 * 1024 * 1024));
    };
    UniformIntRNG u(4, 50);
    std::vector<size_t> _dim(6);
    std::vector<size_t> permutation(_dim.size());
    for (size_t r = 0; r < _dim.size(); r++) {
        size_t size = _dim.size();
        permutation[r] = size - 1 - r;
    }
    _dim[0] = u.gen_single_val();
    _dim[1] = u.gen_single_val();
    _dim[2] = u.gen_single_val();
    _dim[3] = u.gen_single_val();
    _dim[4] = u.gen_single_val();
    _dim[5] = (u.gen_single_val() / 4 + 1) * 4;
    run({{_dim[0], _dim[1], _dim[2], _dim[3], _dim[4], _dim[5]}, dtype::Int8()},
        permutation);
    // Random
    for (size_t r = 0; r < _dim.size(); r++)
        permutation[r] = r;
    for (int nsample = 0; nsample < 20; nsample++) {
        COMPAT_RANDOM(_dim.begin(), _dim.end() - 1);

        COMPAT_RANDOM(permutation.begin(), permutation.end() - 1);

        if (nsample < 5)
            _dim[5] = (u.gen_single_val() / 4 + 1) * 4;
        else
            _dim[5] = u.gen_single_val();

        run({{_dim[0], _dim[1], _dim[2], _dim[3], _dim[4], _dim[5]}, dtype::Int8()},
            permutation);
    }
}
#endif

TEST_F(CUDA, RELAYOUT) {
    struct Arg {
        TensorLayout src, dst;
        Arg(TensorLayout src, TensorLayout dst) : src(src), dst(dst) {}
    };
    std::vector<Arg> args;
    {
        // contiguous stride
        args.emplace_back(
                TensorLayout({4, 3, 2}, {2, 8, 1}, dtype::Float16()),
                TensorLayout({4, 3, 2}, {6, 2, 1}, dtype::Float16()));
        args.emplace_back(
                TensorLayout({4, 3, 2}, {6, 2, 1}, dtype::Float16()),
                TensorLayout({4, 3, 2}, {2, 8, 1}, dtype::Float16()));
        args.emplace_back(
                TensorLayout({2, 4, 3, 5}, {60, 5, 20, 1}, dtype::Float16()),
                TensorLayout({2, 4, 3, 5}, {60, 15, 5, 1}, dtype::Float16()));
    }
    args.emplace_back(
            TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Float16()),
            TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16()));
    args.emplace_back(
            TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16()),
            TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Float16()));
    args.emplace_back(
            TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16()),
            TensorLayout({2, 3, 4, 5}, {180, 60, 15, 3}, dtype::Float16()));
    args.emplace_back(
            TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int32()),
            TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32()));
    args.emplace_back(
            TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32()),
            TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int32()));
    args.emplace_back(
            TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32()),
            TensorLayout({2, 3, 4, 5}, {180, 60, 15, 3}, dtype::Int32()));
    args.emplace_back(
            TensorLayout({16, 128, 128}, {49152, 384, 3}, dtype::Float32()),
            TensorLayout({16, 128, 128}, {16384, 128, 1}, dtype::Float32()));

    {
        // 1d
        size_t n = 10000;
        args.emplace_back(
                TensorLayout({n}, {1}, dtype::Int32()),
                TensorLayout({n}, {1}, dtype::Int32()));
        args.emplace_back(
                TensorLayout({n}, {1}, dtype::Int32()),
                TensorLayout({n}, {2}, dtype::Int32()));
        args.emplace_back(
                TensorLayout({n}, {2}, dtype::Int32()),
                TensorLayout({n}, {1}, dtype::Int32()));
        args.emplace_back(
                TensorLayout({n}, {2}, dtype::Int32()),
                TensorLayout({n}, {2}, dtype::Int32()));
    }
    {
        // 2d
        size_t m = 200, n = 300, k = 400;
        ptrdiff_t k2 = k * 2;
        args.emplace_back(
                TensorLayout({m, n}, {k2, 2}, dtype::Int32()),
                TensorLayout({m, n}, {k2 + 1, 2}, dtype::Int32()));
        args.emplace_back(
                TensorLayout({m, n}, {2, k2}, dtype::Int32()),
                TensorLayout({m, n}, {2, k2 + 1}, dtype::Int32()));
        args.emplace_back(
                TensorLayout({m, n}, {2, k2}, dtype::Int32()),
                TensorLayout({m, n}, {k2 + 1, 2}, dtype::Int32()));
        args.emplace_back(
                TensorLayout({m, n}, {k2, 2}, dtype::Int32()),
                TensorLayout({m, n}, {2, k2 + 1}, dtype::Int32()));
        args.emplace_back(
                TensorLayout({m, n}, {k2, 1}, dtype::Int32()),
                TensorLayout({m, n}, {k2 + 1, 1}, dtype::Int32()));
        args.emplace_back(
                TensorLayout({m, n}, {1, k2}, dtype::Int32()),
                TensorLayout({m, n}, {1, k2 + 1}, dtype::Int32()));
        args.emplace_back(
                TensorLayout({m, n}, {1, k2}, dtype::Int32()),
                TensorLayout({m, n}, {k2 + 1, 1}, dtype::Int32()));
        args.emplace_back(
                TensorLayout({m, n}, {k2, 1}, dtype::Int32()),
                TensorLayout({m, n}, {1, k2 + 1}, dtype::Int32()));
    }
    {
        // 3d
        size_t m = 20, n = 30, k = 40;
        ptrdiff_t k2 = k;
        args.emplace_back(
                TensorLayout({m, n, k}, {k2 * k2 * 4, k2 * 3, 2}, dtype::Int32()),
                TensorLayout(
                        {m, n, k}, {2 * k2 * k2 * k2 * 4, k2 * 3, 2}, dtype::Int32()));
    }
    {
        // simplify_layout
        // 234..56
        // 2..3456
        args.emplace_back(
                TensorLayout(
                        {2, 3, 4, 5, 6},
                        {2 * 3 * 4 * 5 * 6, 2 * 4 * 5 * 6, 2 * 5 * 6, 6, 1},
                        dtype::Int32()),
                TensorLayout(
                        {2, 3, 4, 5, 6}, {4 * 3 * 4 * 5 * 6, 4 * 5 * 6, 5 * 6, 6, 1},
                        dtype::Int32()));
    }

    Checker<Relayout> checker(handle_cuda());
    for (auto&& arg : args) {
        checker.exec(TensorLayoutArray{arg.src, arg.dst});
    }
}

TEST_F(CUDA, TRANSPOSE_INT8) {
    auto run = [&](TensorLayout layout, std::vector<size_t> per) {
        TensorLayout src = layout.dimshuffle(per);
        TensorLayout dst = layout;
        dst.init_contiguous_stride();

        Checker<Relayout> checker(handle_cuda());
        checker.exec(TensorLayoutArray{src, dst});
    };
    //! for last contig(NCHW4<->NCHW32)
    run({{5, 8, 4, 3, 8}, dtype::Int8()}, {1, 3, 0, 2, 4});
    run({{5, 8, 4, 3, 5}, dtype::Int8()}, {1, 3, 0, 2, 4});
    run({{5, 8, 4, 3, 64}, dtype::Int8()}, {1, 3, 0, 2, 4});
    //! for last no contig(NCHW->NCHW4)
    run({{7, 4, 32}, dtype::Int8()}, {2, 0, 1});
    run({{7, 4, 64}, dtype::Int8()}, {2, 0, 1});
    run({{7, 4, 7}, dtype::Int8()}, {2, 0, 1});
    //! for copy
    run({{2, 3, 4, 5, 6},
         {2 * 3 * 4 * 5 * 6, 2 * 4 * 5 * 6, 2 * 5 * 6, 6, 1},
         dtype::Int8()},
        {0, 1, 2, 3, 4});
}

TEST_F(CUDA, RELAYOUT_INT8) {
    struct Arg {
        TensorLayout src, dst;
        Arg(TensorLayout src, TensorLayout dst) : src(src), dst(dst) {}
    };
    std::vector<Arg> args;
    {
        // contiguous stride
        args.emplace_back(
                TensorLayout({4, 3, 2}, {2, 8, 1}, dtype::Int8()),
                TensorLayout({4, 3, 2}, {6, 2, 1}, dtype::Int8()));
        args.emplace_back(
                TensorLayout({4, 3, 2}, {6, 2, 1}, dtype::Int8()),
                TensorLayout({4, 3, 2}, {2, 8, 1}, dtype::Int8()));
        args.emplace_back(
                TensorLayout({2, 4, 3, 5}, {60, 5, 20, 1}, dtype::Int8()),
                TensorLayout({2, 4, 3, 5}, {60, 15, 5, 1}, dtype::Int8()));
    }
    args.emplace_back(
            TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int8()),
            TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int8()));
    args.emplace_back(
            TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int8()),
            TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int8()));
    args.emplace_back(
            TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int8()),
            TensorLayout({2, 3, 4, 5}, {180, 60, 15, 3}, dtype::Int8()));
    args.emplace_back(
            TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int8()),
            TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int8()));
    args.emplace_back(
            TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int8()),
            TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int8()));
    args.emplace_back(
            TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int8()),
            TensorLayout({2, 3, 4, 5}, {180, 60, 15, 3}, dtype::Int8()));
    args.emplace_back(
            TensorLayout({16, 128, 128}, {49152, 384, 3}, dtype::Int8()),
            TensorLayout({16, 128, 128}, {16384, 128, 1}, dtype::Int8()));

    {
        // 1d
        size_t n = 10000;
        args.emplace_back(
                TensorLayout({n}, {1}, dtype::Int8()),
                TensorLayout({n}, {1}, dtype::Int8()));
        args.emplace_back(
                TensorLayout({n}, {1}, dtype::Int8()),
                TensorLayout({n}, {2}, dtype::Int8()));
        args.emplace_back(
                TensorLayout({n}, {2}, dtype::Int8()),
                TensorLayout({n}, {1}, dtype::Int8()));
        args.emplace_back(
                TensorLayout({n}, {2}, dtype::Int8()),
                TensorLayout({n}, {2}, dtype::Int8()));
    }
    {
        // 2d
        size_t m = 200, n = 300, k = 400;
        ptrdiff_t k2 = k * 2;
        args.emplace_back(
                TensorLayout({m, n}, {k2, 2}, dtype::Int8()),
                TensorLayout({m, n}, {k2 + 1, 2}, dtype::Int8()));
        args.emplace_back(
                TensorLayout({m, n}, {2, k2}, dtype::Int8()),
                TensorLayout({m, n}, {2, k2 + 1}, dtype::Int8()));
        args.emplace_back(
                TensorLayout({m, n}, {2, k2}, dtype::Int8()),
                TensorLayout({m, n}, {k2 + 1, 2}, dtype::Int8()));
        args.emplace_back(
                TensorLayout({m, n}, {k2, 2}, dtype::Int8()),
                TensorLayout({m, n}, {2, k2 + 1}, dtype::Int8()));
        args.emplace_back(
                TensorLayout({m, n}, {k2, 1}, dtype::Int8()),
                TensorLayout({m, n}, {k2 + 1, 1}, dtype::Int8()));
        args.emplace_back(
                TensorLayout({m, n}, {1, k2}, dtype::Int8()),
                TensorLayout({m, n}, {1, k2 + 1}, dtype::Int8()));
        args.emplace_back(
                TensorLayout({m, n}, {1, k2}, dtype::Int8()),
                TensorLayout({m, n}, {k2 + 1, 1}, dtype::Int8()));
        args.emplace_back(
                TensorLayout({m, n}, {k2, 1}, dtype::Int8()),
                TensorLayout({m, n}, {1, k2 + 1}, dtype::Int8()));
    }
    {
        // 3d
        size_t m = 20, n = 30, k = 40;
        ptrdiff_t k2 = k;
        args.emplace_back(
                TensorLayout({m, n, k}, {k2 * k2 * 4, k2 * 3, 2}, dtype::Int8()),
                TensorLayout(
                        {m, n, k}, {2 * k2 * k2 * k2 * 4, k2 * 3, 2}, dtype::Int8()));
    }
    {
        // simplify_layout
        // 234..56
        // 2..3456
        args.emplace_back(
                TensorLayout(
                        {2, 3, 4, 5, 6},
                        {2 * 3 * 4 * 5 * 6, 2 * 4 * 5 * 6, 2 * 5 * 6, 6, 1},
                        dtype::Int8()),
                TensorLayout(
                        {2, 3, 4, 5, 6}, {4 * 3 * 4 * 5 * 6, 4 * 5 * 6, 5 * 6, 6, 1},
                        dtype::Int8()));

        args.emplace_back(
                TensorLayout(
                        {2, 3, 4, 5, 6},
                        {4 * 3 * 4 * 5 * 6, 4 * 4 * 5 * 6, 2 * 5 * 6, 6, 1},
                        dtype::Int8()),
                TensorLayout(
                        {2, 3, 4, 5, 6}, {4 * 3 * 4 * 5 * 6, 4 * 5 * 6, 5 * 6, 6, 1},
                        dtype::Int8()));
    }

    Checker<Relayout> checker(handle_cuda());
    for (auto&& arg : args) {
        checker.exec(TensorLayoutArray{arg.src, arg.dst});
    }
}

TEST_F(CUDA, RELAYOUT_TEST) {
    struct Arg {
        TensorLayout src, dst;
        Arg(TensorLayout src, TensorLayout dst) : src(src), dst(dst) {}
    };
    std::vector<Arg> args;
    //! dst contig
    args.emplace_back(
            TensorLayout({5, 32, 9}, {288, 1, 32}, dtype::Int8()),
            TensorLayout({5, 9, 32}, {288, 32, 1}, dtype::Int8()));
    args.emplace_back(
            TensorLayout({5, 9, 32}, {288, 1, 9}, dtype::Int8()),
            TensorLayout({5, 32, 9}, {288, 9, 1}, dtype::Int8()));

    args.emplace_back(
            TensorLayout({5, 4, 9}, {36, 1, 4}, dtype::Int8()),
            TensorLayout({5, 9, 4}, {36, 4, 1}, dtype::Int8()));
    args.emplace_back(
            TensorLayout({5, 9, 4}, {36, 1, 9}, dtype::Int8()),
            TensorLayout({5, 4, 9}, {36, 9, 1}, dtype::Int8()));

    args.emplace_back(
            TensorLayout({5, 32, 4}, {128, 1, 32}, dtype::Int8()),
            TensorLayout({5, 4, 32}, {128, 32, 1}, dtype::Int8()));
    args.emplace_back(
            TensorLayout({5, 4, 32}, {128, 1, 4}, dtype::Int8()),
            TensorLayout({5, 32, 4}, {128, 4, 1}, dtype::Int8()));

    args.emplace_back(
            TensorLayout({5, 7, 5}, {35, 1, 7}, dtype::Int8()),
            TensorLayout({5, 5, 7}, {35, 7, 1}, dtype::Int8()));
    args.emplace_back(
            TensorLayout({5, 5, 7}, {35, 1, 5}, dtype::Int8()),
            TensorLayout({5, 7, 5}, {35, 5, 1}, dtype::Int8()));
    //! src contig
    args.emplace_back(
            TensorLayout({5, 9, 32}, {288, 32, 1}, dtype::Int8()),
            TensorLayout({5, 32, 9}, {288, 1, 32}, dtype::Int8()));
    args.emplace_back(
            TensorLayout({5, 32, 9}, {288, 9, 1}, dtype::Int8()),
            TensorLayout({5, 9, 32}, {288, 1, 9}, dtype::Int8()));

    args.emplace_back(
            TensorLayout({5, 9, 4}, {36, 4, 1}, dtype::Int8()),
            TensorLayout({5, 4, 9}, {36, 1, 4}, dtype::Int8()));
    args.emplace_back(
            TensorLayout({5, 4, 9}, {36, 9, 1}, dtype::Int8()),
            TensorLayout({5, 9, 4}, {36, 1, 9}, dtype::Int8()));

    args.emplace_back(
            TensorLayout({5, 4, 32}, {128, 32, 1}, dtype::Int8()),
            TensorLayout({5, 32, 4}, {128, 1, 32}, dtype::Int8()));
    args.emplace_back(
            TensorLayout({5, 32, 4}, {128, 4, 1}, dtype::Int8()),
            TensorLayout({5, 4, 32}, {128, 1, 4}, dtype::Int8()));

    args.emplace_back(
            TensorLayout({5, 5, 7}, {35, 7, 1}, dtype::Int8()),
            TensorLayout({5, 7, 5}, {35, 1, 7}, dtype::Int8()));
    args.emplace_back(
            TensorLayout({5, 7, 5}, {35, 5, 1}, dtype::Int8()),
            TensorLayout({5, 5, 7}, {35, 1, 5}, dtype::Int8()));
    //! cross
    args.emplace_back(
            TensorLayout({5, 9, 32}, {288 * 4, 32 * 3, 1}, dtype::Int8()),
            TensorLayout({5, 32, 9}, {288 * 4, 1, 32 * 3}, dtype::Int8()));
    args.emplace_back(
            TensorLayout({5, 32, 9}, {288 * 3, 9 * 2, 1}, dtype::Int8()),
            TensorLayout({5, 9, 32}, {288 * 3, 1, 9 * 2}, dtype::Int8()));

    args.emplace_back(
            TensorLayout({5, 9, 4}, {36 * 10, 4 * 7, 1}, dtype::Int8()),
            TensorLayout({5, 4, 9}, {36 * 10, 1, 4 * 7}, dtype::Int8()));

    Checker<Relayout> checker(handle_cuda());
    for (auto&& arg : args) {
        checker.exec(TensorLayoutArray{arg.src, arg.dst});
    }
}

TEST_F(CUDA, RELAYOUT_Q4) {
    Checker<Relayout> checker(handle_cuda());
    UniformIntRNG rng_int4{-7, 7};
    checker.set_rng(0, &rng_int4)
            .set_rng(1, &rng_int4)
            .set_dtype(0, dtype::QuantizedS4(1.f))
            .set_dtype(1, dtype::QuantizedS4(1.f))
            .execs({{2, 2, 1, 1}, {1, 1, 2, 2}})
            .execs({{1, 64, 15, 15}, {1, 15, 15, 64}})
            .execs({{1, 5, 9, 32}, {1, 5, 32, 9}})
            .execl(TensorLayoutArray{
                    {{6400}, {1}, dtype::QuantizedS4{1.f}},
                    {{20, 320}, {1024, 1}, dtype::QuantizedS4{1.f}}})
            .execl(TensorLayoutArray{
                    {{1200, 3}, {4, 1}, dtype::QuantizedS4{1.f}},
                    {{20, 60, 3}, {256, 4, 1}, dtype::QuantizedS4{1.f}}})
            .execl(TensorLayoutArray{
                    {{20, 20, 3, 3}, {256, 12, 4, 1}, dtype::QuantizedS4{1.f}},
                    {{1200, 3}, {4, 1}, dtype::QuantizedS4{1.f}}})
            .execl(TensorLayoutArray{
                    {{5, 16, 7, 7, 4}, {3136, 196, 28, 4, 1}, dtype::QuantizedS4{1.f}},
                    {{5, 16, 7, 7, 4}, {3136, 4, 448, 64, 1}, dtype::QuantizedS4{1.f}}})
            .execl(TensorLayoutArray{
                    {{5, 7, 7, 16, 4}, {3136, 448, 64, 4, 1}, dtype::QuantizedS4{1.f}},
                    {{5, 7, 7, 16, 4}, {3136, 28, 4, 196, 1}, dtype::QuantizedS4{1.f}}})
            .execl(TensorLayoutArray{
                    {{5, 2, 7, 7, 32},
                     {3136, 1568, 224, 32, 1},
                     dtype::QuantizedS4{1.f}},
                    {{5, 2, 7, 7, 32},
                     {3136, 32, 448, 64, 1},
                     dtype::QuantizedS4{1.f}}})
            .execl(TensorLayoutArray{
                    {{5, 7, 7, 2, 32}, {3136, 448, 64, 32, 1}, dtype::QuantizedS4{1.f}},
                    {{5, 7, 7, 2, 32},
                     {3136, 224, 32, 1568, 1},
                     dtype::QuantizedS4{1.f}}});
}
// vim: syntax=cpp.doxygen