OpenI
/
MegEngine

/**
 * \file dnn/test/armv7/convolution.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
#include "test/armv7/fixture.h"

#include "test/common/convolution.h"
#include "test/common/checker.h"
#include "test/common/benchmarker.h"

#include "test/common/rng.h"

using namespace megdnn;
using namespace test;

#if MEGDNN_WITH_BENCHMARK
TEST_F(ARMV7, BENCHMARK_CONVOLUTION_STRIDE2)
{
    using Param = param::Convolution;
    auto run = [&](const TensorShapeArray& shapes, Param param) {
        Benchmarker<Convolution> benchmarker_float(handle());
        size_t RUN = 100;
        auto tfloat = benchmarker_float.set_display(false)
                              .set_times(RUN)
                              .set_param(param)
                              .exec(shapes);
        size_t IC = shapes[1][1];
        size_t FH = shapes[1][2];
        size_t FW = shapes[1][3];
        TensorLayout dst_layout;
        auto opr = handle()->create_operator<Convolution>();
        opr->param() = param;
        opr->deduce_layout({shapes[0], dtype::Float32()},
                           {shapes[1], dtype::Float32()}, dst_layout);
        printf("flops: %.3f mflops\n",
               (IC * dst_layout.total_nr_elems() * FH * FW * 2) /
                       (tfloat / RUN * 1000));
    };

    auto profile = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
                       size_t stride) {
        Param param;
        param.stride_h = stride;
        param.stride_w = stride;
        param.pad_h = kernel / 2;
        param.pad_w = kernel / 2;
        printf("oc: %zd ic: %zd w: %zd h: %zd stride: %zd kernel_size: %zd\n",
               oc, ic, w, h, stride, kernel);

        run({{1, ic, h, w}, {oc, ic, kernel, kernel}, {}},
            param);

    };

    for (size_t kernel : {2, 3, 5, 7}) {
        for (size_t ic : {3, 6, 12, 24}) {
            for (size_t oc : {3, 6, 12, 24}) {
                for (size_t size : {4, 7, 8, 14, 16, 17, 28, 32, 34, 64, 112}) {
                    profile(oc, ic, size, size, kernel, 2);
                }
            }
        }
    }
}
#endif

TEST_F(ARMV7, BENCHMARK_CONVOLUTION_1X1)
{
    int exec_times = 50;
    Benchmarker<MatrixMul> benchmarker_gemm(handle());
    benchmarker_gemm.set_times(exec_times);

    Benchmarker<Convolution> benchmarker(handle());
    benchmarker.set_times(exec_times);

    float mod = 1000 * exec_times  / 1e9;
    auto run = [&](size_t IC, size_t OC, size_t H, size_t W) {
        float time = 1.f, perf = 1.f;

        std::cout<<std::endl;
        std::cout<< "CONV: IC " << IC << ", OC " << OC <<
                    ", H " << H << ", W " << W <<std::endl;
        time = benchmarker.exec({{1, IC, H, W}, {OC, IC, 1, 1}, {1, OC, H, W}});
        perf = OC * (2 * H * W - 1) * IC / time * mod;
        std::cout<<"Performance is " << perf <<" Gflops" <<std::endl;

        std::cout<<"GEMM: (" << OC <<", "<< H*W << ", " <<IC <<")"<<std::endl;
        //time = benchmarker_gemm.exec({{OC, H*W}, {H*W, IC}, {}});
        //perf = OC * (2 * H * W - 1) * IC / time * mod;
        time = benchmarker_gemm.exec({{OC, IC}, {IC, H*W}, {}});
        perf = OC * (2 * IC -1) * H * W / time * mod;
        std::cout<<"Performance is " << perf <<" Gflops" <<std::endl;

    };

    //run(32, 32, 64, 64);
    //run(8, 8, 32, 32);
    //run(32, 32, 128, 128);
    //run(32, 32, 512, 512);
    //run(10,10,2,5);
    //run(100,100,2,50);

    run(16,4,240,135);
    run(8,32,120,67);
    run(16,64,60,33);

    run(1,1,28,28);
    run(8,1,28,28);
    run(2,2,28,28);
    run(8,2,28,28);
    run(4,4,28,28);
    run(16,4,28,28);
}

TEST_F(ARMV7, BENCHMARK_GROUP_CONVOLUTION_1X1) {
    int exec_times = 50;
    Benchmarker<Convolution> benchmarker_gconv1x1(handle());
    benchmarker_gconv1x1.set_times(exec_times);

    float mod = 1000 * exec_times / 1e9;
    auto run = [&](size_t IC, size_t OC,  size_t H, size_t W, size_t group){
        float time = 1.f, perf = 1.f;

        std::cout<<std::endl;
        std::cout<< "GCONV: IC " << IC << ", OC " << OC <<
                ", H " << H << ", W " << W <<", GROUP "<<group << std::endl;

        auto ICg = IC / group;
        auto OCg = OC / group;
        param::Convolution param;
        param.sparse = param::Convolution::Sparse::GROUP;
        time = benchmarker_gconv1x1.set_param(param).exec({{1, IC, H, W},
                {group, OCg, ICg, 1, 1},{}});
        perf = group * OCg * ICg * H * W / time * mod;
        std::cout<<"Performance is " << perf <<" Gflops" <<std::endl;
    };
    run(8*4, 1*4, 28, 28, 4);
    run(2*4, 2*4, 28, 28, 4);
    run(8*4, 2*4, 28, 28, 4);
    run(4*4, 4*4, 28, 28, 4);
    run(16*4, 4*4, 28, 28, 4);

}


// vim: syntax=cpp.doxygen