|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958 |
- #include "test/aarch64/fixture.h"
-
- #include "test/common/benchmarker.h"
- #include "test/common/checker.h"
-
- #include "test/common/matrix_mul.h"
- #include "test/common/rng.h"
-
- #include "test/arm_common/cpuinfo_help.h"
- using namespace megdnn;
- using namespace test;
-
- TEST_F(AARCH64, MATRIX_MUL_FP32K8X12) {
- matrix_mul::check_matrix_mul(
- dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
- "AARCH64_F32K8X12X1");
- }
- #if MGB_ENABLE_CPUINFO
- TEST_F(AARCH64, MATRIX_MUL_FP32K8X12_A53) {
- CpuInfoTmpReplace cpu_replace_guard(cpuinfo_uarch_cortex_a53);
- matrix_mul::check_matrix_mul(
- dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
- "AARCH64_F32K8X12X1");
- }
- TEST_F(AARCH64, MATRIX_MUL_FP32K8X12_A55) {
- CpuInfoTmpReplace cpu_replace_guard(cpuinfo_uarch_cortex_a55);
- matrix_mul::check_matrix_mul(
- dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
- "AARCH64_F32K8X12X1");
- }
- #endif
-
- TEST_F(AARCH64, MATRIX_MUL_FP32K4X16) {
- matrix_mul::check_matrix_mul(
- dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
- "AARCH64_F32K4X16X1");
- }
-
- TEST_F(AARCH64, MATRIX_MUL_FP32_PACK_MK4) {
- matrix_mul::check_matrix_mul(
- dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
- "AARCH64_F32_MK4_K8X12X1", param::MatrixMul::Format::MK4, 1);
- }
- #if MGB_ENABLE_CPUINFO
- TEST_F(AARCH64, MATRIX_MUL_FP32_PACK_MK4_A53) {
- CpuInfoTmpReplace cpu_replace_guard(cpuinfo_uarch_cortex_a53);
- matrix_mul::check_matrix_mul(
- dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
- "AARCH64_F32_MK4_K8X12X1", param::MatrixMul::Format::MK4, 1);
- }
- TEST_F(AARCH64, MATRIX_MUL_FP32_PACK_MK4_A55) {
- CpuInfoTmpReplace cpu_replace_guard(cpuinfo_uarch_cortex_a55);
- matrix_mul::check_matrix_mul(
- dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
- "AARCH64_F32_MK4_K8X12X1", param::MatrixMul::Format::MK4, 1);
- }
- #endif
-
- TEST_F(AARCH64, MATRIX_MUL_FP32_MK4) {
- matrix_mul::check_matrix_mul(
- dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
- "AARCH64_F32_MK4_4x16", param::MatrixMul::Format::MK4, 1);
- }
-
- #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- TEST_F(AARCH64, MATRIX_MUL_F16_K8X24X1) {
- matrix_mul::check_matrix_mul(
- dtype::Float16{}, dtype::Float16{}, dtype::Float16{}, handle(),
- "AARCH64_F16_K8X24X1");
- }
-
- TEST_F(AARCH64, MATRIX_MUL_F16_MK8) {
- matrix_mul::check_matrix_mul(
- dtype::Float16{}, dtype::Float16{}, dtype::Float16{}, handle(),
- "AARCH64_F16_MK8_8X8", param::MatrixMul::Format::MK8, 1);
- }
-
- TEST_F(AARCH64, MATRIX_MUL_F16_MK8_16x12x1) {
- matrix_mul::check_matrix_mul(
- dtype::Float16{}, dtype::Float16{}, dtype::Float16{}, handle(),
- "AARCH64_F16_MK8_16X12X1", param::MatrixMul::Format::MK8, 1);
- }
- #endif
-
- #if MGB_ENABLE_DOT
- TEST_F(AARCH64, MATRIX_MUL_INT8X8X32_K8X12X4_DOTPROD) {
- matrix_mul::check_matrix_mul(
- dtype::Int8{}, dtype::Int8{}, dtype::Int32{}, handle(),
- "AARCH64_INT8X8X32_K8X12X4_DOTPROD");
- }
-
- TEST_F(AARCH64, MATRIX_MUL_INT8X8X32_MK4_8X12X4_DOTPROD) {
- std::vector<matrix_mul::TestArg> args;
- for (size_t m : {1, 2, 3, 4, 5, 6, 7, 10, 11})
- for (size_t n : {2, 3, 4, 5, 8, 12, 13, 14, 15, 16, 31})
- for (size_t k : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 33, 34})
- args.emplace_back(m, n, k, 0);
- matrix_mul::check_matrix_mul(
- dtype::Int8{}, dtype::Int8{}, dtype::Int32{}, handle(),
- "AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD", param::MatrixMul::Format::MK4_DOT,
- 1, 1e-3, std::move(args));
- }
- #else
- TEST_F(AARCH64, MATRIX_MUL_INT8X8X32_K4X4X16) {
- matrix_mul::check_matrix_mul(
- dtype::Int8{}, dtype::Int8{}, dtype::Int32{}, handle(),
- "AARCH64_INT8X8X32_K4X4X16");
- }
-
- TEST_F(AARCH64, MATRIX_MUL_INT8_MK4) {
- std::vector<matrix_mul::TestArg> args;
- for (size_t m : {1, 2, 3, 4, 5, 7, 10, 11})
- for (size_t n : {1, 2, 3, 4, 5, 8, 16, 24, 25, 32})
- for (size_t k : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 33, 34})
- args.emplace_back(m, n, k, 0);
- matrix_mul::check_matrix_mul(
- dtype::Int8{}, dtype::Int8{}, dtype::Int32{}, handle(),
- "AARCH64_INT8X8X32_MK4_4X4X16", param::MatrixMul::Format::MK4, 1, 1e-3,
- std::move(args));
- }
-
- TEST_F(AARCH64, MATRIX_MUL_INT8x8x16_MK4) {
- std::vector<matrix_mul::TestArg> args;
- for (size_t m : {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17})
- for (size_t n : {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 24})
- for (size_t k : {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29})
- args.emplace_back(m, n, k, 0);
- matrix_mul::check_matrix_mul(
- dtype::Int8{}, dtype::Int8{}, dtype::Int16{}, handle(),
- "AARCH64_INT8X8X16_MK4_K8X8X8", param::MatrixMul::Format::MK4, 1, 1e-3,
- std::move(args));
- }
- TEST_F(AARCH64, MATRIX_MUL_MK4_8x8x16_4x4) {
- matrix_mul::check_matrix_mul(
- dtype::Int8{}, dtype::Int8{}, dtype::Int16{}, handle(),
- "AARCH64_INT8X8X16_MK4_4X4X8", param::MatrixMul::Format::MK4, 1);
- }
-
- TEST_F(AARCH64, MATRIX_MUL_MK4_8x8x16) {
- matrix_mul::check_matrix_mul(
- dtype::Int8{}, dtype::Int8{}, dtype::Int16{}, handle(),
- "AARCH64_INT8X8X16_MK4_16X12X4", param::MatrixMul::Format::MK4, 1);
- }
-
- TEST_F(AARCH64, MATRIX_MUL_INT8x8x32_K8x8x8) {
- matrix_mul::check_matrix_mul(
- dtype::Int8{}, dtype::Int8{}, dtype::Int32{}, handle(),
- "AARCH64_INT8X8X32_K8X8X8");
- }
- #endif
-
- TEST_F(AARCH64, MATRIX_MUL_INT8x8x16_K8x8x8) {
- matrix_mul::check_matrix_mul(
- dtype::Int8{}, dtype::Int8{}, dtype::Int16{}, handle(),
- "AARCH64_INT8X8X16_K8X8X8");
- }
-
- TEST_F(AARCH64, MATRIX_MUL_INT8x8x16_K4x4x16) {
- matrix_mul::check_matrix_mul(
- dtype::Int8{}, dtype::Int8{}, dtype::Int16{}, handle(),
- "AARCH64_INT8X8X16_K4X4X16");
- }
-
- TEST_F(AARCH64, MATRIX_MUL_INT4x4x16_K8x8x8_QUANTIZEDS4) {
- param::MatrixMul param;
- param.transposeA = false;
- param.transposeB = false;
-
- Checker<MatrixMul> checker(handle());
- checker.set_dtype(0, dtype::QuantizedS4{0.6})
- .set_dtype(1, dtype::QuantizedS4{0.5})
- .set_dtype(2, dtype::QuantizedS16{0.6 * 0.5})
- .set_param(param);
- checker.set_before_exec_callback(
- AlgoChecker<MatrixMul>("AARCH64_INT4X4X16_K8X8X8"));
-
- auto run = [&](size_t M, size_t N, size_t K) {
- printf("M N K %zu %zu %zu \n", M, N, K);
- TensorShape A, B;
- if (param.transposeA) {
- A = TensorShape{K, M};
- } else {
- A = TensorShape{M, K};
- }
- if (param.transposeB) {
- B = TensorShape{N, K};
- } else {
- B = TensorShape{K, N};
- }
- checker.exec({A, B, {}});
- };
-
- for (size_t m : {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 16, 20})
- for (size_t n : {2, 4, 6, 8, 10, 12, 14, 16, 24})
- for (size_t k : {2, 4, 6, 8, 10, 12, 14, 16, 32})
- run(m, n, k);
-
- for (size_t k = 4; k <= 256; k *= 8) {
- for (size_t m = 4; m <= 256; m *= 4) {
- for (size_t n = 4; n <= 256; n *= 4) {
- run(m, n, k);
- }
- }
- }
- param.transposeA = true;
- run(8, 8, 8);
- run(16, 8, 16);
- param.transposeB = true;
- run(8, 8, 8);
- run(16, 16, 16);
- }
-
- TEST_F(AARCH64, MATRIX_MUL_INT16x16x32_K12X8X1) {
- matrix_mul::check_matrix_mul(
- dtype::Int16{}, dtype::Int16{}, dtype::Int32{}, handle(),
- "AARCH64_INT16X16X32_K12X8X1");
- }
-
- TEST_F(AARCH64, MATRIX_MUL_INT16x16x32_MK8) {
- matrix_mul::check_matrix_mul(
- dtype::Int16{}, dtype::Int16{}, dtype::Int32{}, handle(),
- "AARCH64_INT16X16X32_MK8_8X8", param::MatrixMul::Format::MK8, 1);
- }
-
- //! FIXME: need to add tests of GEMV and QUINT8
-
- #if MEGDNN_WITH_BENCHMARK
-
- TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_K4X16) {
- constexpr size_t RUNS = 50;
- param::MatrixMul param;
- param.transposeA = false;
- param.transposeB = false;
- Benchmarker<MatrixMul> benchmarker_K4X16(handle());
- Benchmarker<MatrixMul> benchmarker_K12X8(handle());
- benchmarker_K4X16.set_times(RUNS)
- .set_dtype(0, dtype::Float32{})
- .set_dtype(1, dtype::Float32{})
- .set_dtype(2, dtype::Float32{})
- .set_param(param)
- .set_display(false);
- benchmarker_K4X16.set_before_exec_callback(
- AlgoChecker<MatrixMul>("AARCH64_F32K4X16X1"));
-
- benchmarker_K12X8.set_before_exec_callback(
- AlgoChecker<MatrixMul>("AARCH64_F32K8X12X1"));
- benchmarker_K12X8.set_times(RUNS)
- .set_dtype(0, dtype::Float32{})
- .set_dtype(1, dtype::Float32{})
- .set_dtype(2, dtype::Float32{})
- .set_param(param)
- .set_display(false);
-
- auto run = [&](size_t M, size_t N, size_t K) {
- TensorShape A, B;
- if (param.transposeA) {
- A = TensorShape{K, M};
- } else {
- A = TensorShape{M, K};
- }
- if (param.transposeB) {
- B = TensorShape{N, K};
- } else {
- B = TensorShape{K, N};
- }
-
- auto k4x16_used = benchmarker_K4X16.exec({A, B, {}}) / RUNS;
- auto k12x8_used = benchmarker_K12X8.exec({A, B, {}}) / RUNS;
- float computations = 2.f * M * K * N * 1e-6;
- printf("run: {%zu{M} %zu{K} %zu{N}} k4x16: %f ms %f Gflops k12x8: %f "
- "ms "
- "%f Gflops k4x16_vs_k12x8: %f\n",
- M, K, N, k4x16_used, computations / k4x16_used, k12x8_used,
- computations / k12x8_used, k12x8_used / k4x16_used);
- };
-
- run(256, 256, 128);
- run(384, 384, 384);
-
- for (size_t k = 4; k <= 256; k *= 8) {
- for (size_t m = 4; m <= 256; m *= 4) {
- for (size_t n = 4; n <= 256; n *= 4) {
- run(m, n, k);
- }
- printf("\n");
- }
- printf("\n");
- }
- }
-
- TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT16_8X8X8) {
- constexpr size_t RUNS = 50;
- param::MatrixMul param;
- param.transposeA = false;
- param.transposeB = false;
- Benchmarker<MatrixMul> benchmarker_int(handle());
- Benchmarker<MatrixMul> benchmarker_int32(handle());
- benchmarker_int.set_times(RUNS)
- .set_dtype(0, dtype::Int8{})
- .set_dtype(1, dtype::Int8{})
- .set_dtype(2, dtype::Int16{})
- .set_param(param)
- .set_display(false);
- benchmarker_int.set_before_exec_callback(
- AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_K8X8X8"));
-
- benchmarker_int32.set_before_exec_callback(
- AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_K8X8X8"));
- benchmarker_int32.set_times(RUNS)
- .set_dtype(0, dtype::Int8{})
- .set_dtype(1, dtype::Int8{})
- .set_dtype(2, dtype::Int32{})
- .set_param(param)
- .set_display(false);
- Benchmarker<MatrixMul> benchmarker_float(handle());
- benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
-
- auto run = [&](size_t M, size_t N, size_t K) {
- TensorShape A, B;
- if (param.transposeA) {
- A = TensorShape{K, M};
- } else {
- A = TensorShape{M, K};
- }
- if (param.transposeB) {
- B = TensorShape{N, K};
- } else {
- B = TensorShape{K, N};
- }
-
- auto int_used = benchmarker_int.exec({A, B, {}}) / RUNS;
- auto float_used = benchmarker_float.exec({A, B, {}}) / RUNS;
- auto int32_used = benchmarker_int32.exec({A, B, {}}) / RUNS;
- float computations = 2.f * M * K * N * 1e-6;
- printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
- "%f Gflops speedup_vs_fp32: %f, speedup_vs_int32: %f\n",
- M, K, N, float_used, computations / float_used, int_used,
- computations / int_used, float_used / int_used, int32_used / int_used);
- };
-
- run(256, 256, 256);
-
- for (size_t k = 4; k <= 256; k *= 8) {
- for (size_t m = 4; m <= 256; m *= 4) {
- for (size_t n = 4; n <= 256; n *= 4) {
- run(m, n, k);
- }
- std::cout << std::endl;
- }
- std::cout << std::endl;
- }
- }
-
- TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT32_MK_4X4X16) {
- constexpr size_t RUNS = 50;
- param::MatrixMul param;
- param.transposeA = false;
- param.transposeB = false;
- Benchmarker<MatrixMul> benchmarker(handle());
- Benchmarker<MatrixMul> benchmarker_mk4(handle());
- benchmarker.set_times(RUNS)
- .set_dtype(0, dtype::Int8{})
- .set_dtype(1, dtype::Int8{})
- .set_dtype(2, dtype::Int32{})
- .set_param(param)
- .set_display(false);
- benchmarker.set_before_exec_callback(
- AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_K4X4X16"));
-
- param.format = MatrixMul::Param::Format::MK4;
- benchmarker_mk4.set_before_exec_callback(
- AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_MK4_4X4X16"));
- benchmarker_mk4.set_times(RUNS)
- .set_dtype(0, dtype::Int8{})
- .set_dtype(1, dtype::Int8{})
- .set_dtype(2, dtype::Int32{})
- .set_param(param)
- .set_display(false);
-
- auto run = [&](size_t M, size_t N, size_t K) {
- auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
- auto mk_used =
- benchmarker_mk4.exec({{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) / RUNS;
- float computations = 2.f * M * K * N * 1e-6;
- printf("run: {%zu{M} %zu{K} %zu{N}} normal: %f ms %f Gflops mk4: %f ms "
- "%f Gflops speedup_vs_normal: %f\n",
- M, K, N, default_used, computations / default_used, mk_used,
- computations / mk_used, default_used / mk_used);
- };
-
- run(256, 256, 128);
- for (size_t k = 4; k <= 512; k *= 2) {
- for (size_t m = 4; m <= 512; m *= 2) {
- for (size_t n = 4; n <= 512; n *= 2) {
- run(m, n, k);
- }
- }
- std::cout << std::endl;
- }
- }
-
- TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_MK4_8x8x16) {
- constexpr size_t RUNS = 50;
- param::MatrixMul param;
- param.transposeA = false;
- param.transposeB = false;
- Benchmarker<MatrixMul> benchmarker(handle());
- Benchmarker<MatrixMul> benchmarker_mk4(handle());
- Benchmarker<MatrixMul> benchmarker_mk4_16x12(handle());
- benchmarker.set_times(RUNS)
- .set_dtype(0, dtype::Int8{})
- .set_dtype(1, dtype::Int8{})
- .set_dtype(2, dtype::Int16{})
- .set_param(param)
- .set_display(false);
- benchmarker.set_before_exec_callback(
- AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_K4X4X16"));
-
- param.format = MatrixMul::Param::Format::MK4;
- benchmarker_mk4.set_before_exec_callback(
- AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_MK4_4X4X8"));
- benchmarker_mk4.set_times(RUNS)
- .set_dtype(0, dtype::Int8{})
- .set_dtype(1, dtype::Int8{})
- .set_dtype(2, dtype::Int16{})
- .set_param(param)
- .set_display(false);
-
- benchmarker_mk4_16x12.set_before_exec_callback(
- AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_MK4_16X12X4"));
- benchmarker_mk4_16x12.set_times(RUNS)
- .set_dtype(0, dtype::Int8{})
- .set_dtype(1, dtype::Int8{})
- .set_dtype(2, dtype::Int16{})
- .set_param(param)
- .set_display(false);
-
- auto run = [&](size_t M, size_t N, size_t K) {
- auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
- auto mk_used =
- benchmarker_mk4.exec({{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) / RUNS;
- auto mk4_16x12_used =
- benchmarker_mk4_16x12.exec({{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) /
- RUNS;
- float computations = 2.f * M * K * N * 1e-6;
- printf("run: {%zu{M} %zu{K} %zu{N}} normal: %f ms %f Gflops mk4: %f ms "
- "%f Gflops speedup: %f, mk4_16x12 %f Gflops speedup: %f\n",
- M, K, N, default_used, computations / default_used, mk_used,
- computations / mk_used, default_used / mk_used,
- computations / mk4_16x12_used, default_used / mk4_16x12_used);
- };
-
- run(384, 384, 384);
- }
-
- TEST_F(AARCH64, BENCHMARK_4x4x16_vs_8x8x16) {
- constexpr size_t RUNS = 50;
- param::MatrixMul param;
- param.transposeA = false;
- param.transposeB = false;
- Benchmarker<MatrixMul> benchmarker(handle());
- Benchmarker<MatrixMul> benchmarker_int4_4x4x16(handle());
- benchmarker_int4_4x4x16.set_times(RUNS)
- .set_dtype(0, dtype::QuantizedS4{0.3})
- .set_dtype(1, dtype::QuantizedS4{0.3})
- .set_dtype(2, dtype::QuantizedS16{0.09})
- .set_param(param)
- .set_display(false);
- benchmarker.set_times(RUNS)
- .set_dtype(0, dtype::Int8{})
- .set_dtype(1, dtype::Int8{})
- .set_dtype(2, dtype::Int16{})
- .set_param(param)
- .set_display(false);
- benchmarker.set_before_exec_callback(
- AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_K4X4X16"));
-
- auto run = [&](size_t M, size_t N, size_t K) {
- auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
- auto int4416_used = benchmarker_int4_4x4x16.exec({{M, K}, {K, N}, {}}) / RUNS;
- float computations = 2.f * M * K * N * 1e-6;
- printf("run: {%zu{M} %zu{K} %zu{N}} normal 8x8x16 used: %f ms %f "
- "Gflops int4416 used %f int4416_gflops %f speedup %f\n",
- M, K, N, default_used, computations / default_used, int4416_used,
- computations / int4416_used, default_used / int4416_used);
- };
-
- for (int m = 32; m <= 1024; m += 32)
- for (int n = 32; n <= 1024; n += 32)
- for (int k = 32; k <= 512; k += 32)
- run(m, n, k);
-
- run(32, 32, 32);
- run(32, 32, 8);
- run(32, 32, 16);
- run(32, 32, 24);
- run(32 * 2, 32 * 2, 32);
- run(32 * 4, 32 * 4, 32);
- run(32 * 6, 32 * 6, 32);
- run(32 * 8, 32 * 8, 32);
- run(32 * 2, 32 * 2, 32 * 2);
- run(32 * 4, 32 * 4, 32 * 3);
- run(32 * 6, 32 * 6, 32 * 4);
- run(32 * 8, 32 * 8, 32 * 5);
- run(32 * 10, 32 * 10, 32 * 10);
- run(384, 384, 384);
- run(256, 256, 384);
- run(512, 512, 384);
- run(1024, 1024, 384);
- }
-
- TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_MK4_8x8x8_8x8x16_vs_4x4x16_8x8x16) {
- constexpr size_t RUNS = 50;
- param::MatrixMul param;
- param.transposeA = false;
- param.transposeB = false;
- Benchmarker<MatrixMul> benchmarker(handle());
- Benchmarker<MatrixMul> benchmarker_mk4(handle());
- Benchmarker<MatrixMul> benchmarker_mk4_4x4x8(handle());
- benchmarker.set_times(RUNS)
- .set_dtype(0, dtype::Int8{})
- .set_dtype(1, dtype::Int8{})
- .set_dtype(2, dtype::Int16{})
- .set_param(param)
- .set_display(false);
- benchmarker.set_before_exec_callback(
- AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_K4X4X16"));
-
- param.format = MatrixMul::Param::Format::MK4;
- benchmarker_mk4.set_before_exec_callback(
- AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_MK4_K8X8X8"));
- benchmarker_mk4.set_times(RUNS)
- .set_dtype(0, dtype::Int8{})
- .set_dtype(1, dtype::Int8{})
- .set_dtype(2, dtype::Int16{})
- .set_param(param)
- .set_display(false);
-
- benchmarker_mk4_4x4x8.set_before_exec_callback(
- AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_MK4_4X4X8"));
- benchmarker_mk4_4x4x8.set_times(RUNS)
- .set_dtype(0, dtype::Int8{})
- .set_dtype(1, dtype::Int8{})
- .set_dtype(2, dtype::Int16{})
- .set_param(param)
- .set_display(false);
-
- auto run = [&](size_t M, size_t N, size_t K) {
- auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
- auto mk_used =
- benchmarker_mk4.exec({{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) / RUNS;
- auto mk4_4x4x8_used =
- benchmarker_mk4_4x4x8.exec({{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) /
- RUNS;
- float computations = 2.f * M * K * N * 1e-6;
- printf("run: {%zu{M} %zu{K} %zu{N}} normal: %f ms %f Gflops mk4: %f ms "
- "%f Gflops speedup: %f, mk4_4x4x8 %f Gflops %f ms speedup: %f\n",
- M, K, N, default_used, computations / default_used, mk_used,
- computations / mk_used, default_used / mk_used,
- computations / mk4_4x4x8_used, mk4_4x4x8_used, mk4_4x4x8_used / mk_used);
- };
-
- run(384, 384, 384);
- run(512, 512, 512);
- run(1024, 1024, 384);
- run(256, 256, 384);
- for (int m = 32; m <= 512; m *= 2)
- for (int n = 32; n <= 512; n *= 2)
- for (int k = 32; k < 512; k *= 2) {
- run(m, n, k);
- }
- }
- TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT16_4X4X16) {
- constexpr size_t RUNS = 50;
- param::MatrixMul param;
- param.transposeA = false;
- param.transposeB = false;
- Benchmarker<MatrixMul> benchmarker_int(handle());
- Benchmarker<MatrixMul> benchmarker_int32(handle());
- benchmarker_int.set_times(RUNS)
- .set_dtype(0, dtype::Int8{})
- .set_dtype(1, dtype::Int8{})
- .set_dtype(2, dtype::Int16{})
- .set_param(param)
- .set_display(false);
- benchmarker_int.set_before_exec_callback(
- AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_K4X4X16"));
-
- benchmarker_int32.set_before_exec_callback(
- AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_K4X4X16"));
- benchmarker_int32.set_times(RUNS)
- .set_dtype(0, dtype::Int8{})
- .set_dtype(1, dtype::Int8{})
- .set_dtype(2, dtype::Int32{})
- .set_param(param)
- .set_display(false);
- Benchmarker<MatrixMul> benchmarker_float(handle());
- benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
-
- auto run = [&](size_t M, size_t N, size_t K) {
- TensorShape A, B;
- if (param.transposeA) {
- A = TensorShape{K, M};
- } else {
- A = TensorShape{M, K};
- }
- if (param.transposeB) {
- B = TensorShape{N, K};
- } else {
- B = TensorShape{K, N};
- }
-
- auto int_used = benchmarker_int.exec({A, B, {}}) / RUNS;
- auto float_used = benchmarker_float.exec({A, B, {}}) / RUNS;
- auto int32_used = benchmarker_int32.exec({A, B, {}}) / RUNS;
- float computations = 2.f * M * K * N * 1e-6;
- printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
- "%f Gflops speedup_vs_fp32: %f, speedup_vs_int32: %f\n",
- M, K, N, float_used, computations / float_used, int_used,
- computations / int_used, float_used / int_used, int32_used / int_used);
- };
-
- run(256, 256, 128);
-
- run(256, 256, 256);
-
- for (size_t k = 4; k <= 256; k *= 4) {
- for (size_t m = 4; m <= 256; m *= 4) {
- for (size_t n = 4; n <= 256; n *= 4) {
- run(m, n, k);
- }
- }
- std::cout << std::endl;
- }
- }
-
- TEST_F(AARCH64, BENCHMARK_GEMV) {
- int exec_times = 10;
- Benchmarker<MatrixMul> benchmarker_gemm(handle());
- benchmarker_gemm.set_times(exec_times);
-
- float mod = 1000 * exec_times / 1e9;
- auto run = [&](size_t M, size_t K, size_t N) {
- float time = 1.f, perf = 1.f;
-
- std::cout << "GEMM: (" << M << ", " << K << ", " << N << ")" << std::endl;
- benchmarker_gemm.set_dtype(0, dtype::Float32()).set_dtype(1, dtype::Float32());
- time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
- perf = 2.f * M * K * N / time * mod;
- std::cout << "gemm fp32, Performance is " << perf << " Gflops" << std::endl;
-
- #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- benchmarker_gemm.set_dtype(0, dtype::Float16()).set_dtype(1, dtype::Float16());
- time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
- perf = 2.f * M * K * N / time * mod;
- std::cout << "gemm fp16, Performance is " << perf << " Gflops" << std::endl;
- #endif
- };
-
- std::cout << "warm up:\n";
- for (int i = 0; i < 50; i++) {
- benchmarker_gemm.set_dtype(0, dtype::Float32())
- .set_dtype(1, dtype::Float32())
- .set_display(false)
- .exec({{256, 256}, {256, 256}, {}});
- benchmarker_gemm.set_display(true);
- }
-
- // run gemv
- for (size_t M : {1, 2, 3, 4, 5, 6, 7, 8, 64, 256})
- for (size_t K : {1, 2, 3, 4, 5, 6, 7, 8, 64, 256})
- for (size_t N : {112})
- run(M, K, N);
- }
-
- #if MGB_ENABLE_DOT
- TEST_F(AARCH64, BENCHMARK_TRANSPOSED_MATRIX_MUL_INT_8X8X32) {
- constexpr size_t RUNS = 50;
- param::MatrixMul param;
- param.transposeA = param.transposeB = true;
- Benchmarker<MatrixMul> benchmarker_int(handle());
- benchmarker_int.set_times(RUNS)
- .set_dtype(0, dtype::Int8{})
- .set_dtype(1, dtype::Int8{})
- .set_dtype(2, {})
- .set_param(param)
- .set_display(false);
- Benchmarker<MatrixMul> benchmarker_float(handle());
- benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
-
- auto run = [&](size_t M, size_t N, size_t K) {
- auto int_used = benchmarker_int.exec({{K, M}, {N, K}, {}}) / RUNS;
- auto float_used = benchmarker_float.exec({{K, M}, {N, K}, {}}) / RUNS;
- float computations = 2.f * M * K * N * 1e-6;
- printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
- "%f Gflops speedup: %f\n",
- M, K, N, float_used, computations / float_used, int_used,
- computations / int_used, float_used / int_used);
- };
-
- run(256, 12 * 24, 256);
-
- for (size_t M : {8, 64, 112, 256}) {
- for (size_t K : {8, 64, 112, 256}) {
- for (size_t N : {8, 64, 112, 256}) {
- run(M, N, K);
- }
- }
- }
- }
-
- TEST_F(AARCH64, BENCHMARK_GEMV_INT_8X8X32) {
- constexpr size_t RUNS = 50;
- param::MatrixMul param;
- Benchmarker<MatrixMul> benchmarker_int(handle());
- benchmarker_int.set_times(RUNS)
- .set_dtype(0, dtype::Int8{})
- .set_dtype(1, dtype::Int8{})
- .set_dtype(2, {})
- .set_param(param)
- .set_display(false);
- Benchmarker<MatrixMul> benchmarker_float(handle());
- benchmarker_float.set_display(false).set_times(RUNS);
-
- auto run = [&](size_t M, size_t N, size_t K) {
- auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
- auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
- float computations = 2.f * M * K * N * 1e-6;
- printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
- "%f Gflops speedup: %f\n",
- M, K, N, float_used, computations / float_used, int_used,
- computations / int_used, float_used / int_used);
- };
-
- for (size_t M : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 256})
- for (size_t N : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 256})
- for (size_t K : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 256})
- run(M, N, K);
- }
-
- TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT8X8X32_MK4_8X12X4) {
- constexpr size_t RUNS = 50;
- param::MatrixMul param;
- param.transposeA = false;
- param.transposeB = false;
- Benchmarker<MatrixMul> benchmarker(handle());
- Benchmarker<MatrixMul> benchmarker_mk4(handle());
- benchmarker.set_times(RUNS)
- .set_dtype(0, dtype::Int8{})
- .set_dtype(1, dtype::Int8{})
- .set_dtype(2, dtype::Int32{})
- .set_param(param)
- .set_display(false);
- benchmarker.set_before_exec_callback(
- AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_K8X12X4"));
-
- param.format = MatrixMul::Param::Format::MK4_DOT;
- benchmarker_mk4.set_before_exec_callback(
- AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD"));
- benchmarker_mk4.set_times(RUNS)
- .set_dtype(0, dtype::Int8{})
- .set_dtype(1, dtype::Int8{})
- .set_dtype(2, dtype::Int32{})
- .set_param(param)
- .set_display(false);
-
- auto run = [&](size_t M, size_t N, size_t K) {
- auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
- auto mk_used =
- benchmarker_mk4.exec({{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) / RUNS;
- float computations = 2.f * M * K * N * 1e-6;
- printf("run: {%zu{M} %zu{K} %zu{N}} normal: %f ms %f Gflops mk4: %f ms "
- "%f Gflops speedup_vs_normal: %f\n",
- M, K, N, default_used, computations / default_used, mk_used,
- computations / mk_used, default_used / mk_used);
- };
-
- run(256, 256, 128);
- for (size_t k = 4; k <= 512; k *= 2) {
- for (size_t m = 4; m <= 512; m *= 2) {
- for (size_t n = 4; n <= 512; n *= 2) {
- run(m, n, k);
- }
- }
- std::cout << std::endl;
- }
- }
- #endif // MGB_ENABLE_DOT
-
- #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_F16_MK8) {
- auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(8);
- matrix_mul::benchmark_with_contrast(
- handle(), args, dtype::Float16{}, dtype::Float16{}, dtype::Float16{},
- "AARCH64_F16_MK8_8X8", param::MatrixMul::Format::MK8, dtype::Float16{},
- dtype::Float16{}, dtype::Float16{}, "AARCH64_F16_K8X24X1");
- }
-
- TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_F16_MK8_16x12) {
- auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(8);
- matrix_mul::benchmark_with_contrast(
- handle(), args, dtype::Float16{}, dtype::Float16{}, dtype::Float16{},
- "AARCH64_F16_MK8_16X12X1", param::MatrixMul::Format::MK8, dtype::Float16{},
- dtype::Float16{}, dtype::Float16{}, "AARCH64_F16_K8X24X1");
- }
- #endif
-
- TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT16x16x32) {
- constexpr size_t RUNS = 50;
- Benchmarker<MatrixMul> benchmarker_int(handle());
- benchmarker_int.set_times(RUNS)
- .set_dtype(0, dtype::Int16{})
- .set_dtype(1, dtype::Int16{})
- .set_dtype(2, dtype::Int32{})
- .set_display(false);
- Benchmarker<MatrixMul> benchmarker_float(handle());
- benchmarker_float.set_display(false).set_times(RUNS);
-
- auto run = [&](size_t M, size_t N, size_t K, int mask) {
- param::MatrixMul param;
- param.transposeA = mask & 0x1;
- param.transposeB = mask & 0x2;
- benchmarker_int.set_param(param);
- benchmarker_float.set_param(param);
- TensorShape A, B;
- if (param.transposeA) {
- A = TensorShape{K, M};
- } else {
- A = TensorShape{M, K};
- }
- if (param.transposeB) {
- B = TensorShape{N, K};
- } else {
- B = TensorShape{K, N};
- }
- auto int_used = benchmarker_int.exec({A, B, {}}) / RUNS;
- auto float_used = benchmarker_float.exec({A, B, {}}) / RUNS;
- float computations = 2.f * M * K * N * 1e-6;
- printf("run: {%zu{M} %zu{K} %zu{N} %d{TA} %d{TB}} "
- "float: %f ms %f Gflops int: %f ms "
- "%f Gflops speedup: %f\n",
- M, K, N, param.transposeA, param.transposeB, float_used,
- computations / float_used, int_used, computations / int_used,
- float_used / int_used);
- };
-
- constexpr int mask = 4;
- for (auto i = 0; i < mask; i++) {
- for (size_t M : {8, 64, 112, 256}) {
- for (size_t K : {8, 64, 112, 256}) {
- for (size_t N : {8, 64, 112, 256}) {
- run(M, N, K, i);
- }
- }
- }
- }
- }
-
- TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_MK4) {
- auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(16);
- matrix_mul::benchmark_with_contrast(
- handle(), args, dtype::Float32{}, dtype::Float32{}, dtype::Float32{},
- "AARCH64_F32_MK4_4x16", param::MatrixMul::Format::MK4, dtype::Float32{},
- dtype::Float32{}, dtype::Float32{});
- }
-
- TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_PACK_MK4) {
- auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(16);
- matrix_mul::benchmark_with_contrast(
- handle(), args, dtype::Float32{}, dtype::Float32{}, dtype::Float32{},
- "AARCH64_F32_MK4_K8X12X1", param::MatrixMul::Format::MK4, dtype::Float32{},
- dtype::Float32{}, dtype::Float32{}, "AARCH64_F32K8X12X1");
- }
-
- TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT16x16x32_MK8) {
- auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(8);
- matrix_mul::benchmark_with_contrast(
- handle(), args, dtype::Int16{}, dtype::Int16{}, dtype::Int32{},
- "AARCH64_INT16X16X32_MK8_8X8", param::MatrixMul::Format::MK8,
- dtype::Int16{}, dtype::Int16{}, dtype::Int32{});
- }
-
- TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_K8X12) {
- constexpr size_t RUNS = 50;
- param::MatrixMul param;
- param.transposeA = param.transposeB = true;
- Benchmarker<MatrixMul> benchmarker_k12x8(handle());
- Benchmarker<MatrixMul> benchmarker_k8x12(handle());
- benchmarker_k12x8.set_param(param).set_display(false).set_times(RUNS);
- benchmarker_k8x12.set_param(param).set_display(false).set_times(RUNS);
- benchmarker_k12x8.set_before_exec_callback(
- AlgoChecker<MatrixMul>("AARCH64_F32K4X16X1"));
-
- benchmarker_k8x12.set_before_exec_callback(
- AlgoChecker<MatrixMul>("AARCH64_F32K8X12X1"));
-
- auto run = [&](size_t M, size_t N, size_t K) {
- auto k12x8_used = benchmarker_k12x8.exec({{K, M}, {N, K}, {}}) / RUNS;
- auto k8x12_used = benchmarker_k8x12.exec({{K, M}, {N, K}, {}}) / RUNS;
- float computations = 2.f * M * K * N * 1e-6;
- printf("run: {%zu{M} %zu{K} %zu{N}} float k12x8: %f ms %f Gflops "
- "k8x12: %f ms "
- "%f Gflops speedup: %f\n",
- M, K, N, k12x8_used, computations / k12x8_used, k8x12_used,
- computations / k8x12_used, k12x8_used / k8x12_used);
- };
-
- run(256, 12 * 24, 256);
-
- for (size_t M : {8, 64, 112, 256}) {
- for (size_t K : {8, 64, 112, 256}) {
- for (size_t N : {8, 64, 112, 256}) {
- run(M, N, K);
- }
- }
- }
- }
-
- TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_K8X12_NO_TRANS) {
- constexpr size_t RUNS = 50;
- param::MatrixMul param;
- param.transposeA = param.transposeB = false;
- Benchmarker<MatrixMul> benchmarker_k12x8(handle());
- Benchmarker<MatrixMul> benchmarker_k8x12(handle());
- benchmarker_k12x8.set_param(param).set_display(false).set_times(RUNS);
- benchmarker_k8x12.set_param(param).set_display(false).set_times(RUNS);
- benchmarker_k12x8.set_before_exec_callback(
- AlgoChecker<MatrixMul>("AARCH64_F32K4X16X1"));
-
- benchmarker_k8x12.set_before_exec_callback(
- AlgoChecker<MatrixMul>("AARCH64_F32K8X12X1"));
-
- auto run = [&](size_t M, size_t N, size_t K) {
- auto k12x8_used = benchmarker_k12x8.exec({{M, K}, {K, N}, {}}) / RUNS;
- auto k8x12_used = benchmarker_k8x12.exec({{M, K}, {K, N}, {}}) / RUNS;
- float computations = 2.f * M * K * N * 1e-6;
- printf("run: {%zu{M} %zu{K} %zu{N}} float k12x8: %f ms %f Gflops "
- "k8x12: %f ms "
- "%f Gflops speedup: %f\n",
- M, K, N, k12x8_used, computations / k12x8_used, k8x12_used,
- computations / k8x12_used, k12x8_used / k8x12_used);
- };
-
- run(256, 12 * 24, 256);
-
- for (size_t M : {8, 64, 112, 256}) {
- for (size_t K : {8, 64, 112, 256}) {
- for (size_t N : {8, 64, 112, 256}) {
- run(M, N, K);
- }
- }
- }
- }
-
- #endif // MEGDNN_WITH_BENCHMARK
-
- // vim: syntax=cpp.doxygen
|