diff --git a/dnn/src/fallback/matrix_mul/gi/fp32/strategy_4x12.cpp b/dnn/src/fallback/matrix_mul/gi/fp32/strategy_4x12.cpp index d413252d..2a842ff4 100644 --- a/dnn/src/fallback/matrix_mul/gi/fp32/strategy_4x12.cpp +++ b/dnn/src/fallback/matrix_mul/gi/fp32/strategy_4x12.cpp @@ -1,11 +1,3 @@ -#include "src/fallback/matrix_mul/generic_strategy.h" -#include "src/fallback/matrix_mul/gi/fp32/common.h" - -using namespace megdnn; -using namespace matmul::fallback; - -namespace { - #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wuninitialized" @@ -18,6 +10,15 @@ namespace { #endif #endif #endif + +#include "src/fallback/matrix_mul/generic_strategy.h" +#include "src/fallback/matrix_mul/gi/fp32/common.h" + +using namespace megdnn; +using namespace matmul::fallback; + +namespace { + void kern_4x12( const float* packA, const float* packB, int K, float* output, int LDC, bool is_first_k, int m_remain) { @@ -615,7 +616,6 @@ void kern_4x4( } } } -#pragma GCC diagnostic pop void gi_sgemm_4x12_pack_A_n( float* outptr, const float* inptr, int ldin, int y0, int ymax, int k0, diff --git a/dnn/test/armv7/matrix_mul.cpp b/dnn/test/armv7/matrix_mul.cpp index c80b6abf..ca79771e 100644 --- a/dnn/test/armv7/matrix_mul.cpp +++ b/dnn/test/armv7/matrix_mul.cpp @@ -571,6 +571,12 @@ TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT32_MK_4X2X16) { } } +TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_ARMV7_F32) { + auto args = matrix_mul::get_benchmark_matmul_args(); + matrix_mul::benchmark_single_algo( + handle(), args, dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, + "ARMV7_F32", param::MatrixMul::Format::DEFAULT); +} #endif // vim: syntax=cpp.doxygen diff --git a/dnn/test/common/matrix_mul.cpp b/dnn/test/common/matrix_mul.cpp index 31c5b7e1..aac4f825 100644 --- a/dnn/test/common/matrix_mul.cpp +++ b/dnn/test/common/matrix_mul.cpp @@ -429,6 +429,68 @@ void matrix_mul::benchmark_with_contrast( } } +void matrix_mul::benchmark_single_algo( + Handle* handle, const std::vector& args, DType A_dtype, DType B_dtype, + DType C_dtype, const char* algo, param::MatrixMul::Format format) { + using Param = MatrixMul::Param; + + megdnn_assert(A_dtype.enumv() == B_dtype.enumv()); + Benchmarker benchmark(handle); + constexpr size_t RUNS = 50; + if (algo) { + benchmark.set_before_exec_callback(AlgoChecker(algo)); + } + benchmark.set_dtype(0, A_dtype).set_dtype(1, B_dtype).set_dtype(2, C_dtype); + benchmark.set_times(RUNS); + + auto bench = [](Benchmarker& benchmark, Param param, + param::MatrixMul::Format format, size_t m, size_t n, size_t k, + size_t pack_size) -> float { + param.format = format; + benchmark.set_param(param); + float used_algo = 1.0; + if (format == param::MatrixMul::Format::DEFAULT) { + size_t A0 = m * pack_size, A1 = k * pack_size, B0 = k * pack_size, B1 = n; + TensorShape A, B; + if (param.transposeA) { + std::swap(A0, A1); + } + if (param.transposeB) { + std::swap(B0, B1); + } + used_algo = benchmark.execs({{A0, A1}, {B0, B1}, {}}) / RUNS; + } else { + size_t A0 = m, A1 = k, B0 = k, B1 = n; + if (param.transposeA) { + std::swap(A0, A1); + } + if (param.transposeB) { + std::swap(B0, B1); + } + + used_algo = + benchmark.execs( + {{A0, A1, pack_size, pack_size}, {B0, B1, pack_size}, {}}) / + RUNS; + } + return used_algo; + }; + + size_t pack_size = MatrixMulForward::pack_size(format); + for (auto& arg : args) { + Param param; + param.transposeA = arg.mask & 0x1; + param.transposeB = arg.mask & 0x2; + + auto used_algo = + bench(benchmark, param, format, arg.m, arg.n, arg.k, pack_size); + + float computations = 2.f * arg.m * pack_size * arg.k * pack_size * arg.n * 1e-6; + printf("run: {(%zu, %zu) x (%zu, %zu)} %f ms %f Gflops\n", arg.m * pack_size, + arg.k * pack_size, arg.k * pack_size, arg.n, used_algo, + computations / used_algo); + } +} #endif // vim: syntax=cpp.doxygen diff --git a/dnn/test/common/matrix_mul.h b/dnn/test/common/matrix_mul.h index 1b3f2fe0..374428fe 100644 --- a/dnn/test/common/matrix_mul.h +++ b/dnn/test/common/matrix_mul.h @@ -85,6 +85,10 @@ void benchmark_with_contrast( DType contrast_B_dtype = dtype::Float32{}, DType contrast_C_dtype = dtype::Float32{}, const char* contrast_algo = nullptr, param::MatrixMul::Format contrast_format = param::MatrixMul::Format::DEFAULT); +void benchmark_single_algo( + Handle* handle, const std::vector& args, DType A_dtype, DType B_dtype, + DType C_dtype, const char* algo = nullptr, + param::MatrixMul::Format format = param::MatrixMul::Format::DEFAULT); #endif } // namespace matrix_mul diff --git a/dnn/test/fallback/matrix_mul.cpp b/dnn/test/fallback/matrix_mul.cpp index 1d50cb04..b1b71a70 100644 --- a/dnn/test/fallback/matrix_mul.cpp +++ b/dnn/test/fallback/matrix_mul.cpp @@ -154,6 +154,16 @@ TEST_F(FALLBACK, BATCHED_MATRIX_MUL) { checker.execs({AL, BL, {}}); } } + +#if MEGDNN_WITH_BENCHMARK +TEST_F(FALLBACK, BENCHMARK_MATRIX_MUL_FB_GI_F32_4x12) { + auto args = matrix_mul::get_benchmark_matmul_args(); + matrix_mul::benchmark_single_algo( + handle(), args, dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, + "FB_GI_F32_4x12", param::MatrixMul::Format::DEFAULT); +} + +#endif } // namespace test } // namespace megdnn