@@ -1,11 +1,3 @@ | |||||
#include "src/fallback/matrix_mul/generic_strategy.h" | |||||
#include "src/fallback/matrix_mul/gi/fp32/common.h" | |||||
using namespace megdnn; | |||||
using namespace matmul::fallback; | |||||
namespace { | |||||
#pragma GCC diagnostic push | #pragma GCC diagnostic push | ||||
#pragma GCC diagnostic ignored "-Wuninitialized" | #pragma GCC diagnostic ignored "-Wuninitialized" | ||||
@@ -18,6 +10,15 @@ namespace { | |||||
#endif | #endif | ||||
#endif | #endif | ||||
#endif | #endif | ||||
#include "src/fallback/matrix_mul/generic_strategy.h" | |||||
#include "src/fallback/matrix_mul/gi/fp32/common.h" | |||||
using namespace megdnn; | |||||
using namespace matmul::fallback; | |||||
namespace { | |||||
void kern_4x12( | void kern_4x12( | ||||
const float* packA, const float* packB, int K, float* output, int LDC, | const float* packA, const float* packB, int K, float* output, int LDC, | ||||
bool is_first_k, int m_remain) { | bool is_first_k, int m_remain) { | ||||
@@ -615,7 +616,6 @@ void kern_4x4( | |||||
} | } | ||||
} | } | ||||
} | } | ||||
#pragma GCC diagnostic pop | |||||
void gi_sgemm_4x12_pack_A_n( | void gi_sgemm_4x12_pack_A_n( | ||||
float* outptr, const float* inptr, int ldin, int y0, int ymax, int k0, | float* outptr, const float* inptr, int ldin, int y0, int ymax, int k0, | ||||
@@ -571,6 +571,12 @@ TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT32_MK_4X2X16) { | |||||
} | } | ||||
} | } | ||||
TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_ARMV7_F32) { | |||||
auto args = matrix_mul::get_benchmark_matmul_args(); | |||||
matrix_mul::benchmark_single_algo( | |||||
handle(), args, dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, | |||||
"ARMV7_F32", param::MatrixMul::Format::DEFAULT); | |||||
} | |||||
#endif | #endif | ||||
// vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen |
@@ -429,6 +429,68 @@ void matrix_mul::benchmark_with_contrast( | |||||
} | } | ||||
} | } | ||||
void matrix_mul::benchmark_single_algo( | |||||
Handle* handle, const std::vector<TestArg>& args, DType A_dtype, DType B_dtype, | |||||
DType C_dtype, const char* algo, param::MatrixMul::Format format) { | |||||
using Param = MatrixMul::Param; | |||||
megdnn_assert(A_dtype.enumv() == B_dtype.enumv()); | |||||
Benchmarker<MatrixMul> benchmark(handle); | |||||
constexpr size_t RUNS = 50; | |||||
if (algo) { | |||||
benchmark.set_before_exec_callback(AlgoChecker<MatrixMul>(algo)); | |||||
} | |||||
benchmark.set_dtype(0, A_dtype).set_dtype(1, B_dtype).set_dtype(2, C_dtype); | |||||
benchmark.set_times(RUNS); | |||||
auto bench = [](Benchmarker<MatrixMul>& benchmark, Param param, | |||||
param::MatrixMul::Format format, size_t m, size_t n, size_t k, | |||||
size_t pack_size) -> float { | |||||
param.format = format; | |||||
benchmark.set_param(param); | |||||
float used_algo = 1.0; | |||||
if (format == param::MatrixMul::Format::DEFAULT) { | |||||
size_t A0 = m * pack_size, A1 = k * pack_size, B0 = k * pack_size, B1 = n; | |||||
TensorShape A, B; | |||||
if (param.transposeA) { | |||||
std::swap(A0, A1); | |||||
} | |||||
if (param.transposeB) { | |||||
std::swap(B0, B1); | |||||
} | |||||
used_algo = benchmark.execs({{A0, A1}, {B0, B1}, {}}) / RUNS; | |||||
} else { | |||||
size_t A0 = m, A1 = k, B0 = k, B1 = n; | |||||
if (param.transposeA) { | |||||
std::swap(A0, A1); | |||||
} | |||||
if (param.transposeB) { | |||||
std::swap(B0, B1); | |||||
} | |||||
used_algo = | |||||
benchmark.execs( | |||||
{{A0, A1, pack_size, pack_size}, {B0, B1, pack_size}, {}}) / | |||||
RUNS; | |||||
} | |||||
return used_algo; | |||||
}; | |||||
size_t pack_size = MatrixMulForward::pack_size(format); | |||||
for (auto& arg : args) { | |||||
Param param; | |||||
param.transposeA = arg.mask & 0x1; | |||||
param.transposeB = arg.mask & 0x2; | |||||
auto used_algo = | |||||
bench(benchmark, param, format, arg.m, arg.n, arg.k, pack_size); | |||||
float computations = 2.f * arg.m * pack_size * arg.k * pack_size * arg.n * 1e-6; | |||||
printf("run: {(%zu, %zu) x (%zu, %zu)} %f ms %f Gflops\n", arg.m * pack_size, | |||||
arg.k * pack_size, arg.k * pack_size, arg.n, used_algo, | |||||
computations / used_algo); | |||||
} | |||||
} | |||||
#endif | #endif | ||||
// vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen |
@@ -85,6 +85,10 @@ void benchmark_with_contrast( | |||||
DType contrast_B_dtype = dtype::Float32{}, | DType contrast_B_dtype = dtype::Float32{}, | ||||
DType contrast_C_dtype = dtype::Float32{}, const char* contrast_algo = nullptr, | DType contrast_C_dtype = dtype::Float32{}, const char* contrast_algo = nullptr, | ||||
param::MatrixMul::Format contrast_format = param::MatrixMul::Format::DEFAULT); | param::MatrixMul::Format contrast_format = param::MatrixMul::Format::DEFAULT); | ||||
void benchmark_single_algo( | |||||
Handle* handle, const std::vector<TestArg>& args, DType A_dtype, DType B_dtype, | |||||
DType C_dtype, const char* algo = nullptr, | |||||
param::MatrixMul::Format format = param::MatrixMul::Format::DEFAULT); | |||||
#endif | #endif | ||||
} // namespace matrix_mul | } // namespace matrix_mul | ||||
@@ -154,6 +154,16 @@ TEST_F(FALLBACK, BATCHED_MATRIX_MUL) { | |||||
checker.execs({AL, BL, {}}); | checker.execs({AL, BL, {}}); | ||||
} | } | ||||
} | } | ||||
#if MEGDNN_WITH_BENCHMARK | |||||
TEST_F(FALLBACK, BENCHMARK_MATRIX_MUL_FB_GI_F32_4x12) { | |||||
auto args = matrix_mul::get_benchmark_matmul_args(); | |||||
matrix_mul::benchmark_single_algo( | |||||
handle(), args, dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, | |||||
"FB_GI_F32_4x12", param::MatrixMul::Format::DEFAULT); | |||||
} | |||||
#endif | |||||
} // namespace test | } // namespace test | ||||
} // namespace megdnn | } // namespace megdnn | ||||