|
- #if MEGDNN_WITH_BENCHMARK
- #include "test/arm_common/fixture.h"
-
- #include "megdnn/oprs.h"
- #include "test/common/benchmarker.h"
- #include "test/common/checker.h"
- #include "test/common/rng.h"
-
- using namespace megdnn;
- using namespace test;
-
- #define TEST_IN_DIFF_DISTRUBUTION(proportion_of_inf, dataset_number) \
- max_val = 88.3762626647949f / (1 - proportion_of_inf); \
- UniformFloatRNG rng##dataset_number(0.f, max_val); \
- B.set_rng(0, &rng##dataset_number); \
- B.execs({{355600}, {}});
-
- TEST_F(ARM_COMMON, BENCHMARK_ELEM_UNARY_FLOATONLY) {
- Benchmarker<ElemwiseForward> B(handle());
- using Mode = ElemwiseForward::Param::Mode;
- // UniformFloatWithZeroRNG rng(80, 100, 0.1);
- printf("Test Optr exp(x)\n");
- B.set_param(Mode::EXP);
- B.execs({{355600}, {}});
-
- B.set_param(Mode::EXP);
- B.execs({{355600}, {}});
- float max_val = 0;
- TEST_IN_DIFF_DISTRUBUTION(0.25, 1)
- TEST_IN_DIFF_DISTRUBUTION(0.5, 2)
- TEST_IN_DIFF_DISTRUBUTION(0.75, 3)
- TEST_IN_DIFF_DISTRUBUTION(0.9999, 4)
-
- printf("Test Optr tanh(x)\n");
- B.set_param(Mode::TANH);
- B.execs({{355600}, {}});
-
- B.set_param(Mode::TANH);
- B.execs({{355600}, {}});
- max_val = 0;
- TEST_IN_DIFF_DISTRUBUTION(0.25, 5)
- TEST_IN_DIFF_DISTRUBUTION(0.5, 6)
- TEST_IN_DIFF_DISTRUBUTION(0.75, 7)
- TEST_IN_DIFF_DISTRUBUTION(0.9999, 8)
-
- printf("Test Optr fast_tanh(x)\n");
- B.set_param(Mode::FAST_TANH);
- B.execs({{355600}, {}});
-
- printf("Test Optr sigmoid(x)\n");
- B.set_param(Mode::SIGMOID);
- B.execs({{355600}, {}});
- TEST_IN_DIFF_DISTRUBUTION(0.25, 9)
- TEST_IN_DIFF_DISTRUBUTION(0.5, 10)
- TEST_IN_DIFF_DISTRUBUTION(0.75, 11)
- TEST_IN_DIFF_DISTRUBUTION(0.9999, 12)
-
- B.set_param(Mode::SIGMOID);
- B.execs({{355600}, {}});
- max_val = 0;
- TEST_IN_DIFF_DISTRUBUTION(0.25, 13)
- TEST_IN_DIFF_DISTRUBUTION(0.5, 14)
- TEST_IN_DIFF_DISTRUBUTION(0.75, 15)
- TEST_IN_DIFF_DISTRUBUTION(0.9999, 16)
- }
-
- TEST_F(ARM_COMMON, BENCHMARK_ELEMWISE_UNARY) {
- Benchmarker<ElemwiseForward> B(handle());
- using Mode = ElemwiseForward::Param::Mode;
-
- const size_t RUN_TIMES = 10;
- B.set_times(RUN_TIMES).set_display(false);
-
- auto run_unary = [&](const TensorShape& shape, param::Elemwise::Mode mode,
- const char* mode_str, DType dtype) {
- B.set_param(mode).set_dtype(0, dtype);
- float time = B.execs({shape, {}}) / RUN_TIMES;
- float computations = shape.total_nr_elems() * 2 / (1024.f * 1024.f * 1024.f);
- printf("%s(%s):\tlayout(%s)\ttime(%fms)\tbandwidth(%fGBps)\n", mode_str,
- dtype.name(), shape.to_string().c_str(), time,
- computations * dtype.size() / time * 1e3);
- };
- #define RUN(shape, mode, dtype) run_unary(shape, mode, #mode, dtype);
-
- #define BENCHMARK_CASES_INT(shape, dtype) \
- RUN(shape, Mode::RELU, dtype) \
- RUN(shape, Mode::ABS, dtype)
-
- #define BENCHMARK_CASES_FLOAT(shape, dtype) \
- BENCHMARK_CASES_INT(shape, dtype) \
- RUN(shape, Mode::SIGMOID, dtype) \
- RUN(shape, Mode::EXP, dtype) \
- RUN(shape, Mode::TANH, dtype) \
- RUN(shape, Mode::FAST_TANH, dtype)
-
- TensorShape shape = {10, 50, 10, 100};
- BENCHMARK_CASES_INT(shape, dtype::Int32());
- BENCHMARK_CASES_INT(shape, dtype::Int16());
- BENCHMARK_CASES_INT(shape, dtype::Int8());
- BENCHMARK_CASES_FLOAT(shape, dtype::Float32());
- #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- BENCHMARK_CASES_FLOAT(shape, dtype::Float16());
- #endif
- #undef BENCHMARK_CASES_INT
- #undef BENCHMARK_CASES_FLOAT
- #undef RUN
- }
-
- TEST_F(ARM_COMMON, BENCHMARK_ELEMWISE_UNARY_MULTI_TYPE) {
- Benchmarker<ElemwiseMultiType> B(handle());
- using Mode = ElemwiseMultiType::Param::Mode;
-
- const size_t RUN_TIMES = 20;
- B.set_times(RUN_TIMES).set_display(false);
-
- auto run_unary = [&](const TensorShape& shape, Mode mode, const char* mode_str,
- DType src_dtype, DType dst_dtype) {
- B.set_param(mode).set_dtype(0, src_dtype).set_dtype(1, dst_dtype);
- float time = B.execs({shape, {}}) / RUN_TIMES;
- float computations = shape.total_nr_elems() * 2 / (1024.f * 1024.f * 1024.f);
- printf("type %s %s(%s) to %s \ttime(%fms)\tbandwidth(%fGBps)\n", mode_str,
- src_dtype.name(), shape.to_string().c_str(), dst_dtype.name(), time,
- computations * src_dtype.size() / time * 1e3);
- };
-
- #define RUN(shape, mode, src_dtype, dst_dtye) \
- run_unary(shape, mode, #mode, src_dtype, dst_dtye);
-
- #define BENCHMARK_CASES_INT(shape, src_dtype, dst_dtye) \
- RUN(shape, Mode::QRELU, src_dtype, dst_dtye) \
- RUN(shape, Mode::QABS, src_dtype, dst_dtye)
-
- TensorShape shape = {10, 50, 10, 100};
- BENCHMARK_CASES_INT(shape, dtype::QuantizedS32(62.5f), dtype::QuantizedS8(2.5f));
- #undef BENCHMARK_CASES_INT
- #undef BENCHMARK_CASES_FLOAT
- #undef RUN
- }
-
- TEST_F(ARM_COMMON, BENCHMARK_ELEMWISE_BINARY) {
- Benchmarker<ElemwiseForward> B(handle());
- using Mode = ElemwiseForward::Param::Mode;
-
- const size_t RUN_TIMES = 10;
- B.set_times(RUN_TIMES).set_display(false);
-
- auto run_binary = [&](const TensorShape& shape0, const TensorShape& shape1,
- param::Elemwise::Mode mode, const char* mode_str,
- DType dtype) {
- B.set_param(mode).set_dtype(0, dtype).set_dtype(1, dtype);
- float time = B.execs({shape0, shape1, {}}) / RUN_TIMES;
- float bandwidth = (shape0.total_nr_elems() + shape1.total_nr_elems() +
- std::max(shape0.total_nr_elems(), shape1.total_nr_elems())) /
- (1024.f * 1024.f * 1024.f) * dtype.size() / time * 1e3;
- printf("%s(%s):\tlayout(%s %s)\ttime(%fms)\tbandwidth(%fGBps)\n", mode_str,
- dtype.name(), shape0.to_string().c_str(), shape1.to_string().c_str(),
- time, bandwidth);
- };
- #define RUN(shape0, shape1, mode, dtype) run_binary(shape0, shape1, mode, #mode, dtype);
-
- #define BENCHMARK_CASES_INT(shape0, shape1, dtype) \
- RUN(shape0, shape1, Mode::ADD, dtype) \
- RUN(shape0, shape1, Mode::MIN, dtype) \
- RUN(shape0, shape1, Mode::MAX, dtype) \
- RUN(shape0, shape1, Mode::SUB, dtype) \
- RUN(shape0, shape1, Mode::MUL, dtype) \
- RUN(shape0, shape1, Mode::FUSE_ADD_RELU, dtype)
-
- #define BENCHMARK_CASES_FLOAT(shape0, shape1, dtype) \
- BENCHMARK_CASES_INT(shape0, shape1, dtype) \
- RUN(shape0, shape1, Mode::TRUE_DIV, dtype) \
- RUN(shape0, shape1, Mode::FUSE_ADD_SIGMOID, dtype) \
- RUN(shape0, shape1, Mode::FUSE_ADD_TANH, dtype)
-
- #define BENCHMARK_CASES_EVERY_DTYPE(shape0, shape1) \
- BENCHMARK_CASES_INT(shape0, shape1, dtype::Int32()); \
- BENCHMARK_CASES_INT(shape0, shape1, dtype::Int16()); \
- BENCHMARK_CASES_INT(shape0, shape1, dtype::Int8()); \
- BENCHMARK_CASES_FLOAT(shape0, shape1, dtype::Float32());
-
- TensorShape shape0 = {10, 50, 10, 100};
- TensorShape shape1 = {10, 50, 10, 100};
- BENCHMARK_CASES_EVERY_DTYPE(shape0, shape1);
-
- shape1 = {1, 50, 1, 1};
- BENCHMARK_CASES_EVERY_DTYPE(shape0, shape1);
-
- shape1 = {1, 1, 1, 1};
- BENCHMARK_CASES_EVERY_DTYPE(shape0, shape1);
- #undef BENCHMARK_CASES_EVERY_DTYPE
- #undef BENCHMARK_CASES_FLOAT
- #undef BENCHMARK_CASES_INT
- #undef RUN
- }
-
- TEST_F(ARM_COMMON, BENCHMARK_ELEMWISE_TERNARY) {
- Benchmarker<ElemwiseForward> B(handle());
- using Mode = ElemwiseForward::Param::Mode;
-
- const size_t RUN_TIMES = 10;
- B.set_times(RUN_TIMES).set_display(false);
-
- auto run_ternary = [&](const TensorShape& shape0, const TensorShape& shape1,
- const TensorShape& shape2, param::Elemwise::Mode mode,
- const char* mode_str, DType dtype) {
- B.set_param(mode).set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
- float time = B.execs({shape0, shape1, shape2, {}}) / RUN_TIMES;
- float bandwidth = (shape0.total_nr_elems() * 2 + shape1.total_nr_elems() +
- shape2.total_nr_elems()) /
- (1024.f * 1024.f * 1024.f) * dtype.size() / time * 1e3;
- printf("%s(%s):\tlayout(%s %s %s)\ttime(%fms)\tbandwidth(%fGBps)\n", mode_str,
- dtype.name(), shape0.to_string().c_str(), shape1.to_string().c_str(),
- shape2.to_string().c_str(), time, bandwidth);
- };
-
- TensorShape shape = {10, 50, 10, 100};
- run_ternary(
- shape, shape, shape, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Int32());
- run_ternary(
- shape, shape, shape, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Int16());
- run_ternary(
- shape, shape, shape, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Int8());
- run_ternary(
- shape, shape, shape, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3",
- dtype::Float32());
- #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- run_ternary(
- shape, {1}, {1}, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Float32());
- run_ternary(
- shape, {1}, {1}, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Float16());
- run_ternary(
- {1}, shape, {1}, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Float32());
- run_ternary(
- {1}, shape, {1}, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Float16());
- #endif
- }
- #endif
|