You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution.cpp 4.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. #include "test/armv7/fixture.h"
  2. #include "test/common/benchmarker.h"
  3. #include "test/common/checker.h"
  4. #include "test/common/convolution.h"
  5. #include "test/common/rng.h"
  6. using namespace megdnn;
  7. using namespace test;
  8. #if MEGDNN_WITH_BENCHMARK
  9. TEST_F(ARMV7, BENCHMARK_CONVOLUTION_STRIDE2) {
  10. using Param = param::Convolution;
  11. auto run = [&](const TensorShapeArray& shapes, Param param) {
  12. Benchmarker<Convolution> benchmarker_float(handle());
  13. size_t RUN = 100;
  14. auto tfloat = benchmarker_float.set_display(false)
  15. .set_times(RUN)
  16. .set_param(param)
  17. .exec(shapes);
  18. size_t IC = shapes[1][1];
  19. size_t FH = shapes[1][2];
  20. size_t FW = shapes[1][3];
  21. TensorLayout dst_layout;
  22. auto opr = handle()->create_operator<Convolution>();
  23. opr->param() = param;
  24. opr->deduce_layout(
  25. {shapes[0], dtype::Float32()}, {shapes[1], dtype::Float32()},
  26. dst_layout);
  27. printf("flops: %.3f mflops\n", (IC * dst_layout.total_nr_elems() * FH * FW *
  28. 2) / (tfloat / RUN * 1000));
  29. };
  30. auto profile = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  31. size_t stride) {
  32. Param param;
  33. param.stride_h = stride;
  34. param.stride_w = stride;
  35. param.pad_h = kernel / 2;
  36. param.pad_w = kernel / 2;
  37. printf("oc: %zd ic: %zd w: %zd h: %zd stride: %zd kernel_size: %zd\n", oc, ic,
  38. w, h, stride, kernel);
  39. run({{1, ic, h, w}, {oc, ic, kernel, kernel}, {}}, param);
  40. };
  41. for (size_t kernel : {2, 3, 5, 7}) {
  42. for (size_t ic : {3, 6, 12, 24}) {
  43. for (size_t oc : {3, 6, 12, 24}) {
  44. for (size_t size : {4, 7, 8, 14, 16, 17, 28, 32, 34, 64, 112}) {
  45. profile(oc, ic, size, size, kernel, 2);
  46. }
  47. }
  48. }
  49. }
  50. }
  51. #endif
  52. TEST_F(ARMV7, BENCHMARK_CONVOLUTION_1X1) {
  53. int exec_times = 50;
  54. Benchmarker<MatrixMul> benchmarker_gemm(handle());
  55. benchmarker_gemm.set_times(exec_times);
  56. Benchmarker<Convolution> benchmarker(handle());
  57. benchmarker.set_times(exec_times);
  58. float mod = 1000 * exec_times / 1e9;
  59. auto run = [&](size_t IC, size_t OC, size_t H, size_t W) {
  60. float time = 1.f, perf = 1.f;
  61. std::cout << std::endl;
  62. std::cout << "CONV: IC " << IC << ", OC " << OC << ", H " << H << ", W " << W
  63. << std::endl;
  64. time = benchmarker.exec({{1, IC, H, W}, {OC, IC, 1, 1}, {1, OC, H, W}});
  65. perf = OC * (2 * H * W - 1) * IC / time * mod;
  66. std::cout << "Performance is " << perf << " Gflops" << std::endl;
  67. std::cout << "GEMM: (" << OC << ", " << H * W << ", " << IC << ")" << std::endl;
  68. // time = benchmarker_gemm.exec({{OC, H*W}, {H*W, IC}, {}});
  69. // perf = OC * (2 * H * W - 1) * IC / time * mod;
  70. time = benchmarker_gemm.exec({{OC, IC}, {IC, H * W}, {}});
  71. perf = OC * (2 * IC - 1) * H * W / time * mod;
  72. std::cout << "Performance is " << perf << " Gflops" << std::endl;
  73. };
  74. // run(32, 32, 64, 64);
  75. // run(8, 8, 32, 32);
  76. // run(32, 32, 128, 128);
  77. // run(32, 32, 512, 512);
  78. // run(10,10,2,5);
  79. // run(100,100,2,50);
  80. run(16, 4, 240, 135);
  81. run(8, 32, 120, 67);
  82. run(16, 64, 60, 33);
  83. run(1, 1, 28, 28);
  84. run(8, 1, 28, 28);
  85. run(2, 2, 28, 28);
  86. run(8, 2, 28, 28);
  87. run(4, 4, 28, 28);
  88. run(16, 4, 28, 28);
  89. }
  90. TEST_F(ARMV7, BENCHMARK_GROUP_CONVOLUTION_1X1) {
  91. int exec_times = 50;
  92. Benchmarker<Convolution> benchmarker_gconv1x1(handle());
  93. benchmarker_gconv1x1.set_times(exec_times);
  94. float mod = 1000 * exec_times / 1e9;
  95. auto run = [&](size_t IC, size_t OC, size_t H, size_t W, size_t group) {
  96. float time = 1.f, perf = 1.f;
  97. std::cout << std::endl;
  98. std::cout << "GCONV: IC " << IC << ", OC " << OC << ", H " << H << ", W " << W
  99. << ", GROUP " << group << std::endl;
  100. auto ICg = IC / group;
  101. auto OCg = OC / group;
  102. param::Convolution param;
  103. param.sparse = param::Convolution::Sparse::GROUP;
  104. time = benchmarker_gconv1x1.set_param(param).exec(
  105. {{1, IC, H, W}, {group, OCg, ICg, 1, 1}, {}});
  106. perf = group * OCg * ICg * H * W / time * mod;
  107. std::cout << "Performance is " << perf << " Gflops" << std::endl;
  108. };
  109. run(8 * 4, 1 * 4, 28, 28, 4);
  110. run(2 * 4, 2 * 4, 28, 28, 4);
  111. run(8 * 4, 2 * 4, 28, 28, 4);
  112. run(4 * 4, 4 * 4, 28, 28, 4);
  113. run(16 * 4, 4 * 4, 28, 28, 4);
  114. }
  115. // vim: syntax=cpp.doxygen