You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias.cpp 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
  1. /**
  2. * \file dnn/test/aarch64/conv_bias.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "test/aarch64/fixture.h"
  12. #include "src/fallback/conv_bias/common.h"
  13. #include "test/common/benchmarker.h"
  14. #include "test/common/checker.h"
  15. #include "test/common/conv_bias.h"
  16. #include "test/common/rng.h"
  17. #include "test/common/task_record_check.h"
  18. #include "test/common/tensor.h"
  19. namespace megdnn {
  20. namespace test {
  21. std::vector<conv_bias::TestArg> get_conv_bias_args(
  22. std::vector<size_t> kernel, size_t stride) {
  23. using namespace conv_bias;
  24. using Param = param::ConvBias;
  25. using NLMode = param::ConvBias::NonlineMode;
  26. std::vector<TestArg> args;
  27. auto pack = [&](size_t n, size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  28. size_t stride, NLMode nonline_mode) {
  29. Param param;
  30. param.stride_h = stride;
  31. param.stride_w = stride;
  32. param.pad_h = kernel == 1 ? 0 : kernel / 2;
  33. param.pad_w = kernel == 1 ? 0 : kernel / 2;
  34. param.nonlineMode = nonline_mode;
  35. //! no bias
  36. args.emplace_back(
  37. param, TensorShape{n, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  38. TensorShape{});
  39. //! bias broadcast channle
  40. args.emplace_back(
  41. param, TensorShape{n, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  42. TensorShape{1, oc, 1, 1});
  43. //! bias
  44. args.emplace_back(
  45. param, TensorShape{n, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  46. TensorShape{
  47. n, oc, (h + 2 * param.pad_h - kernel) / stride + 1,
  48. (w + 2 * param.pad_h - kernel) / stride + 1});
  49. };
  50. for (auto nlmode : {NLMode::IDENTITY, NLMode::RELU, NLMode::SIGMOID}) {
  51. for (size_t n : {1, 2}) {
  52. for (size_t ic : {1, 2, 3, 4, 8}) {
  53. for (size_t oc : {1, 2, 3, 4, 8}) {
  54. for (size_t size : {1, 2, 3, 4, 8, 24}) {
  55. for (size_t k : kernel) {
  56. pack(n, oc, ic, size + 24, size + 24, k, stride, nlmode);
  57. }
  58. }
  59. }
  60. }
  61. }
  62. }
  63. return args;
  64. }
  65. void checker_conv_bias(
  66. std::vector<conv_bias::TestArg> args, Handle* handle, const char* algo_name) {
  67. using namespace conv_bias;
  68. Checker<ConvBias> checker(handle);
  69. checker.set_before_exec_callback(
  70. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));
  71. for (auto&& arg : args) {
  72. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}});
  73. }
  74. }
  75. TEST_F(AARCH64_MULTI_THREADS, CONVBIAS_DIRECT_FP32_STR2) {
  76. check_conv_bias(
  77. conv_bias::get_conv_bias_args({2, 3, 5, 7}, 2, false, false, false),
  78. handle(), "ARMV8F32STRD2");
  79. }
  80. TEST_F(AARCH64_MULTI_THREADS, CONVBIAS_RECORD) {
  81. auto args = conv_bias::get_conv_bias_args({2, 3, 5, 7}, 2, false, false, false);
  82. TaskRecordChecker<ConvBias> checker(0);
  83. for (auto&& arg : args) {
  84. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}});
  85. }
  86. }
  87. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  88. void checker_conv_bias_fp16(
  89. std::vector<conv_bias::TestArg> args, Handle* handle, const char* algo_name,
  90. float epsilon) {
  91. using namespace conv_bias;
  92. Checker<ConvBias> checker(handle);
  93. checker.set_before_exec_callback(
  94. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));
  95. checker.set_epsilon(epsilon);
  96. checker.set_dtype(0, dtype::Float16())
  97. .set_dtype(1, dtype::Float16())
  98. .set_dtype(2, dtype::Float16())
  99. .set_dtype(4, dtype::Float16());
  100. NormalRNG rng(1.f);
  101. checker.set_rng(0, &rng).set_rng(1, &rng);
  102. for (auto&& arg : args) {
  103. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}});
  104. }
  105. }
  106. TEST_F(AARCH64_MULTI_THREADS, CONVBIAS_DIRECT_FP16_STR2) {
  107. NormalRNG rng(1);
  108. checker_conv_bias_f16(
  109. conv_bias::get_conv_bias_args({2, 3, 5}, 2, false, false, false), handle(),
  110. rng, "ARMV8F16STRD2", 0.04);
  111. }
  112. #endif
  113. #if MEGDNN_WITH_BENCHMARK
  114. std::vector<conv_bias::TestArg> get_conv_bias_benchmaker_args(
  115. std::vector<size_t> kernel, size_t stride) {
  116. using namespace conv_bias;
  117. using Param = param::ConvBias;
  118. using NLMode = param::ConvBias::NonlineMode;
  119. std::vector<TestArg> args;
  120. auto pack = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  121. size_t stride, NLMode nonline_mode) {
  122. Param param;
  123. param.stride_h = stride;
  124. param.stride_w = stride;
  125. param.pad_h = kernel == 1 ? 0 : kernel / 2;
  126. param.pad_w = kernel == 1 ? 0 : kernel / 2;
  127. param.nonlineMode = nonline_mode;
  128. //! no bias
  129. args.emplace_back(
  130. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  131. TensorShape{});
  132. //! bias broadcast channle
  133. args.emplace_back(
  134. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  135. TensorShape{1, oc, 1, 1});
  136. //! bias
  137. args.emplace_back(
  138. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  139. TensorShape{
  140. 1, oc, (h + 2 * param.pad_h - kernel) / stride + 1,
  141. (w + 2 * param.pad_w - kernel) / stride + 1});
  142. };
  143. for (auto nlmode : {NLMode::IDENTITY, NLMode::RELU, NLMode::SIGMOID}) {
  144. for (size_t k : kernel) {
  145. for (size_t ic : {3, 6, 12, 24}) {
  146. for (size_t oc : {3, 6, 12, 24}) {
  147. for (size_t size : {4, 7, 8, 14, 16, 17, 28, 32, 34, 64, 112}) {
  148. pack(oc, ic, size, size, k, stride, nlmode);
  149. }
  150. }
  151. }
  152. }
  153. }
  154. return args;
  155. }
  156. void benchmarker_conv_bias(
  157. std::vector<conv_bias::TestArg> args, Handle* handle, const char* algo_name,
  158. const char* cmp_algo_name) {
  159. using namespace conv_bias;
  160. constexpr size_t N = 10;
  161. Benchmarker<ConvBias> benchmark_float(handle);
  162. benchmark_float
  163. .set_before_exec_callback(
  164. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name))
  165. .set_times(N)
  166. .set_display(false);
  167. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  168. Benchmarker<ConvBias> benchmark_float16(handle);
  169. benchmark_float16
  170. .set_before_exec_callback(
  171. conv_bias::ConvBiasAlgoChecker<ConvBias>(cmp_algo_name))
  172. .set_times(N)
  173. .set_dtype(0, dtype::Float16())
  174. .set_dtype(1, dtype::Float16())
  175. .set_dtype(2, dtype::Float16())
  176. .set_dtype(4, dtype::Float16())
  177. .set_display(false);
  178. #endif
  179. for (auto&& arg : args) {
  180. TensorLayout dst_layout;
  181. auto opr = handle->create_operator<ConvBias>();
  182. opr->param() = arg.param;
  183. opr->deduce_layout(
  184. {arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()},
  185. {arg.bias, dtype::Float32()}, {}, dst_layout);
  186. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  187. arg.filter[2] * arg.filter[3] * 2.0 /
  188. (1024 * 1024 * 1024) * 1e3; // GFLOPS
  189. printf("filter n: %zu c: %zu h:%zu w:%zu ", arg.filter[0], arg.filter[1],
  190. arg.filter[2], arg.filter[3]);
  191. printf("input c: %zu h:%zu w:%zu \n", arg.src[1], arg.src[2], arg.src[3]);
  192. auto time32 = benchmark_float.set_param(arg.param).execs(
  193. {arg.src, arg.filter, arg.bias, {}, {}}) /
  194. N;
  195. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  196. auto time16 = benchmark_float16.set_param(arg.param).execs(
  197. {arg.src, arg.filter, arg.bias, {}, {}}) /
  198. N;
  199. printf("---------------------------------fp32 flops: %.3f Gflops fp16 "
  200. "flops %.3f Gflops speedup: %f\n",
  201. computations / time32, computations / time16, time32 / time16);
  202. #else
  203. printf("---------------------------------fp32 flops: %.3f Gflops\n",
  204. computations / time32);
  205. #endif
  206. }
  207. }
  208. TEST_F(AARCH64, BENCHMARK_CONVBIAS_STRIDE2_FP32_FP16) {
  209. benchmarker_conv_bias(
  210. get_conv_bias_benchmaker_args({2, 3, 5, 7}, 2), handle(), "ARMV8F32STRD2",
  211. "ARMV8F16STRD2");
  212. }
  213. TEST_F(AARCH64, BENCHMARK_CONVBIAS) {
  214. constexpr size_t RUNS = 10;
  215. param::ConvBias param;
  216. param.stride_h = 1;
  217. param.stride_w = 1;
  218. Benchmarker<ConvBias> benchmarker_int(handle());
  219. benchmarker_int.set_times(RUNS)
  220. .set_dtype(0, dtype::QuantizedS8(2.5f))
  221. .set_dtype(1, dtype::QuantizedS8(2.5f))
  222. .set_dtype(2, dtype::QuantizedS32(6.25f))
  223. .set_dtype(4, dtype::QuantizedS8(40.25f))
  224. .set_display(false);
  225. Benchmarker<ConvBias> benchmarker_float(handle());
  226. benchmarker_float.set_display(false).set_times(RUNS);
  227. auto run = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS) {
  228. TensorShape src({N, IC, H, W}), filter({OC, IC, FS, FS}), bias({N, OC, H, W}),
  229. dst({N, OC, H, W});
  230. param.pad_h = FS / 2;
  231. param.pad_w = FS / 2;
  232. auto int_used =
  233. benchmarker_int.set_param(param).exec({src, filter, bias, {}, dst}) /
  234. RUNS;
  235. auto float_used =
  236. benchmarker_float.set_param(param).exec({src, filter, bias, {}, dst}) /
  237. RUNS;
  238. float computations = IC * (FS * FS + 1) * dst.total_nr_elems() * 2 * 1e-6;
  239. printf("run: %s %s %s->%s \nfloat: %f ms %f Gflops int: %f ms "
  240. "%f Gflops speedup: %f\n",
  241. src.to_string().c_str(), filter.to_string().c_str(),
  242. bias.to_string().c_str(), dst.to_string().c_str(), float_used,
  243. computations / float_used, int_used, computations / int_used,
  244. float_used / int_used);
  245. };
  246. run(1, 128, 128, 32, 32, 3);
  247. for (size_t IC : {1, 4, 8, 16, 32, 64}) {
  248. for (size_t OC : {1, 4, 8, 16, 32, 64}) {
  249. for (size_t size : {7, 14, 28, 56}) {
  250. for (size_t FS : {1, 3, 5}) {
  251. run(1, IC, OC, size, size, FS);
  252. }
  253. }
  254. }
  255. }
  256. }
  257. #endif
  258. } // namespace test
  259. } // namespace megdnn
  260. // vim: syntax=cpp.doxygen