You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias.cpp 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
  1. /**
  2. * \file dnn/test/aarch64/conv_bias.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "test/aarch64/fixture.h"
  12. #include "src/fallback/conv_bias/common.h"
  13. #include "test/common/benchmarker.h"
  14. #include "test/common/checker.h"
  15. #include "test/common/conv_bias.h"
  16. #include "test/common/rng.h"
  17. #include "test/common/tensor.h"
  18. namespace megdnn {
  19. namespace test {
  20. std::vector<conv_bias::TestArg> get_conv_bias_args(std::vector<size_t> kernel,
  21. size_t stride) {
  22. using namespace conv_bias;
  23. using Param = param::ConvBias;
  24. using NLMode = param::ConvBias::NonlineMode;
  25. std::vector<TestArg> args;
  26. auto pack = [&](size_t n, size_t oc, size_t ic, size_t w, size_t h,
  27. size_t kernel, size_t stride, NLMode nonline_mode) {
  28. Param param;
  29. param.stride_h = stride;
  30. param.stride_w = stride;
  31. param.pad_h = kernel == 1 ? 0 : kernel / 2;
  32. param.pad_w = kernel == 1 ? 0 : kernel / 2;
  33. param.nonlineMode = nonline_mode;
  34. //! no bias
  35. args.emplace_back(param, TensorShape{n, ic, h, w},
  36. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  37. //! bias broadcast channle
  38. args.emplace_back(param, TensorShape{n, ic, h, w},
  39. TensorShape{oc, ic, kernel, kernel},
  40. TensorShape{1, oc, 1, 1});
  41. //! bias
  42. args.emplace_back(
  43. param, TensorShape{n, ic, h, w},
  44. TensorShape{oc, ic, kernel, kernel},
  45. TensorShape{n, oc, (h + 2 * param.pad_h - kernel) / stride + 1,
  46. (w + 2 * param.pad_h - kernel) / stride + 1});
  47. };
  48. for (auto nlmode : {NLMode::IDENTITY, NLMode::RELU, NLMode::SIGMOID}) {
  49. for (size_t n : {1, 2}) {
  50. for (size_t ic : {1, 2, 3, 4, 8}) {
  51. for (size_t oc : {1, 2, 3, 4, 8}) {
  52. for (size_t size : {1, 2, 3, 4, 8, 24}) {
  53. for (size_t k : kernel) {
  54. pack(n, oc, ic, size + 24, size + 24, k, stride,
  55. nlmode);
  56. }
  57. }
  58. }
  59. }
  60. }
  61. }
  62. return args;
  63. }
  64. void checker_conv_bias(std::vector<conv_bias::TestArg> args, Handle* handle,
  65. const char* algo_name) {
  66. using namespace conv_bias;
  67. Checker<ConvBias> checker(handle);
  68. checker.set_before_exec_callback(
  69. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));
  70. for (auto&& arg : args) {
  71. checker.set_param(arg.param).execs(
  72. {arg.src, arg.filter, arg.bias, {}, {}});
  73. }
  74. }
  75. TEST_F(AARCH64_MULTI_THREADS, CONVBIAS_DIRECT_FP32_STR2) {
  76. check_conv_bias(
  77. conv_bias::get_conv_bias_args({2, 3, 5, 7}, 2, false, false, false),
  78. handle(), "ARMV8F32STRD2");
  79. }
  80. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  81. void checker_conv_bias_fp16(std::vector<conv_bias::TestArg> args,
  82. Handle* handle, const char* algo_name,
  83. float epsilon) {
  84. using namespace conv_bias;
  85. Checker<ConvBias> checker(handle);
  86. checker.set_before_exec_callback(
  87. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));
  88. checker.set_epsilon(epsilon);
  89. checker.set_dtype(0, dtype::Float16())
  90. .set_dtype(1, dtype::Float16())
  91. .set_dtype(2, dtype::Float16())
  92. .set_dtype(4, dtype::Float16());
  93. NormalRNG rng(1.f);
  94. checker.set_rng(0, &rng).set_rng(1, &rng);
  95. for (auto&& arg : args) {
  96. checker.set_param(arg.param).execs(
  97. {arg.src, arg.filter, arg.bias, {}, {}});
  98. }
  99. }
  100. TEST_F(AARCH64_MULTI_THREADS, CONVBIAS_DIRECT_FP16_STR2) {
  101. NormalRNG rng(1);
  102. checker_conv_bias_f16(
  103. conv_bias::get_conv_bias_args({2, 3, 5}, 2, false, false, false),
  104. handle(), rng, "ARMV8F16STRD2", 0.04);
  105. }
  106. #endif
  107. #if MEGDNN_WITH_BENCHMARK
  108. std::vector<conv_bias::TestArg> get_conv_bias_benchmaker_args(
  109. std::vector<size_t> kernel, size_t stride) {
  110. using namespace conv_bias;
  111. using Param = param::ConvBias;
  112. using NLMode = param::ConvBias::NonlineMode;
  113. std::vector<TestArg> args;
  114. auto pack = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  115. size_t stride, NLMode nonline_mode) {
  116. Param param;
  117. param.stride_h = stride;
  118. param.stride_w = stride;
  119. param.pad_h = kernel == 1 ? 0 : kernel / 2;
  120. param.pad_w = kernel == 1 ? 0 : kernel / 2;
  121. param.nonlineMode = nonline_mode;
  122. //! no bias
  123. args.emplace_back(param, TensorShape{1, ic, h, w},
  124. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  125. //! bias broadcast channle
  126. args.emplace_back(param, TensorShape{1, ic, h, w},
  127. TensorShape{oc, ic, kernel, kernel},
  128. TensorShape{1, oc, 1, 1});
  129. //! bias
  130. args.emplace_back(
  131. param, TensorShape{1, ic, h, w},
  132. TensorShape{oc, ic, kernel, kernel},
  133. TensorShape{1, oc, (h + 2 * param.pad_h - kernel) / stride + 1,
  134. (w + 2 * param.pad_w - kernel) / stride + 1});
  135. };
  136. for (auto nlmode : {NLMode::IDENTITY, NLMode::RELU, NLMode::SIGMOID}) {
  137. for (size_t k : kernel) {
  138. for (size_t ic : {3, 6, 12, 24}) {
  139. for (size_t oc : {3, 6, 12, 24}) {
  140. for (size_t size :
  141. {4, 7, 8, 14, 16, 17, 28, 32, 34, 64, 112}) {
  142. pack(oc, ic, size, size, k, stride, nlmode);
  143. }
  144. }
  145. }
  146. }
  147. }
  148. return args;
  149. }
  150. void benchmarker_conv_bias(std::vector<conv_bias::TestArg> args, Handle* handle,
  151. const char* algo_name, const char* cmp_algo_name) {
  152. using namespace conv_bias;
  153. constexpr size_t N = 10;
  154. Benchmarker<ConvBias> benchmark_float(handle);
  155. benchmark_float
  156. .set_before_exec_callback(
  157. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name))
  158. .set_times(N)
  159. .set_display(false);
  160. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  161. Benchmarker<ConvBias> benchmark_float16(handle);
  162. benchmark_float16
  163. .set_before_exec_callback(
  164. conv_bias::ConvBiasAlgoChecker<ConvBias>(cmp_algo_name))
  165. .set_times(N)
  166. .set_dtype(0, dtype::Float16())
  167. .set_dtype(1, dtype::Float16())
  168. .set_dtype(2, dtype::Float16())
  169. .set_dtype(4, dtype::Float16())
  170. .set_display(false);
  171. #endif
  172. for (auto&& arg : args) {
  173. TensorLayout dst_layout;
  174. auto opr = handle->create_operator<ConvBias>();
  175. opr->param() = arg.param;
  176. opr->deduce_layout({arg.src, dtype::Float32()},
  177. {arg.filter, dtype::Float32()},
  178. {arg.bias, dtype::Float32()}, {}, dst_layout);
  179. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  180. arg.filter[2] * arg.filter[3] * 2.0 /
  181. (1024 * 1024 * 1024) * 1e3; // GFLOPS
  182. printf("filter n: %zu c: %zu h:%zu w:%zu ", arg.filter[0],
  183. arg.filter[1], arg.filter[2], arg.filter[3]);
  184. printf("input c: %zu h:%zu w:%zu \n", arg.src[1], arg.src[2],
  185. arg.src[3]);
  186. auto time32 = benchmark_float.set_param(arg.param).execs(
  187. {arg.src, arg.filter, arg.bias, {}, {}}) /
  188. N;
  189. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  190. auto time16 = benchmark_float16.set_param(arg.param).execs(
  191. {arg.src, arg.filter, arg.bias, {}, {}}) /
  192. N;
  193. printf("---------------------------------fp32 flops: %.3f Gflops fp16 "
  194. "flops %.3f Gflops speedup: %f\n",
  195. computations / time32, computations / time16, time32 / time16);
  196. #else
  197. printf("---------------------------------fp32 flops: %.3f Gflops\n",
  198. computations / time32);
  199. #endif
  200. }
  201. }
  202. TEST_F(AARCH64, BENCHMARK_CONVBIAS_STRIDE2_FP32_FP16) {
  203. benchmarker_conv_bias(get_conv_bias_benchmaker_args({2,3,5,7},2),
  204. handle(),"ARMV8F32STRD2", "ARMV8F16STRD2");
  205. }
  206. TEST_F(AARCH64, BENCHMARK_CONVBIAS) {
  207. constexpr size_t RUNS = 10;
  208. param::ConvBias param;
  209. param.stride_h = 1;
  210. param.stride_w = 1;
  211. Benchmarker<ConvBias> benchmarker_int(handle());
  212. benchmarker_int.set_times(RUNS)
  213. .set_dtype(0, dtype::QuantizedS8(2.5f))
  214. .set_dtype(1, dtype::QuantizedS8(2.5f))
  215. .set_dtype(2, dtype::QuantizedS32(6.25f))
  216. .set_dtype(4, dtype::QuantizedS8(40.25f))
  217. .set_display(false);
  218. Benchmarker<ConvBias> benchmarker_float(handle());
  219. benchmarker_float.set_display(false).set_times(RUNS);
  220. auto run = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  221. size_t FS) {
  222. TensorShape src({N, IC, H, W}), filter({OC, IC, FS, FS}),
  223. bias({N, OC, H, W}), dst({N, OC, H, W});
  224. param.pad_h = FS / 2;
  225. param.pad_w = FS / 2;
  226. auto int_used = benchmarker_int.set_param(param).exec(
  227. {src, filter, bias, {}, dst}) /
  228. RUNS;
  229. auto float_used = benchmarker_float.set_param(param).exec(
  230. {src, filter, bias, {}, dst}) /
  231. RUNS;
  232. float computations =
  233. IC * (FS * FS + 1) * dst.total_nr_elems() * 2 * 1e-6;
  234. printf("run: %s %s %s->%s \nfloat: %f ms %f Gflops int: %f ms "
  235. "%f Gflops speedup: %f\n",
  236. src.to_string().c_str(), filter.to_string().c_str(),
  237. bias.to_string().c_str(), dst.to_string().c_str(), float_used,
  238. computations / float_used, int_used, computations / int_used,
  239. float_used / int_used);
  240. };
  241. run(1, 128, 128, 32, 32, 3);
  242. for (size_t IC : {1, 4, 8, 16, 32, 64}) {
  243. for (size_t OC : {1, 4, 8, 16, 32, 64}) {
  244. for (size_t size : {7, 14, 28, 56}) {
  245. for (size_t FS : {1, 3, 5}) {
  246. run(1, IC, OC, size, size, FS);
  247. }
  248. }
  249. }
  250. }
  251. }
  252. #endif
  253. } // namespace test
  254. } // namespace megdnn
  255. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台