You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution.cpp 18 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486
  1. /**
  2. * \file dnn/test/x86/convolution.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "test/x86/fixture.h"
  13. #include "megdnn/opr_param_defs.h"
  14. #include "megdnn/oprs.h"
  15. #include "test/common/accuracy_shake_checker.h"
  16. #include "test/common/benchmarker.h"
  17. #include "test/common/checker.h"
  18. #include "test/common/convolution.h"
  19. #include "test/common/rng.h"
  20. #include "test/common/task_record_check.h"
  21. #include "test/common/tensor.h"
  22. #include "test/common/workspace_wrapper.h"
  23. namespace {
  24. #if MEGDNN_X86_WITH_MKL_DNN
  25. struct ConvArg {
  26. size_t batch_size, fh, sh, ph, ic, ih, iw, oc, groups;
  27. };
  28. std::vector<ConvArg> get_dense_conv_args() {
  29. std::vector<ConvArg> args;
  30. for (size_t batch_size : {1}) {
  31. for (size_t fh : {3, 5, 7}) {
  32. for (size_t sh : {1, 2}) {
  33. for (size_t ph : std::vector<size_t>{0, fh / 2}) {
  34. for (size_t oc : {3, 4}) {
  35. args.emplace_back(
  36. ConvArg{batch_size, fh, sh, ph, 2, 7, 15, oc, 1});
  37. args.emplace_back(
  38. ConvArg{batch_size, fh, sh, ph, 2, 7, 14, oc, 1});
  39. args.emplace_back(
  40. ConvArg{batch_size, fh, sh, ph, 2, 7, 13, oc, 1});
  41. args.emplace_back(
  42. ConvArg{batch_size, fh, sh, ph, 2, 7, 12, oc, 1});
  43. args.emplace_back(
  44. ConvArg{batch_size, fh, sh, ph, 2, 7, 11, oc, 1});
  45. args.emplace_back(
  46. ConvArg{batch_size, fh, sh, ph, 2, 7, 10, oc, 1});
  47. args.emplace_back(
  48. ConvArg{batch_size, fh, sh, ph, 2, 7, 9, oc, 1});
  49. args.emplace_back(
  50. ConvArg{batch_size, fh, sh, ph, 2, 7, 8, oc, 1});
  51. args.emplace_back(
  52. ConvArg{batch_size, fh, sh, ph, 4, 7, 8, oc, 1});
  53. } // end oc
  54. } // end ph
  55. } // end sh
  56. } // end fh
  57. } // end batch_size
  58. return args;
  59. }
  60. std::vector<ConvArg> get_group_conv_args() {
  61. std::vector<ConvArg> args;
  62. for (size_t batch_size : {1}) {
  63. for (size_t fh : {3, 5, 7}) {
  64. for (size_t sh : {1, 2}) {
  65. for (size_t ph : std::vector<size_t>{0, fh / 2}) {
  66. for (size_t oc : {3}) {
  67. args.emplace_back(
  68. ConvArg{batch_size, fh, sh, ph, 2, 7, 15, oc, 2});
  69. args.emplace_back(
  70. ConvArg{batch_size, fh, sh, ph, 2, 7, 14, oc, 2});
  71. args.emplace_back(
  72. ConvArg{batch_size, fh, sh, ph, 2, 7, 13, oc, 2});
  73. args.emplace_back(
  74. ConvArg{batch_size, fh, sh, ph, 2, 7, 12, oc, 2});
  75. args.emplace_back(
  76. ConvArg{batch_size, fh, sh, ph, 2, 7, 11, oc, 2});
  77. args.emplace_back(
  78. ConvArg{batch_size, fh, sh, ph, 2, 7, 10, oc, 2});
  79. args.emplace_back(
  80. ConvArg{batch_size, fh, sh, ph, 2, 7, 9, oc, 2});
  81. args.emplace_back(
  82. ConvArg{batch_size, fh, sh, ph, 2, 7, 8, oc, 2});
  83. } // end oc
  84. } // end ph
  85. } // end sh
  86. } // end fh
  87. } // end batch_size
  88. args.emplace_back(ConvArg{2, 1, 1, 0, 6, 18, 18, 9, 3});
  89. return args;
  90. }
  91. #endif
  92. } // namespace
  93. namespace megdnn {
  94. namespace test {
  95. TEST_F(X86, DEFAULT_CONV_DIRECT_STRIDE1) {
  96. using namespace convolution;
  97. std::vector<TestArg> args;
  98. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p) {
  99. if (w + 2 * p < kernel || h + 2 * p < kernel)
  100. return;
  101. param::Convolution param;
  102. param.stride_h = 1;
  103. param.stride_w = 1;
  104. param.pad_h = p;
  105. param.pad_w = p;
  106. args.emplace_back(
  107. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel});
  108. };
  109. for (size_t kernel : {1, 2, 3, 4, 5, 6, 7})
  110. for (size_t ic : {1, 4, 8, 16})
  111. for (size_t oc : {1, 4, 8})
  112. for (size_t p : {0, 2})
  113. for (size_t size : {20, 21, 24})
  114. run(oc, ic, size, size, kernel, p);
  115. Checker<ConvolutionForward> checker(handle());
  116. checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  117. "CONVOLUTION_DEFAULT_X86_CONV_BIAS_DIRECT_STRIDE1_LARGE_GROUP"));
  118. checker.set_epsilon(1);
  119. UniformIntRNG rng{-50, 50};
  120. checker.set_dtype(0, dtype::Float32())
  121. .set_dtype(1, dtype::Float32())
  122. .set_dtype(2, dtype::Float32())
  123. .set_rng(0, &rng)
  124. .set_rng(1, &rng)
  125. .set_rng(2, &rng);
  126. for (auto&& arg : args) {
  127. checker.set_param(arg.param).exec({arg.src, arg.filter, {}});
  128. }
  129. }
  130. TEST_F(X86, DEFAULT_CONV_DIRECT_STRIDE1_RECORD) {
  131. using namespace convolution;
  132. std::vector<TestArg> args;
  133. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p) {
  134. if (w + 2 * p < kernel || h + 2 * p < kernel)
  135. return;
  136. param::Convolution param;
  137. param.stride_h = 1;
  138. param.stride_w = 1;
  139. param.pad_h = p;
  140. param.pad_w = p;
  141. args.emplace_back(
  142. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel});
  143. };
  144. run(1, 1, 20, 20, 3, 2);
  145. TaskRecordChecker<ConvolutionForward> checker(0);
  146. checker.set_epsilon(1);
  147. UniformIntRNG rng{-50, 50};
  148. checker.set_dtype(0, dtype::Float32())
  149. .set_dtype(1, dtype::Float32())
  150. .set_dtype(2, dtype::Float32())
  151. .set_rng(0, &rng)
  152. .set_rng(1, &rng)
  153. .set_rng(2, &rng);
  154. for (auto&& arg : args) {
  155. checker.set_param(arg.param).exec({arg.src, arg.filter, {}});
  156. }
  157. }
  158. TEST_F(X86, DEFAULT_CONV_DIRECT_STRIDE2) {
  159. using namespace convolution;
  160. std::vector<TestArg> args;
  161. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p) {
  162. if (w + 2 * p < kernel || h + 2 * p < kernel)
  163. return;
  164. param::Convolution param;
  165. param.stride_h = 2;
  166. param.stride_w = 2;
  167. param.pad_h = p;
  168. param.pad_w = p;
  169. args.emplace_back(
  170. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel});
  171. };
  172. for (size_t kernel : {2, 3, 5, 7})
  173. for (size_t ic : {1, 4, 8, 16})
  174. for (size_t oc : {1, 4, 8})
  175. for (size_t p : {0, 2})
  176. for (size_t size : {20, 21, 24})
  177. run(oc, ic, size, size, kernel, p);
  178. Checker<ConvolutionForward> checker(handle());
  179. checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  180. "CONVOLUTION_DEFAULT_X86_CONV_BIAS_DIRECT_STRIDE2_LARGE_GROUP"));
  181. checker.set_epsilon(1);
  182. UniformIntRNG rng{-50, 50};
  183. checker.set_dtype(0, dtype::Float32())
  184. .set_dtype(1, dtype::Float32())
  185. .set_dtype(2, dtype::Float32())
  186. .set_rng(0, &rng)
  187. .set_rng(1, &rng)
  188. .set_rng(2, &rng);
  189. for (auto&& arg : args) {
  190. checker.set_param(arg.param).exec({arg.src, arg.filter, {}});
  191. }
  192. }
  193. #if MEGDNN_X86_WITH_MKL_DNN
  194. TEST_F(X86, CONVOLUTION_FORWARD_INT8) {
  195. Checker<ConvolutionForward> checker(handle());
  196. checker.set_before_exec_callback(
  197. AlgoChecker<ConvolutionForward>("CONVOLUTION_DEFAULT_MKLDNN_INT8"));
  198. param::Convolution param;
  199. param.sparse = param::Convolution::Sparse::GROUP;
  200. UniformIntRNG rng{-128, 127};
  201. std::vector<ConvArg> args = get_group_conv_args();
  202. for (auto&& arg : args) {
  203. param.stride_h = param.stride_w = arg.sh;
  204. param.pad_h = param.pad_w = arg.ph;
  205. checker.set_dtype(0, dtype::Int8())
  206. .set_dtype(1, dtype::Int8())
  207. .set_dtype(2, dtype::Int32())
  208. .set_rng(0, &rng)
  209. .set_rng(1, &rng)
  210. .set_param(param)
  211. .execs({{arg.batch_size, arg.ic * arg.groups, arg.ih, arg.iw},
  212. {arg.groups, arg.oc, arg.ic, arg.fh, arg.fh},
  213. {}});
  214. }
  215. args = get_dense_conv_args();
  216. param.sparse = param::Convolution::Sparse::DENSE;
  217. for (auto&& arg : args) {
  218. param.stride_h = param.stride_w = arg.sh;
  219. param.pad_h = param.pad_w = arg.ph;
  220. checker.set_dtype(0, dtype::Int8())
  221. .set_dtype(1, dtype::Int8())
  222. .set_dtype(2, dtype::Int32())
  223. .set_rng(0, &rng)
  224. .set_rng(1, &rng)
  225. .set_param(param)
  226. .execs({{arg.batch_size, arg.ic, arg.ih, arg.iw},
  227. {arg.oc, arg.ic, arg.fh, arg.fh},
  228. {}});
  229. }
  230. }
  231. TEST_F(X86, CONVOLUTION_FORWARD_MATMUL_INT8) {
  232. std::vector<ConvArg> args = get_dense_conv_args();
  233. Checker<ConvolutionForward> checker(handle());
  234. checker.set_before_exec_callback(
  235. AlgoChecker<ConvolutionForward>("CONVOLUTION_DEFAULT_MKLDNN_MATMUL_INT8"));
  236. param::Convolution param;
  237. param.sparse = param::Convolution::Sparse::DENSE;
  238. UniformIntRNG rng{-128, 127};
  239. for (auto&& arg : args) {
  240. param.stride_h = param.stride_w = arg.sh;
  241. param.pad_h = param.pad_w = arg.ph;
  242. checker.set_dtype(0, dtype::Int8())
  243. .set_dtype(1, dtype::Int8())
  244. .set_dtype(2, dtype::Int32())
  245. .set_rng(0, &rng)
  246. .set_rng(1, &rng)
  247. .set_param(param)
  248. .execs({{arg.batch_size, arg.ic, arg.ih, arg.iw},
  249. {arg.oc, arg.ic, arg.fh, arg.fh},
  250. {}});
  251. }
  252. }
  253. static void x86_correctness_fp32_mkldnn_run(
  254. Checker<Convolution>& checker, UniformIntRNG& rng, Handle* handle, size_t n,
  255. size_t stride, size_t kernel, size_t oc, size_t ic, size_t h, size_t w,
  256. size_t group) {
  257. auto oc_per_group = oc / group;
  258. auto ic_per_group = ic / group;
  259. bool ok_group = oc_per_group % 8 == 0 && oc_per_group > 0 &&
  260. (ic_per_group % 8 == 0 || ic_per_group == 3) && ic_per_group > 0;
  261. bool ok_depthwise = oc == ic && oc == group;
  262. if (!(ok_group || ok_depthwise)) {
  263. return;
  264. }
  265. size_t pad = kernel / 2;
  266. size_t kernel_h = kernel;
  267. size_t kernel_w = kernel;
  268. param::Convolution param;
  269. param.format = param::Convolution::Format::NCHW88;
  270. param.stride_h = stride;
  271. param.stride_w = stride;
  272. param.pad_h = pad;
  273. param.pad_w = pad;
  274. auto src_tensor_shape = TensorShape{n, ic / 8, h, w, 8};
  275. if (ic == 3) {
  276. src_tensor_shape = TensorShape{n, ic, h, w};
  277. }
  278. auto weight_tensor_shape = TensorShape{oc / 8, ic / 8, kernel_h, kernel_w, 8, 8};
  279. if (ic == 3) {
  280. weight_tensor_shape = TensorShape{oc / 8, kernel_h, kernel_w, ic, 8};
  281. }
  282. if (group == 1) {
  283. param.sparse = param::Convolution::Sparse::DENSE;
  284. } else if (group > 1 && ic / group == 1 && oc / group == 1) {
  285. param.sparse = param::Convolution::Sparse::GROUP;
  286. weight_tensor_shape = TensorShape{group / 8, 1, 1, kernel_h, kernel_w, 8};
  287. } else if (
  288. group > 1 && oc / group % 8 == 0 && oc / group > 0 && ic / group % 8 == 0 &&
  289. ic / group > 0) {
  290. param.sparse = param::Convolution::Sparse::GROUP;
  291. weight_tensor_shape = TensorShape{
  292. group, oc / group / 8, ic / group / 8, kernel_h, kernel_w, 8, 8};
  293. }
  294. checker.set_dtype(0, dtype::Float32())
  295. .set_dtype(1, dtype::Float32())
  296. .set_rng(0, &rng)
  297. .set_rng(1, &rng)
  298. .set_epsilon(1e-3)
  299. .set_param(param)
  300. .execs({src_tensor_shape, weight_tensor_shape, {}});
  301. }
  302. static void x86_correctness_fp32_mkldnn(Handle* handle) {
  303. Checker<Convolution> checker(handle);
  304. UniformIntRNG rng{-127, 127};
  305. checker.set_before_exec_callback(
  306. AlgoChecker<ConvolutionForward>("CONVOLUTION_DEFAULT_MKLDNN_CONV_FP32"));
  307. for (size_t n : {1, 2})
  308. for (size_t stride : {1, 2})
  309. for (size_t kernel : {3, 5, 7})
  310. for (size_t oc : {8, 16})
  311. for (size_t ic : {3, 8, 16})
  312. for (size_t h : {22, 33})
  313. for (size_t w : {22, 33}) {
  314. for (size_t group = 1; group <= std::min(oc, ic);
  315. ++group) {
  316. x86_correctness_fp32_mkldnn_run(
  317. checker, rng, handle, n, stride, kernel, oc,
  318. ic, h, w, group);
  319. }
  320. }
  321. }
  322. TEST_F(X86, CONVOLUTION_DIRECT_MKLDNN_C8) {
  323. x86_correctness_fp32_mkldnn(handle());
  324. }
  325. #endif
  326. #if MEGDNN_WITH_BENCHMARK
  327. TEST_F(X86, BENCHMARK_CONVOLUTION_I8x8x16) {
  328. using namespace convolution;
  329. using Param = param::Convolution;
  330. std::vector<TestArg> args;
  331. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  332. size_t stride, size_t group = 1) {
  333. Param param;
  334. param.stride_h = stride;
  335. param.stride_w = stride;
  336. param.pad_h = kernel / 2;
  337. param.pad_w = kernel / 2;
  338. if (group > 1) {
  339. param.sparse = param::Convolution::Sparse::GROUP;
  340. args.emplace_back(
  341. param, TensorShape{1, ic, h, w},
  342. TensorShape{group, oc / group, ic / group, kernel, kernel});
  343. } else {
  344. param.sparse = param::Convolution::Sparse::DENSE;
  345. args.emplace_back(
  346. param, TensorShape{1, ic, h, w},
  347. TensorShape{oc, ic, kernel, kernel});
  348. }
  349. };
  350. run(48, 96, 15, 15, 1, 1);
  351. run(64, 64, 60, 60, 3, 1);
  352. run(64, 64, 60, 60, 3, 1, 64);
  353. constexpr size_t RUN = 30;
  354. Benchmarker<Convolution> benchmark(handle());
  355. benchmark.set_dtype(0, dtype::Int8())
  356. .set_dtype(1, dtype::Int8())
  357. .set_dtype(2, dtype::Int16());
  358. benchmark.set_before_exec_callback(AlgoChecker<Convolution>(".*"));
  359. benchmark.set_display(false);
  360. benchmark.set_times(RUN);
  361. for (auto&& arg : args) {
  362. TensorLayout dst_layout;
  363. auto opr = handle()->create_operator<Convolution>();
  364. opr->param() = arg.param;
  365. opr->deduce_layout(
  366. {arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()},
  367. dst_layout);
  368. //! dst.nr_elems * IC * FH * FW * 2
  369. float icpg = arg.filter.ndim == 4 ? arg.filter[1] : arg.filter[2];
  370. float filter = arg.filter.ndim == 4 ? arg.filter[2] : arg.filter[3];
  371. float computations = dst_layout.total_nr_elems() * icpg * filter * filter *
  372. 2.0 / (1024 * 1024 * 1024) * 1e3;
  373. auto used_int =
  374. benchmark.set_param(arg.param).exec({arg.src, arg.filter, {}}) / RUN;
  375. printf("%s %s: int: %f ms %f Gflops \n", arg.src.to_string().c_str(),
  376. arg.filter.to_string().c_str(), used_int, computations / used_int);
  377. }
  378. }
  379. #if MEGDNN_X86_WITH_MKL_DNN
  380. TEST_F(X86, BENCHMARK_CONVOLUTION_I8x8x32_MKLDNN) {
  381. using namespace convolution;
  382. using Param = param::Convolution;
  383. std::vector<TestArg> args;
  384. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  385. size_t stride) {
  386. Param param;
  387. param.stride_h = stride;
  388. param.stride_w = stride;
  389. param.pad_h = kernel / 2;
  390. param.pad_w = kernel / 2;
  391. args.emplace_back(
  392. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel});
  393. };
  394. for (size_t kernel : {2, 3, 5, 7}) {
  395. for (size_t ic : {1, 8, 16, 32, 64}) {
  396. for (size_t oc : {1, 8, 16, 32, 64}) {
  397. run(oc, ic, 56, 56, kernel, 1);
  398. run(oc, ic, 128, 128, kernel, 1);
  399. run(oc, ic, 256, 256, kernel, 1);
  400. }
  401. }
  402. }
  403. constexpr size_t RUN = 50;
  404. Benchmarker<Convolution> benchmark(handle());
  405. benchmark.set_dtype(0, dtype::Int8())
  406. .set_dtype(1, dtype::Int8())
  407. .set_dtype(2, dtype::Int32());
  408. benchmark.set_display(false);
  409. benchmark.set_times(RUN);
  410. Benchmarker<Convolution> benchmark_float(handle());
  411. benchmark_float.set_display(false);
  412. benchmark_float.set_times(RUN);
  413. for (auto&& arg : args) {
  414. TensorLayout dst_layout;
  415. auto opr = handle()->create_operator<Convolution>();
  416. opr->param() = arg.param;
  417. opr->deduce_layout(
  418. {arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()},
  419. dst_layout);
  420. //! dst.nr_elems * IC * FH * FW * 2
  421. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  422. arg.filter[2] * arg.filter[3] * 2.0 /
  423. (1024 * 1024 * 1024) * 1e3;
  424. auto used_int =
  425. benchmark.set_param(arg.param).exec({arg.src, arg.filter, {}}) / RUN;
  426. auto used_float =
  427. benchmark_float.set_param(arg.param).exec({arg.src, arg.filter, {}}) /
  428. RUN;
  429. printf("%s %s: int: %f ms %f Gflops float: %f ms %f GFlops speedup: "
  430. "%f\n",
  431. arg.src.to_string().c_str(), arg.filter.to_string().c_str(), used_int,
  432. computations / used_int, used_float, computations / used_float,
  433. used_float / used_int);
  434. }
  435. }
  436. #endif
  437. #endif
  438. } // namespace test
  439. } // namespace megdnn
  440. // vim: syntax=cpp.doxygen