You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution.cpp 19 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501
  1. /**
  2. * \file dnn/test/x86/convolution.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "test/x86/fixture.h"
  13. #include "megdnn/opr_param_defs.h"
  14. #include "megdnn/oprs.h"
  15. #include "test/common/benchmarker.h"
  16. #include "test/common/checker.h"
  17. #include "test/common/convolution.h"
  18. #include "test/common/rng.h"
  19. #include "test/common/tensor.h"
  20. #include "test/common/workspace_wrapper.h"
  21. namespace {
  22. #if MEGDNN_X86_WITH_MKL_DNN
  23. struct ConvArg {
  24. size_t batch_size, fh, sh, ph, ic, ih, iw, oc, groups;
  25. };
  26. std::vector<ConvArg> get_dense_conv_args() {
  27. std::vector<ConvArg> args;
  28. for (size_t batch_size : {1}) {
  29. for (size_t fh : {3, 5, 7}) {
  30. for (size_t sh : {1, 2}) {
  31. for (size_t ph : std::vector<size_t>{0, fh / 2}) {
  32. for (size_t oc : {3, 4}) {
  33. args.emplace_back(ConvArg{batch_size, fh, sh, ph, 2, 7,
  34. 15, oc, 1});
  35. args.emplace_back(ConvArg{batch_size, fh, sh, ph, 2, 7,
  36. 14, oc, 1});
  37. args.emplace_back(ConvArg{batch_size, fh, sh, ph, 2, 7,
  38. 13, oc, 1});
  39. args.emplace_back(ConvArg{batch_size, fh, sh, ph, 2, 7,
  40. 12, oc, 1});
  41. args.emplace_back(ConvArg{batch_size, fh, sh, ph, 2, 7,
  42. 11, oc, 1});
  43. args.emplace_back(ConvArg{batch_size, fh, sh, ph, 2, 7,
  44. 10, oc, 1});
  45. args.emplace_back(ConvArg{batch_size, fh, sh, ph, 2, 7,
  46. 9, oc, 1});
  47. args.emplace_back(ConvArg{batch_size, fh, sh, ph, 2, 7,
  48. 8, oc, 1});
  49. args.emplace_back(ConvArg{batch_size, fh, sh, ph, 4, 7,
  50. 8, oc, 1});
  51. } // end oc
  52. } // end ph
  53. } // end sh
  54. } // end fh
  55. } // end batch_size
  56. return args;
  57. }
  58. std::vector<ConvArg> get_group_conv_args() {
  59. std::vector<ConvArg> args;
  60. for (size_t batch_size : {1}) {
  61. for (size_t fh : {3, 5, 7}) {
  62. for (size_t sh : {1, 2}) {
  63. for (size_t ph : std::vector<size_t>{0, fh / 2}) {
  64. for (size_t oc : {3}) {
  65. args.emplace_back(ConvArg{batch_size, fh, sh, ph, 2, 7,
  66. 15, oc, 2});
  67. args.emplace_back(ConvArg{batch_size, fh, sh, ph, 2, 7,
  68. 14, oc, 2});
  69. args.emplace_back(ConvArg{batch_size, fh, sh, ph, 2, 7,
  70. 13, oc, 2});
  71. args.emplace_back(ConvArg{batch_size, fh, sh, ph, 2, 7,
  72. 12, oc, 2});
  73. args.emplace_back(ConvArg{batch_size, fh, sh, ph, 2, 7,
  74. 11, oc, 2});
  75. args.emplace_back(ConvArg{batch_size, fh, sh, ph, 2, 7,
  76. 10, oc, 2});
  77. args.emplace_back(ConvArg{batch_size, fh, sh, ph, 2, 7,
  78. 9, oc, 2});
  79. args.emplace_back(ConvArg{batch_size, fh, sh, ph, 2, 7,
  80. 8, oc, 2});
  81. } // end oc
  82. } // end ph
  83. } // end sh
  84. } // end fh
  85. } // end batch_size
  86. args.emplace_back(ConvArg{2, 1, 1, 0, 6, 18, 18, 9, 3});
  87. return args;
  88. }
  89. #endif
  90. } // namespace
  91. namespace megdnn {
  92. namespace test {
  93. TEST_F(X86, DEFAULT_CONV_DIRECT_STRIDE1) {
  94. using namespace convolution;
  95. std::vector<TestArg> args;
  96. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  97. size_t p) {
  98. if (w + 2 * p < kernel || h + 2 * p < kernel)
  99. return;
  100. param::Convolution param;
  101. param.stride_h = 1;
  102. param.stride_w = 1;
  103. param.pad_h = p;
  104. param.pad_w = p;
  105. args.emplace_back(param, TensorShape{1, ic, h, w},
  106. TensorShape{oc, ic, kernel, kernel});
  107. };
  108. for (size_t kernel : {1, 2, 3, 4, 5, 6, 7})
  109. for (size_t ic : {1, 4, 8, 16})
  110. for (size_t oc : {1, 4, 8})
  111. for (size_t p : {0, 2})
  112. for (size_t size : {20, 21, 24})
  113. run(oc, ic, size, size, kernel, p);
  114. Checker<ConvolutionForward> checker(handle());
  115. checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  116. "CONVOLUTION_DEFAULT_X86_CONV_BIAS_DIRECT_STRIDE1_LARGE_GROUP"));
  117. checker.set_epsilon(1);
  118. UniformIntRNG rng{-50, 50};
  119. checker.set_dtype(0, dtype::Float32())
  120. .set_dtype(1, dtype::Float32())
  121. .set_dtype(2, dtype::Float32())
  122. .set_rng(0, &rng)
  123. .set_rng(1, &rng)
  124. .set_rng(2, &rng);
  125. for (auto&& arg : args) {
  126. checker.set_param(arg.param).exec({arg.src, arg.filter, {}});
  127. }
  128. }
  129. TEST_F(X86, DEFAULT_CONV_DIRECT_STRIDE2) {
  130. using namespace convolution;
  131. std::vector<TestArg> args;
  132. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  133. size_t p) {
  134. if (w + 2 * p < kernel || h + 2 * p < kernel)
  135. return;
  136. param::Convolution param;
  137. param.stride_h = 2;
  138. param.stride_w = 2;
  139. param.pad_h = p;
  140. param.pad_w = p;
  141. args.emplace_back(param, TensorShape{1, ic, h, w},
  142. TensorShape{oc, ic, kernel, kernel});
  143. };
  144. for (size_t kernel : {2, 3, 5, 7})
  145. for (size_t ic : {1, 4, 8, 16})
  146. for (size_t oc : {1, 4, 8})
  147. for (size_t p : {0, 2})
  148. for (size_t size : {20, 21, 24})
  149. run(oc, ic, size, size, kernel, p);
  150. Checker<ConvolutionForward> checker(handle());
  151. checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  152. "CONVOLUTION_DEFAULT_X86_CONV_BIAS_DIRECT_STRIDE2_LARGE_GROUP"));
  153. checker.set_epsilon(1);
  154. UniformIntRNG rng{-50, 50};
  155. checker.set_dtype(0, dtype::Float32())
  156. .set_dtype(1, dtype::Float32())
  157. .set_dtype(2, dtype::Float32())
  158. .set_rng(0, &rng)
  159. .set_rng(1, &rng)
  160. .set_rng(2, &rng);
  161. for (auto&& arg : args) {
  162. checker.set_param(arg.param).exec({arg.src, arg.filter, {}});
  163. }
  164. }
  165. TEST_F(X86, DEFAULT_CONV_MATMUL) {
  166. using namespace convolution;
  167. std::vector<TestArg> args;
  168. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  169. size_t p) {
  170. if (w + 2 * p < kernel || h + 2 * p < kernel)
  171. return;
  172. param::Convolution param;
  173. param.stride_h = 1;
  174. param.stride_w = 1;
  175. param.pad_h = p;
  176. param.pad_w = p;
  177. //! no bias
  178. args.emplace_back(param, TensorShape{1, ic, h, w},
  179. TensorShape{oc, ic, kernel, kernel});
  180. };
  181. for (size_t kernel : {2, 3, 5, 7})
  182. for (size_t ic : {1, 2, 3, 4})
  183. for (size_t oc : {1, 2, 3, 4})
  184. for (size_t p : {0, 2})
  185. for (size_t size : {20, 21, 22, 23, 24}) {
  186. run(oc, ic, size, size, kernel, p);
  187. }
  188. Checker<ConvolutionForward> checker(handle());
  189. checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  190. "CONVOLUTION_DEFAULT_X86_CONV_BIAS_MATMUL"));
  191. UniformIntRNG rng{-50, 50};
  192. checker.set_dtype(0, dtype::Float32())
  193. .set_dtype(1, dtype::Float32())
  194. .set_dtype(2, dtype::Float32())
  195. .set_rng(0, &rng)
  196. .set_rng(1, &rng)
  197. .set_rng(2, &rng);
  198. for (auto&& arg : args) {
  199. checker.set_param(arg.param).exec({arg.src, arg.filter, {}});
  200. }
  201. }
  202. #if MEGDNN_X86_WITH_MKL_DNN
  203. TEST_F(X86, CONVOLUTION_FORWARD_INT8) {
  204. Checker<ConvolutionForward> checker(handle());
  205. checker.set_before_exec_callback(
  206. AlgoChecker<ConvolutionForward>("CONVOLUTION_DEFAULT_MKLDNN_INT8"));
  207. param::Convolution param;
  208. param.sparse = param::Convolution::Sparse::GROUP;
  209. UniformIntRNG rng{-128, 127};
  210. std::vector<ConvArg> args = get_group_conv_args();
  211. for (auto&& arg : args) {
  212. param.stride_h = param.stride_w = arg.sh;
  213. param.pad_h = param.pad_w = arg.ph;
  214. checker.set_dtype(0, dtype::Int8())
  215. .set_dtype(1, dtype::Int8())
  216. .set_dtype(2, dtype::Int32())
  217. .set_rng(0, &rng)
  218. .set_rng(1, &rng)
  219. .set_param(param)
  220. .execs({{arg.batch_size, arg.ic * arg.groups, arg.ih, arg.iw},
  221. {arg.groups, arg.oc, arg.ic, arg.fh, arg.fh},
  222. {}});
  223. }
  224. args = get_dense_conv_args();
  225. param.sparse = param::Convolution::Sparse::DENSE;
  226. for (auto&& arg : args) {
  227. param.stride_h = param.stride_w = arg.sh;
  228. param.pad_h = param.pad_w = arg.ph;
  229. checker.set_dtype(0, dtype::Int8())
  230. .set_dtype(1, dtype::Int8())
  231. .set_dtype(2, dtype::Int32())
  232. .set_rng(0, &rng)
  233. .set_rng(1, &rng)
  234. .set_param(param)
  235. .execs({{arg.batch_size, arg.ic, arg.ih, arg.iw},
  236. {arg.oc, arg.ic, arg.fh, arg.fh},
  237. {}});
  238. }
  239. }
  240. TEST_F(X86, CONVOLUTION_FORWARD_MATMUL_INT8) {
  241. std::vector<ConvArg> args = get_dense_conv_args();
  242. Checker<ConvolutionForward> checker(handle());
  243. checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  244. "CONVOLUTION_DEFAULT_MKLDNN_MATMUL_INT8"));
  245. param::Convolution param;
  246. param.sparse = param::Convolution::Sparse::DENSE;
  247. UniformIntRNG rng{-128, 127};
  248. for (auto&& arg : args) {
  249. param.stride_h = param.stride_w = arg.sh;
  250. param.pad_h = param.pad_w = arg.ph;
  251. checker.set_dtype(0, dtype::Int8())
  252. .set_dtype(1, dtype::Int8())
  253. .set_dtype(2, dtype::Int32())
  254. .set_rng(0, &rng)
  255. .set_rng(1, &rng)
  256. .set_param(param)
  257. .execs({{arg.batch_size, arg.ic, arg.ih, arg.iw},
  258. {arg.oc, arg.ic, arg.fh, arg.fh},
  259. {}});
  260. }
  261. }
  262. static void x86_correctness_fp32_mkldnn_run(Checker<Convolution>& checker,
  263. UniformIntRNG& rng, Handle* handle,
  264. size_t n, size_t stride,
  265. size_t kernel, size_t oc, size_t ic,
  266. size_t h, size_t w, size_t group) {
  267. auto oc_per_group = oc / group;
  268. auto ic_per_group = ic / group;
  269. bool ok_group = oc_per_group % 8 == 0 && oc_per_group > 0 &&
  270. (ic_per_group % 8 == 0 || ic_per_group == 3) &&
  271. ic_per_group > 0;
  272. bool ok_depthwise = oc == ic && oc == group;
  273. if (!(ok_group || ok_depthwise)) {
  274. return;
  275. }
  276. size_t pad = kernel / 2;
  277. size_t kernel_h = kernel;
  278. size_t kernel_w = kernel;
  279. param::Convolution param;
  280. param.format = param::Convolution::Format::NCHW88;
  281. param.stride_h = stride;
  282. param.stride_w = stride;
  283. param.pad_h = pad;
  284. param.pad_w = pad;
  285. auto src_tensor_shape = TensorShape{n, ic / 8, h, w, 8};
  286. if (ic == 3) {
  287. src_tensor_shape = TensorShape{n, ic, h, w};
  288. }
  289. auto weight_tensor_shape =
  290. TensorShape{oc / 8, ic / 8, kernel_h, kernel_w, 8, 8};
  291. if (ic == 3) {
  292. weight_tensor_shape = TensorShape{oc / 8, kernel_h, kernel_w, ic, 8};
  293. }
  294. if (group == 1) {
  295. param.sparse = param::Convolution::Sparse::DENSE;
  296. } else if (group > 1 && ic / group == 1 && oc / group == 1) {
  297. param.sparse = param::Convolution::Sparse::GROUP;
  298. weight_tensor_shape =
  299. TensorShape{group / 8, 1, 1, kernel_h, kernel_w, 8};
  300. } else if (group > 1 && oc / group % 8 == 0 && oc / group > 0 &&
  301. ic / group % 8 == 0 && ic / group > 0) {
  302. param.sparse = param::Convolution::Sparse::GROUP;
  303. weight_tensor_shape = TensorShape{
  304. group, oc / group / 8, ic / group / 8, kernel_h, kernel_w, 8,
  305. 8};
  306. }
  307. checker.set_dtype(0, dtype::Float32())
  308. .set_dtype(1, dtype::Float32())
  309. .set_rng(0, &rng)
  310. .set_rng(1, &rng)
  311. .set_epsilon(1e-3)
  312. .set_param(param)
  313. .execs({src_tensor_shape, weight_tensor_shape, {}});
  314. }
  315. static void x86_correctness_fp32_mkldnn(Handle* handle) {
  316. Checker<Convolution> checker(handle);
  317. UniformIntRNG rng{-127, 127};
  318. checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  319. "CONVOLUTION_DEFAULT_MKLDNN_CONV_FP32"));
  320. for (size_t n : {1, 2})
  321. for (size_t stride : {1, 2})
  322. for (size_t kernel : {3, 5, 7})
  323. for (size_t oc : {8, 16})
  324. for (size_t ic : {3, 8, 16})
  325. for (size_t h : {22, 33})
  326. for (size_t w : {22, 33}) {
  327. for (size_t group = 1;
  328. group <= std::min(oc, ic); ++group) {
  329. x86_correctness_fp32_mkldnn_run(
  330. checker, rng, handle, n, stride,
  331. kernel, oc, ic, h, w, group);
  332. }
  333. }
  334. }
  335. TEST_F(X86, CONVOLUTION_DIRECT_MKLDNN_C8) {
  336. x86_correctness_fp32_mkldnn(handle());
  337. }
  338. #endif
  339. #if MEGDNN_WITH_BENCHMARK
  340. TEST_F(X86, BENCHMARK_CONVOLUTION_I8x8x16) {
  341. using namespace convolution;
  342. using Param = param::Convolution;
  343. std::vector<TestArg> args;
  344. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  345. size_t stride, size_t group = 1) {
  346. Param param;
  347. param.stride_h = stride;
  348. param.stride_w = stride;
  349. param.pad_h = kernel / 2;
  350. param.pad_w = kernel / 2;
  351. if (group > 1) {
  352. param.sparse = param::Convolution::Sparse::GROUP;
  353. args.emplace_back(
  354. param, TensorShape{1, ic, h, w},
  355. TensorShape{group, oc / group, ic / group, kernel, kernel});
  356. } else {
  357. param.sparse = param::Convolution::Sparse::DENSE;
  358. args.emplace_back(param, TensorShape{1, ic, h, w},
  359. TensorShape{oc, ic, kernel, kernel});
  360. }
  361. };
  362. run(48, 96, 15, 15, 1, 1);
  363. run(64, 64, 60, 60, 3, 1);
  364. run(64, 64, 60, 60, 3, 1, 64);
  365. constexpr size_t RUN = 30;
  366. Benchmarker<Convolution> benchmark(handle());
  367. benchmark.set_dtype(0, dtype::Int8())
  368. .set_dtype(1, dtype::Int8())
  369. .set_dtype(2, dtype::Int16());
  370. benchmark.set_before_exec_callback(AlgoChecker<Convolution>(".*"));
  371. benchmark.set_display(false);
  372. benchmark.set_times(RUN);
  373. for (auto&& arg : args) {
  374. TensorLayout dst_layout;
  375. auto opr = handle()->create_operator<Convolution>();
  376. opr->param() = arg.param;
  377. opr->deduce_layout({arg.src, dtype::Float32()},
  378. {arg.filter, dtype::Float32()}, dst_layout);
  379. //! dst.nr_elems * IC * FH * FW * 2
  380. float icpg = arg.filter.ndim == 4 ? arg.filter[1] : arg.filter[2];
  381. float filter = arg.filter.ndim == 4 ? arg.filter[2] : arg.filter[3];
  382. float computations = dst_layout.total_nr_elems() * icpg * filter *
  383. filter * 2.0 / (1024 * 1024 * 1024) * 1e3;
  384. auto used_int =
  385. benchmark.set_param(arg.param).exec({arg.src, arg.filter, {}}) /
  386. RUN;
  387. printf("%s %s: int: %f ms %f Gflops \n", arg.src.to_string().c_str(),
  388. arg.filter.to_string().c_str(), used_int,
  389. computations / used_int);
  390. }
  391. }
  392. #if MEGDNN_X86_WITH_MKL_DNN
  393. TEST_F(X86, BENCHMARK_CONVOLUTION_I8x8x32_MKLDNN) {
  394. using namespace convolution;
  395. using Param = param::Convolution;
  396. std::vector<TestArg> args;
  397. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  398. size_t stride) {
  399. Param param;
  400. param.stride_h = stride;
  401. param.stride_w = stride;
  402. param.pad_h = kernel / 2;
  403. param.pad_w = kernel / 2;
  404. args.emplace_back(param, TensorShape{1, ic, h, w},
  405. TensorShape{oc, ic, kernel, kernel});
  406. };
  407. for (size_t kernel : {2, 3, 5, 7}) {
  408. for (size_t ic : {1, 8, 16, 32, 64}) {
  409. for (size_t oc : {1, 8, 16, 32, 64}) {
  410. run(oc, ic, 56, 56, kernel, 1);
  411. run(oc, ic, 128, 128, kernel, 1);
  412. run(oc, ic, 256, 256, kernel, 1);
  413. }
  414. }
  415. }
  416. constexpr size_t RUN = 50;
  417. Benchmarker<Convolution> benchmark(handle());
  418. benchmark.set_dtype(0, dtype::Int8())
  419. .set_dtype(1, dtype::Int8())
  420. .set_dtype(2, dtype::Int32());
  421. benchmark.set_display(false);
  422. benchmark.set_times(RUN);
  423. Benchmarker<Convolution> benchmark_float(handle());
  424. benchmark_float.set_display(false);
  425. benchmark_float.set_times(RUN);
  426. for (auto&& arg : args) {
  427. TensorLayout dst_layout;
  428. auto opr = handle()->create_operator<Convolution>();
  429. opr->param() = arg.param;
  430. opr->deduce_layout({arg.src, dtype::Float32()},
  431. {arg.filter, dtype::Float32()}, dst_layout);
  432. //! dst.nr_elems * IC * FH * FW * 2
  433. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  434. arg.filter[2] * arg.filter[3] * 2.0 /
  435. (1024 * 1024 * 1024) * 1e3;
  436. auto used_int =
  437. benchmark.set_param(arg.param).exec({arg.src, arg.filter, {}}) /
  438. RUN;
  439. auto used_float = benchmark_float.set_param(arg.param).exec(
  440. {arg.src, arg.filter, {}}) /
  441. RUN;
  442. printf("%s %s: int: %f ms %f Gflops float: %f ms %f GFlops speedup: "
  443. "%f\n",
  444. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  445. used_int, computations / used_int, used_float,
  446. computations / used_float, used_float / used_int);
  447. }
  448. }
  449. #endif
  450. #endif
  451. } // namespace test
  452. } // namespace megdnn
  453. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台