You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution.cpp 33 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831
  1. #include "megdnn/dtype.h"
  2. #include "test/fallback/fixture.h"
  3. #include "test/common/benchmarker.h"
  4. #include "test/common/checker.h"
  5. #include "test/common/convolution.h"
  6. #include "test/common/rng.h"
  7. #include "test/common/task_record_check.h"
  8. using namespace megdnn;
  9. using namespace test;
  10. namespace megdnn {
  11. namespace test {
  12. TEST_F(FALLBACK, CONVOLUTION_MATRIX_MUL_RECORD) {
  13. using Param = Convolution::Param;
  14. TaskRecordChecker<Convolution> checker(1);
  15. NormalRNG default_rng;
  16. UniformIntRNG int_rng{-50, 50};
  17. Param param;
  18. param.stride_h = 2;
  19. param.stride_w = 2;
  20. param.pad_h = 3 / 2;
  21. param.pad_w = 3 / 2;
  22. param.pad_h = 0;
  23. param.pad_w = 0;
  24. checker.set_dtype(0, dtype::Float32())
  25. .set_dtype(1, dtype::Float32())
  26. .set_rng(0, &default_rng)
  27. .set_rng(1, &default_rng)
  28. .set_param(param)
  29. .execs({{1, 3, 20, 40}, {24, 3, 3, 3}, {}});
  30. }
  31. } // namespace test
  32. } // namespace megdnn
  33. #if MEGDNN_WITH_BENCHMARK
  34. TEST_F(FALLBACK, BENCHMARK_CONVOLUTION_MATRIX_MUL) {
  35. using Param = Convolution::Param;
  36. auto run = [&](const TensorShapeArray& shapes, Param param) {
  37. Benchmarker<Convolution> benchmarker_float(handle());
  38. size_t RUN = 50;
  39. auto tfloat = benchmarker_float.set_display(false)
  40. .set_dtype(0, dtype::Float32{})
  41. .set_dtype(1, dtype::Float32{})
  42. .set_times(RUN)
  43. .set_param(param)
  44. .exec(shapes);
  45. size_t IC = shapes[1][1];
  46. size_t FH = shapes[1][2];
  47. size_t FW = shapes[1][3];
  48. TensorLayout dst_layout;
  49. auto opr = handle()->create_operator<Convolution>();
  50. opr->param() = param;
  51. opr->deduce_layout(
  52. {shapes[0], dtype::Float32()}, {shapes[1], dtype::Float32()},
  53. dst_layout);
  54. printf("fp32 flops: %.3f mflops\n",
  55. (IC * dst_layout.total_nr_elems() * FH * FW * 2) /
  56. (tfloat / RUN * 1000));
  57. };
  58. auto profile = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  59. size_t stride) {
  60. Param param;
  61. param.stride_h = stride;
  62. param.stride_w = stride;
  63. param.pad_h = kernel / 2;
  64. param.pad_w = kernel / 2;
  65. param.pad_h = 0;
  66. param.pad_w = 0;
  67. printf("oc: %zd ic: %zd w: %zd h: %zd stride: %zd kernel_size: %zd\n", oc, ic,
  68. w, h, stride, kernel);
  69. run({{1, ic, h, w}, {oc, ic, kernel, kernel}, {}}, param);
  70. };
  71. profile(48, 128, 56, 88, 1, 1);
  72. profile(56, 128, 64, 80, 1, 1);
  73. profile(24, 3, 256, 320, 3, 2);
  74. profile(16, 3, 224, 352, 5, 2);
  75. profile(16, 3, 256, 320, 7, 2);
  76. profile(8, 8, 56, 88, 3, 1);
  77. profile(8, 8, 7, 11, 3, 1);
  78. profile(4, 4, 64, 80, 3, 1);
  79. profile(108, 108, 7, 7, 3, 1);
  80. profile(54, 54, 7, 7, 3, 1);
  81. profile(3, 3, 128, 128, 3, 1);
  82. profile(3, 3, 112, 112, 3, 1);
  83. }
  84. TEST_F(FALLBACK, BENCHMARK_CONVOLUTION_MATRIX_MUL_8832) {
  85. using Param = Convolution::Param;
  86. auto run = [&](const TensorShapeArray& shapes, Param param) {
  87. Benchmarker<Convolution> benchmarker_float(handle());
  88. size_t RUN = 50;
  89. auto tfloat = benchmarker_float.set_display(false)
  90. .set_dtype(0, dtype::Int8{})
  91. .set_dtype(1, dtype::Int8{})
  92. .set_dtype(2, dtype::Int32{})
  93. .set_times(RUN)
  94. .set_param(param)
  95. .exec(shapes);
  96. size_t IC = shapes[1][1];
  97. size_t FH = shapes[1][2];
  98. size_t FW = shapes[1][3];
  99. TensorLayout dst_layout;
  100. auto opr = handle()->create_operator<Convolution>();
  101. opr->param() = param;
  102. opr->deduce_layout(
  103. {shapes[0], dtype::Float32()}, {shapes[1], dtype::Float32()},
  104. dst_layout);
  105. printf("fp32 flops: %.3f mflops\n",
  106. (IC * dst_layout.total_nr_elems() * FH * FW * 2) /
  107. (tfloat / RUN * 1000));
  108. };
  109. auto profile = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  110. size_t stride) {
  111. Param param;
  112. param.stride_h = stride;
  113. param.stride_w = stride;
  114. param.pad_h = kernel / 2;
  115. param.pad_w = kernel / 2;
  116. param.pad_h = 0;
  117. param.pad_w = 0;
  118. printf("oc: %zd ic: %zd w: %zd h: %zd stride: %zd kernel_size: %zd\n", oc, ic,
  119. w, h, stride, kernel);
  120. run({{1, ic, h, w}, {oc, ic, kernel, kernel}, {}}, param);
  121. };
  122. profile(48, 128, 56, 88, 1, 1);
  123. profile(56, 128, 64, 80, 3, 1);
  124. profile(24, 3, 256, 320, 3, 2);
  125. }
  126. TEST_F(FALLBACK, BENCHMARK_CONVOLUTION_MATRIX_MUL_8816) {
  127. using Param = Convolution::Param;
  128. auto run = [&](const TensorShapeArray& shapes, Param param) {
  129. Benchmarker<Convolution> benchmarker_float(handle());
  130. size_t RUN = 50;
  131. auto tfloat = benchmarker_float.set_display(false)
  132. .set_dtype(0, dtype::Int8{})
  133. .set_dtype(1, dtype::Int8{})
  134. .set_dtype(2, dtype::Int16{})
  135. .set_times(RUN)
  136. .set_param(param)
  137. .exec(shapes);
  138. size_t IC = shapes[1][1];
  139. size_t FH = shapes[1][2];
  140. size_t FW = shapes[1][3];
  141. TensorLayout dst_layout;
  142. auto opr = handle()->create_operator<Convolution>();
  143. opr->param() = param;
  144. opr->deduce_layout(
  145. {shapes[0], dtype::Float32()}, {shapes[1], dtype::Float32()},
  146. dst_layout);
  147. printf("fp32 flops: %.3f mflops\n",
  148. (IC * dst_layout.total_nr_elems() * FH * FW * 2) /
  149. (tfloat / RUN * 1000));
  150. };
  151. auto profile = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  152. size_t stride) {
  153. Param param;
  154. param.stride_h = stride;
  155. param.stride_w = stride;
  156. param.pad_h = kernel / 2;
  157. param.pad_w = kernel / 2;
  158. param.pad_h = 0;
  159. param.pad_w = 0;
  160. printf("oc: %zd ic: %zd w: %zd h: %zd stride: %zd kernel_size: %zd\n", oc, ic,
  161. w, h, stride, kernel);
  162. run({{1, ic, h, w}, {oc, ic, kernel, kernel}, {}}, param);
  163. };
  164. profile(48, 128, 56, 88, 1, 1);
  165. profile(48, 128, 56, 88, 1, 2);
  166. profile(56, 128, 64, 80, 3, 1);
  167. profile(24, 3, 256, 320, 3, 2);
  168. }
  169. TEST_F(FALLBACK, BENCHMARK_CONVOLUTION_BACKWARD_DATA) {
  170. using Param = ConvolutionBackwardData::Param;
  171. auto run = [&](const TensorLayoutArray& tensors, Param param) {
  172. Benchmarker<ConvolutionBackwardData> benchmarker_fallback(handle());
  173. size_t RUN = 500;
  174. benchmarker_fallback.set_display(false)
  175. .set_dtype(0, dtype::Float32{})
  176. .set_dtype(1, dtype::Float32{})
  177. .set_times(RUN)
  178. .set_param(param);
  179. auto tmatmul =
  180. benchmarker_fallback
  181. .set_before_exec_callback(
  182. AlgoChecker<ConvolutionBackwardData>("DeconvMatmul"))
  183. .exec(tensors);
  184. auto tdirect =
  185. benchmarker_fallback
  186. .set_before_exec_callback(
  187. AlgoChecker<ConvolutionBackwardData>("DeconvDirect"))
  188. .exec(tensors);
  189. size_t IC = tensors[0][1];
  190. size_t FH = tensors[0][2];
  191. size_t FW = tensors[0][3];
  192. size_t total_flops = IC * tensors[1].total_nr_elems() * FH * FW * 2;
  193. printf("Direct_time: %.3f ms Direct_flops: %.3f mflops\n", tdirect,
  194. total_flops / (tdirect / RUN * 1000));
  195. printf("Matmul_time: %.3f ms Matmul_flops: %.3f mflops\n", tmatmul,
  196. total_flops / (tmatmul / RUN * 1000));
  197. printf("speedup: %.3f\n", tdirect / tmatmul);
  198. };
  199. auto profile = [&](size_t n, size_t ic, size_t oh, size_t ow, size_t oc, size_t fh,
  200. size_t fw, size_t stride = 1, size_t padding = 0) {
  201. Param param;
  202. param.pad_h = param.pad_w = padding;
  203. param.stride_h = param.stride_w = stride;
  204. printf("oc: %zd ic: %zd w: %zd h: %zd stride: %zd kernel_size: %zd\n", oc, ic,
  205. ow, oh, stride, fh);
  206. TensorLayout diff = TensorLayout{{n, oc, oh, ow}, dtype::Float32()};
  207. TensorLayout filter = TensorLayout{{oc, ic, fh, fw}, dtype::Float32()};
  208. TensorLayout grad;
  209. {
  210. auto opr = handle()->create_operator<ConvolutionBackwardData>();
  211. opr->param() = param;
  212. opr->deduce_layout(filter, diff, grad);
  213. }
  214. run(TensorLayoutArray{filter, diff, grad}, param);
  215. };
  216. profile(1, 1, 3, 3, 1, 2, 2);
  217. profile(1, 2, 3, 3, 2, 2, 2);
  218. profile(1, 4, 3, 3, 4, 2, 2);
  219. profile(1, 4, 3, 3, 8, 2, 2);
  220. profile(1, 8, 3, 3, 4, 2, 2);
  221. profile(1, 8, 3, 3, 8, 2, 2);
  222. }
  223. #endif
  224. TEST_F(FALLBACK, CONVOLUTION_MATRIX_MUL) {
  225. Checker<Convolution> checker(handle());
  226. using Param = Convolution::Param;
  227. Param param;
  228. param.sparse = param::Convolution::Sparse::DENSE;
  229. auto run = [&](size_t n, size_t ic, size_t ih, size_t iw, size_t oc, size_t fh,
  230. size_t fw) {
  231. param.pad_h = param.pad_w = 1;
  232. param.stride_h = param.stride_w = 1;
  233. checker.set_param(param);
  234. checker.execs({{n, ic, ih, iw}, {oc, ic, fh, fw}, {}});
  235. };
  236. run(1, 3, 128, 128, 5, 3, 3);
  237. run(1, 56, 128, 64, 80, 1, 1);
  238. run(1, 8, 8, 7, 11, 3, 1);
  239. run(1, 54, 54, 7, 7, 3, 1);
  240. run(1, 3, 3, 128, 128, 3, 1);
  241. run(1, 3, 3, 112, 112, 3, 1);
  242. run(1, 1, 1, 1, 1, 3, 3);
  243. }
  244. #if MEGDNN_X86
  245. TEST_F(FALLBACK_MULTI_THREADS, CONVOLUTION_8816) {
  246. Checker<Convolution> checker(handle());
  247. using Param = Convolution::Param;
  248. checker.set_before_exec_callback(AlgoChecker<Convolution>(".+FB_GEMV.+"));
  249. auto run = [&](size_t n, size_t ic, size_t ih, size_t iw, size_t oc, size_t fh,
  250. size_t fw, size_t pad, size_t stride, size_t group) {
  251. Param param;
  252. param.sparse = group > 1 ? param::Convolution::Sparse::GROUP
  253. : param::Convolution::Sparse::DENSE;
  254. param.pad_h = param.pad_w = pad;
  255. param.stride_h = param.stride_w = stride;
  256. checker.set_param(param);
  257. if (group > 1) {
  258. checker.execl(
  259. {{{n, ic, ih, iw}, dtype::Int8()},
  260. {{group, oc / group, ic / group, fh, fw}, dtype::Int8()},
  261. {{}, dtype::Int16()}});
  262. } else {
  263. checker.execl(
  264. {{{n, ic, ih, iw}, dtype::Int8()},
  265. {{oc, ic, fh, fw}, dtype::Int8()},
  266. {{}, dtype::Int16()}});
  267. }
  268. };
  269. for (auto n : {1, 2})
  270. for (auto ic : {3, 4, 8, 12, 16})
  271. for (auto oc : {4, 8, 16, 32})
  272. for (auto ih : {7, 14, 15, 22})
  273. for (auto iw : {7, 13, 11, 32})
  274. for (auto filter : {1, 2, 3, 5, 7})
  275. for (auto stride : {1, 2})
  276. for (auto pad : {0, filter / 2}) {
  277. run(n, ic, ih, iw, oc, filter, filter, pad, stride,
  278. 1);
  279. if (ic == oc) {
  280. run(n, ic, ih, iw, oc, filter, filter, pad,
  281. stride, ic);
  282. }
  283. }
  284. }
  285. #endif
  286. TEST_F(FALLBACK, CONVOLUTION_NAIVE_ALGO_FP16) {
  287. Checker<Convolution> checker(handle());
  288. using Param = Convolution::Param;
  289. checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>("NAIVE_ALGO"));
  290. Param param;
  291. param.sparse = param::Convolution::Sparse::DENSE;
  292. auto run = [&](size_t n, size_t ic, size_t ih, size_t iw, size_t oc, size_t fh,
  293. size_t fw) {
  294. param.pad_h = param.pad_w = 1;
  295. param.stride_h = param.stride_w = 1;
  296. for (auto cmode : std::vector<Param::ComputeMode>{
  297. Param::ComputeMode::DEFAULT, Param::ComputeMode::FLOAT32}) {
  298. param.compute_mode = cmode;
  299. checker.set_param(param)
  300. .set_dtype(0, dtype::Float16())
  301. .set_dtype(1, dtype::Float16())
  302. // Use inferred output dtype.
  303. .set_dtype(2, {});
  304. checker.execs({{n, ic, ih, iw}, {oc, ic, fh, fw}, {}});
  305. }
  306. };
  307. run(1, 3, 128, 128, 5, 3, 3);
  308. run(1, 8, 8, 7, 11, 3, 1);
  309. }
  310. TEST_F(FALLBACK_MULTI_THREADS, CONVOLUTION_NAIVE_FALLBACK) {
  311. Checker<Convolution> checker(handle());
  312. using Param = Convolution::Param;
  313. checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>("FALLBACK_ALGO"));
  314. Param param;
  315. auto run = [&](size_t n, size_t group, size_t ic, size_t ih, size_t iw, size_t oc,
  316. size_t fh, size_t fw) {
  317. param.sparse = param::Convolution::Sparse::GROUP;
  318. param.pad_h = param.pad_w = 1;
  319. param.stride_h = param.stride_w = 1;
  320. TensorShape src{n, ic, ih, iw}, filter{group, oc / group, ic / group, fh, fw};
  321. checker.set_param(param)
  322. .set_dtype(0, dtype::Float32())
  323. .set_dtype(1, dtype::Float32())
  324. .set_dtype(2, {});
  325. checker.execs({src, filter, {}});
  326. };
  327. run(4, 1, 3, 21, 15, 5, 3, 3);
  328. run(1, 8, 56, 24, 31, 56, 1, 1);
  329. run(4, 8, 8, 8, 7, 8, 3, 1);
  330. run(8, 1, 54, 54, 7, 7, 3, 1);
  331. run(100, 1, 1, 1, 1, 1, 3, 3);
  332. }
  333. TEST_F(FALLBACK_MULTI_THREADS, CONVOLUTION_NAIVE_ALGO) {
  334. Checker<Convolution> checker(handle());
  335. using Param = Convolution::Param;
  336. checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>("NAIVE_ALGO"));
  337. Param param;
  338. auto run = [&](size_t n, size_t group, size_t ic, size_t ih, size_t iw, size_t oc,
  339. size_t fh, size_t fw) {
  340. param.sparse = param::Convolution::Sparse::GROUP;
  341. param.pad_h = param.pad_w = 1;
  342. param.stride_h = param.stride_w = 1;
  343. TensorShape src{n, ic, ih, iw}, filter{group, oc / group, ic / group, fh, fw};
  344. checker.set_param(param).set_dtype(2, {});
  345. //! float32
  346. checker.set_dtype(0, dtype::Float32()).set_dtype(1, dtype::Float32());
  347. checker.execs({src, filter, {}});
  348. //! float16
  349. checker.set_dtype(0, dtype::Float16()).set_dtype(1, dtype::Float16());
  350. checker.execs({src, filter, {}});
  351. //! Qint8
  352. checker.set_dtype(0, dtype::QuantizedS8(3.34f))
  353. .set_dtype(1, dtype::QuantizedS8(0.32f));
  354. checker.execs({src, filter, {}});
  355. //! Quint8
  356. checker.set_dtype(0, dtype::Quantized8Asymm(3.34f, static_cast<uint8_t>(21)))
  357. .set_dtype(1, dtype::Quantized8Asymm(0.32f, static_cast<uint8_t>(15)));
  358. checker.execs({src, filter, {}});
  359. };
  360. run(4, 1, 3, 21, 15, 5, 3, 3);
  361. run(1, 8, 56, 24, 31, 56, 1, 1);
  362. run(4, 8, 8, 8, 7, 8, 3, 1);
  363. run(8, 1, 54, 54, 7, 7, 3, 1);
  364. run(100, 1, 1, 1, 1, 1, 3, 3);
  365. }
  366. TEST_F(FALLBACK, CONVOLUTION_MATRIX_MUL_SINT8) {
  367. Checker<Convolution> checker(handle());
  368. using Param = Convolution::Param;
  369. Param param;
  370. param.sparse = param::Convolution::Sparse::DENSE;
  371. auto run = [&](size_t n, size_t ic, size_t ih, size_t iw, size_t oc, size_t fh,
  372. size_t fw) {
  373. param.pad_h = param.pad_w = 1;
  374. param.stride_h = param.stride_w = 1;
  375. checker.set_param(param)
  376. .set_dtype(0, dtype::QuantizedS8(0.2f))
  377. .set_dtype(1, dtype::QuantizedS8(0.2f))
  378. // Use inferred output dtype.
  379. .set_dtype(2, {});
  380. checker.execs({{n, ic, ih, iw}, {oc, ic, fh, fw}, {}});
  381. };
  382. run(1, 3, 128, 128, 5, 3, 3);
  383. run(1, 56, 128, 64, 80, 1, 1);
  384. run(1, 8, 8, 7, 11, 3, 1);
  385. run(1, 54, 54, 7, 7, 3, 1);
  386. run(1, 3, 3, 128, 128, 3, 1);
  387. run(1, 3, 3, 112, 112, 3, 1);
  388. run(1, 1, 1, 1, 1, 3, 3);
  389. }
  390. TEST_F(FALLBACK, CONVOLUTION_BACKWARD_DATA) {
  391. Checker<ConvolutionBackwardData> checker(handle());
  392. using Param = ConvolutionBackwardData::Param;
  393. Param param;
  394. auto run = [&](size_t n, size_t ic, size_t oh, size_t ow, size_t oc, size_t fh,
  395. size_t fw, size_t stride, size_t padding, size_t dilate = 1,
  396. size_t group = 1) {
  397. param.pad_h = param.pad_w = padding;
  398. param.stride_h = param.stride_w = stride;
  399. param.dilate_h = param.dilate_w = dilate;
  400. TensorLayout diff = TensorLayout{{n, oc * group, oh, ow}, dtype::Float32()};
  401. TensorLayout grad;
  402. TensorLayout filter;
  403. if (group == 1) {
  404. param.sparse = Param::Sparse::DENSE;
  405. filter = {{oc, ic, fh, fw}, dtype::Float32()};
  406. } else {
  407. param.sparse = Param::Sparse::GROUP;
  408. filter = {{group, oc, ic, fh, fw}, dtype::Float32()};
  409. }
  410. // TensorLayout grad;
  411. {
  412. auto opr = handle()->create_operator<ConvolutionBackwardData>();
  413. opr->param() = param;
  414. opr->deduce_layout(filter, diff, grad);
  415. }
  416. checker.set_param(param)
  417. .set_dtype(0, dtype::Float32())
  418. .set_dtype(1, dtype::Float32());
  419. checker.exec(TensorLayoutArray{filter, diff, grad});
  420. };
  421. for (auto mode : {Param::Mode::CONVOLUTION, Param::Mode::CROSS_CORRELATION}) {
  422. param.mode = mode;
  423. run(4, 3, 10, 13, 5, 1, 1, 1, 0, 1, 1);
  424. run(5, 5, 24, 43, 11, 9, 3, 3, 12, 1, 2);
  425. run(4, 3, 10, 45, 2, 1, 1, 1, 0, 4, 3);
  426. run(2, 3, 9, 12, 2, 4, 6, 1, 0, 1, 2);
  427. run(3, 4, 17, 32, 2, 3, 2, 5, 4, 4, 3);
  428. run(5, 5, 24, 43, 11, 9, 3, 3, 12, 2, 2);
  429. run(2, 3, 20, 33, 3, 5, 7, 4, 15, 2, 3);
  430. run(4, 4, 6, 7, 9, 3, 2, 2, 1, 3, 2);
  431. }
  432. }
  433. TEST_F(FALLBACK, CONVOLUTION_BACKWARD_DATA_NCHW44) {
  434. Checker<ConvolutionBackwardData> checker(handle());
  435. using Param = ConvolutionBackwardData::Param;
  436. Param param;
  437. param.format = Param::Format::NCHW44;
  438. auto run = [&](size_t n, size_t ic, size_t oh, size_t ow, size_t oc, size_t fh,
  439. size_t fw, size_t stride, size_t padding, size_t dilate = 1,
  440. size_t group = 1) {
  441. param.pad_h = param.pad_w = padding;
  442. param.stride_h = param.stride_w = stride;
  443. param.dilate_h = param.dilate_w = dilate;
  444. TensorLayout diff =
  445. TensorLayout{{n, oc / 4 * group, oh, ow, 4}, dtype::Float32()};
  446. TensorLayout grad;
  447. TensorLayout filter;
  448. if (group == 1) {
  449. param.sparse = Param::Sparse::DENSE;
  450. filter = {{oc / 4, ic / 4, fh, fw, 4, 4}, dtype::Float32()};
  451. } else {
  452. param.sparse = Param::Sparse::GROUP;
  453. filter = {{group, oc / 4, ic / 4, fh, fw, 4, 4}, dtype::Float32()};
  454. }
  455. // TensorLayout grad;
  456. {
  457. auto opr = handle()->create_operator<ConvolutionBackwardData>();
  458. opr->param() = param;
  459. opr->deduce_layout(filter, diff, grad);
  460. }
  461. checker.set_param(param)
  462. .set_dtype(0, dtype::Float32())
  463. .set_dtype(1, dtype::Float32());
  464. checker.exec(TensorLayoutArray{filter, diff, grad});
  465. };
  466. for (auto mode : {Param::Mode::CONVOLUTION, Param::Mode::CROSS_CORRELATION}) {
  467. param.mode = mode;
  468. run(1, 4, 2, 2, 4, 1, 1, 1, 0, 1, 1);
  469. run(1, 4, 2, 2, 4, 3, 3, 1, 0, 1, 1);
  470. run(1, 4, 2, 2, 4, 3, 3, 1, 1, 1, 1);
  471. run(4, 16, 10, 13, 16, 1, 1, 1, 0, 1, 1);
  472. run(4, 16, 10, 13, 16, 3, 3, 1, 0, 1, 1);
  473. run(4, 16, 10, 13, 16, 3, 3, 1, 1, 1, 1);
  474. run(4, 32, 11, 23, 32, 1, 1, 1, 0, 1, 4);
  475. run(4, 16, 11, 23, 8, 3, 3, 1, 0, 1, 4);
  476. run(4, 16, 11, 23, 8, 3, 3, 1, 1, 1, 4);
  477. run(4, 16, 11, 23, 8, 3, 3, 2, 1, 1, 4);
  478. }
  479. }
  480. TEST_F(FALLBACK, CONVOLUTION_BACKWARD_DATA_RECORD) {
  481. TaskRecordChecker<ConvolutionBackwardData> checker(1);
  482. using Param = ConvolutionBackwardData::Param;
  483. Param param;
  484. auto run = [&](size_t n, size_t ic, size_t oh, size_t ow, size_t oc, size_t fh,
  485. size_t fw, size_t stride, size_t padding, size_t dilate = 1,
  486. size_t group = 1) {
  487. param.pad_h = param.pad_w = padding;
  488. param.stride_h = param.stride_w = stride;
  489. param.dilate_h = param.dilate_w = dilate;
  490. TensorLayout diff = TensorLayout{{n, oc * group, oh, ow}, dtype::Float32()};
  491. TensorLayout grad;
  492. TensorLayout filter;
  493. if (group == 1) {
  494. param.sparse = Param::Sparse::DENSE;
  495. filter = {{oc, ic, fh, fw}, dtype::Float32()};
  496. } else {
  497. param.sparse = Param::Sparse::GROUP;
  498. filter = {{group, oc, ic, fh, fw}, dtype::Float32()};
  499. }
  500. // TensorLayout grad;
  501. {
  502. auto opr = handle()->create_operator<ConvolutionBackwardData>();
  503. opr->param() = param;
  504. opr->deduce_layout(filter, diff, grad);
  505. }
  506. checker.set_param(param)
  507. .set_dtype(0, dtype::Float32())
  508. .set_dtype(1, dtype::Float32());
  509. checker.exec(TensorLayoutArray{filter, diff, grad});
  510. };
  511. for (auto mode : {Param::Mode::CONVOLUTION, Param::Mode::CROSS_CORRELATION}) {
  512. param.mode = mode;
  513. run(4, 3, 10, 13, 5, 1, 1, 1, 0, 1, 1);
  514. run(5, 5, 24, 43, 11, 9, 3, 3, 12, 1, 2);
  515. run(4, 3, 10, 45, 2, 1, 1, 1, 0, 4, 3);
  516. run(2, 3, 9, 12, 2, 4, 6, 1, 0, 1, 2);
  517. run(3, 4, 17, 32, 2, 3, 2, 5, 4, 4, 3);
  518. run(5, 5, 24, 43, 11, 9, 3, 3, 12, 2, 2);
  519. run(2, 3, 20, 33, 3, 5, 7, 4, 15, 2, 3);
  520. run(4, 4, 6, 7, 9, 3, 2, 2, 1, 3, 2);
  521. }
  522. }
  523. TEST_F(FALLBACK, CONVOLUTION_BACKWARD_DATA_INT8_INT8_INT32) {
  524. Checker<ConvolutionBackwardData> checker(handle());
  525. using Param = ConvolutionBackwardData::Param;
  526. Param param;
  527. auto run = [&](size_t n, size_t ic, size_t oh, size_t ow, size_t oc, size_t fh,
  528. size_t fw, size_t stride, size_t padding, size_t dilate = 1,
  529. size_t group = 1) {
  530. param.pad_h = param.pad_w = padding;
  531. param.stride_h = param.stride_w = stride;
  532. param.dilate_h = param.dilate_w = dilate;
  533. TensorLayout diff = TensorLayout{{n, oc * group, oh, ow}, dtype::Int8()};
  534. TensorLayout grad;
  535. TensorLayout filter;
  536. if (group == 1) {
  537. param.sparse = Param::Sparse::DENSE;
  538. filter = {{oc, ic, fh, fw}, dtype::Int8()};
  539. } else {
  540. param.sparse = Param::Sparse::GROUP;
  541. filter = {{group, oc, ic, fh, fw}, dtype::Int8()};
  542. }
  543. // TensorLayout grad;
  544. {
  545. auto opr = handle()->create_operator<ConvolutionBackwardData>();
  546. opr->param() = param;
  547. opr->deduce_layout(filter, diff, grad);
  548. }
  549. checker.set_param(param)
  550. .set_dtype(0, dtype::Int8())
  551. .set_dtype(1, dtype::Int8())
  552. .set_dtype(2, dtype::Int32());
  553. checker.exec(TensorLayoutArray{filter, diff, grad});
  554. };
  555. for (auto mode : {Param::Mode::CONVOLUTION, Param::Mode::CROSS_CORRELATION}) {
  556. param.mode = mode;
  557. run(4, 3, 10, 13, 5, 1, 1, 1, 0, 1, 1);
  558. run(5, 5, 24, 43, 11, 9, 3, 3, 12, 1, 2);
  559. run(4, 3, 10, 45, 2, 1, 1, 1, 0, 4, 3);
  560. run(2, 3, 9, 12, 2, 4, 6, 1, 0, 1, 2);
  561. run(3, 4, 17, 32, 2, 3, 2, 5, 4, 4, 3);
  562. run(5, 5, 24, 43, 11, 9, 3, 3, 12, 2, 2);
  563. run(2, 3, 20, 33, 3, 5, 7, 4, 15, 2, 3);
  564. run(4, 4, 6, 7, 9, 3, 2, 2, 1, 3, 2);
  565. }
  566. }
  567. TEST_F(FALLBACK, CONVOLUTION_BACKWARD_DATA_SINT8) {
  568. Checker<ConvolutionBackwardData> checker(handle());
  569. using Param = ConvolutionBackwardData::Param;
  570. Param param;
  571. auto run = [&](size_t n, size_t ic, size_t oh, size_t ow, size_t oc, size_t fh,
  572. size_t fw, size_t stride, size_t padding, size_t dilate = 1,
  573. size_t group = 1) {
  574. param.pad_h = param.pad_w = padding;
  575. param.stride_h = param.stride_w = stride;
  576. param.dilate_h = param.dilate_w = dilate;
  577. TensorLayout diff =
  578. TensorLayout{{n, oc * group, oh, ow}, dtype::QuantizedS8(0.2f)};
  579. TensorLayout grad;
  580. TensorLayout filter;
  581. if (group == 1) {
  582. param.sparse = Param::Sparse::DENSE;
  583. filter = {{oc, ic, fh, fw}, dtype::QuantizedS8(0.2f)};
  584. } else {
  585. param.sparse = Param::Sparse::GROUP;
  586. filter = {{group, oc, ic, fh, fw}, dtype::QuantizedS8(0.2f)};
  587. }
  588. // TensorLayout grad;
  589. {
  590. auto opr = handle()->create_operator<ConvolutionBackwardData>();
  591. opr->param() = param;
  592. opr->deduce_layout(filter, diff, grad);
  593. }
  594. checker.set_param(param)
  595. .set_dtype(0, dtype::QuantizedS8(0.2f))
  596. .set_dtype(1, dtype::QuantizedS8(0.2f))
  597. .set_dtype(2, {});
  598. checker.exec(TensorLayoutArray{filter, diff, grad});
  599. };
  600. for (auto mode : {Param::Mode::CONVOLUTION, Param::Mode::CROSS_CORRELATION}) {
  601. param.mode = mode;
  602. run(4, 3, 10, 13, 5, 1, 1, 1, 0, 1, 1);
  603. run(5, 5, 24, 43, 11, 9, 3, 3, 12, 1, 2);
  604. run(4, 3, 10, 45, 2, 1, 1, 1, 0, 4, 3);
  605. run(2, 3, 9, 12, 2, 4, 6, 1, 0, 1, 2);
  606. run(3, 4, 17, 32, 2, 3, 2, 5, 4, 4, 3);
  607. run(5, 5, 24, 43, 11, 9, 3, 3, 12, 2, 2);
  608. run(2, 3, 20, 33, 3, 5, 7, 4, 15, 2, 3);
  609. run(4, 4, 6, 7, 9, 3, 2, 2, 1, 3, 2);
  610. }
  611. }
  612. TEST_F(FALLBACK, CONVOLUTION_BACKWARD_DATA_QUINT8) {
  613. Checker<ConvolutionBackwardData> checker(handle());
  614. using Param = ConvolutionBackwardData::Param;
  615. Param param;
  616. auto run = [&](size_t n, size_t ic, size_t oh, size_t ow, size_t oc, size_t fh,
  617. size_t fw, size_t stride, size_t padding, size_t dilate = 1,
  618. size_t group = 1) {
  619. param.pad_h = param.pad_w = padding;
  620. param.stride_h = param.stride_w = stride;
  621. param.dilate_h = param.dilate_w = dilate;
  622. TensorLayout diff = TensorLayout{
  623. {n, oc * group, oh, ow}, dtype::Quantized8Asymm(1.3f, (uint8_t)129)};
  624. TensorLayout grad;
  625. TensorLayout filter;
  626. if (group == 1) {
  627. param.sparse = Param::Sparse::DENSE;
  628. filter = {{oc, ic, fh, fw}, dtype::Quantized8Asymm(1.2f, (uint8_t)127)};
  629. } else {
  630. param.sparse = Param::Sparse::GROUP;
  631. filter = {
  632. {group, oc, ic, fh, fw},
  633. dtype::Quantized8Asymm(1.2f, (uint8_t)127)};
  634. }
  635. // TensorLayout grad;
  636. {
  637. auto opr = handle()->create_operator<ConvolutionBackwardData>();
  638. opr->param() = param;
  639. opr->deduce_layout(filter, diff, grad);
  640. }
  641. NormalRNG rng(128.f);
  642. checker.set_param(param)
  643. .set_dtype(0, dtype::Quantized8Asymm(1.2f, (uint8_t)127))
  644. .set_dtype(1, dtype::Quantized8Asymm(1.3f, (uint8_t)129))
  645. .set_dtype(2, {});
  646. checker.set_rng(0, &rng).set_rng(1, &rng);
  647. checker.exec(TensorLayoutArray{filter, diff, grad});
  648. };
  649. for (auto mode : {Param::Mode::CONVOLUTION, Param::Mode::CROSS_CORRELATION}) {
  650. param.mode = mode;
  651. run(4, 3, 10, 13, 5, 1, 1, 1, 0, 1, 1);
  652. run(5, 5, 24, 43, 11, 9, 3, 3, 12, 1, 2);
  653. run(4, 3, 10, 45, 2, 1, 1, 1, 0, 4, 3);
  654. run(2, 3, 9, 12, 2, 4, 6, 1, 0, 1, 2);
  655. run(3, 4, 17, 32, 2, 3, 2, 5, 4, 4, 3);
  656. run(5, 5, 24, 43, 11, 9, 3, 3, 12, 2, 2);
  657. run(2, 3, 20, 33, 3, 5, 7, 4, 15, 2, 3);
  658. run(4, 4, 6, 7, 9, 3, 2, 2, 1, 3, 2);
  659. }
  660. }
  661. TEST_F(FALLBACK, CONVOLUTION_BACKWARD_DATA_NAIVE_ALGO) {
  662. Checker<ConvolutionBackwardData> checker(handle());
  663. checker.set_before_exec_callback(
  664. AlgoChecker<ConvolutionBackwardData>("DeconvNaive"));
  665. using Param = ConvolutionBackwardData::Param;
  666. Param param;
  667. auto run = [&](size_t n, size_t ic, size_t oh, size_t ow, size_t oc, size_t fh,
  668. size_t fw, size_t stride, size_t padding, size_t dilate = 1,
  669. size_t group = 1) {
  670. param.pad_h = param.pad_w = padding;
  671. param.stride_h = param.stride_w = stride;
  672. param.dilate_h = param.dilate_w = dilate;
  673. TensorLayout diff = TensorLayout{{n, oc * group, oh, ow}, dtype::Float32()};
  674. TensorLayout grad;
  675. TensorLayout filter;
  676. if (group == 1) {
  677. param.sparse = Param::Sparse::DENSE;
  678. filter = {{oc, ic, fh, fw}, dtype::Float32()};
  679. } else {
  680. param.sparse = Param::Sparse::GROUP;
  681. filter = {{group, oc, ic, fh, fw}, dtype::Float32()};
  682. }
  683. // TensorLayout grad;
  684. {
  685. auto opr = handle()->create_operator<ConvolutionBackwardData>();
  686. opr->param() = param;
  687. opr->deduce_layout(filter, diff, grad);
  688. }
  689. checker.set_param(param);
  690. checker.exec(TensorLayoutArray{filter, diff, grad});
  691. };
  692. for (auto mode : {Param::Mode::CONVOLUTION, Param::Mode::CROSS_CORRELATION}) {
  693. param.mode = mode;
  694. run(4, 3, 10, 13, 5, 1, 1, 1, 0, 1, 1);
  695. run(5, 5, 24, 43, 11, 9, 3, 3, 12, 1, 2);
  696. run(4, 3, 10, 45, 2, 1, 1, 1, 0, 4, 3);
  697. run(2, 3, 9, 12, 2, 4, 6, 1, 0, 1, 2);
  698. run(3, 4, 17, 32, 2, 3, 2, 5, 4, 4, 3);
  699. run(5, 5, 24, 43, 11, 9, 3, 3, 12, 2, 2);
  700. run(2, 3, 20, 33, 3, 5, 7, 4, 15, 2, 3);
  701. run(4, 4, 6, 7, 9, 3, 2, 2, 1, 3, 2);
  702. }
  703. }
  704. #if MEGDNN_WITH_BENCHMARK
  705. TEST_F(FALLBACK, BENCHMARK_CONVOLUTION_BACKWARD_DATA_NCHW44) {
  706. using Param = ConvolutionBackwardData::Param;
  707. auto run = [&](size_t n, size_t ic, size_t oh, size_t ow, size_t oc, size_t fh,
  708. size_t fw, size_t stride, size_t padding, size_t dilate = 1,
  709. size_t group = 1) {
  710. Param param;
  711. param.pad_h = param.pad_w = padding;
  712. param.stride_h = param.stride_w = stride;
  713. param.dilate_h = param.dilate_w = dilate;
  714. TensorLayout diff_nchw44 =
  715. TensorLayout{{n, oc / 4 * group, oh, ow, 4}, dtype::Float32()};
  716. TensorLayout diff = TensorLayout{{n, oc * group, oh, ow}, dtype::Float32()};
  717. TensorLayout grad;
  718. TensorLayout grad_nchw44;
  719. TensorLayout filter;
  720. TensorLayout filter_nchw44;
  721. if (group == 1) {
  722. param.sparse = Param::Sparse::DENSE;
  723. filter_nchw44 = {{oc / 4, ic / 4, fh, fw, 4, 4}, dtype::Float32()};
  724. filter = {{oc, ic, fh, fw}, dtype::Float32()};
  725. } else {
  726. param.sparse = Param::Sparse::GROUP;
  727. filter_nchw44 = {{group, oc / 4, ic / 4, fh, fw, 4, 4}, dtype::Float32()};
  728. filter = {{group, oc, ic, fh, fw}, dtype::Float32()};
  729. }
  730. {
  731. auto opr = handle()->create_operator<ConvolutionBackwardData>();
  732. opr->param() = param;
  733. opr->deduce_layout(filter, diff, grad);
  734. opr->param().format = Param::Format::NCHW44;
  735. opr->deduce_layout(filter_nchw44, diff_nchw44, grad_nchw44);
  736. }
  737. Benchmarker<ConvolutionBackwardData> benchmarker_fallback(handle());
  738. size_t RUN = 50;
  739. benchmarker_fallback.set_display(false)
  740. .set_dtype(0, dtype::Float32{})
  741. .set_dtype(1, dtype::Float32{})
  742. .set_dtype(2, dtype::Float32{})
  743. .set_times(RUN);
  744. auto tnchw = benchmarker_fallback.set_param(param).exec(
  745. TensorLayoutArray{filter, diff, grad});
  746. param.format = Param::Format::NCHW44;
  747. auto tnchw44 = benchmarker_fallback.set_param(param).exec(
  748. TensorLayoutArray{filter_nchw44, diff_nchw44, grad_nchw44});
  749. size_t IC = ic;
  750. size_t FH = fh;
  751. size_t FW = fw;
  752. size_t total_flops = IC * diff.total_nr_elems() * FH * FW * 2;
  753. printf("nchw_time: %.3f ms nchw_flops: %.3f Gflops\n", tnchw,
  754. total_flops / (tnchw / RUN * 1e6));
  755. printf("nchw44_time: %.3f ms nchw44_flops: %.3f Gflops\n", tnchw44,
  756. total_flops / (tnchw44 / RUN * 1e6));
  757. printf("speedup: %.3f\n", tnchw / tnchw44);
  758. };
  759. run(1, 16, 14, 14, 16, 3, 3, 1, 1, 1, 1);
  760. run(1, 32, 28, 28, 16, 3, 3, 1, 1, 1, 1);
  761. run(1, 48, 28, 28, 48, 2, 2, 1, 0, 1, 1);
  762. run(1, 32, 26, 26, 32, 3, 3, 1, 0, 1, 1);
  763. run(2, 32, 64, 64, 32, 3, 3, 1, 0, 1, 1);
  764. run(2, 16, 112, 112, 16, 3, 3, 1, 0, 1, 1);
  765. }
  766. #endif
  767. // vim: syntax=cpp.doxygen