You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution.cpp 15 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370
  1. /**
  2. * \file dnn/test/naive/convolution.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "test/naive/fixture.h"
  12. #include "test/common/benchmarker.h"
  13. #include "megdnn/oprs/nn.h"
  14. #include "test/common/checker.h"
  15. #include "test/common/random_state.h"
  16. #include "test/common/convolution.h"
  17. #include "test/common/extra_impl_helper.h"
  18. using namespace megdnn;
  19. using namespace test;
  20. #if MEGDNN_WITH_BENCHMARK
  21. TEST_F(NAIVE, BENCHMARK_CONVOLUTION_BACKWARD_DATA) {
  22. using Param = ConvolutionBackwardData::Param;
  23. auto run = [&](const TensorLayoutArray& tensors, Param param) {
  24. Benchmarker<ConvolutionBackwardData> benchmarker_naive(handle());
  25. size_t RUN = 500;
  26. auto tfloat = benchmarker_naive.set_display(false)
  27. .set_dtype(0, dtype::Float32{})
  28. .set_dtype(1, dtype::Float32{})
  29. .set_times(RUN)
  30. .set_param(param)
  31. .exec(tensors);
  32. size_t IC = tensors[0][1];
  33. size_t FH = tensors[0][2];
  34. size_t FW = tensors[0][3];
  35. printf("fp32 flops: %.3f mflops\n",
  36. (IC * tensors[1].total_nr_elems() * FH * FW * 2) /
  37. (tfloat / RUN * 1000));
  38. };
  39. auto profile = [&](size_t n, size_t ic, size_t oh, size_t ow, size_t oc,
  40. size_t fh, size_t fw, size_t stride = 1,
  41. size_t padding = 0) {
  42. Param param;
  43. param.pad_h = param.pad_w = padding;
  44. param.stride_h = param.stride_w = stride;
  45. printf("oc: %zd ic: %zd w: %zd h: %zd stride: %zd kernel_size: %zd\n",
  46. oc, ic, ow, oh, stride, fh);
  47. TensorLayout diff = TensorLayout{{n, oc, oh, ow}, dtype::Float32()};
  48. TensorLayout filter = TensorLayout{{oc, ic, fh, fw}, dtype::Float32()};
  49. TensorLayout grad;
  50. {
  51. auto opr = handle()->create_operator<ConvolutionBackwardData>();
  52. opr->param() = param;
  53. opr->deduce_layout(filter, diff, grad);
  54. }
  55. run(TensorLayoutArray{filter, diff, grad}, param);
  56. };
  57. profile(1, 1, 2, 2, 1, 3, 3);
  58. profile(1, 1, 4, 4, 1, 3, 3);
  59. profile(1, 1, 8, 8, 1, 3, 3);
  60. profile(1, 1, 16, 16, 1, 3, 3);
  61. profile(1, 1, 32, 32, 1, 3, 3);
  62. profile(1, 1, 64, 64, 1, 3, 3);
  63. profile(1, 1, 128, 128, 1, 3, 3);
  64. }
  65. #endif
  66. TEST_F(NAIVE, CONVOLUTION_QUANTIZED8x8x32) {
  67. Checker<Convolution> checker(handle(), /* check_dispatch */false);
  68. Convolution::Param param;
  69. param.format = Convolution::Param::Format::NCHW;
  70. checker.set_param(param).exect(
  71. Testcase{
  72. TensorValue({1, 1, 4, 4}, dtype::Quantized8Asymm(0.1f, (uint8_t)128),
  73. {90, 136, 85, 204,
  74. 48, 9, 226, 25,
  75. 118, 109, 87, 132,
  76. 104, 163, 25, 90}),
  77. TensorValue({3, 1, 3, 3}, dtype::Quantized8Asymm(0.2f, (uint8_t)124),
  78. {153, 170, 102,
  79. 103, 23, 213,
  80. 116, 195, 191,
  81. 44, 50, 247,
  82. 172, 42, 32,
  83. 233, 163, 247,
  84. 120, 241, 209,
  85. 83, 201, 115,
  86. 32, 140, 147}),
  87. {}},
  88. Testcase{
  89. {},
  90. {},
  91. TensorValue({1, 3, 2, 2}, dtype::QuantizedS32(0.1f * 0.2f),
  92. {18617, -22475,
  93. -15694, -1920,
  94. -12813, 4440,
  95. 18190, -13195,
  96. -9659, 15933,
  97. -5558, -4969})});
  98. }
  99. TEST_F(NAIVE, DECONVOLUTION_QUANTIZED8x8x32) {
  100. Checker<ConvolutionBackwardData> checker(handle(), /* check_dispatch */false);
  101. ConvolutionBackwardData::Param param;
  102. param.format = ConvolutionBackwardData::Param::Format::NCHW;
  103. checker.set_param(param).exect(
  104. Testcase{
  105. TensorValue({1, 3, 3, 3}, dtype::Quantized8Asymm(0.0084f, (uint8_t)135),
  106. {131, 155, 190,
  107. 255, 43, 155,
  108. 97, 238, 127,
  109. 157, 72, 161,
  110. 157, 0, 69,
  111. 204, 167, 180,
  112. 108, 47, 203,
  113. 179, 136, 83,
  114. 143, 182, 105}),
  115. TensorValue({1, 1, 4, 4}, dtype::Quantized8Asymm(0.1f, (uint8_t)157),
  116. {126, 49, 99, 0,
  117. 173, 19, 129, 19,
  118. 161, 180, 32, 255,
  119. 203, 120, 208, 96}),
  120. {}},
  121. Testcase{
  122. {},
  123. {},
  124. TensorValue({1, 3, 6, 6}, dtype::QuantizedS32(0.1f * 0.0084f),
  125. { 124, -188, -3633, -6472, -6330, -8635,
  126. -3784, -9236, 588, -23262, 8984, -10730,
  127. 3082, -17133, 2164, -17515, -8486, 3886,
  128. -312, 10352, -28728, 26413, -23921, -291,
  129. 5368, -9134, 17531, -29535, 17726, -2004,
  130. -1748, 6144, -6117, 7867, -6691, 488,
  131. -682, -423, 4722, -2608, 8383, -4082,
  132. -330, -2235, 23844, 6644, 32989, 6774,
  133. -1699, -13386, 4010, 2932, 3420, 4591,
  134. 2204, -12756, -7098, -4632, -5487, -14264,
  135. 1288, -5309, -4628, -1988, 2380, 8436,
  136. 3174, -1081, 4405, -4242, 343, -2745,
  137. 837, 5644, 8962, 1999, 9872, -10676,
  138. -1796, -2465, 12940, -4544, 13099, -1220,
  139. 348, -9350, -5189, 10252, -21445, 18550,
  140. -938, -2385, -7868, -646, 9788, -5104,
  141. 2056, -1210, -224, -6490, 5643, 232,
  142. 368, 1866, -2711, 3019, -4397, 1830})});
  143. }
  144. TEST_F(NAIVE, CONVOLUTION_WITH_NCHW4) {
  145. Checker<Convolution> checker(handle());
  146. Convolution::Param param;
  147. param.format = Convolution::Param::Format::NCHW4;
  148. auto convert_true_format = [](const TensorLayout& layout) {
  149. if (layout.ndim == 4)
  150. return layout
  151. .reshape(
  152. {layout[0], layout[1] / 4, layout[2], layout[3], 4})
  153. .dimshuffle({0, 1, 4, 2, 3});
  154. else
  155. return layout
  156. .reshape({layout[0], layout[1], layout[2] / 4, layout[3],
  157. layout[4], 4})
  158. .dimshuffle({0, 1, 2, 5, 3, 4});
  159. };
  160. auto extra_impl = [&, this](const TensorNDArray& tensors) {
  161. auto conv = handle()->create_operator<Convolution>();
  162. conv->param() = param;
  163. conv->param().format = Convolution::Param::Format::NCHW;
  164. TensorNDArray nchw_tensors;
  165. for (size_t i = 0; i < tensors.size(); ++i) {
  166. auto layout = tensors[i].layout;
  167. if (layout.ndim == 5) {
  168. layout = layout.reshape({layout[0], layout[1] * layout[4],
  169. layout[2], layout[3]});
  170. } else {
  171. megdnn_assert(layout.ndim == 6 &&
  172. param.sparse == Convolution::Param::Sparse::GROUP);
  173. layout = layout.reshape(
  174. {layout[0], layout[1], layout[2] * layout[5],
  175. layout[3], layout[4]});
  176. }
  177. nchw_tensors.emplace_back(
  178. malloc(layout.span().dist_byte()), layout);
  179. }
  180. TensorNDArray nchw4_tensors;
  181. for (size_t i = 0; i < tensors.size(); ++i) {
  182. auto layout = convert_true_format(nchw_tensors[i].layout);
  183. nchw4_tensors.emplace_back(tensors[i].raw_ptr, std::move(layout));
  184. }
  185. auto workspace_size = conv->get_workspace_in_bytes(
  186. tensors[0].layout, tensors[1].layout, tensors[2].layout,
  187. nullptr);
  188. dt_byte* workspace_ptr = static_cast<dt_byte*>(malloc(workspace_size));
  189. Workspace workspace{workspace_ptr, workspace_size};
  190. auto relayout = handle()->create_operator<RelayoutForward>();
  191. relayout->exec(nchw4_tensors[0], nchw_tensors[0]);
  192. relayout->exec(nchw4_tensors[1], nchw_tensors[1]);
  193. conv->exec(nchw_tensors[0], nchw_tensors[1], nchw_tensors[2], nullptr,
  194. workspace);
  195. relayout->exec(nchw_tensors[2], nchw4_tensors[2]);
  196. free(workspace_ptr);
  197. for (auto&& tensor : nchw_tensors) {
  198. free(tensor.raw_ptr);
  199. }
  200. };
  201. UniformIntRNG rng{0, 4};
  202. ConstValue filter_rng{1};
  203. checker.set_extra_opr_impl(extra_impl)
  204. .set_rng(0, &filter_rng)
  205. .set_rng(1, &filter_rng);
  206. checker.set_param(param)
  207. .execs({{1, 2, 2, 2, 4}, {4, 2, 1, 1, 4}, {}})
  208. .execs({{20, 3, 30, 30, 4}, {4, 3, 1, 1, 4}, {}})
  209. .execs({{20, 2, 30, 30, 4}, {4, 2, 3, 3, 4}, {}});
  210. param.sparse = Convolution::Param::Sparse::GROUP;
  211. checker.set_param(param)
  212. .execs({{20, 15, 30, 30, 4}, {5, 4, 3, 3, 3, 4}, {}})
  213. .execs({{20, 25, 30, 30, 4}, {5, 4, 5, 1, 1, 4}, {}})
  214. .execs({{20, 27, 30, 30, 4}, {3, 4, 9, 1, 1, 4}, {}});
  215. }
  216. TEST_F(NAIVE, CONVOLUTION_BFLOAT16) {
  217. Checker<Convolution> checker(handle(), false);
  218. using Param = Convolution::Param;
  219. Param param;
  220. param.sparse = param::Convolution::Sparse::DENSE;
  221. Param impl_param = param;
  222. auto run = [&](size_t n, size_t ic, size_t ih, size_t iw, size_t oc,
  223. size_t fh, size_t fw) {
  224. float scale = 1.0f / sqrt(ic * fh * fw);
  225. UniformFloatRNG rng(scale, 2 * scale);
  226. param.pad_h = param.pad_w = 1;
  227. param.stride_h = param.stride_w = 1;
  228. impl_param.pad_h = impl_param.pad_w = 1;
  229. impl_param.stride_h = impl_param.stride_w = 1;
  230. auto extra_impl =
  231. extra_impl_helper<Convolution>(handle(), impl_param);
  232. for (auto cmode :
  233. std::vector<Param::ComputeMode>{Param::ComputeMode::DEFAULT,
  234. Param::ComputeMode::FLOAT32}) {
  235. param.compute_mode = cmode;
  236. checker.set_param(param)
  237. .set_dtype(0, dtype::BFloat16())
  238. .set_dtype(1, dtype::BFloat16())
  239. // Use inferred output dtype.
  240. .set_dtype(2, {})
  241. .set_rng(0, &rng)
  242. .set_rng(1, &rng)
  243. .set_extra_opr_impl(extra_impl)
  244. .set_epsilon(1e-1)
  245. .execs({{n, ic, ih, iw}, {oc, ic, fh, fw}, {}});
  246. }
  247. };
  248. run(1, 1, 20, 20, 5, 3, 3);
  249. run(1, 2, 8, 7, 11, 3, 1);
  250. }
  251. TEST_F(NAIVE, CONVOLUTION_BACKWARD_DATA_BFLOAT16) {
  252. Checker<ConvolutionBackwardData> checker(handle(), false);
  253. using Param = ConvolutionBackwardData::Param;
  254. Param param, impl_param;
  255. param.sparse = Param::Sparse::DENSE;
  256. auto run = [&](size_t n, size_t ic, size_t oh, size_t ow, size_t oc,
  257. size_t fh, size_t fw, size_t stride, size_t padding,
  258. const Param::ComputeMode& cmode =
  259. Param::ComputeMode::DEFAULT) {
  260. param.pad_h = param.pad_w = padding;
  261. param.stride_h = param.stride_w = stride;
  262. param.dilate_h = param.dilate_w = 1;
  263. param.compute_mode = cmode;
  264. TensorLayout diff =
  265. TensorLayout{{n, oc, oh, ow}, dtype::BFloat16()};
  266. TensorLayout grad;
  267. TensorLayout filter;
  268. filter = {{oc, ic, fh, fw}, dtype::BFloat16()};
  269. // TensorLayout grad;
  270. {
  271. auto opr = handle()->create_operator<ConvolutionBackwardData>();
  272. opr->param() = param;
  273. opr->deduce_layout(filter, diff, grad);
  274. }
  275. impl_param = param;
  276. impl_param.compute_mode = Param::ComputeMode::DEFAULT;
  277. auto extra_impl = extra_impl_helper<ConvolutionBackwardData>(
  278. handle(), impl_param);
  279. checker.set_param(param)
  280. .set_extra_opr_impl(extra_impl)
  281. .set_epsilon(1e-1)
  282. .set_dtype(0, dtype::BFloat16())
  283. .set_dtype(1, dtype::BFloat16());
  284. checker.exec(TensorLayoutArray{filter, diff, grad});
  285. };
  286. run(4, 3, 10, 13, 5, 1, 1, 1, 0);
  287. run(2, 1, 24, 43, 11, 3, 3, 2, 1, Param::ComputeMode::FLOAT32);
  288. }
  289. TEST_F(NAIVE, CONVOLUTION_BACKWARD_FILTER_BFLOAT16) {
  290. using namespace convolution;
  291. Checker<ConvolutionBackwardFilter> checker(handle(), false);
  292. using Param = ConvolutionBackwardFilter::Param;
  293. Param param;
  294. Param impl_param = param;
  295. auto run = [&](size_t n, size_t ic, size_t ih, size_t iw, size_t oc,
  296. size_t fh, size_t fw,
  297. const Param::ComputeMode& cmode =
  298. Param::ComputeMode::DEFAULT) {
  299. auto src = TensorLayout({n, ic, ih, iw}, dtype::BFloat16());
  300. auto filter = TensorLayout({oc, ic, fh, fw}, dtype::BFloat16());
  301. TensorLayout dst;
  302. {
  303. auto opr = handle()->create_operator<Convolution>();
  304. opr->param() = param;
  305. opr->deduce_layout(src, filter, dst);
  306. }
  307. float scale = 1.0f / sqrt(dst[2] * dst[3]);
  308. UniformFloatRNG rng(scale, 2 * scale);
  309. src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
  310. param.compute_mode = cmode;
  311. impl_param = param;
  312. impl_param.compute_mode = Param::ComputeMode::DEFAULT;
  313. auto extra_impl = extra_impl_helper<ConvolutionBackwardFilter>(
  314. handle(), impl_param);
  315. checker.set_rng(0, &rng)
  316. .set_rng(1, &rng)
  317. .set_dtype(0, dtype::BFloat16())
  318. .set_dtype(1, dtype::BFloat16())
  319. .set_epsilon(1e-1)
  320. .set_extra_opr_impl(extra_impl)
  321. .set_param(param)
  322. .exec(TensorLayoutArray{src, dst, filter});
  323. };
  324. run(1, 2, 8, 7, 11, 3, 1);
  325. run(1, 1, 20, 20, 5, 3, 3, Param::ComputeMode::FLOAT32);
  326. }
  327. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台