You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution.cpp 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. #include "test/naive/fixture.h"
  2. #include "megdnn/oprs/nn.h"
  3. #include "test/common/benchmarker.h"
  4. #include "test/common/checker.h"
  5. #include "test/common/convolution.h"
  6. #include "test/common/extra_impl_helper.h"
  7. #include "test/common/random_state.h"
  8. using namespace megdnn;
  9. using namespace test;
  10. #if MEGDNN_WITH_BENCHMARK
  11. TEST_F(NAIVE, BENCHMARK_CONVOLUTION_BACKWARD_DATA) {
  12. using Param = ConvolutionBackwardData::Param;
  13. auto run = [&](const TensorLayoutArray& tensors, Param param) {
  14. Benchmarker<ConvolutionBackwardData> benchmarker_naive(handle());
  15. size_t RUN = 500;
  16. auto tfloat = benchmarker_naive.set_display(false)
  17. .set_dtype(0, dtype::Float32{})
  18. .set_dtype(1, dtype::Float32{})
  19. .set_times(RUN)
  20. .set_param(param)
  21. .exec(tensors);
  22. size_t IC = tensors[0][1];
  23. size_t FH = tensors[0][2];
  24. size_t FW = tensors[0][3];
  25. printf("fp32 flops: %.3f mflops\n",
  26. (IC * tensors[1].total_nr_elems() * FH * FW * 2) /
  27. (tfloat / RUN * 1000));
  28. };
  29. auto profile = [&](size_t n, size_t ic, size_t oh, size_t ow, size_t oc, size_t fh,
  30. size_t fw, size_t stride = 1, size_t padding = 0) {
  31. Param param;
  32. param.pad_h = param.pad_w = padding;
  33. param.stride_h = param.stride_w = stride;
  34. printf("oc: %zd ic: %zd w: %zd h: %zd stride: %zd kernel_size: %zd\n", oc, ic,
  35. ow, oh, stride, fh);
  36. TensorLayout diff = TensorLayout{{n, oc, oh, ow}, dtype::Float32()};
  37. TensorLayout filter = TensorLayout{{oc, ic, fh, fw}, dtype::Float32()};
  38. TensorLayout grad;
  39. {
  40. auto opr = handle()->create_operator<ConvolutionBackwardData>();
  41. opr->param() = param;
  42. opr->deduce_layout(filter, diff, grad);
  43. }
  44. run(TensorLayoutArray{filter, diff, grad}, param);
  45. };
  46. profile(1, 1, 2, 2, 1, 3, 3);
  47. profile(1, 1, 4, 4, 1, 3, 3);
  48. profile(1, 1, 8, 8, 1, 3, 3);
  49. profile(1, 1, 16, 16, 1, 3, 3);
  50. profile(1, 1, 32, 32, 1, 3, 3);
  51. profile(1, 1, 64, 64, 1, 3, 3);
  52. profile(1, 1, 128, 128, 1, 3, 3);
  53. }
  54. #endif
  55. TEST_F(NAIVE, CONVOLUTION_QUANTIZED8x8x32) {
  56. Checker<Convolution> checker(handle(), /* check_dispatch */ false);
  57. Convolution::Param param;
  58. param.format = Convolution::Param::Format::NCHW;
  59. checker.set_param(param).exect(
  60. Testcase{
  61. TensorValue(
  62. {1, 1, 4, 4}, dtype::Quantized8Asymm(0.1f, (uint8_t)128),
  63. {90, 136, 85, 204, 48, 9, 226, 25, 118, 109, 87, 132, 104,
  64. 163, 25, 90}),
  65. TensorValue(
  66. {3, 1, 3, 3}, dtype::Quantized8Asymm(0.2f, (uint8_t)124),
  67. {153, 170, 102, 103, 23, 213, 116, 195, 191,
  68. 44, 50, 247, 172, 42, 32, 233, 163, 247,
  69. 120, 241, 209, 83, 201, 115, 32, 140, 147}),
  70. {}},
  71. Testcase{
  72. {},
  73. {},
  74. TensorValue(
  75. {1, 3, 2, 2}, dtype::QuantizedS32(0.1f * 0.2f),
  76. {18617, -22475, -15694, -1920,
  77. -12813, 4440, 18190, -13195,
  78. -9659, 15933, -5558, -4969})});
  79. }
  80. TEST_F(NAIVE, DECONVOLUTION_QUANTIZED8x8x32) {
  81. Checker<ConvolutionBackwardData> checker(handle(), /* check_dispatch */ false);
  82. ConvolutionBackwardData::Param param;
  83. param.format = ConvolutionBackwardData::Param::Format::NCHW;
  84. checker.set_param(param).exect(
  85. Testcase{
  86. TensorValue(
  87. {1, 3, 3, 3}, dtype::Quantized8Asymm(0.0084f, (uint8_t)135),
  88. {131, 155, 190, 255, 43, 155, 97, 238, 127,
  89. 157, 72, 161, 157, 0, 69, 204, 167, 180,
  90. 108, 47, 203, 179, 136, 83, 143, 182, 105}),
  91. TensorValue(
  92. {1, 1, 4, 4}, dtype::Quantized8Asymm(0.1f, (uint8_t)157),
  93. {126, 49, 99, 0, 173, 19, 129, 19, 161, 180, 32, 255, 203,
  94. 120, 208, 96}),
  95. {}},
  96. Testcase{
  97. {},
  98. {},
  99. TensorValue(
  100. {1, 3, 6, 6}, dtype::QuantizedS32(0.1f * 0.0084f),
  101. {124, -188, -3633, -6472, -6330, -8635,
  102. -3784, -9236, 588, -23262, 8984, -10730,
  103. 3082, -17133, 2164, -17515, -8486, 3886,
  104. -312, 10352, -28728, 26413, -23921, -291,
  105. 5368, -9134, 17531, -29535, 17726, -2004,
  106. -1748, 6144, -6117, 7867, -6691, 488,
  107. -682, -423, 4722, -2608, 8383, -4082,
  108. -330, -2235, 23844, 6644, 32989, 6774,
  109. -1699, -13386, 4010, 2932, 3420, 4591,
  110. 2204, -12756, -7098, -4632, -5487, -14264,
  111. 1288, -5309, -4628, -1988, 2380, 8436,
  112. 3174, -1081, 4405, -4242, 343, -2745,
  113. 837, 5644, 8962, 1999, 9872, -10676,
  114. -1796, -2465, 12940, -4544, 13099, -1220,
  115. 348, -9350, -5189, 10252, -21445, 18550,
  116. -938, -2385, -7868, -646, 9788, -5104,
  117. 2056, -1210, -224, -6490, 5643, 232,
  118. 368, 1866, -2711, 3019, -4397, 1830})});
  119. }
  120. TEST_F(NAIVE, CONVOLUTION_WITH_NCHW4) {
  121. Checker<Convolution> checker(handle());
  122. Convolution::Param param;
  123. param.format = Convolution::Param::Format::NCHW4;
  124. auto convert_true_format = [](const TensorLayout& layout) {
  125. if (layout.ndim == 4)
  126. return layout.reshape({layout[0], layout[1] / 4, layout[2], layout[3], 4})
  127. .dimshuffle({0, 1, 4, 2, 3});
  128. else
  129. return layout
  130. .reshape(
  131. {layout[0], layout[1], layout[2] / 4, layout[3], layout[4],
  132. 4})
  133. .dimshuffle({0, 1, 2, 5, 3, 4});
  134. };
  135. auto extra_impl = [&, this](const TensorNDArray& tensors) {
  136. auto conv = handle()->create_operator<Convolution>();
  137. conv->param() = param;
  138. conv->param().format = Convolution::Param::Format::NCHW;
  139. TensorNDArray nchw_tensors;
  140. for (size_t i = 0; i < tensors.size(); ++i) {
  141. auto layout = tensors[i].layout;
  142. if (layout.ndim == 5) {
  143. layout = layout.reshape(
  144. {layout[0], layout[1] * layout[4], layout[2], layout[3]});
  145. } else {
  146. megdnn_assert(
  147. layout.ndim == 6 &&
  148. param.sparse == Convolution::Param::Sparse::GROUP);
  149. layout = layout.reshape(
  150. {layout[0], layout[1], layout[2] * layout[5], layout[3],
  151. layout[4]});
  152. }
  153. nchw_tensors.emplace_back(malloc(layout.span().dist_byte()), layout);
  154. }
  155. TensorNDArray nchw4_tensors;
  156. for (size_t i = 0; i < tensors.size(); ++i) {
  157. auto layout = convert_true_format(nchw_tensors[i].layout);
  158. nchw4_tensors.emplace_back(tensors[i].raw_ptr(), std::move(layout));
  159. }
  160. auto workspace_size = conv->get_workspace_in_bytes(
  161. tensors[0].layout, tensors[1].layout, tensors[2].layout, nullptr);
  162. dt_byte* workspace_ptr = static_cast<dt_byte*>(malloc(workspace_size));
  163. Workspace workspace{workspace_ptr, workspace_size};
  164. auto relayout = handle()->create_operator<RelayoutForward>();
  165. relayout->exec(nchw4_tensors[0], nchw_tensors[0]);
  166. relayout->exec(nchw4_tensors[1], nchw_tensors[1]);
  167. conv->exec(
  168. nchw_tensors[0], nchw_tensors[1], nchw_tensors[2], nullptr, workspace);
  169. relayout->exec(nchw_tensors[2], nchw4_tensors[2]);
  170. free(workspace_ptr);
  171. for (auto&& tensor : nchw_tensors) {
  172. free(tensor.raw_ptr());
  173. }
  174. };
  175. UniformIntRNG rng{0, 4};
  176. ConstValue filter_rng{1};
  177. checker.set_extra_opr_impl(extra_impl)
  178. .set_rng(0, &filter_rng)
  179. .set_rng(1, &filter_rng);
  180. checker.set_param(param)
  181. .execs({{1, 2, 2, 2, 4}, {4, 2, 1, 1, 4}, {}})
  182. .execs({{20, 3, 30, 30, 4}, {4, 3, 1, 1, 4}, {}})
  183. .execs({{20, 2, 30, 30, 4}, {4, 2, 3, 3, 4}, {}});
  184. param.sparse = Convolution::Param::Sparse::GROUP;
  185. checker.set_param(param)
  186. .execs({{20, 15, 30, 30, 4}, {5, 4, 3, 3, 3, 4}, {}})
  187. .execs({{20, 25, 30, 30, 4}, {5, 4, 5, 1, 1, 4}, {}})
  188. .execs({{20, 27, 30, 30, 4}, {3, 4, 9, 1, 1, 4}, {}});
  189. }
  190. TEST_F(NAIVE, CONVOLUTION_BFLOAT16) {
  191. Checker<Convolution> checker(handle(), false);
  192. using Param = Convolution::Param;
  193. Param param;
  194. param.sparse = param::Convolution::Sparse::DENSE;
  195. Param impl_param = param;
  196. auto run = [&](size_t n, size_t ic, size_t ih, size_t iw, size_t oc, size_t fh,
  197. size_t fw) {
  198. float scale = 1.0f / sqrt(ic * fh * fw);
  199. UniformFloatRNG rng(scale, 2 * scale);
  200. param.pad_h = param.pad_w = 1;
  201. param.stride_h = param.stride_w = 1;
  202. impl_param.pad_h = impl_param.pad_w = 1;
  203. impl_param.stride_h = impl_param.stride_w = 1;
  204. auto extra_impl = extra_impl_helper<Convolution>(handle(), impl_param);
  205. for (auto cmode : std::vector<Param::ComputeMode>{
  206. Param::ComputeMode::DEFAULT, Param::ComputeMode::FLOAT32}) {
  207. param.compute_mode = cmode;
  208. checker.set_param(param)
  209. .set_dtype(0, dtype::BFloat16())
  210. .set_dtype(1, dtype::BFloat16())
  211. // Use inferred output dtype.
  212. .set_dtype(2, {})
  213. .set_rng(0, &rng)
  214. .set_rng(1, &rng)
  215. .set_extra_opr_impl(extra_impl)
  216. .set_epsilon(1e-1)
  217. .execs({{n, ic, ih, iw}, {oc, ic, fh, fw}, {}});
  218. }
  219. };
  220. run(1, 1, 20, 20, 5, 3, 3);
  221. run(1, 2, 8, 7, 11, 3, 1);
  222. }
  223. TEST_F(NAIVE, CONVOLUTION_BACKWARD_DATA_BFLOAT16) {
  224. Checker<ConvolutionBackwardData> checker(handle(), false);
  225. using Param = ConvolutionBackwardData::Param;
  226. Param param, impl_param;
  227. param.sparse = Param::Sparse::DENSE;
  228. auto run = [&](size_t n, size_t ic, size_t oh, size_t ow, size_t oc, size_t fh,
  229. size_t fw, size_t stride, size_t padding,
  230. const Param::ComputeMode& cmode = Param::ComputeMode::DEFAULT) {
  231. param.pad_h = param.pad_w = padding;
  232. param.stride_h = param.stride_w = stride;
  233. param.dilate_h = param.dilate_w = 1;
  234. param.compute_mode = cmode;
  235. TensorLayout diff = TensorLayout{{n, oc, oh, ow}, dtype::BFloat16()};
  236. TensorLayout grad;
  237. TensorLayout filter;
  238. filter = {{oc, ic, fh, fw}, dtype::BFloat16()};
  239. // TensorLayout grad;
  240. {
  241. auto opr = handle()->create_operator<ConvolutionBackwardData>();
  242. opr->param() = param;
  243. opr->deduce_layout(filter, diff, grad);
  244. }
  245. impl_param = param;
  246. impl_param.compute_mode = Param::ComputeMode::DEFAULT;
  247. auto extra_impl =
  248. extra_impl_helper<ConvolutionBackwardData>(handle(), impl_param);
  249. checker.set_param(param)
  250. .set_extra_opr_impl(extra_impl)
  251. .set_epsilon(1e-1)
  252. .set_dtype(0, dtype::BFloat16())
  253. .set_dtype(1, dtype::BFloat16());
  254. checker.exec(TensorLayoutArray{filter, diff, grad});
  255. };
  256. run(4, 3, 10, 13, 5, 1, 1, 1, 0);
  257. run(2, 1, 24, 43, 11, 3, 3, 2, 1, Param::ComputeMode::FLOAT32);
  258. }
  259. TEST_F(NAIVE, CONVOLUTION_BACKWARD_FILTER_BFLOAT16) {
  260. using namespace convolution;
  261. Checker<ConvolutionBackwardFilter> checker(handle(), false);
  262. using Param = ConvolutionBackwardFilter::Param;
  263. Param param;
  264. Param impl_param = param;
  265. auto run = [&](size_t n, size_t ic, size_t ih, size_t iw, size_t oc, size_t fh,
  266. size_t fw,
  267. const Param::ComputeMode& cmode = Param::ComputeMode::DEFAULT) {
  268. auto src = TensorLayout({n, ic, ih, iw}, dtype::BFloat16());
  269. auto filter = TensorLayout({oc, ic, fh, fw}, dtype::BFloat16());
  270. TensorLayout dst;
  271. {
  272. auto opr = handle()->create_operator<Convolution>();
  273. opr->param() = param;
  274. opr->deduce_layout(src, filter, dst);
  275. }
  276. float scale = 1.0f / sqrt(dst[2] * dst[3]);
  277. UniformFloatRNG rng(scale, 2 * scale);
  278. src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
  279. param.compute_mode = cmode;
  280. impl_param = param;
  281. impl_param.compute_mode = Param::ComputeMode::DEFAULT;
  282. auto extra_impl =
  283. extra_impl_helper<ConvolutionBackwardFilter>(handle(), impl_param);
  284. checker.set_rng(0, &rng)
  285. .set_rng(1, &rng)
  286. .set_dtype(0, dtype::BFloat16())
  287. .set_dtype(1, dtype::BFloat16())
  288. .set_epsilon(1e-1)
  289. .set_extra_opr_impl(extra_impl)
  290. .set_param(param)
  291. .exec(TensorLayoutArray{src, dst, filter});
  292. };
  293. run(1, 2, 8, 7, 11, 3, 1);
  294. run(1, 1, 20, 20, 5, 3, 3, Param::ComputeMode::FLOAT32);
  295. }
  296. // vim: syntax=cpp.doxygen