You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias.cpp 44 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109
  1. /**
  2. * \file dnn/test/fallback/conv_bias.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "test/common/conv_bias.h"
  12. #include "megdnn/opr_param_defs.h"
  13. #include "megdnn/oprs.h"
  14. #include "test/common/benchmarker.h"
  15. #include "test/common/checker.h"
  16. #include "test/common/rng.h"
  17. #include "test/common/task_record_check.h"
  18. #include "test/common/tensor.h"
  19. #include "test/fallback/fixture.h"
  20. #if MEGDNN_X86
  21. #include "src/x86/utils.h"
  22. #endif
  23. namespace megdnn {
  24. namespace test {
  25. TEST_F(FALLBACK, CONV_BIAS_FORWARD) {
  26. using namespace conv_bias;
  27. std::vector<TestArg> args = get_args();
  28. Checker<ConvBiasForward> checker(handle());
  29. NormalRNG default_rng;
  30. UniformIntRNG int_rng{-50, 50};
  31. param::ConvBias param;
  32. {
  33. param.format = param::ConvBias::Format::NHWC;
  34. auto src_shape = TensorShape{2, 16, 32, 24};
  35. auto filter_shape = TensorShape{4, 3, 3, 24};
  36. auto bias_shape_channel = TensorShape{1, 1, 1, 4};
  37. checker.set_dtype(0, dtype::Float32())
  38. .set_dtype(1, dtype::Float32())
  39. .set_dtype(2, dtype::Float32())
  40. .set_rng(0, &default_rng)
  41. .set_rng(1, &default_rng)
  42. .set_rng(2, &default_rng)
  43. .set_param(param)
  44. .execs({src_shape, filter_shape, bias_shape_channel, {}, {}});
  45. }
  46. checker.set_before_exec_callback(
  47. conv_bias::ConvBiasAlgoChecker<ConvBias>("FALLBACK_NAIVE"));
  48. for (auto&& arg : args) {
  49. checker.set_dtype(0, dtype::Float32())
  50. .set_dtype(1, dtype::Float32())
  51. .set_dtype(2, dtype::Float32())
  52. .set_rng(0, &default_rng)
  53. .set_rng(1, &default_rng)
  54. .set_rng(2, &default_rng)
  55. .set_epsilon(1e-3)
  56. .set_param(arg.param)
  57. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  58. }
  59. {
  60. param.format = param::ConvBias::Format::NCHW;
  61. param.sparse = ConvBias::Param::Sparse::GROUP;
  62. auto src_shape = TensorShape{2, 16, 32, 24};
  63. auto filter_shape = TensorShape{4, 4, 4, 1, 1};
  64. auto bias_shape_channel = TensorShape{1, 16, 1, 1};
  65. auto bias_shape = TensorShape{2, 16, 32, 24};
  66. checker.set_dtype(0, dtype::Float32())
  67. .set_dtype(1, dtype::Float32())
  68. .set_dtype(2, dtype::Float32())
  69. .set_rng(0, &default_rng)
  70. .set_rng(1, &default_rng)
  71. .set_rng(2, &default_rng)
  72. .set_param(param)
  73. .execs({src_shape, filter_shape, bias_shape, {}, {}})
  74. .execs({src_shape, filter_shape, bias_shape_channel, {}, {}});
  75. }
  76. }
  77. TEST_F(FALLBACK, CONV_BIAS_FORWARD_RECORD) {
  78. using namespace conv_bias;
  79. TaskRecordChecker<ConvBiasForward> checker(1);
  80. NormalRNG default_rng;
  81. UniformIntRNG int_rng{-50, 50};
  82. param::ConvBias param;
  83. {
  84. param.format = param::ConvBias::Format::NHWC;
  85. auto src_shape = TensorShape{2, 16, 32, 24};
  86. auto filter_shape = TensorShape{4, 3, 3, 24};
  87. auto bias_shape_channel = TensorShape{1, 1, 1, 4};
  88. checker.set_dtype(0, dtype::Float32())
  89. .set_dtype(1, dtype::Float32())
  90. .set_dtype(2, dtype::Float32())
  91. .set_rng(0, &default_rng)
  92. .set_rng(1, &default_rng)
  93. .set_rng(2, &default_rng)
  94. .set_param(param)
  95. .execs({src_shape, filter_shape, bias_shape_channel, {}, {}});
  96. }
  97. {
  98. param.format = param::ConvBias::Format::NCHW;
  99. param.sparse = ConvBias::Param::Sparse::GROUP;
  100. auto src_shape = TensorShape{2, 16, 32, 24};
  101. auto filter_shape = TensorShape{4, 4, 4, 1, 1};
  102. auto bias_shape_channel = TensorShape{1, 16, 1, 1};
  103. auto bias_shape = TensorShape{2, 16, 32, 24};
  104. checker.set_dtype(0, dtype::Float32())
  105. .set_dtype(1, dtype::Float32())
  106. .set_dtype(2, dtype::Float32())
  107. .set_rng(0, &default_rng)
  108. .set_rng(1, &default_rng)
  109. .set_rng(2, &default_rng)
  110. .set_param(param)
  111. .execs({src_shape, filter_shape, bias_shape, {}, {}})
  112. .execs({src_shape, filter_shape, bias_shape_channel, {}, {}});
  113. }
  114. }
  115. TEST_F(FALLBACK, FP32_GEMV_MK4_GI) {
  116. Checker<MatrixMul> checker(handle());
  117. using Param = MatrixMul::Param;
  118. checker.set_before_exec_callback(AlgoChecker<MatrixMul>("FB_GI_F32_GEMV_MK4"));
  119. checker.set_epsilon(1e-2);
  120. auto run = [&](size_t M, size_t K) {
  121. Param param;
  122. param.format = param::MatrixMul::Format::MK4;
  123. param.transposeA = false;
  124. param.transposeB = false;
  125. TensorShape A, B;
  126. A = TensorShape{M / 4, K / 4, 4, 4};
  127. B = TensorShape{K / 4, 1, 4};
  128. checker.set_param(param).execs({A, B, {}});
  129. };
  130. // N = 1
  131. for (size_t M : {4, 16, 128, 1024})
  132. for (size_t K : {4, 8, 12, 128, 256, 4096})
  133. run(M, K);
  134. }
  135. std::vector<conv_bias::TestArg> get_conv_bias_args(
  136. std::vector<size_t> kernel, std::vector<size_t> padv,
  137. std::vector<param::ConvBias::NonlineMode> nlmodev, std::vector<size_t> stridev,
  138. bool no_bias, bool only_broadbias) {
  139. using namespace conv_bias;
  140. using Param = param::ConvBias;
  141. using NLMode = param::ConvBias::NonlineMode;
  142. std::vector<TestArg> args;
  143. auto pack = [&](size_t n, size_t oc, size_t ic, size_t w, size_t h, size_t pad,
  144. size_t kernel, size_t stride, NLMode nonlinemode) {
  145. Param param;
  146. param.stride_h = stride;
  147. param.stride_w = stride;
  148. param.pad_h = pad;
  149. param.pad_w = pad;
  150. param.nonlineMode = nonlinemode;
  151. args.emplace_back(
  152. param, TensorShape{n, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  153. TensorShape{});
  154. if (!no_bias) {
  155. args.emplace_back(
  156. param, TensorShape{n, ic, h, w},
  157. TensorShape{oc, ic, kernel, kernel}, TensorShape{1, oc, 1, 1});
  158. if (!only_broadbias) {
  159. args.emplace_back(
  160. param, TensorShape{n, ic, h, w},
  161. TensorShape{oc, ic, kernel, kernel},
  162. TensorShape{
  163. n, oc, (h + 2 * param.pad_h - kernel) / stride + 1,
  164. (w + 2 * param.pad_h - kernel) / stride + 1});
  165. }
  166. }
  167. };
  168. auto pack_group = [&](size_t n, size_t oc, size_t ic, size_t w, size_t h,
  169. size_t pad, size_t kernel, size_t stride,
  170. NLMode nonlinemode) {
  171. Param param;
  172. param.stride_h = stride;
  173. param.stride_w = stride;
  174. param.pad_h = pad;
  175. param.pad_w = pad;
  176. param.nonlineMode = nonlinemode;
  177. param.sparse = param::ConvBias::Sparse::GROUP;
  178. args.emplace_back(
  179. param, TensorShape{n, 2 * ic, h, w},
  180. TensorShape{2, oc, ic, kernel, kernel}, TensorShape{});
  181. if (!no_bias) {
  182. args.emplace_back(
  183. param, TensorShape{n, 2 * ic, h, w},
  184. TensorShape{2, oc, ic, kernel, kernel},
  185. TensorShape{1, oc * 2, 1, 1});
  186. if (!only_broadbias) {
  187. args.emplace_back(
  188. param, TensorShape{n, 2 * ic, h, w},
  189. TensorShape{2, oc, ic, kernel, kernel},
  190. TensorShape{
  191. n, 2 * oc, (h + 2 * param.pad_h - kernel) / stride + 1,
  192. (w + 2 * param.pad_h - kernel) / stride + 1});
  193. }
  194. }
  195. };
  196. for (size_t n : {1, 2}) {
  197. for (auto nlmode : nlmodev) {
  198. for (auto pad : padv) {
  199. for (auto stride : stridev) {
  200. for (size_t ic : {1, 5}) {
  201. for (size_t oc : {1, 11}) {
  202. for (size_t size : {9, 30}) {
  203. for (size_t kern : kernel) {
  204. pack(n, oc, ic, size + 4, size + 4, pad, kern,
  205. stride, nlmode);
  206. pack_group(
  207. n, oc, ic, size, size, pad, kern, stride,
  208. nlmode);
  209. }
  210. }
  211. }
  212. }
  213. }
  214. }
  215. }
  216. }
  217. return args;
  218. }
  219. void checker_conv_bias(
  220. std::vector<conv_bias::TestArg> args, Handle* handle, RNG* rng, float epsilon,
  221. DType type0, DType type1, DType type2, DType type3, const char* algo_name) {
  222. using namespace conv_bias;
  223. Checker<ConvBias> checker(handle);
  224. checker.set_before_exec_callback(
  225. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));
  226. checker.set_dtype(0, type0);
  227. checker.set_dtype(1, type1);
  228. checker.set_dtype(2, type2);
  229. checker.set_dtype(4, type3);
  230. checker.set_epsilon(epsilon);
  231. if (NULL != rng) {
  232. checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng).set_rng(3, rng);
  233. }
  234. for (auto&& arg : args) {
  235. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}});
  236. }
  237. }
  238. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_FORWARD_IM2COL_8X8X16) {
  239. using namespace conv_bias;
  240. param::ConvBias cur_param;
  241. using NLMode = param::ConvBias::NonlineMode;
  242. std::vector<conv_bias::TestArg> args = get_conv_bias_args(
  243. {1, 3}, {0}, {NLMode::IDENTITY, NLMode::RELU}, {1}, false, true);
  244. NormalRNG default_rng;
  245. Checker<ConvBias> checker(handle());
  246. checker.set_dtype(0, dtype::Int8{});
  247. checker.set_dtype(1, dtype::Int8{});
  248. checker.set_dtype(2, dtype::Int16{});
  249. checker.set_dtype(4, dtype::Int16{});
  250. for (auto&& arg : args) {
  251. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}});
  252. }
  253. }
  254. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_FORWARD) {
  255. using namespace conv_bias;
  256. param::ConvBias cur_param;
  257. using NLMode = param::ConvBias::NonlineMode;
  258. std::vector<conv_bias::TestArg> args = get_conv_bias_args(
  259. {1, 3, 5}, {0, 3},
  260. {NLMode::IDENTITY, NLMode::H_SWISH, NLMode::SIGMOID, NLMode::RELU}, {1, 2},
  261. false, false);
  262. NormalRNG default_rng;
  263. checker_conv_bias(
  264. args, handle(), &default_rng, 1e-3, dtype::Float32{}, dtype::Float32{},
  265. dtype::Float32{}, dtype::Float32{}, "FALLBACK_NAIVE");
  266. }
  267. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_NCHW_NCHW44_F32_S2) {
  268. check_conv_bias(
  269. conv_bias::get_nchw44_conv_bias_args(
  270. {2, 3, 5, 7}, ONLY_IDENTITY_NLMODE, ONLY_BR_BIASMODE, 2, false,
  271. true),
  272. handle(), "F32_CONV_NCHW_NCHW44");
  273. }
  274. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_NCHW_NCHW44_F32_S1) {
  275. check_conv_bias(
  276. conv_bias::get_nchw44_conv_bias_args(
  277. {2, 3, 5, 7}, ONLY_IDENTITY_NLMODE, ONLY_BR_BIASMODE, 1, false,
  278. true),
  279. handle(), "F32_CONV_NCHW_NCHW44");
  280. }
  281. std::vector<conv_bias::TestArg> get_nchw44_channel_wise_args(
  282. std::vector<size_t> kernel, size_t stride, bool no_bias, bool no_nonlinemode,
  283. bool no_full_bias) {
  284. using namespace conv_bias;
  285. using Param = param::ConvBias;
  286. using NLMode = param::ConvBias::NonlineMode;
  287. std::vector<TestArg> args;
  288. auto pack = [&](size_t n, size_t group, size_t w, size_t h, size_t kernel,
  289. size_t stride, NLMode nlmode, bool pad) {
  290. Param param;
  291. param.stride_h = stride;
  292. param.stride_w = stride;
  293. if (pad) {
  294. param.pad_h = kernel / 2;
  295. param.pad_w = kernel / 2;
  296. } else {
  297. param.pad_h = 0;
  298. param.pad_w = 0;
  299. }
  300. param.nonlineMode = nlmode;
  301. param.format = param::ConvBias::Format::NCHW44;
  302. param.sparse = param::ConvBias::Sparse::GROUP;
  303. args.emplace_back(
  304. param, TensorShape{n, group, h, w, 4},
  305. TensorShape{group, 1, 1, kernel, kernel, 4}, TensorShape{});
  306. if (!no_bias) {
  307. args.emplace_back(
  308. param, TensorShape{n, group, h, w, 4},
  309. TensorShape{group, 1, 1, kernel, kernel, 4},
  310. TensorShape{1, group, 1, 1, 4});
  311. }
  312. if (!no_full_bias) {
  313. args.emplace_back(
  314. param, TensorShape{n, group, h, w, 4},
  315. TensorShape{group, 1, 1, kernel, kernel, 4},
  316. TensorShape{
  317. n, group, (h + 2 * param.pad_w - kernel) / stride + 1,
  318. (w + 2 * param.pad_w - kernel) / stride + 1, 4});
  319. }
  320. };
  321. std::vector<NLMode> nonlinemode = {NLMode::IDENTITY};
  322. if (!no_nonlinemode) {
  323. nonlinemode.emplace_back(NLMode::RELU);
  324. nonlinemode.emplace_back(NLMode::H_SWISH);
  325. }
  326. for (size_t n : {1, 2}) {
  327. for (auto nlmode : nonlinemode) {
  328. for (bool pad : {true}) {
  329. for (size_t group : {1, 2, 4, 7, 16}) {
  330. for (size_t size : {4, 6, 7, 9, 20}) {
  331. for (size_t kern : kernel) {
  332. pack(n, group, size, size, kern, stride, nlmode, pad);
  333. }
  334. }
  335. }
  336. }
  337. for (bool pad : {false}) {
  338. for (size_t group : {1, 2, 7, 16}) {
  339. for (size_t size : {7, 9, 20}) {
  340. for (size_t kern : kernel) {
  341. pack(n, group, size, size, kern, stride, nlmode, pad);
  342. }
  343. }
  344. }
  345. }
  346. }
  347. }
  348. return args;
  349. }
  350. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_CHANNEL_WISE_STRIDE1_FP32_NCHW44_1) {
  351. check_conv_bias(
  352. get_nchw44_channel_wise_args({2, 3}, 1, false, false, false), handle(),
  353. "F32_CHANNEL_WISE_NCHW44");
  354. }
  355. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_CHANNEL_WISE_STRIDE1_FP32_NCHW44_2) {
  356. check_conv_bias(
  357. get_nchw44_channel_wise_args({5}, 1, false, false, false), handle(),
  358. "F32_CHANNEL_WISE_NCHW44");
  359. }
  360. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_CHANNEL_WISE_STRIDE2_FP32_NCHW44) {
  361. check_conv_bias(
  362. get_nchw44_channel_wise_args({2, 3, 5}, 2, false, false, false), handle(),
  363. "F32_CHANNEL_WISE_NCHW44");
  364. }
  365. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S1_K7) {
  366. //! k=7 s=1
  367. check_conv_bias(
  368. conv_bias::get_nchw44_conv_bias_args(
  369. {7}, ONLY_IDENTITY_NLMODE, BR_AND_NO_BIASMODE, 1),
  370. handle(), "F32_CONV_NCHW44_DIRECT");
  371. }
  372. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S1_K2K3) {
  373. check_conv_bias(
  374. conv_bias::get_nchw44_conv_bias_args(
  375. {2, 3}, FULL_NLMODE, ONLY_BR_BIASMODE, 1),
  376. handle(), "F32_CONV_NCHW44_DIRECT");
  377. }
  378. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S1_K5) {
  379. check_conv_bias(
  380. conv_bias::get_nchw44_conv_bias_args({5}, FULL_NLMODE, ONLY_BR_BIASMODE, 1),
  381. handle(), "F32_CONV_NCHW44_DIRECT");
  382. }
  383. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S2) {
  384. check_conv_bias(
  385. conv_bias::get_nchw44_conv_bias_args(
  386. {2, 3, 5, 7}, FULL_NLMODE, ONLY_BR_BIASMODE, 2),
  387. handle(), "F32_CONV_NCHW44_DIRECT");
  388. }
  389. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32) {
  390. check_conv_bias(
  391. conv_bias::get_conv_bias_args(
  392. {1, 2, 3, 4, 5, 6, 7}, 1, false, false, false),
  393. handle(), "F32DIRECT");
  394. }
  395. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_STR2) {
  396. check_conv_bias(
  397. conv_bias::get_conv_bias_args({2, 3, 5, 7}, 2, false, false, false),
  398. handle(), "F32STRD2");
  399. }
  400. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_STR1) {
  401. check_conv_bias(
  402. conv_bias::get_conv_bias_args({2, 3, 5, 7}, 1, false, false, false),
  403. handle(), "F32STRD1");
  404. }
  405. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_PREPROCESS_NCHW44) {
  406. using namespace conv_bias;
  407. std::vector<TestArg> nchw44_args = conv_bias::get_nchw44_conv_bias_args(
  408. {3}, QUAN_NLMODE, BR_AND_NO_BIASMODE, 1);
  409. Checker<ConvBiasForward> checker(handle());
  410. auto run = [&checker](
  411. const std::vector<TestArg>& args, DType A_dtype, DType B_dtype,
  412. DType C_dtype, DType D_dtype, const float eps) {
  413. for (auto&& arg : args) {
  414. checker.set_dtype(0, A_dtype)
  415. .set_dtype(1, B_dtype)
  416. .set_dtype(2, C_dtype)
  417. .set_dtype(4, D_dtype)
  418. .set_epsilon(eps)
  419. .set_param(arg.param)
  420. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  421. }
  422. };
  423. //! uncomment this when low precision mode is ok
  424. // run(handle(), nchw44_args, {2, 6, 7}, dtype::Float32(), dtype::Float32(),
  425. // dtype::Float32(), dtype::Float32(), 1e-2f);
  426. //! remove this when low precision mode is ok
  427. run(nchw44_args, dtype::Float32(), dtype::Float32(), dtype::Float32(),
  428. dtype::Float32(), 1e-3f);
  429. }
  430. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_FORWARD_QUANTIZED) {
  431. using namespace conv_bias;
  432. param::ConvBias cur_param;
  433. using NLMode = param::ConvBias::NonlineMode;
  434. std::vector<conv_bias::TestArg> args = get_conv_bias_args(
  435. {1, 3, 5, 7}, {0, 3}, {NLMode::IDENTITY, NLMode::H_SWISH, NLMode::RELU},
  436. {1, 2}, false, false);
  437. UniformIntRNG int_rng{-50, 50};
  438. float epsilon = 1e-3;
  439. checker_conv_bias(
  440. args, handle(), &int_rng, epsilon, dtype::QuantizedS8(2.5f),
  441. dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f),
  442. dtype::QuantizedS8(60.25f), "FALLBACK_NAIVE");
  443. }
  444. #if MEGDNN_WITH_BENCHMARK
  445. namespace {
  446. void benchmark_impl(
  447. const param::ConvBias param,
  448. std::vector<std::pair<SmallVector<TensorShape>, float>>& shapes_and_computation,
  449. const std::string algo_name, size_t RUNS,
  450. TaskExecutorConfig&& multi_thread_config,
  451. TaskExecutorConfig&& single_thread_config, std::vector<DType>& data_type) {
  452. std::vector<float> multi_thread_times, single_thread_times;
  453. {
  454. auto multi_thread_hanle = create_cpu_handle(0, true, &multi_thread_config);
  455. auto benchmarker = Benchmarker<ConvBias>(multi_thread_hanle.get());
  456. benchmarker.set_times(RUNS)
  457. .set_display(false)
  458. .set_param(param)
  459. .set_dtype(0, data_type[0])
  460. .set_dtype(1, data_type[1])
  461. .set_dtype(2, data_type[2])
  462. .set_dtype(4, data_type[3])
  463. .set_before_exec_callback(
  464. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name.c_str()));
  465. for (auto shape : shapes_and_computation) {
  466. multi_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  467. }
  468. }
  469. {
  470. auto single_thread_handle = create_cpu_handle(0, true, &single_thread_config);
  471. auto benchmarker = Benchmarker<ConvBias>(single_thread_handle.get());
  472. benchmarker.set_times(RUNS)
  473. .set_display(false)
  474. .set_param(param)
  475. .set_dtype(0, data_type[0])
  476. .set_dtype(1, data_type[1])
  477. .set_dtype(2, data_type[2])
  478. .set_dtype(4, data_type[3])
  479. .set_before_exec_callback(
  480. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name.c_str()));
  481. for (auto shape : shapes_and_computation) {
  482. single_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  483. }
  484. }
  485. printf("Benchmark : Multi threads %zu, ", multi_thread_config.nr_thread);
  486. printf("core_ids:");
  487. for (size_t i = 0; i < multi_thread_config.affinity_core_set.size(); i++) {
  488. printf("%zu ", multi_thread_config.affinity_core_set[i]);
  489. }
  490. printf(", Single thread core_id %zu\n", single_thread_config.affinity_core_set[0]);
  491. for (size_t i = 0; i < shapes_and_computation.size(); i++) {
  492. auto shapes = shapes_and_computation[i];
  493. printf("Bench case: ");
  494. for (auto&& shape : shapes.first) {
  495. printf("%s ", shape.to_string().c_str());
  496. }
  497. float computations = shapes.second;
  498. printf("%zu threads gflops: %f,\n single thread gflops: "
  499. "%f. spead up = %f, speedup/cores=%f\n",
  500. multi_thread_config.nr_thread, computations / multi_thread_times[i],
  501. computations / single_thread_times[i],
  502. single_thread_times[i] / multi_thread_times[i],
  503. single_thread_times[i] / multi_thread_times[i] /
  504. multi_thread_config.nr_thread);
  505. }
  506. }
  507. } // namespace
  508. TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_GI_CONVBIAS_DIRECTF32) {
  509. constexpr size_t RUNS = 50;
  510. param::ConvBias param;
  511. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  512. param.pad_h = 1;
  513. param.pad_w = 1;
  514. param.stride_h = 1;
  515. param.stride_w = 1;
  516. param.sparse = param::ConvBias::Sparse::GROUP;
  517. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  518. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  519. size_t group) {
  520. SmallVector<TensorShape> shapes{
  521. {N, IC, H, W},
  522. {group, OC / group, IC / group, FS, FS},
  523. {1, OC, 1, 1},
  524. {},
  525. {N, OC, H, W}};
  526. TensorShape dst{N, OC, H, W};
  527. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  528. dst.total_nr_elems()) *
  529. 1e-6;
  530. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  531. };
  532. bench_case(1, 32, 32, 200, 200, 3, 4);
  533. bench_case(1, 32, 32, 200, 200, 3, 32);
  534. bench_case(1, 32, 32, 128, 128, 3, 4);
  535. bench_case(1, 32, 32, 128, 128, 3, 32);
  536. bench_case(1, 32, 32, 100, 100, 3, 4);
  537. bench_case(1, 32, 32, 100, 100, 3, 32);
  538. bench_case(1, 32, 32, 80, 80, 3, 4);
  539. bench_case(1, 32, 32, 80, 80, 3, 32);
  540. std::string algo_name = "F32DIRECT";
  541. printf("Benchmark F32DIRECT_LARGE_GROUP algo\n");
  542. std::vector<DType> data_type = {
  543. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  544. benchmark_impl(
  545. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  546. data_type);
  547. benchmark_impl(
  548. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  549. data_type);
  550. benchmark_impl(
  551. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  552. data_type);
  553. shapes_and_computation.clear();
  554. algo_name = "F32DIRECT";
  555. printf("Benchmark F32DIRECT_SMALL_GROUP algo\n");
  556. bench_case(1, 32, 32, 200, 200, 3, 1);
  557. bench_case(1, 32, 32, 128, 128, 3, 1);
  558. bench_case(1, 32, 32, 100, 100, 3, 1);
  559. bench_case(1, 32, 32, 80, 80, 3, 1);
  560. benchmark_impl(
  561. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  562. data_type);
  563. benchmark_impl(
  564. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  565. data_type);
  566. benchmark_impl(
  567. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  568. data_type);
  569. }
  570. TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_GI_CONVBIAS_DIRECTF32_STR1) {
  571. constexpr size_t RUNS = 50;
  572. param::ConvBias param;
  573. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  574. param.pad_h = 1;
  575. param.pad_w = 1;
  576. param.stride_h = 1;
  577. param.stride_w = 1;
  578. param.sparse = param::ConvBias::Sparse::GROUP;
  579. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  580. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  581. size_t group) {
  582. SmallVector<TensorShape> shapes{
  583. {N, IC, H, W},
  584. {group, OC / group, IC / group, FS, FS},
  585. {1, OC, 1, 1},
  586. {},
  587. {N, OC, H, W}};
  588. TensorShape dst{N, OC, H, W};
  589. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  590. dst.total_nr_elems()) *
  591. 1e-6;
  592. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  593. };
  594. bench_case(1, 32, 32, 200, 200, 3, 4);
  595. bench_case(1, 32, 32, 200, 200, 3, 32);
  596. bench_case(1, 32, 32, 128, 128, 3, 4);
  597. bench_case(1, 32, 32, 128, 128, 3, 32);
  598. bench_case(1, 32, 32, 100, 100, 3, 4);
  599. bench_case(1, 32, 32, 100, 100, 3, 32);
  600. bench_case(1, 32, 32, 80, 80, 3, 4);
  601. bench_case(1, 32, 32, 80, 80, 3, 32);
  602. std::string algo_name = "F32STRD1";
  603. printf("Benchmark F32STRD1_LARGE_GROUP algo\n");
  604. std::vector<DType> data_type = {
  605. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  606. benchmark_impl(
  607. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  608. data_type);
  609. benchmark_impl(
  610. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  611. data_type);
  612. benchmark_impl(
  613. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  614. data_type);
  615. shapes_and_computation.clear();
  616. algo_name = "F32STRD1";
  617. printf("Benchmark F32STRD1_SMALL_GROUP algo\n");
  618. bench_case(1, 32, 32, 200, 200, 3, 1);
  619. bench_case(1, 32, 32, 128, 128, 3, 1);
  620. bench_case(1, 32, 32, 100, 100, 3, 1);
  621. bench_case(1, 32, 32, 80, 80, 3, 1);
  622. benchmark_impl(
  623. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  624. data_type);
  625. benchmark_impl(
  626. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  627. data_type);
  628. benchmark_impl(
  629. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  630. data_type);
  631. }
  632. TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_GI_CONVBIAS_DIRECTF32_STR2) {
  633. constexpr size_t RUNS = 50;
  634. param::ConvBias param;
  635. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  636. param.pad_h = 1;
  637. param.pad_w = 1;
  638. param.stride_h = 2;
  639. param.stride_w = 2;
  640. param.sparse = param::ConvBias::Sparse::GROUP;
  641. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  642. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  643. size_t group, size_t P, size_t S) {
  644. SmallVector<TensorShape> shapes{
  645. {N, IC, H, W},
  646. {group, OC / group, IC / group, FS, FS},
  647. {1, OC, 1, 1},
  648. {},
  649. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  650. TensorShape dst{N, OC, H, W};
  651. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  652. dst.total_nr_elems()) *
  653. 1e-6;
  654. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  655. };
  656. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 2);
  657. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 2);
  658. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 2);
  659. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 2);
  660. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 2);
  661. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 2);
  662. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 2);
  663. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 2);
  664. std::string algo_name = "F32STRD2";
  665. printf("Benchmark F32STRD2_LARGE_GROUP algo\n");
  666. std::vector<DType> data_type = {
  667. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  668. benchmark_impl(
  669. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  670. data_type);
  671. benchmark_impl(
  672. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  673. data_type);
  674. benchmark_impl(
  675. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  676. data_type);
  677. shapes_and_computation.clear();
  678. algo_name = "F32STRD2";
  679. printf("Benchmark F32STRD2_SMALL_GROUP algo\n");
  680. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 2);
  681. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 2);
  682. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 2);
  683. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 2);
  684. benchmark_impl(
  685. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  686. data_type);
  687. benchmark_impl(
  688. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  689. data_type);
  690. benchmark_impl(
  691. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  692. data_type);
  693. }
  694. TEST_F(FALLBACK, BENCHMARK_GI_CHANNEL_WISE_F32_STRIDE1_NCHW44) {
  695. // have to remove preferred restrict in usable func before run the benchmark
  696. using namespace conv_bias;
  697. param::ConvBias param;
  698. param.stride_h = 1;
  699. param.stride_w = 1;
  700. param.pad_h = 1;
  701. param.pad_w = 1;
  702. param.nonlineMode = NonlineMode::RELU;
  703. param.sparse = param::ConvBias::Sparse::GROUP;
  704. constexpr size_t RUN = 50;
  705. Benchmarker<ConvBias> benchmark0(handle());
  706. benchmark0.set_display(false);
  707. benchmark0.set_param(param);
  708. benchmark0.set_times(RUN);
  709. benchmark0.set_before_exec_callback(
  710. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32STRD1"));
  711. auto opr = handle()->create_operator<ConvBias>();
  712. opr->param() = param;
  713. param.format = param::ConvBias::Format::NCHW44;
  714. Benchmarker<ConvBias> benchmark1(handle());
  715. benchmark1.set_display(false);
  716. benchmark1.set_param(param);
  717. benchmark1.set_times(RUN);
  718. benchmark1.set_before_exec_callback(
  719. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32_CHANNEL_WISE_NCHW44"));
  720. auto run = [&](size_t group, size_t w, size_t h, size_t kernel) {
  721. TensorLayout dst_layout;
  722. opr->deduce_layout(
  723. {{1, group * 4, h, w}, dtype::Int8()},
  724. {{group * 4, 1, 1, kernel, kernel}, dtype::Int8()},
  725. {{1, group * 4, 1, 1}, dtype::Int32()}, {}, dst_layout);
  726. //! dst.nr_elems * IC * FH * FW * 2
  727. float computations = dst_layout.total_nr_elems() * kernel * kernel * 2.0 /
  728. (1024 * 1024 * 1024) * 1e3;
  729. auto used0 = benchmark0.exec(
  730. {{1, group * 4, h, w},
  731. {group * 4, 1, 1, kernel, kernel},
  732. {1, group * 4, 1, 1},
  733. {},
  734. {}}) /
  735. RUN;
  736. auto used1 = benchmark1.exec(
  737. {{1, group, h, w, 4},
  738. {group, 1, 1, kernel, kernel, 4},
  739. {1, group, 1, 1, 4},
  740. {},
  741. {}}) /
  742. RUN;
  743. printf("group/h/w/kernel:%zu,%zu,%zu,%zu: nchw: %f ms %f Gflops "
  744. "nchw44: "
  745. "%f ms %f GFlops "
  746. "speedup: %f\n",
  747. group, h, w, kernel, used0, computations / used0, used1,
  748. computations / used1, used0 / used1);
  749. };
  750. for (size_t group : {8, 16, 32, 64}) {
  751. for (size_t kerenl : {2, 3, 5}) {
  752. run(group, 112, 112, kerenl);
  753. run(group, 56, 56, kerenl);
  754. run(group, 48, 48, kerenl);
  755. run(group, 28, 28, kerenl);
  756. run(group, 14, 14, kerenl);
  757. }
  758. }
  759. run(8, 112, 112, 3);
  760. run(32, 56, 56, 3);
  761. run(64, 28, 28, 3);
  762. run(128, 14, 14, 3);
  763. }
  764. TEST_F(FALLBACK, BENCHMARK_GI_CHANNEL_WISE_F32_STRIDE2_NCHW44) {
  765. // have to remove preferred restrict in usable func before run the benchmark
  766. using namespace conv_bias;
  767. param::ConvBias param;
  768. param.stride_h = 2;
  769. param.stride_w = 2;
  770. param.pad_h = 1;
  771. param.pad_w = 1;
  772. param.nonlineMode = NonlineMode::RELU;
  773. param.sparse = param::ConvBias::Sparse::GROUP;
  774. constexpr size_t RUN = 50;
  775. Benchmarker<ConvBias> benchmark0(handle());
  776. benchmark0.set_display(false);
  777. benchmark0.set_param(param);
  778. benchmark0.set_times(RUN);
  779. benchmark0.set_before_exec_callback(
  780. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32STRD2"));
  781. auto opr = handle()->create_operator<ConvBias>();
  782. opr->param() = param;
  783. param.format = param::ConvBias::Format::NCHW44;
  784. Benchmarker<ConvBias> benchmark1(handle());
  785. benchmark1.set_display(false);
  786. benchmark1.set_param(param);
  787. benchmark1.set_times(RUN);
  788. benchmark1.set_before_exec_callback(
  789. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32_CHANNEL_WISE_NCHW44"));
  790. auto run = [&](size_t group, size_t w, size_t h, size_t kernel) {
  791. TensorLayout dst_layout;
  792. opr->deduce_layout(
  793. {{1, group * 4, h, w}, dtype::Int8()},
  794. {{group * 4, 1, 1, kernel, kernel}, dtype::Int8()},
  795. {{1, group * 4, 1, 1}, dtype::Int32()}, {}, dst_layout);
  796. //! dst.nr_elems * IC * FH * FW * 2
  797. float computations = dst_layout.total_nr_elems() * kernel * kernel * 2.0 /
  798. (1024 * 1024 * 1024) * 1e3;
  799. auto used0 = benchmark0.exec(
  800. {{1, group * 4, h, w},
  801. {group * 4, 1, 1, kernel, kernel},
  802. {1, group * 4, 1, 1},
  803. {},
  804. {}}) /
  805. RUN;
  806. auto used1 = benchmark1.exec(
  807. {{1, group, h, w, 4},
  808. {group, 1, 1, kernel, kernel, 4},
  809. {1, group, 1, 1, 4},
  810. {},
  811. {}}) /
  812. RUN;
  813. printf("group/h/w/kernel:%zu,%zu,%zu,%zu: nchw: %f ms %f Gflops "
  814. "nchw44: "
  815. "%f ms %f GFlops "
  816. "speedup: %f\n",
  817. group, h, w, kernel, used0, computations / used0, used1,
  818. computations / used1, used0 / used1);
  819. };
  820. for (size_t group : {8, 16, 32, 64}) {
  821. for (size_t kerenl : {2, 3, 5}) {
  822. run(group, 112, 112, kerenl);
  823. run(group, 56, 56, kerenl);
  824. run(group, 48, 48, kerenl);
  825. run(group, 28, 28, kerenl);
  826. run(group, 14, 14, kerenl);
  827. }
  828. }
  829. run(8, 112, 112, 3);
  830. run(32, 56, 56, 3);
  831. run(64, 28, 28, 3);
  832. run(128, 14, 14, 3);
  833. }
  834. TEST_F(FALLBACK, BENCHMARK_CONVBIAS) {
  835. constexpr size_t RUNS = 10;
  836. param::ConvBias param;
  837. param.stride_h = 1;
  838. param.stride_w = 1;
  839. Benchmarker<ConvBias> benchmarker_int(handle());
  840. benchmarker_int.set_times(RUNS)
  841. .set_dtype(0, dtype::QuantizedS8(2.5f))
  842. .set_dtype(1, dtype::QuantizedS8(2.5f))
  843. .set_dtype(2, dtype::QuantizedS32(6.25f))
  844. .set_dtype(4, dtype::QuantizedS8(40.25f))
  845. .set_display(false);
  846. Benchmarker<ConvBias> benchmarker_float(handle());
  847. benchmarker_float.set_display(false).set_times(RUNS);
  848. auto run = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS) {
  849. TensorShape src({N, IC, H, W}), filter({OC, IC, FS, FS}), bias({N, OC, 1, 1}),
  850. z({}), dst({N, OC, H, W});
  851. param.pad_h = FS / 2;
  852. param.pad_w = FS / 2;
  853. auto int_used =
  854. benchmarker_int.set_param(param).exec({src, filter, bias, z, dst}) /
  855. RUNS;
  856. auto float_used =
  857. benchmarker_float.set_param(param).exec({src, filter, bias, z, dst}) /
  858. RUNS;
  859. float computations = IC * (FS * FS + 1) * dst.total_nr_elems() * 2 * 1e-6;
  860. printf("run: %s %s %s->%s \nfloat: %f ms %f Gflops int: %f ms "
  861. "%f Gflops speedup: %f\n",
  862. src.to_string().c_str(), filter.to_string().c_str(),
  863. bias.to_string().c_str(), dst.to_string().c_str(), float_used,
  864. computations / float_used, int_used, computations / int_used,
  865. float_used / int_used);
  866. };
  867. run(1, 128, 128, 32, 32, 3);
  868. for (size_t IC : {32, 64, 128}) {
  869. for (size_t OC : {32, 64, 128}) {
  870. for (size_t size : {28, 56}) {
  871. for (size_t FS : {3, 5}) {
  872. run(1, IC, OC, size, size, FS);
  873. }
  874. }
  875. }
  876. }
  877. }
  878. TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F23_4x4) {
  879. #if MEGDNN_AARCH64
  880. conv_bias::benchmark_winograd("WINOGRAD:AARCH64_F32_MK4_4x16:4:2", handle(), 3, 4);
  881. #elif MEGDNN_ARMV7
  882. conv_bias::benchmark_winograd("WINOGRAD:ARMV7_F32_MK4_4x8:4:2", handle(), 3, 4);
  883. #else
  884. conv_bias::benchmark_winograd("WINOGRAD:FB_GI_F32_MK4_4x8:4:2", handle(), 3, 4);
  885. #endif
  886. }
  887. void benchmark_winograd_nchw_vs_nchw44(
  888. const char* algo_name0, const char* algo_name1, Handle* handle) {
  889. using namespace conv_bias;
  890. using NLMode = param::ConvBias::NonlineMode;
  891. std::vector<conv_bias::TestArg> args_nchw44;
  892. std::vector<conv_bias::TestArg> args_nchw;
  893. auto pack = [&](size_t n, size_t oc, size_t ic, size_t h, size_t w, size_t group,
  894. NLMode nlmode) {
  895. param::ConvBias param;
  896. param.format = param::ConvBias::Format::NCHW44;
  897. param.stride_h = 1;
  898. param.stride_w = 1;
  899. param.pad_h = 1;
  900. param.pad_w = 1;
  901. param.nonlineMode = nlmode;
  902. if (group == 1) {
  903. param.sparse = param::ConvBias::Sparse::DENSE;
  904. args_nchw44.emplace_back(
  905. param, TensorShape{n, ic / 4, h, w, 4},
  906. TensorShape{oc / 4, ic / 4, 3, 3, 4, 4}, TensorShape{});
  907. param.format = param::ConvBias::Format::NCHW;
  908. args_nchw.emplace_back(
  909. param, TensorShape{n, ic, h, w}, TensorShape{oc, ic, 3, 3},
  910. TensorShape{});
  911. } else {
  912. auto oc_per_group = oc / group;
  913. auto ic_per_group = ic / group;
  914. param.sparse = param::ConvBias::Sparse::GROUP;
  915. args_nchw44.emplace_back(
  916. param, TensorShape{n, ic_per_group / 4, h, w, 4},
  917. TensorShape{group, oc_per_group / 4, ic_per_group / 4, 3, 3, 4, 4},
  918. TensorShape{});
  919. param.format = param::ConvBias::Format::NCHW;
  920. args_nchw.emplace_back(
  921. param, TensorShape{n, ic, h, w},
  922. TensorShape{group, oc_per_group, ic_per_group, 3, 3},
  923. TensorShape{});
  924. }
  925. };
  926. std::vector<NLMode> nonlinemode = {NLMode::IDENTITY};
  927. for (auto nlmode : nonlinemode)
  928. for (size_t n : {1})
  929. for (size_t group = 1; group <= 1; ++group) {
  930. pack(n, 512, 512, 15, 15, group, nlmode);
  931. pack(n, 512, 256, 15, 15, group, nlmode);
  932. pack(n, 256, 256, 29, 29, group, nlmode);
  933. pack(n, 256, 128, 29, 29, group, nlmode);
  934. pack(n, 128, 128, 57, 57, group, nlmode);
  935. pack(n, 128, 64, 57, 57, group, nlmode);
  936. pack(n, 24, 24, 224, 224, group, nlmode);
  937. pack(n, 64, 24, 123, 123, group, nlmode);
  938. pack(n, 64, 64, 56, 56, group, nlmode);
  939. pack(n, 128, 128, 28, 28, group, nlmode);
  940. pack(n, 256, 256, 14, 14, group, nlmode);
  941. pack(n, 512, 512, 7, 7, group, nlmode);
  942. }
  943. using namespace conv_bias;
  944. constexpr size_t RUN = 10;
  945. Benchmarker<ConvBias> benchmark_winograd_nchw(handle);
  946. benchmark_winograd_nchw.set_display(false);
  947. benchmark_winograd_nchw.set_times(RUN);
  948. Benchmarker<ConvBias> benchmark_winograd_nchw44(handle);
  949. benchmark_winograd_nchw44.set_display(false);
  950. benchmark_winograd_nchw44.set_times(RUN);
  951. std::string winograd_nchw_algo_name = ssprintf("WINOGRAD:%s", algo_name0);
  952. std::string winograd_nchw44_algo_name = ssprintf("WINOGRAD_NCHW44:%s", algo_name1);
  953. for (size_t i = 0; i < args_nchw.size(); ++i) {
  954. auto arg_nchw = args_nchw[i];
  955. auto arg_nchw44 = args_nchw44[i];
  956. TensorLayout dst_layout;
  957. auto opr = handle->create_operator<ConvBias>();
  958. opr->param() = arg_nchw.param;
  959. opr->deduce_layout(
  960. {arg_nchw.src, dtype::Float32()}, {arg_nchw.filter, dtype::Float32()},
  961. {arg_nchw.bias, dtype::Float32()}, {}, dst_layout);
  962. //! dst.nr_elems * IC * FH * FW * 2
  963. float computations = dst_layout.total_nr_elems() * arg_nchw.filter[1] *
  964. arg_nchw.filter[2] * arg_nchw.filter[3] * 2.0 /
  965. (1024 * 1024 * 1024) * 1e3;
  966. benchmark_winograd_nchw.set_param(arg_nchw.param);
  967. auto nchw_used = algo_benchmark<ConvBias>(
  968. benchmark_winograd_nchw,
  969. {arg_nchw.src, arg_nchw.filter, {}, {}, {}},
  970. winograd_nchw_algo_name.c_str()) /
  971. RUN;
  972. benchmark_winograd_nchw44.set_param(arg_nchw44.param);
  973. auto nchw44_used = algo_benchmark<ConvBias>(
  974. benchmark_winograd_nchw44,
  975. {arg_nchw44.src, arg_nchw44.filter, {}, {}, {}},
  976. winograd_nchw44_algo_name.c_str()) /
  977. RUN;
  978. printf("%s %s: nchw: %f ms %f Gflops nchw44: %f ms %f GFlops "
  979. "speedup: "
  980. "%f\n",
  981. arg_nchw.src.to_string().c_str(), arg_nchw.filter.to_string().c_str(),
  982. nchw_used, computations / nchw_used, nchw44_used,
  983. computations / nchw44_used, nchw_used / nchw44_used);
  984. }
  985. }
  986. TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F23_MK4_NCHW_VS_NCHW44) {
  987. #if MEGDNN_AARCH64
  988. benchmark_winograd_nchw_vs_nchw44(
  989. "AARCH64_F32_MK4_4x16:4:2", "AARCH64_F32_MK4_4x16:4:2", handle());
  990. #elif MEGDNN_ARMV7
  991. benchmark_winograd_nchw_vs_nchw44(
  992. "ARMV7_F32_MK4_4x8:4:2", "ARMV7_F32_MK4_4x8:4:2", handle());
  993. #else
  994. benchmark_winograd_nchw_vs_nchw44(
  995. "FB_GI_F32_MK4_4x8:4:2", "FB_GI_F32_MK4_4x8:4:2", handle());
  996. #endif
  997. }
  998. TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F63_4x4) {
  999. #if MEGDNN_AARCH64
  1000. conv_bias::benchmark_winograd("WINOGRAD:AARCH64_F32_MK4_4x16:4:6", handle(), 3, 4);
  1001. #elif MEGDNN_ARMV7
  1002. conv_bias::benchmark_winograd("WINOGRAD:ARMV7_F32_MK4_4x8:4:6", handle(), 3, 4);
  1003. #else
  1004. conv_bias::benchmark_winograd("WINOGRAD:FB_GI_F32_MK4_4x8:4:6", handle(), 3, 4);
  1005. #endif
  1006. }
  1007. TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F63_MK4_NCHW_VS_NCHW44) {
  1008. #if MEGDNN_AARCH64
  1009. benchmark_winograd_nchw_vs_nchw44(
  1010. "AARCH64_F32_MK4_4x16:4:6", "AARCH64_F32_MK4_4x16:4:6", handle());
  1011. #elif MEGDNN_ARMV7
  1012. benchmark_winograd_nchw_vs_nchw44(
  1013. "ARMV7_F32_MK4_4x8:4:6", "ARMV7_F32_MK4_4x8:4:6", handle());
  1014. #else
  1015. benchmark_winograd_nchw_vs_nchw44(
  1016. "FB_GI_F32_MK4_4x8:4:6", "FB_GI_F32_MK4_4x8:4:6", handle());
  1017. #endif
  1018. }
  1019. #endif
  1020. } // namespace test
  1021. } // namespace megdnn
  1022. // vim: syntax=cpp.doxygen