You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias.cpp 44 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098
  1. #include "test/common/conv_bias.h"
  2. #include "megdnn/opr_param_defs.h"
  3. #include "megdnn/oprs.h"
  4. #include "test/common/benchmarker.h"
  5. #include "test/common/checker.h"
  6. #include "test/common/rng.h"
  7. #include "test/common/task_record_check.h"
  8. #include "test/common/tensor.h"
  9. #include "test/fallback/fixture.h"
  10. #if MEGDNN_X86
  11. #include "src/x86/utils.h"
  12. #endif
  13. namespace megdnn {
  14. namespace test {
  15. TEST_F(FALLBACK, CONV_BIAS_FORWARD) {
  16. using namespace conv_bias;
  17. std::vector<TestArg> args = get_args();
  18. Checker<ConvBiasForward> checker(handle());
  19. NormalRNG default_rng;
  20. UniformIntRNG int_rng{-50, 50};
  21. param::ConvBias param;
  22. {
  23. param.format = param::ConvBias::Format::NHWC;
  24. auto src_shape = TensorShape{2, 16, 32, 24};
  25. auto filter_shape = TensorShape{4, 3, 3, 24};
  26. auto bias_shape_channel = TensorShape{1, 1, 1, 4};
  27. checker.set_dtype(0, dtype::Float32())
  28. .set_dtype(1, dtype::Float32())
  29. .set_dtype(2, dtype::Float32())
  30. .set_rng(0, &default_rng)
  31. .set_rng(1, &default_rng)
  32. .set_rng(2, &default_rng)
  33. .set_param(param)
  34. .execs({src_shape, filter_shape, bias_shape_channel, {}, {}});
  35. }
  36. checker.set_before_exec_callback(
  37. conv_bias::ConvBiasAlgoChecker<ConvBias>("FALLBACK_NAIVE"));
  38. for (auto&& arg : args) {
  39. checker.set_dtype(0, dtype::Float32())
  40. .set_dtype(1, dtype::Float32())
  41. .set_dtype(2, dtype::Float32())
  42. .set_rng(0, &default_rng)
  43. .set_rng(1, &default_rng)
  44. .set_rng(2, &default_rng)
  45. .set_epsilon(1e-3)
  46. .set_param(arg.param)
  47. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  48. }
  49. {
  50. param.format = param::ConvBias::Format::NCHW;
  51. param.sparse = ConvBias::Param::Sparse::GROUP;
  52. auto src_shape = TensorShape{2, 16, 32, 24};
  53. auto filter_shape = TensorShape{4, 4, 4, 1, 1};
  54. auto bias_shape_channel = TensorShape{1, 16, 1, 1};
  55. auto bias_shape = TensorShape{2, 16, 32, 24};
  56. checker.set_dtype(0, dtype::Float32())
  57. .set_dtype(1, dtype::Float32())
  58. .set_dtype(2, dtype::Float32())
  59. .set_rng(0, &default_rng)
  60. .set_rng(1, &default_rng)
  61. .set_rng(2, &default_rng)
  62. .set_param(param)
  63. .execs({src_shape, filter_shape, bias_shape, {}, {}})
  64. .execs({src_shape, filter_shape, bias_shape_channel, {}, {}});
  65. }
  66. }
  67. TEST_F(FALLBACK, CONV_BIAS_FORWARD_RECORD) {
  68. using namespace conv_bias;
  69. TaskRecordChecker<ConvBiasForward> checker(1);
  70. NormalRNG default_rng;
  71. UniformIntRNG int_rng{-50, 50};
  72. param::ConvBias param;
  73. {
  74. param.format = param::ConvBias::Format::NHWC;
  75. auto src_shape = TensorShape{2, 16, 32, 24};
  76. auto filter_shape = TensorShape{4, 3, 3, 24};
  77. auto bias_shape_channel = TensorShape{1, 1, 1, 4};
  78. checker.set_dtype(0, dtype::Float32())
  79. .set_dtype(1, dtype::Float32())
  80. .set_dtype(2, dtype::Float32())
  81. .set_rng(0, &default_rng)
  82. .set_rng(1, &default_rng)
  83. .set_rng(2, &default_rng)
  84. .set_param(param)
  85. .execs({src_shape, filter_shape, bias_shape_channel, {}, {}});
  86. }
  87. {
  88. param.format = param::ConvBias::Format::NCHW;
  89. param.sparse = ConvBias::Param::Sparse::GROUP;
  90. auto src_shape = TensorShape{2, 16, 32, 24};
  91. auto filter_shape = TensorShape{4, 4, 4, 1, 1};
  92. auto bias_shape_channel = TensorShape{1, 16, 1, 1};
  93. auto bias_shape = TensorShape{2, 16, 32, 24};
  94. checker.set_dtype(0, dtype::Float32())
  95. .set_dtype(1, dtype::Float32())
  96. .set_dtype(2, dtype::Float32())
  97. .set_rng(0, &default_rng)
  98. .set_rng(1, &default_rng)
  99. .set_rng(2, &default_rng)
  100. .set_param(param)
  101. .execs({src_shape, filter_shape, bias_shape, {}, {}})
  102. .execs({src_shape, filter_shape, bias_shape_channel, {}, {}});
  103. }
  104. }
  105. TEST_F(FALLBACK, FP32_GEMV_MK4_GI) {
  106. Checker<MatrixMul> checker(handle());
  107. using Param = MatrixMul::Param;
  108. checker.set_before_exec_callback(AlgoChecker<MatrixMul>("FB_GI_F32_GEMV_MK4"));
  109. checker.set_epsilon(1e-2);
  110. auto run = [&](size_t M, size_t K) {
  111. Param param;
  112. param.format = param::MatrixMul::Format::MK4;
  113. param.transposeA = false;
  114. param.transposeB = false;
  115. TensorShape A, B;
  116. A = TensorShape{M / 4, K / 4, 4, 4};
  117. B = TensorShape{K / 4, 1, 4};
  118. checker.set_param(param).execs({A, B, {}});
  119. };
  120. // N = 1
  121. for (size_t M : {4, 16, 128, 1024})
  122. for (size_t K : {4, 8, 12, 128, 256, 4096})
  123. run(M, K);
  124. }
  125. std::vector<conv_bias::TestArg> get_conv_bias_args(
  126. std::vector<size_t> kernel, std::vector<size_t> padv,
  127. std::vector<param::ConvBias::NonlineMode> nlmodev, std::vector<size_t> stridev,
  128. bool no_bias, bool only_broadbias) {
  129. using namespace conv_bias;
  130. using Param = param::ConvBias;
  131. using NLMode = param::ConvBias::NonlineMode;
  132. std::vector<TestArg> args;
  133. auto pack = [&](size_t n, size_t oc, size_t ic, size_t w, size_t h, size_t pad,
  134. size_t kernel, size_t stride, NLMode nonlinemode) {
  135. Param param;
  136. param.stride_h = stride;
  137. param.stride_w = stride;
  138. param.pad_h = pad;
  139. param.pad_w = pad;
  140. param.nonlineMode = nonlinemode;
  141. args.emplace_back(
  142. param, TensorShape{n, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  143. TensorShape{});
  144. if (!no_bias) {
  145. args.emplace_back(
  146. param, TensorShape{n, ic, h, w},
  147. TensorShape{oc, ic, kernel, kernel}, TensorShape{1, oc, 1, 1});
  148. if (!only_broadbias) {
  149. args.emplace_back(
  150. param, TensorShape{n, ic, h, w},
  151. TensorShape{oc, ic, kernel, kernel},
  152. TensorShape{
  153. n, oc, (h + 2 * param.pad_h - kernel) / stride + 1,
  154. (w + 2 * param.pad_h - kernel) / stride + 1});
  155. }
  156. }
  157. };
  158. auto pack_group = [&](size_t n, size_t oc, size_t ic, size_t w, size_t h,
  159. size_t pad, size_t kernel, size_t stride,
  160. NLMode nonlinemode) {
  161. Param param;
  162. param.stride_h = stride;
  163. param.stride_w = stride;
  164. param.pad_h = pad;
  165. param.pad_w = pad;
  166. param.nonlineMode = nonlinemode;
  167. param.sparse = param::ConvBias::Sparse::GROUP;
  168. args.emplace_back(
  169. param, TensorShape{n, 2 * ic, h, w},
  170. TensorShape{2, oc, ic, kernel, kernel}, TensorShape{});
  171. if (!no_bias) {
  172. args.emplace_back(
  173. param, TensorShape{n, 2 * ic, h, w},
  174. TensorShape{2, oc, ic, kernel, kernel},
  175. TensorShape{1, oc * 2, 1, 1});
  176. if (!only_broadbias) {
  177. args.emplace_back(
  178. param, TensorShape{n, 2 * ic, h, w},
  179. TensorShape{2, oc, ic, kernel, kernel},
  180. TensorShape{
  181. n, 2 * oc, (h + 2 * param.pad_h - kernel) / stride + 1,
  182. (w + 2 * param.pad_h - kernel) / stride + 1});
  183. }
  184. }
  185. };
  186. for (size_t n : {1, 2}) {
  187. for (auto nlmode : nlmodev) {
  188. for (auto pad : padv) {
  189. for (auto stride : stridev) {
  190. for (size_t ic : {1, 5}) {
  191. for (size_t oc : {1, 11}) {
  192. for (size_t size : {9, 30}) {
  193. for (size_t kern : kernel) {
  194. pack(n, oc, ic, size + 4, size + 4, pad, kern,
  195. stride, nlmode);
  196. pack_group(
  197. n, oc, ic, size, size, pad, kern, stride,
  198. nlmode);
  199. }
  200. }
  201. }
  202. }
  203. }
  204. }
  205. }
  206. }
  207. return args;
  208. }
  209. void checker_conv_bias(
  210. std::vector<conv_bias::TestArg> args, Handle* handle, RNG* rng, float epsilon,
  211. DType type0, DType type1, DType type2, DType type3, const char* algo_name) {
  212. using namespace conv_bias;
  213. Checker<ConvBias> checker(handle);
  214. checker.set_before_exec_callback(
  215. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));
  216. checker.set_dtype(0, type0);
  217. checker.set_dtype(1, type1);
  218. checker.set_dtype(2, type2);
  219. checker.set_dtype(4, type3);
  220. checker.set_epsilon(epsilon);
  221. if (NULL != rng) {
  222. checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng).set_rng(3, rng);
  223. }
  224. for (auto&& arg : args) {
  225. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}});
  226. }
  227. }
  228. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_FORWARD_IM2COL_8X8X16) {
  229. using namespace conv_bias;
  230. param::ConvBias cur_param;
  231. using NLMode = param::ConvBias::NonlineMode;
  232. std::vector<conv_bias::TestArg> args = get_conv_bias_args(
  233. {1, 3}, {0}, {NLMode::IDENTITY, NLMode::RELU}, {1}, false, true);
  234. NormalRNG default_rng;
  235. Checker<ConvBias> checker(handle());
  236. checker.set_dtype(0, dtype::Int8{});
  237. checker.set_dtype(1, dtype::Int8{});
  238. checker.set_dtype(2, dtype::Int16{});
  239. checker.set_dtype(4, dtype::Int16{});
  240. for (auto&& arg : args) {
  241. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}});
  242. }
  243. }
  244. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_FORWARD) {
  245. using namespace conv_bias;
  246. param::ConvBias cur_param;
  247. using NLMode = param::ConvBias::NonlineMode;
  248. std::vector<conv_bias::TestArg> args = get_conv_bias_args(
  249. {1, 3, 5}, {0, 3},
  250. {NLMode::IDENTITY, NLMode::H_SWISH, NLMode::SIGMOID, NLMode::RELU}, {1, 2},
  251. false, false);
  252. NormalRNG default_rng;
  253. checker_conv_bias(
  254. args, handle(), &default_rng, 1e-3, dtype::Float32{}, dtype::Float32{},
  255. dtype::Float32{}, dtype::Float32{}, "FALLBACK_NAIVE");
  256. }
  257. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_NCHW_NCHW44_F32_S2) {
  258. check_conv_bias(
  259. conv_bias::get_nchw44_conv_bias_args(
  260. {2, 3, 5, 7}, ONLY_IDENTITY_NLMODE, ONLY_BR_BIASMODE, 2, false,
  261. true),
  262. handle(), "F32_CONV_NCHW_NCHW44");
  263. }
  264. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_NCHW_NCHW44_F32_S1) {
  265. check_conv_bias(
  266. conv_bias::get_nchw44_conv_bias_args(
  267. {2, 3, 5, 7}, ONLY_IDENTITY_NLMODE, ONLY_BR_BIASMODE, 1, false,
  268. true),
  269. handle(), "F32_CONV_NCHW_NCHW44");
  270. }
  271. std::vector<conv_bias::TestArg> get_nchw44_channel_wise_args(
  272. std::vector<size_t> kernel, size_t stride, bool no_bias, bool no_nonlinemode,
  273. bool no_full_bias) {
  274. using namespace conv_bias;
  275. using Param = param::ConvBias;
  276. using NLMode = param::ConvBias::NonlineMode;
  277. std::vector<TestArg> args;
  278. auto pack = [&](size_t n, size_t group, size_t w, size_t h, size_t kernel,
  279. size_t stride, NLMode nlmode, bool pad) {
  280. Param param;
  281. param.stride_h = stride;
  282. param.stride_w = stride;
  283. if (pad) {
  284. param.pad_h = kernel / 2;
  285. param.pad_w = kernel / 2;
  286. } else {
  287. param.pad_h = 0;
  288. param.pad_w = 0;
  289. }
  290. param.nonlineMode = nlmode;
  291. param.format = param::ConvBias::Format::NCHW44;
  292. param.sparse = param::ConvBias::Sparse::GROUP;
  293. args.emplace_back(
  294. param, TensorShape{n, group, h, w, 4},
  295. TensorShape{group, 1, 1, kernel, kernel, 4}, TensorShape{});
  296. if (!no_bias) {
  297. args.emplace_back(
  298. param, TensorShape{n, group, h, w, 4},
  299. TensorShape{group, 1, 1, kernel, kernel, 4},
  300. TensorShape{1, group, 1, 1, 4});
  301. }
  302. if (!no_full_bias) {
  303. args.emplace_back(
  304. param, TensorShape{n, group, h, w, 4},
  305. TensorShape{group, 1, 1, kernel, kernel, 4},
  306. TensorShape{
  307. n, group, (h + 2 * param.pad_w - kernel) / stride + 1,
  308. (w + 2 * param.pad_w - kernel) / stride + 1, 4});
  309. }
  310. };
  311. std::vector<NLMode> nonlinemode = {NLMode::IDENTITY};
  312. if (!no_nonlinemode) {
  313. nonlinemode.emplace_back(NLMode::RELU);
  314. nonlinemode.emplace_back(NLMode::H_SWISH);
  315. }
  316. for (size_t n : {1, 2}) {
  317. for (auto nlmode : nonlinemode) {
  318. for (bool pad : {true}) {
  319. for (size_t group : {1, 2, 4, 7, 16}) {
  320. for (size_t size : {4, 6, 7, 9, 20}) {
  321. for (size_t kern : kernel) {
  322. pack(n, group, size, size, kern, stride, nlmode, pad);
  323. }
  324. }
  325. }
  326. }
  327. for (bool pad : {false}) {
  328. for (size_t group : {1, 2, 7, 16}) {
  329. for (size_t size : {7, 9, 20}) {
  330. for (size_t kern : kernel) {
  331. pack(n, group, size, size, kern, stride, nlmode, pad);
  332. }
  333. }
  334. }
  335. }
  336. }
  337. }
  338. return args;
  339. }
  340. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_CHANNEL_WISE_STRIDE1_FP32_NCHW44_1) {
  341. check_conv_bias(
  342. get_nchw44_channel_wise_args({2, 3}, 1, false, false, false), handle(),
  343. "F32_CHANNEL_WISE_NCHW44");
  344. }
  345. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_CHANNEL_WISE_STRIDE1_FP32_NCHW44_2) {
  346. check_conv_bias(
  347. get_nchw44_channel_wise_args({5}, 1, false, false, false), handle(),
  348. "F32_CHANNEL_WISE_NCHW44");
  349. }
  350. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_CHANNEL_WISE_STRIDE2_FP32_NCHW44) {
  351. check_conv_bias(
  352. get_nchw44_channel_wise_args({2, 3, 5}, 2, false, false, false), handle(),
  353. "F32_CHANNEL_WISE_NCHW44");
  354. }
  355. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S1_K7) {
  356. //! k=7 s=1
  357. check_conv_bias(
  358. conv_bias::get_nchw44_conv_bias_args(
  359. {7}, ONLY_IDENTITY_NLMODE, BR_AND_NO_BIASMODE, 1),
  360. handle(), "F32_CONV_NCHW44_DIRECT");
  361. }
  362. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S1_K2K3) {
  363. check_conv_bias(
  364. conv_bias::get_nchw44_conv_bias_args(
  365. {2, 3}, FULL_NLMODE, ONLY_BR_BIASMODE, 1),
  366. handle(), "F32_CONV_NCHW44_DIRECT");
  367. }
  368. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S1_K5) {
  369. check_conv_bias(
  370. conv_bias::get_nchw44_conv_bias_args({5}, FULL_NLMODE, ONLY_BR_BIASMODE, 1),
  371. handle(), "F32_CONV_NCHW44_DIRECT");
  372. }
  373. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S2) {
  374. check_conv_bias(
  375. conv_bias::get_nchw44_conv_bias_args(
  376. {2, 3, 5, 7}, FULL_NLMODE, ONLY_BR_BIASMODE, 2),
  377. handle(), "F32_CONV_NCHW44_DIRECT");
  378. }
  379. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32) {
  380. check_conv_bias(
  381. conv_bias::get_conv_bias_args(
  382. {1, 2, 3, 4, 5, 6, 7}, 1, false, false, false),
  383. handle(), "F32DIRECT");
  384. }
  385. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_STR2) {
  386. check_conv_bias(
  387. conv_bias::get_conv_bias_args({2, 3, 5, 7}, 2, false, false, false),
  388. handle(), "F32STRD2");
  389. }
  390. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_STR1) {
  391. check_conv_bias(
  392. conv_bias::get_conv_bias_args({2, 3, 5, 7}, 1, false, false, false),
  393. handle(), "F32STRD1");
  394. }
  395. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_PREPROCESS_NCHW44) {
  396. using namespace conv_bias;
  397. std::vector<TestArg> nchw44_args = conv_bias::get_nchw44_conv_bias_args(
  398. {3}, QUAN_NLMODE, BR_AND_NO_BIASMODE, 1);
  399. Checker<ConvBiasForward> checker(handle());
  400. auto run = [&checker](
  401. const std::vector<TestArg>& args, DType A_dtype, DType B_dtype,
  402. DType C_dtype, DType D_dtype, const float eps) {
  403. for (auto&& arg : args) {
  404. checker.set_dtype(0, A_dtype)
  405. .set_dtype(1, B_dtype)
  406. .set_dtype(2, C_dtype)
  407. .set_dtype(4, D_dtype)
  408. .set_epsilon(eps)
  409. .set_param(arg.param)
  410. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  411. }
  412. };
  413. //! uncomment this when low precision mode is ok
  414. // run(handle(), nchw44_args, {2, 6, 7}, dtype::Float32(), dtype::Float32(),
  415. // dtype::Float32(), dtype::Float32(), 1e-2f);
  416. //! remove this when low precision mode is ok
  417. run(nchw44_args, dtype::Float32(), dtype::Float32(), dtype::Float32(),
  418. dtype::Float32(), 1e-3f);
  419. }
  420. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_FORWARD_QUANTIZED) {
  421. using namespace conv_bias;
  422. param::ConvBias cur_param;
  423. using NLMode = param::ConvBias::NonlineMode;
  424. std::vector<conv_bias::TestArg> args = get_conv_bias_args(
  425. {1, 3, 5, 7}, {0, 3}, {NLMode::IDENTITY, NLMode::H_SWISH, NLMode::RELU},
  426. {1, 2}, false, false);
  427. UniformIntRNG int_rng{-50, 50};
  428. float epsilon = 1e-3;
  429. checker_conv_bias(
  430. args, handle(), &int_rng, epsilon, dtype::QuantizedS8(2.5f),
  431. dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f),
  432. dtype::QuantizedS8(60.25f), "FALLBACK_NAIVE");
  433. }
  434. #if MEGDNN_WITH_BENCHMARK
  435. namespace {
  436. void benchmark_impl(
  437. const param::ConvBias param,
  438. std::vector<std::pair<SmallVector<TensorShape>, float>>& shapes_and_computation,
  439. const std::string algo_name, size_t RUNS,
  440. TaskExecutorConfig&& multi_thread_config,
  441. TaskExecutorConfig&& single_thread_config, std::vector<DType>& data_type) {
  442. std::vector<float> multi_thread_times, single_thread_times;
  443. {
  444. auto multi_thread_hanle = create_cpu_handle(0, true, &multi_thread_config);
  445. auto benchmarker = Benchmarker<ConvBias>(multi_thread_hanle.get());
  446. benchmarker.set_times(RUNS)
  447. .set_display(false)
  448. .set_param(param)
  449. .set_dtype(0, data_type[0])
  450. .set_dtype(1, data_type[1])
  451. .set_dtype(2, data_type[2])
  452. .set_dtype(4, data_type[3])
  453. .set_before_exec_callback(
  454. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name.c_str()));
  455. for (auto shape : shapes_and_computation) {
  456. multi_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  457. }
  458. }
  459. {
  460. auto single_thread_handle = create_cpu_handle(0, true, &single_thread_config);
  461. auto benchmarker = Benchmarker<ConvBias>(single_thread_handle.get());
  462. benchmarker.set_times(RUNS)
  463. .set_display(false)
  464. .set_param(param)
  465. .set_dtype(0, data_type[0])
  466. .set_dtype(1, data_type[1])
  467. .set_dtype(2, data_type[2])
  468. .set_dtype(4, data_type[3])
  469. .set_before_exec_callback(
  470. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name.c_str()));
  471. for (auto shape : shapes_and_computation) {
  472. single_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  473. }
  474. }
  475. printf("Benchmark : Multi threads %zu, ", multi_thread_config.nr_thread);
  476. printf("core_ids:");
  477. for (size_t i = 0; i < multi_thread_config.affinity_core_set.size(); i++) {
  478. printf("%zu ", multi_thread_config.affinity_core_set[i]);
  479. }
  480. printf(", Single thread core_id %zu\n", single_thread_config.affinity_core_set[0]);
  481. for (size_t i = 0; i < shapes_and_computation.size(); i++) {
  482. auto shapes = shapes_and_computation[i];
  483. printf("Bench case: ");
  484. for (auto&& shape : shapes.first) {
  485. printf("%s ", shape.to_string().c_str());
  486. }
  487. float computations = shapes.second;
  488. printf("%zu threads gflops: %f,\n single thread gflops: "
  489. "%f. spead up = %f, speedup/cores=%f\n",
  490. multi_thread_config.nr_thread, computations / multi_thread_times[i],
  491. computations / single_thread_times[i],
  492. single_thread_times[i] / multi_thread_times[i],
  493. single_thread_times[i] / multi_thread_times[i] /
  494. multi_thread_config.nr_thread);
  495. }
  496. }
  497. } // namespace
  498. TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_GI_CONVBIAS_DIRECTF32) {
  499. constexpr size_t RUNS = 50;
  500. param::ConvBias param;
  501. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  502. param.pad_h = 1;
  503. param.pad_w = 1;
  504. param.stride_h = 1;
  505. param.stride_w = 1;
  506. param.sparse = param::ConvBias::Sparse::GROUP;
  507. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  508. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  509. size_t group) {
  510. SmallVector<TensorShape> shapes{
  511. {N, IC, H, W},
  512. {group, OC / group, IC / group, FS, FS},
  513. {1, OC, 1, 1},
  514. {},
  515. {N, OC, H, W}};
  516. TensorShape dst{N, OC, H, W};
  517. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  518. dst.total_nr_elems()) *
  519. 1e-6;
  520. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  521. };
  522. bench_case(1, 32, 32, 200, 200, 3, 4);
  523. bench_case(1, 32, 32, 200, 200, 3, 32);
  524. bench_case(1, 32, 32, 128, 128, 3, 4);
  525. bench_case(1, 32, 32, 128, 128, 3, 32);
  526. bench_case(1, 32, 32, 100, 100, 3, 4);
  527. bench_case(1, 32, 32, 100, 100, 3, 32);
  528. bench_case(1, 32, 32, 80, 80, 3, 4);
  529. bench_case(1, 32, 32, 80, 80, 3, 32);
  530. std::string algo_name = "F32DIRECT";
  531. printf("Benchmark F32DIRECT_LARGE_GROUP algo\n");
  532. std::vector<DType> data_type = {
  533. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  534. benchmark_impl(
  535. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  536. data_type);
  537. benchmark_impl(
  538. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  539. data_type);
  540. benchmark_impl(
  541. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  542. data_type);
  543. shapes_and_computation.clear();
  544. algo_name = "F32DIRECT";
  545. printf("Benchmark F32DIRECT_SMALL_GROUP algo\n");
  546. bench_case(1, 32, 32, 200, 200, 3, 1);
  547. bench_case(1, 32, 32, 128, 128, 3, 1);
  548. bench_case(1, 32, 32, 100, 100, 3, 1);
  549. bench_case(1, 32, 32, 80, 80, 3, 1);
  550. benchmark_impl(
  551. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  552. data_type);
  553. benchmark_impl(
  554. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  555. data_type);
  556. benchmark_impl(
  557. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  558. data_type);
  559. }
  560. TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_GI_CONVBIAS_DIRECTF32_STR1) {
  561. constexpr size_t RUNS = 50;
  562. param::ConvBias param;
  563. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  564. param.pad_h = 1;
  565. param.pad_w = 1;
  566. param.stride_h = 1;
  567. param.stride_w = 1;
  568. param.sparse = param::ConvBias::Sparse::GROUP;
  569. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  570. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  571. size_t group) {
  572. SmallVector<TensorShape> shapes{
  573. {N, IC, H, W},
  574. {group, OC / group, IC / group, FS, FS},
  575. {1, OC, 1, 1},
  576. {},
  577. {N, OC, H, W}};
  578. TensorShape dst{N, OC, H, W};
  579. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  580. dst.total_nr_elems()) *
  581. 1e-6;
  582. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  583. };
  584. bench_case(1, 32, 32, 200, 200, 3, 4);
  585. bench_case(1, 32, 32, 200, 200, 3, 32);
  586. bench_case(1, 32, 32, 128, 128, 3, 4);
  587. bench_case(1, 32, 32, 128, 128, 3, 32);
  588. bench_case(1, 32, 32, 100, 100, 3, 4);
  589. bench_case(1, 32, 32, 100, 100, 3, 32);
  590. bench_case(1, 32, 32, 80, 80, 3, 4);
  591. bench_case(1, 32, 32, 80, 80, 3, 32);
  592. std::string algo_name = "F32STRD1";
  593. printf("Benchmark F32STRD1_LARGE_GROUP algo\n");
  594. std::vector<DType> data_type = {
  595. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  596. benchmark_impl(
  597. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  598. data_type);
  599. benchmark_impl(
  600. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  601. data_type);
  602. benchmark_impl(
  603. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  604. data_type);
  605. shapes_and_computation.clear();
  606. algo_name = "F32STRD1";
  607. printf("Benchmark F32STRD1_SMALL_GROUP algo\n");
  608. bench_case(1, 32, 32, 200, 200, 3, 1);
  609. bench_case(1, 32, 32, 128, 128, 3, 1);
  610. bench_case(1, 32, 32, 100, 100, 3, 1);
  611. bench_case(1, 32, 32, 80, 80, 3, 1);
  612. benchmark_impl(
  613. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  614. data_type);
  615. benchmark_impl(
  616. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  617. data_type);
  618. benchmark_impl(
  619. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  620. data_type);
  621. }
  622. TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_GI_CONVBIAS_DIRECTF32_STR2) {
  623. constexpr size_t RUNS = 50;
  624. param::ConvBias param;
  625. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  626. param.pad_h = 1;
  627. param.pad_w = 1;
  628. param.stride_h = 2;
  629. param.stride_w = 2;
  630. param.sparse = param::ConvBias::Sparse::GROUP;
  631. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  632. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  633. size_t group, size_t P, size_t S) {
  634. SmallVector<TensorShape> shapes{
  635. {N, IC, H, W},
  636. {group, OC / group, IC / group, FS, FS},
  637. {1, OC, 1, 1},
  638. {},
  639. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  640. TensorShape dst{N, OC, H, W};
  641. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  642. dst.total_nr_elems()) *
  643. 1e-6;
  644. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  645. };
  646. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 2);
  647. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 2);
  648. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 2);
  649. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 2);
  650. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 2);
  651. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 2);
  652. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 2);
  653. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 2);
  654. std::string algo_name = "F32STRD2";
  655. printf("Benchmark F32STRD2_LARGE_GROUP algo\n");
  656. std::vector<DType> data_type = {
  657. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  658. benchmark_impl(
  659. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  660. data_type);
  661. benchmark_impl(
  662. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  663. data_type);
  664. benchmark_impl(
  665. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  666. data_type);
  667. shapes_and_computation.clear();
  668. algo_name = "F32STRD2";
  669. printf("Benchmark F32STRD2_SMALL_GROUP algo\n");
  670. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 2);
  671. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 2);
  672. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 2);
  673. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 2);
  674. benchmark_impl(
  675. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  676. data_type);
  677. benchmark_impl(
  678. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  679. data_type);
  680. benchmark_impl(
  681. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  682. data_type);
  683. }
  684. TEST_F(FALLBACK, BENCHMARK_GI_CHANNEL_WISE_F32_STRIDE1_NCHW44) {
  685. // have to remove preferred restrict in usable func before run the benchmark
  686. using namespace conv_bias;
  687. param::ConvBias param;
  688. param.stride_h = 1;
  689. param.stride_w = 1;
  690. param.pad_h = 1;
  691. param.pad_w = 1;
  692. param.nonlineMode = NonlineMode::RELU;
  693. param.sparse = param::ConvBias::Sparse::GROUP;
  694. constexpr size_t RUN = 50;
  695. Benchmarker<ConvBias> benchmark0(handle());
  696. benchmark0.set_display(false);
  697. benchmark0.set_param(param);
  698. benchmark0.set_times(RUN);
  699. benchmark0.set_before_exec_callback(
  700. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32STRD1"));
  701. auto opr = handle()->create_operator<ConvBias>();
  702. opr->param() = param;
  703. param.format = param::ConvBias::Format::NCHW44;
  704. Benchmarker<ConvBias> benchmark1(handle());
  705. benchmark1.set_display(false);
  706. benchmark1.set_param(param);
  707. benchmark1.set_times(RUN);
  708. benchmark1.set_before_exec_callback(
  709. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32_CHANNEL_WISE_NCHW44"));
  710. auto run = [&](size_t group, size_t w, size_t h, size_t kernel) {
  711. TensorLayout dst_layout;
  712. opr->deduce_layout(
  713. {{1, group * 4, h, w}, dtype::Int8()},
  714. {{group * 4, 1, 1, kernel, kernel}, dtype::Int8()},
  715. {{1, group * 4, 1, 1}, dtype::Int32()}, {}, dst_layout);
  716. //! dst.nr_elems * IC * FH * FW * 2
  717. float computations = dst_layout.total_nr_elems() * kernel * kernel * 2.0 /
  718. (1024 * 1024 * 1024) * 1e3;
  719. auto used0 = benchmark0.exec(
  720. {{1, group * 4, h, w},
  721. {group * 4, 1, 1, kernel, kernel},
  722. {1, group * 4, 1, 1},
  723. {},
  724. {}}) /
  725. RUN;
  726. auto used1 = benchmark1.exec(
  727. {{1, group, h, w, 4},
  728. {group, 1, 1, kernel, kernel, 4},
  729. {1, group, 1, 1, 4},
  730. {},
  731. {}}) /
  732. RUN;
  733. printf("group/h/w/kernel:%zu,%zu,%zu,%zu: nchw: %f ms %f Gflops "
  734. "nchw44: "
  735. "%f ms %f GFlops "
  736. "speedup: %f\n",
  737. group, h, w, kernel, used0, computations / used0, used1,
  738. computations / used1, used0 / used1);
  739. };
  740. for (size_t group : {8, 16, 32, 64}) {
  741. for (size_t kerenl : {2, 3, 5}) {
  742. run(group, 112, 112, kerenl);
  743. run(group, 56, 56, kerenl);
  744. run(group, 48, 48, kerenl);
  745. run(group, 28, 28, kerenl);
  746. run(group, 14, 14, kerenl);
  747. }
  748. }
  749. run(8, 112, 112, 3);
  750. run(32, 56, 56, 3);
  751. run(64, 28, 28, 3);
  752. run(128, 14, 14, 3);
  753. }
  754. TEST_F(FALLBACK, BENCHMARK_GI_CHANNEL_WISE_F32_STRIDE2_NCHW44) {
  755. // have to remove preferred restrict in usable func before run the benchmark
  756. using namespace conv_bias;
  757. param::ConvBias param;
  758. param.stride_h = 2;
  759. param.stride_w = 2;
  760. param.pad_h = 1;
  761. param.pad_w = 1;
  762. param.nonlineMode = NonlineMode::RELU;
  763. param.sparse = param::ConvBias::Sparse::GROUP;
  764. constexpr size_t RUN = 50;
  765. Benchmarker<ConvBias> benchmark0(handle());
  766. benchmark0.set_display(false);
  767. benchmark0.set_param(param);
  768. benchmark0.set_times(RUN);
  769. benchmark0.set_before_exec_callback(
  770. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32STRD2"));
  771. auto opr = handle()->create_operator<ConvBias>();
  772. opr->param() = param;
  773. param.format = param::ConvBias::Format::NCHW44;
  774. Benchmarker<ConvBias> benchmark1(handle());
  775. benchmark1.set_display(false);
  776. benchmark1.set_param(param);
  777. benchmark1.set_times(RUN);
  778. benchmark1.set_before_exec_callback(
  779. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32_CHANNEL_WISE_NCHW44"));
  780. auto run = [&](size_t group, size_t w, size_t h, size_t kernel) {
  781. TensorLayout dst_layout;
  782. opr->deduce_layout(
  783. {{1, group * 4, h, w}, dtype::Int8()},
  784. {{group * 4, 1, 1, kernel, kernel}, dtype::Int8()},
  785. {{1, group * 4, 1, 1}, dtype::Int32()}, {}, dst_layout);
  786. //! dst.nr_elems * IC * FH * FW * 2
  787. float computations = dst_layout.total_nr_elems() * kernel * kernel * 2.0 /
  788. (1024 * 1024 * 1024) * 1e3;
  789. auto used0 = benchmark0.exec(
  790. {{1, group * 4, h, w},
  791. {group * 4, 1, 1, kernel, kernel},
  792. {1, group * 4, 1, 1},
  793. {},
  794. {}}) /
  795. RUN;
  796. auto used1 = benchmark1.exec(
  797. {{1, group, h, w, 4},
  798. {group, 1, 1, kernel, kernel, 4},
  799. {1, group, 1, 1, 4},
  800. {},
  801. {}}) /
  802. RUN;
  803. printf("group/h/w/kernel:%zu,%zu,%zu,%zu: nchw: %f ms %f Gflops "
  804. "nchw44: "
  805. "%f ms %f GFlops "
  806. "speedup: %f\n",
  807. group, h, w, kernel, used0, computations / used0, used1,
  808. computations / used1, used0 / used1);
  809. };
  810. for (size_t group : {8, 16, 32, 64}) {
  811. for (size_t kerenl : {2, 3, 5}) {
  812. run(group, 112, 112, kerenl);
  813. run(group, 56, 56, kerenl);
  814. run(group, 48, 48, kerenl);
  815. run(group, 28, 28, kerenl);
  816. run(group, 14, 14, kerenl);
  817. }
  818. }
  819. run(8, 112, 112, 3);
  820. run(32, 56, 56, 3);
  821. run(64, 28, 28, 3);
  822. run(128, 14, 14, 3);
  823. }
  824. TEST_F(FALLBACK, BENCHMARK_CONVBIAS) {
  825. constexpr size_t RUNS = 10;
  826. param::ConvBias param;
  827. param.stride_h = 1;
  828. param.stride_w = 1;
  829. Benchmarker<ConvBias> benchmarker_int(handle());
  830. benchmarker_int.set_times(RUNS)
  831. .set_dtype(0, dtype::QuantizedS8(2.5f))
  832. .set_dtype(1, dtype::QuantizedS8(2.5f))
  833. .set_dtype(2, dtype::QuantizedS32(6.25f))
  834. .set_dtype(4, dtype::QuantizedS8(40.25f))
  835. .set_display(false);
  836. Benchmarker<ConvBias> benchmarker_float(handle());
  837. benchmarker_float.set_display(false).set_times(RUNS);
  838. auto run = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS) {
  839. TensorShape src({N, IC, H, W}), filter({OC, IC, FS, FS}), bias({N, OC, 1, 1}),
  840. z({}), dst({N, OC, H, W});
  841. param.pad_h = FS / 2;
  842. param.pad_w = FS / 2;
  843. auto int_used =
  844. benchmarker_int.set_param(param).exec({src, filter, bias, z, dst}) /
  845. RUNS;
  846. auto float_used =
  847. benchmarker_float.set_param(param).exec({src, filter, bias, z, dst}) /
  848. RUNS;
  849. float computations = IC * (FS * FS + 1) * dst.total_nr_elems() * 2 * 1e-6;
  850. printf("run: %s %s %s->%s \nfloat: %f ms %f Gflops int: %f ms "
  851. "%f Gflops speedup: %f\n",
  852. src.to_string().c_str(), filter.to_string().c_str(),
  853. bias.to_string().c_str(), dst.to_string().c_str(), float_used,
  854. computations / float_used, int_used, computations / int_used,
  855. float_used / int_used);
  856. };
  857. run(1, 128, 128, 32, 32, 3);
  858. for (size_t IC : {32, 64, 128}) {
  859. for (size_t OC : {32, 64, 128}) {
  860. for (size_t size : {28, 56}) {
  861. for (size_t FS : {3, 5}) {
  862. run(1, IC, OC, size, size, FS);
  863. }
  864. }
  865. }
  866. }
  867. }
  868. TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F23_4x4) {
  869. #if MEGDNN_AARCH64
  870. conv_bias::benchmark_winograd("WINOGRAD:AARCH64_F32_MK4_4x16:4:2", handle(), 3, 4);
  871. #elif MEGDNN_ARMV7
  872. conv_bias::benchmark_winograd("WINOGRAD:ARMV7_F32_MK4_4x8:4:2", handle(), 3, 4);
  873. #else
  874. conv_bias::benchmark_winograd("WINOGRAD:FB_GI_F32_MK4_4x8:4:2", handle(), 3, 4);
  875. #endif
  876. }
  877. void benchmark_winograd_nchw_vs_nchw44(
  878. const char* algo_name0, const char* algo_name1, Handle* handle) {
  879. using namespace conv_bias;
  880. using NLMode = param::ConvBias::NonlineMode;
  881. std::vector<conv_bias::TestArg> args_nchw44;
  882. std::vector<conv_bias::TestArg> args_nchw;
  883. auto pack = [&](size_t n, size_t oc, size_t ic, size_t h, size_t w, size_t group,
  884. NLMode nlmode) {
  885. param::ConvBias param;
  886. param.format = param::ConvBias::Format::NCHW44;
  887. param.stride_h = 1;
  888. param.stride_w = 1;
  889. param.pad_h = 1;
  890. param.pad_w = 1;
  891. param.nonlineMode = nlmode;
  892. if (group == 1) {
  893. param.sparse = param::ConvBias::Sparse::DENSE;
  894. args_nchw44.emplace_back(
  895. param, TensorShape{n, ic / 4, h, w, 4},
  896. TensorShape{oc / 4, ic / 4, 3, 3, 4, 4}, TensorShape{});
  897. param.format = param::ConvBias::Format::NCHW;
  898. args_nchw.emplace_back(
  899. param, TensorShape{n, ic, h, w}, TensorShape{oc, ic, 3, 3},
  900. TensorShape{});
  901. } else {
  902. auto oc_per_group = oc / group;
  903. auto ic_per_group = ic / group;
  904. param.sparse = param::ConvBias::Sparse::GROUP;
  905. args_nchw44.emplace_back(
  906. param, TensorShape{n, ic_per_group / 4, h, w, 4},
  907. TensorShape{group, oc_per_group / 4, ic_per_group / 4, 3, 3, 4, 4},
  908. TensorShape{});
  909. param.format = param::ConvBias::Format::NCHW;
  910. args_nchw.emplace_back(
  911. param, TensorShape{n, ic, h, w},
  912. TensorShape{group, oc_per_group, ic_per_group, 3, 3},
  913. TensorShape{});
  914. }
  915. };
  916. std::vector<NLMode> nonlinemode = {NLMode::IDENTITY};
  917. for (auto nlmode : nonlinemode)
  918. for (size_t n : {1})
  919. for (size_t group = 1; group <= 1; ++group) {
  920. pack(n, 512, 512, 15, 15, group, nlmode);
  921. pack(n, 512, 256, 15, 15, group, nlmode);
  922. pack(n, 256, 256, 29, 29, group, nlmode);
  923. pack(n, 256, 128, 29, 29, group, nlmode);
  924. pack(n, 128, 128, 57, 57, group, nlmode);
  925. pack(n, 128, 64, 57, 57, group, nlmode);
  926. pack(n, 24, 24, 224, 224, group, nlmode);
  927. pack(n, 64, 24, 123, 123, group, nlmode);
  928. pack(n, 64, 64, 56, 56, group, nlmode);
  929. pack(n, 128, 128, 28, 28, group, nlmode);
  930. pack(n, 256, 256, 14, 14, group, nlmode);
  931. pack(n, 512, 512, 7, 7, group, nlmode);
  932. }
  933. using namespace conv_bias;
  934. constexpr size_t RUN = 10;
  935. Benchmarker<ConvBias> benchmark_winograd_nchw(handle);
  936. benchmark_winograd_nchw.set_display(false);
  937. benchmark_winograd_nchw.set_times(RUN);
  938. Benchmarker<ConvBias> benchmark_winograd_nchw44(handle);
  939. benchmark_winograd_nchw44.set_display(false);
  940. benchmark_winograd_nchw44.set_times(RUN);
  941. std::string winograd_nchw_algo_name = ssprintf("WINOGRAD:%s", algo_name0);
  942. std::string winograd_nchw44_algo_name = ssprintf("WINOGRAD_NCHW44:%s", algo_name1);
  943. for (size_t i = 0; i < args_nchw.size(); ++i) {
  944. auto arg_nchw = args_nchw[i];
  945. auto arg_nchw44 = args_nchw44[i];
  946. TensorLayout dst_layout;
  947. auto opr = handle->create_operator<ConvBias>();
  948. opr->param() = arg_nchw.param;
  949. opr->deduce_layout(
  950. {arg_nchw.src, dtype::Float32()}, {arg_nchw.filter, dtype::Float32()},
  951. {arg_nchw.bias, dtype::Float32()}, {}, dst_layout);
  952. //! dst.nr_elems * IC * FH * FW * 2
  953. float computations = dst_layout.total_nr_elems() * arg_nchw.filter[1] *
  954. arg_nchw.filter[2] * arg_nchw.filter[3] * 2.0 /
  955. (1024 * 1024 * 1024) * 1e3;
  956. benchmark_winograd_nchw.set_param(arg_nchw.param);
  957. auto nchw_used = algo_benchmark<ConvBias>(
  958. benchmark_winograd_nchw,
  959. {arg_nchw.src, arg_nchw.filter, {}, {}, {}},
  960. winograd_nchw_algo_name.c_str()) /
  961. RUN;
  962. benchmark_winograd_nchw44.set_param(arg_nchw44.param);
  963. auto nchw44_used = algo_benchmark<ConvBias>(
  964. benchmark_winograd_nchw44,
  965. {arg_nchw44.src, arg_nchw44.filter, {}, {}, {}},
  966. winograd_nchw44_algo_name.c_str()) /
  967. RUN;
  968. printf("%s %s: nchw: %f ms %f Gflops nchw44: %f ms %f GFlops "
  969. "speedup: "
  970. "%f\n",
  971. arg_nchw.src.to_string().c_str(), arg_nchw.filter.to_string().c_str(),
  972. nchw_used, computations / nchw_used, nchw44_used,
  973. computations / nchw44_used, nchw_used / nchw44_used);
  974. }
  975. }
  976. TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F23_MK4_NCHW_VS_NCHW44) {
  977. #if MEGDNN_AARCH64
  978. benchmark_winograd_nchw_vs_nchw44(
  979. "AARCH64_F32_MK4_4x16:4:2", "AARCH64_F32_MK4_4x16:4:2", handle());
  980. #elif MEGDNN_ARMV7
  981. benchmark_winograd_nchw_vs_nchw44(
  982. "ARMV7_F32_MK4_4x8:4:2", "ARMV7_F32_MK4_4x8:4:2", handle());
  983. #else
  984. benchmark_winograd_nchw_vs_nchw44(
  985. "FB_GI_F32_MK4_4x8:4:2", "FB_GI_F32_MK4_4x8:4:2", handle());
  986. #endif
  987. }
  988. TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F63_4x4) {
  989. #if MEGDNN_AARCH64
  990. conv_bias::benchmark_winograd("WINOGRAD:AARCH64_F32_MK4_4x16:4:6", handle(), 3, 4);
  991. #elif MEGDNN_ARMV7
  992. conv_bias::benchmark_winograd("WINOGRAD:ARMV7_F32_MK4_4x8:4:6", handle(), 3, 4);
  993. #else
  994. conv_bias::benchmark_winograd("WINOGRAD:FB_GI_F32_MK4_4x8:4:6", handle(), 3, 4);
  995. #endif
  996. }
  997. TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F63_MK4_NCHW_VS_NCHW44) {
  998. #if MEGDNN_AARCH64
  999. benchmark_winograd_nchw_vs_nchw44(
  1000. "AARCH64_F32_MK4_4x16:4:6", "AARCH64_F32_MK4_4x16:4:6", handle());
  1001. #elif MEGDNN_ARMV7
  1002. benchmark_winograd_nchw_vs_nchw44(
  1003. "ARMV7_F32_MK4_4x8:4:6", "ARMV7_F32_MK4_4x8:4:6", handle());
  1004. #else
  1005. benchmark_winograd_nchw_vs_nchw44(
  1006. "FB_GI_F32_MK4_4x8:4:6", "FB_GI_F32_MK4_4x8:4:6", handle());
  1007. #endif
  1008. }
  1009. #endif
  1010. } // namespace test
  1011. } // namespace megdnn
  1012. // vim: syntax=cpp.doxygen