You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias.cpp 52 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306
  1. #include "test/common/conv_bias.h"
  2. #include "megdnn/opr_param_defs.h"
  3. #include "megdnn/oprs.h"
  4. #include "test/common/benchmarker.h"
  5. #include "test/common/checker.h"
  6. #include "test/common/rng.h"
  7. #include "test/common/task_record_check.h"
  8. #include "test/common/tensor.h"
  9. #include "test/fallback/fixture.h"
  10. #if MEGDNN_X86
  11. #include "src/x86/utils.h"
  12. #endif
  13. namespace megdnn {
  14. namespace test {
  15. TEST_F(FALLBACK, CONV_BIAS_FORWARD) {
  16. using namespace conv_bias;
  17. std::vector<TestArg> args = get_args();
  18. Checker<ConvBiasForward> checker(handle());
  19. NormalRNG default_rng;
  20. UniformIntRNG int_rng{-50, 50};
  21. param::ConvBias param;
  22. {
  23. param.format = param::ConvBias::Format::NHWC;
  24. auto src_shape = TensorShape{2, 16, 32, 24};
  25. auto filter_shape = TensorShape{4, 3, 3, 24};
  26. auto bias_shape_channel = TensorShape{1, 1, 1, 4};
  27. checker.set_dtype(0, dtype::Float32())
  28. .set_dtype(1, dtype::Float32())
  29. .set_dtype(2, dtype::Float32())
  30. .set_rng(0, &default_rng)
  31. .set_rng(1, &default_rng)
  32. .set_rng(2, &default_rng)
  33. .set_param(param)
  34. .execs({src_shape, filter_shape, bias_shape_channel, {}, {}});
  35. }
  36. checker.set_before_exec_callback(
  37. conv_bias::ConvBiasAlgoChecker<ConvBias>("FALLBACK_NAIVE"));
  38. for (auto&& arg : args) {
  39. checker.set_dtype(0, dtype::Float32())
  40. .set_dtype(1, dtype::Float32())
  41. .set_dtype(2, dtype::Float32())
  42. .set_rng(0, &default_rng)
  43. .set_rng(1, &default_rng)
  44. .set_rng(2, &default_rng)
  45. .set_epsilon(1e-3)
  46. .set_param(arg.param)
  47. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  48. }
  49. {
  50. param.format = param::ConvBias::Format::NCHW;
  51. param.sparse = ConvBias::Param::Sparse::GROUP;
  52. auto src_shape = TensorShape{2, 16, 32, 24};
  53. auto filter_shape = TensorShape{4, 4, 4, 1, 1};
  54. auto bias_shape_channel = TensorShape{1, 16, 1, 1};
  55. auto bias_shape = TensorShape{2, 16, 32, 24};
  56. checker.set_dtype(0, dtype::Float32())
  57. .set_dtype(1, dtype::Float32())
  58. .set_dtype(2, dtype::Float32())
  59. .set_rng(0, &default_rng)
  60. .set_rng(1, &default_rng)
  61. .set_rng(2, &default_rng)
  62. .set_param(param)
  63. .execs({src_shape, filter_shape, bias_shape, {}, {}})
  64. .execs({src_shape, filter_shape, bias_shape_channel, {}, {}});
  65. }
  66. }
  67. TEST_F(FALLBACK, CONV_BIAS_FORWARD_RECORD) {
  68. using namespace conv_bias;
  69. TaskRecordChecker<ConvBiasForward> checker(1);
  70. NormalRNG default_rng;
  71. UniformIntRNG int_rng{-50, 50};
  72. param::ConvBias param;
  73. {
  74. param.format = param::ConvBias::Format::NHWC;
  75. auto src_shape = TensorShape{2, 16, 32, 24};
  76. auto filter_shape = TensorShape{4, 3, 3, 24};
  77. auto bias_shape_channel = TensorShape{1, 1, 1, 4};
  78. checker.set_dtype(0, dtype::Float32())
  79. .set_dtype(1, dtype::Float32())
  80. .set_dtype(2, dtype::Float32())
  81. .set_rng(0, &default_rng)
  82. .set_rng(1, &default_rng)
  83. .set_rng(2, &default_rng)
  84. .set_param(param)
  85. .execs({src_shape, filter_shape, bias_shape_channel, {}, {}});
  86. }
  87. {
  88. param.format = param::ConvBias::Format::NCHW;
  89. param.sparse = ConvBias::Param::Sparse::GROUP;
  90. auto src_shape = TensorShape{2, 16, 32, 24};
  91. auto filter_shape = TensorShape{4, 4, 4, 1, 1};
  92. auto bias_shape_channel = TensorShape{1, 16, 1, 1};
  93. auto bias_shape = TensorShape{2, 16, 32, 24};
  94. checker.set_dtype(0, dtype::Float32())
  95. .set_dtype(1, dtype::Float32())
  96. .set_dtype(2, dtype::Float32())
  97. .set_rng(0, &default_rng)
  98. .set_rng(1, &default_rng)
  99. .set_rng(2, &default_rng)
  100. .set_param(param)
  101. .execs({src_shape, filter_shape, bias_shape, {}, {}})
  102. .execs({src_shape, filter_shape, bias_shape_channel, {}, {}});
  103. }
  104. }
  105. TEST_F(FALLBACK, FP32_GEMV_MK4_GI) {
  106. Checker<MatrixMul> checker(handle());
  107. using Param = MatrixMul::Param;
  108. checker.set_before_exec_callback(AlgoChecker<MatrixMul>("FB_GI_F32_GEMV_MK4"));
  109. checker.set_epsilon(1e-2);
  110. auto run = [&](size_t M, size_t K) {
  111. Param param;
  112. param.format = param::MatrixMul::Format::MK4;
  113. param.transposeA = false;
  114. param.transposeB = false;
  115. TensorShape A, B;
  116. A = TensorShape{M / 4, K / 4, 4, 4};
  117. B = TensorShape{K / 4, 1, 4};
  118. checker.set_param(param).execs({A, B, {}});
  119. };
  120. // N = 1
  121. for (size_t M : {4, 16, 128, 1024})
  122. for (size_t K : {4, 8, 12, 128, 256, 4096})
  123. run(M, K);
  124. }
  125. std::vector<conv_bias::TestArg> get_conv_bias_args(
  126. std::vector<size_t> kernel, std::vector<size_t> padv,
  127. std::vector<param::ConvBias::NonlineMode> nlmodev, std::vector<size_t> stridev,
  128. bool no_bias, bool only_broadbias) {
  129. using namespace conv_bias;
  130. using Param = param::ConvBias;
  131. using NLMode = param::ConvBias::NonlineMode;
  132. std::vector<TestArg> args;
  133. auto pack = [&](size_t n, size_t oc, size_t ic, size_t w, size_t h, size_t pad,
  134. size_t kernel, size_t stride, NLMode nonlinemode) {
  135. Param param;
  136. param.stride_h = stride;
  137. param.stride_w = stride;
  138. param.pad_h = pad;
  139. param.pad_w = pad;
  140. param.nonlineMode = nonlinemode;
  141. args.emplace_back(
  142. param, TensorShape{n, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  143. TensorShape{});
  144. if (!no_bias) {
  145. args.emplace_back(
  146. param, TensorShape{n, ic, h, w},
  147. TensorShape{oc, ic, kernel, kernel}, TensorShape{1, oc, 1, 1});
  148. if (!only_broadbias) {
  149. args.emplace_back(
  150. param, TensorShape{n, ic, h, w},
  151. TensorShape{oc, ic, kernel, kernel},
  152. TensorShape{
  153. n, oc, (h + 2 * param.pad_h - kernel) / stride + 1,
  154. (w + 2 * param.pad_h - kernel) / stride + 1});
  155. }
  156. }
  157. };
  158. auto pack_group = [&](size_t n, size_t oc, size_t ic, size_t w, size_t h,
  159. size_t pad, size_t kernel, size_t stride,
  160. NLMode nonlinemode) {
  161. Param param;
  162. param.stride_h = stride;
  163. param.stride_w = stride;
  164. param.pad_h = pad;
  165. param.pad_w = pad;
  166. param.nonlineMode = nonlinemode;
  167. param.sparse = param::ConvBias::Sparse::GROUP;
  168. args.emplace_back(
  169. param, TensorShape{n, 2 * ic, h, w},
  170. TensorShape{2, oc, ic, kernel, kernel}, TensorShape{});
  171. if (!no_bias) {
  172. args.emplace_back(
  173. param, TensorShape{n, 2 * ic, h, w},
  174. TensorShape{2, oc, ic, kernel, kernel},
  175. TensorShape{1, oc * 2, 1, 1});
  176. if (!only_broadbias) {
  177. args.emplace_back(
  178. param, TensorShape{n, 2 * ic, h, w},
  179. TensorShape{2, oc, ic, kernel, kernel},
  180. TensorShape{
  181. n, 2 * oc, (h + 2 * param.pad_h - kernel) / stride + 1,
  182. (w + 2 * param.pad_h - kernel) / stride + 1});
  183. }
  184. }
  185. };
  186. for (size_t n : {1, 2}) {
  187. for (auto nlmode : nlmodev) {
  188. for (auto pad : padv) {
  189. for (auto stride : stridev) {
  190. for (size_t ic : {1, 5}) {
  191. for (size_t oc : {1, 11}) {
  192. for (size_t size : {9, 30}) {
  193. for (size_t kern : kernel) {
  194. pack(n, oc, ic, size + 4, size + 4, pad, kern,
  195. stride, nlmode);
  196. pack_group(
  197. n, oc, ic, size, size, pad, kern, stride,
  198. nlmode);
  199. }
  200. }
  201. }
  202. }
  203. }
  204. }
  205. }
  206. }
  207. return args;
  208. }
  209. void checker_conv_bias(
  210. std::vector<conv_bias::TestArg> args, Handle* handle, RNG* rng, float epsilon,
  211. DType type0, DType type1, DType type2, DType type3, const char* algo_name) {
  212. using namespace conv_bias;
  213. Checker<ConvBias> checker(handle);
  214. checker.set_before_exec_callback(
  215. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));
  216. checker.set_dtype(0, type0);
  217. checker.set_dtype(1, type1);
  218. checker.set_dtype(2, type2);
  219. checker.set_dtype(4, type3);
  220. checker.set_epsilon(epsilon);
  221. if (NULL != rng) {
  222. checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng).set_rng(3, rng);
  223. }
  224. for (auto&& arg : args) {
  225. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}});
  226. }
  227. }
  228. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_1X1_S1_MK4_PACK_F32) {
  229. using namespace conv_bias;
  230. std::vector<conv_bias::TestArg> args =
  231. get_nchw44_conv_bias_args({1}, FULL_NLMODE, ALL_BIASMODE, 1, true);
  232. check_conv_bias(args, handle(), "CONV1x1:FB_GI_F32_MK4_PACK_4x12:24");
  233. }
  234. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_IM2COL_S1_MK4_PACK_F32_PREPROCESS) {
  235. using namespace conv_bias;
  236. std::vector<conv_bias::TestArg> args =
  237. get_nchw44_conv_bias_args({2, 4, 7}, FULL_NLMODE, BR_AND_NO_BIASMODE, 1);
  238. #define cb(name) \
  239. check_conv_bias_preprocess( \
  240. args, handle(), nullptr, 0.001, dtype::Float32(), dtype::Float32(), \
  241. dtype::Float32(), dtype::Float32(), name);
  242. cb("IM2COLMATMUL:FB_GI_F32_MK4_PACK_4x12");
  243. #undef cb
  244. }
  245. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_IM2COL_S2_MK4_PACK_F32_FUSE_PREPROCESS) {
  246. using namespace conv_bias;
  247. std::vector<conv_bias::TestArg> args =
  248. get_nchw44_conv_bias_args({3}, FULL_NLMODE, BR_AND_BIAS_BIASMODE, 2);
  249. #define cb(name) \
  250. check_conv_bias_preprocess( \
  251. args, handle(), nullptr, 0.001, dtype::Float32(), dtype::Float32(), \
  252. dtype::Float32(), dtype::Float32(), name);
  253. cb("IM2COLMATMUL:FB_GI_F32_MK4_PACK_4x12");
  254. #undef cb
  255. }
  256. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_1X1_S1_MK4_PACK_F32_PREPROCESS) {
  257. using namespace conv_bias;
  258. std::vector<conv_bias::TestArg> args =
  259. get_nchw44_conv_bias_args({1}, FULL_NLMODE, ALL_BIASMODE, 1, true);
  260. #define cb(name) \
  261. check_conv_bias_preprocess( \
  262. args, handle(), nullptr, 0.001, dtype::Float32(), dtype::Float32(), \
  263. dtype::Float32(), dtype::Float32(), name);
  264. cb("CONV1x1:FB_GI_F32_MK4_PACK_4x12:24");
  265. #undef cb
  266. }
  267. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_IM2COL_S1_MK4_PACK_F32) {
  268. using namespace conv_bias;
  269. std::vector<conv_bias::TestArg> args =
  270. get_nchw44_conv_bias_args({2, 4, 7}, FULL_NLMODE, BR_AND_BIAS_BIASMODE, 1);
  271. check_conv_bias(args, handle(), "IM2COLMATMUL:FB_GI_F32_MK4_PACK_4x12");
  272. }
  273. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_IM2COL_S2_MK4_PACK_F32) {
  274. using namespace conv_bias;
  275. std::vector<conv_bias::TestArg> args =
  276. get_nchw44_conv_bias_args({3, 5, 6}, FULL_NLMODE, BR_AND_BIAS_BIASMODE, 2);
  277. #define cb(name) check_conv_bias(args, handle(), name);
  278. cb("IM2COLMATMUL:FB_GI_F32_MK4_PACK_4x12");
  279. #undef cb
  280. }
  281. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_IM2COL_S2_MK4_PACK_F32_FUSE) {
  282. using namespace conv_bias;
  283. std::vector<conv_bias::TestArg> args =
  284. get_nchw44_conv_bias_args({3}, FULL_NLMODE, ALL_BIASMODE, 2);
  285. #define cb(name) check_conv_bias(args, handle(), name);
  286. cb("IM2COLMATMUL:FB_GI_F32_MK4_PACK_4x12");
  287. #undef cb
  288. }
  289. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_FORWARD_IM2COL_8X8X16) {
  290. using namespace conv_bias;
  291. param::ConvBias cur_param;
  292. using NLMode = param::ConvBias::NonlineMode;
  293. std::vector<conv_bias::TestArg> args = get_conv_bias_args(
  294. {1, 3}, {0}, {NLMode::IDENTITY, NLMode::RELU}, {1}, false, true);
  295. NormalRNG default_rng;
  296. Checker<ConvBias> checker(handle());
  297. checker.set_dtype(0, dtype::Int8{});
  298. checker.set_dtype(1, dtype::Int8{});
  299. checker.set_dtype(2, dtype::Int16{});
  300. checker.set_dtype(4, dtype::Int16{});
  301. for (auto&& arg : args) {
  302. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}});
  303. }
  304. }
  305. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_FORWARD) {
  306. using namespace conv_bias;
  307. param::ConvBias cur_param;
  308. using NLMode = param::ConvBias::NonlineMode;
  309. std::vector<conv_bias::TestArg> args = get_conv_bias_args(
  310. {1, 3, 5}, {0, 3},
  311. {NLMode::IDENTITY, NLMode::H_SWISH, NLMode::SIGMOID, NLMode::RELU}, {1, 2},
  312. false, false);
  313. NormalRNG default_rng;
  314. checker_conv_bias(
  315. args, handle(), &default_rng, 1e-3, dtype::Float32{}, dtype::Float32{},
  316. dtype::Float32{}, dtype::Float32{}, "FALLBACK_NAIVE");
  317. }
  318. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_NCHW_NCHW44_F32_S2) {
  319. check_conv_bias(
  320. conv_bias::get_nchw44_conv_bias_args(
  321. {2, 3, 5, 7}, ONLY_IDENTITY_NLMODE, ONLY_BR_BIASMODE, 2, false,
  322. true),
  323. handle(), "F32_CONV_NCHW_NCHW44");
  324. }
  325. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_NCHW_NCHW44_F32_S1) {
  326. check_conv_bias(
  327. conv_bias::get_nchw44_conv_bias_args(
  328. {2, 3, 5, 7}, ONLY_IDENTITY_NLMODE, ONLY_BR_BIASMODE, 1, false,
  329. true),
  330. handle(), "F32_CONV_NCHW_NCHW44");
  331. }
  332. std::vector<conv_bias::TestArg> get_nchw44_channel_wise_args(
  333. std::vector<size_t> kernel, size_t stride, bool no_bias, bool no_nonlinemode,
  334. bool no_full_bias) {
  335. using namespace conv_bias;
  336. using Param = param::ConvBias;
  337. using NLMode = param::ConvBias::NonlineMode;
  338. std::vector<TestArg> args;
  339. auto pack = [&](size_t n, size_t group, size_t w, size_t h, size_t kernel,
  340. size_t stride, NLMode nlmode, bool pad) {
  341. Param param;
  342. param.stride_h = stride;
  343. param.stride_w = stride;
  344. if (pad) {
  345. param.pad_h = kernel / 2;
  346. param.pad_w = kernel / 2;
  347. } else {
  348. param.pad_h = 0;
  349. param.pad_w = 0;
  350. }
  351. param.nonlineMode = nlmode;
  352. param.format = param::ConvBias::Format::NCHW44;
  353. param.sparse = param::ConvBias::Sparse::GROUP;
  354. args.emplace_back(
  355. param, TensorShape{n, group, h, w, 4},
  356. TensorShape{group, 1, 1, kernel, kernel, 4}, TensorShape{});
  357. if (!no_bias) {
  358. args.emplace_back(
  359. param, TensorShape{n, group, h, w, 4},
  360. TensorShape{group, 1, 1, kernel, kernel, 4},
  361. TensorShape{1, group, 1, 1, 4});
  362. }
  363. if (!no_full_bias) {
  364. args.emplace_back(
  365. param, TensorShape{n, group, h, w, 4},
  366. TensorShape{group, 1, 1, kernel, kernel, 4},
  367. TensorShape{
  368. n, group, (h + 2 * param.pad_w - kernel) / stride + 1,
  369. (w + 2 * param.pad_w - kernel) / stride + 1, 4});
  370. }
  371. };
  372. std::vector<NLMode> nonlinemode = {NLMode::IDENTITY};
  373. if (!no_nonlinemode) {
  374. nonlinemode.emplace_back(NLMode::RELU);
  375. nonlinemode.emplace_back(NLMode::H_SWISH);
  376. }
  377. for (size_t n : {1, 2}) {
  378. for (auto nlmode : nonlinemode) {
  379. for (bool pad : {true}) {
  380. for (size_t group : {1, 2, 4, 7, 16}) {
  381. for (size_t size : {4, 6, 7, 9, 20}) {
  382. for (size_t kern : kernel) {
  383. pack(n, group, size, size, kern, stride, nlmode, pad);
  384. }
  385. }
  386. }
  387. }
  388. for (bool pad : {false}) {
  389. for (size_t group : {1, 2, 7, 16}) {
  390. for (size_t size : {7, 9, 20}) {
  391. for (size_t kern : kernel) {
  392. pack(n, group, size, size, kern, stride, nlmode, pad);
  393. }
  394. }
  395. }
  396. }
  397. }
  398. }
  399. return args;
  400. }
  401. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_CHANNEL_WISE_STRIDE1_FP32_NCHW44_1) {
  402. check_conv_bias(
  403. get_nchw44_channel_wise_args({2, 3}, 1, false, false, false), handle(),
  404. "F32_CHANNEL_WISE_NCHW44");
  405. }
  406. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_CHANNEL_WISE_STRIDE1_FP32_NCHW44_2) {
  407. check_conv_bias(
  408. get_nchw44_channel_wise_args({5}, 1, false, false, false), handle(),
  409. "F32_CHANNEL_WISE_NCHW44");
  410. }
  411. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_CHANNEL_WISE_STRIDE2_FP32_NCHW44) {
  412. check_conv_bias(
  413. get_nchw44_channel_wise_args({2, 3, 5}, 2, false, false, false), handle(),
  414. "F32_CHANNEL_WISE_NCHW44");
  415. }
  416. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S1_K7) {
  417. //! k=7 s=1
  418. check_conv_bias(
  419. conv_bias::get_nchw44_conv_bias_args(
  420. {7}, ONLY_IDENTITY_NLMODE, BR_AND_NO_BIASMODE, 1),
  421. handle(), "F32_CONV_NCHW44_DIRECT");
  422. }
  423. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S1_K2K3) {
  424. check_conv_bias(
  425. conv_bias::get_nchw44_conv_bias_args(
  426. {2, 3}, FULL_NLMODE, ONLY_BR_BIASMODE, 1),
  427. handle(), "F32_CONV_NCHW44_DIRECT");
  428. }
  429. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S1_K5) {
  430. check_conv_bias(
  431. conv_bias::get_nchw44_conv_bias_args({5}, FULL_NLMODE, ONLY_BR_BIASMODE, 1),
  432. handle(), "F32_CONV_NCHW44_DIRECT");
  433. }
  434. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S2) {
  435. check_conv_bias(
  436. conv_bias::get_nchw44_conv_bias_args(
  437. {2, 3, 5, 7}, FULL_NLMODE, ONLY_BR_BIASMODE, 2),
  438. handle(), "F32_CONV_NCHW44_DIRECT");
  439. }
  440. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32) {
  441. check_conv_bias(
  442. conv_bias::get_conv_bias_args(
  443. {1, 2, 3, 4, 5, 6, 7}, 1, false, false, false),
  444. handle(), "F32DIRECT");
  445. }
  446. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_STR2) {
  447. check_conv_bias(
  448. conv_bias::get_conv_bias_args({2, 3, 5, 7}, 2, false, false, false),
  449. handle(), "F32STRD2");
  450. }
  451. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_STR1) {
  452. check_conv_bias(
  453. conv_bias::get_conv_bias_args({2, 3, 5, 7}, 1, false, false, false),
  454. handle(), "F32STRD1");
  455. }
  456. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F23_4) {
  457. using namespace conv_bias;
  458. std::vector<TestArg> args = get_winograd_mk_packed_args();
  459. Checker<ConvBiasForward> checker(handle());
  460. check_winograd("4:2:32", checker, args, param::MatrixMul::Format::MK4);
  461. }
  462. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F23_4_NCHW44) {
  463. using namespace conv_bias;
  464. std::vector<TestArg> args =
  465. get_nchw44_conv_bias_args({3}, QUAN_NLMODE, BR_AND_NO_BIASMODE, 1);
  466. Checker<ConvBiasForward> checker(handle());
  467. check_winograd(
  468. "4:2:32", checker, args, param::MatrixMul::Format::MK4,
  469. param::ConvBias::Format::NCHW44);
  470. }
  471. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F23_4_WEIGHT_PREPROCESS) {
  472. using namespace conv_bias;
  473. std::vector<TestArg> args = get_winograd_mk_packed_args();
  474. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  475. handle());
  476. check_winograd("4:2:32", checker, args, param::MatrixMul::Format::MK4);
  477. }
  478. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F23_4_NCHW44_WEIGHT_PREPROCESS) {
  479. using namespace conv_bias;
  480. std::vector<TestArg> args =
  481. get_nchw44_conv_bias_args({3}, QUAN_NLMODE, BR_AND_NO_BIASMODE, 1);
  482. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  483. handle());
  484. check_winograd(
  485. "4:2:32", checker, args, param::MatrixMul::Format::MK4,
  486. param::ConvBias::Format::NCHW44);
  487. }
  488. TEST_F(FALLBACK, CONVBIAS_GI_WINOGRAD_F63_4) {
  489. using namespace conv_bias;
  490. std::vector<TestArg> args = get_winograd_mk_packed_args();
  491. Checker<ConvBiasForward> checker(handle());
  492. check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4);
  493. }
  494. TEST_F(FALLBACK, CONVBIAS_GI_WINOGRAD_F63_4_WEIGHT_PREPROCESS) {
  495. using namespace conv_bias;
  496. std::vector<TestArg> args = get_winograd_mk_packed_args();
  497. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  498. handle());
  499. check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4);
  500. }
  501. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F63) {
  502. using namespace conv_bias;
  503. std::vector<TestArg> args = get_winograd_args(3);
  504. Checker<ConvBiasForward> checker(handle());
  505. check_winograd("1:6:32", checker, args);
  506. }
  507. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F63_4) {
  508. using namespace conv_bias;
  509. std::vector<TestArg> args = get_winograd_mk_packed_args();
  510. Checker<ConvBiasForward> checker(handle());
  511. check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4);
  512. }
  513. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F63_4_NCHW44) {
  514. using namespace conv_bias;
  515. std::vector<TestArg> args =
  516. get_nchw44_conv_bias_args({3}, QUAN_NLMODE, BR_AND_NO_BIASMODE, 1);
  517. Checker<ConvBiasForward> checker(handle());
  518. check_winograd(
  519. "4:6:16", checker, args, param::MatrixMul::Format::MK4,
  520. param::ConvBias::Format::NCHW44);
  521. }
  522. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F54) {
  523. using namespace conv_bias;
  524. std::vector<TestArg> args = get_winograd_args(4);
  525. Checker<ConvBiasForward> checker(handle());
  526. check_winograd("1:5:32", checker, args);
  527. }
  528. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F45) {
  529. using namespace conv_bias;
  530. std::vector<TestArg> args = get_winograd_args(5);
  531. Checker<ConvBiasForward> checker(handle());
  532. check_winograd("1:4:32", checker, args);
  533. }
  534. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F63_WEIGHT_PREPROCESS) {
  535. using namespace conv_bias;
  536. std::vector<TestArg> args = get_winograd_args(3);
  537. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  538. handle());
  539. check_winograd("1:6:32", checker, args);
  540. }
  541. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F63_4_WEIGHT_PREPROCESS) {
  542. using namespace conv_bias;
  543. std::vector<TestArg> args = get_winograd_mk_packed_args();
  544. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  545. handle());
  546. check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4);
  547. }
  548. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F63_4_NCHW44_WEIGHT_PREPROCESS) {
  549. using namespace conv_bias;
  550. std::vector<TestArg> args =
  551. get_nchw44_conv_bias_args({3}, QUAN_NLMODE, BR_AND_NO_BIASMODE, 1);
  552. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  553. handle());
  554. check_winograd(
  555. "4:6:16", checker, args, param::MatrixMul::Format::MK4,
  556. param::ConvBias::Format::NCHW44);
  557. }
  558. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F54_WEIGHT_PREPROCESS) {
  559. using namespace conv_bias;
  560. std::vector<TestArg> args = get_winograd_args(4);
  561. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  562. handle());
  563. check_winograd("1:5:32", checker, args);
  564. }
  565. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F45_WEIGHT_PREPROCESS) {
  566. using namespace conv_bias;
  567. std::vector<TestArg> args = get_winograd_args(5);
  568. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  569. handle());
  570. check_winograd("1:4:32", checker, args);
  571. }
  572. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_PREPROCESS_NCHW44) {
  573. using namespace conv_bias;
  574. std::vector<TestArg> nchw44_args = conv_bias::get_nchw44_conv_bias_args(
  575. {3}, QUAN_NLMODE, BR_AND_NO_BIASMODE, 1);
  576. Checker<ConvBiasForward> checker(handle());
  577. auto run = [&checker](
  578. const std::vector<TestArg>& args, DType A_dtype, DType B_dtype,
  579. DType C_dtype, DType D_dtype, const float eps) {
  580. for (auto&& arg : args) {
  581. checker.set_dtype(0, A_dtype)
  582. .set_dtype(1, B_dtype)
  583. .set_dtype(2, C_dtype)
  584. .set_dtype(4, D_dtype)
  585. .set_epsilon(eps)
  586. .set_param(arg.param)
  587. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  588. }
  589. };
  590. //! uncomment this when low precision mode is ok
  591. // run(handle(), nchw44_args, {2, 6, 7}, dtype::Float32(), dtype::Float32(),
  592. // dtype::Float32(), dtype::Float32(), 1e-2f);
  593. //! remove this when low precision mode is ok
  594. run(nchw44_args, dtype::Float32(), dtype::Float32(), dtype::Float32(),
  595. dtype::Float32(), 1e-3f);
  596. }
  597. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_FORWARD_QUANTIZED) {
  598. using namespace conv_bias;
  599. param::ConvBias cur_param;
  600. using NLMode = param::ConvBias::NonlineMode;
  601. std::vector<conv_bias::TestArg> args = get_conv_bias_args(
  602. {1, 3, 5, 7}, {0, 3}, {NLMode::IDENTITY, NLMode::H_SWISH, NLMode::RELU},
  603. {1, 2}, false, false);
  604. UniformIntRNG int_rng{-50, 50};
  605. float epsilon = 1e-3;
  606. checker_conv_bias(
  607. args, handle(), &int_rng, epsilon, dtype::QuantizedS8(2.5f),
  608. dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f),
  609. dtype::QuantizedS8(60.25f), "FALLBACK_NAIVE");
  610. }
  611. #if MEGDNN_WITH_BENCHMARK
  612. namespace {
  613. void benchmark_impl(
  614. const param::ConvBias param,
  615. std::vector<std::pair<SmallVector<TensorShape>, float>>& shapes_and_computation,
  616. const std::string algo_name, size_t RUNS,
  617. TaskExecutorConfig&& multi_thread_config,
  618. TaskExecutorConfig&& single_thread_config, std::vector<DType>& data_type) {
  619. std::vector<float> multi_thread_times, single_thread_times;
  620. {
  621. auto multi_thread_hanle = create_cpu_handle(0, true, &multi_thread_config);
  622. auto benchmarker = Benchmarker<ConvBias>(multi_thread_hanle.get());
  623. benchmarker.set_times(RUNS)
  624. .set_display(false)
  625. .set_param(param)
  626. .set_dtype(0, data_type[0])
  627. .set_dtype(1, data_type[1])
  628. .set_dtype(2, data_type[2])
  629. .set_dtype(4, data_type[3])
  630. .set_before_exec_callback(
  631. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name.c_str()));
  632. for (auto shape : shapes_and_computation) {
  633. multi_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  634. }
  635. }
  636. {
  637. auto single_thread_handle = create_cpu_handle(0, true, &single_thread_config);
  638. auto benchmarker = Benchmarker<ConvBias>(single_thread_handle.get());
  639. benchmarker.set_times(RUNS)
  640. .set_display(false)
  641. .set_param(param)
  642. .set_dtype(0, data_type[0])
  643. .set_dtype(1, data_type[1])
  644. .set_dtype(2, data_type[2])
  645. .set_dtype(4, data_type[3])
  646. .set_before_exec_callback(
  647. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name.c_str()));
  648. for (auto shape : shapes_and_computation) {
  649. single_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  650. }
  651. }
  652. printf("Benchmark : Multi threads %zu, ", multi_thread_config.nr_thread);
  653. printf("core_ids:");
  654. for (size_t i = 0; i < multi_thread_config.affinity_core_set.size(); i++) {
  655. printf("%zu ", multi_thread_config.affinity_core_set[i]);
  656. }
  657. printf(", Single thread core_id %zu\n", single_thread_config.affinity_core_set[0]);
  658. for (size_t i = 0; i < shapes_and_computation.size(); i++) {
  659. auto shapes = shapes_and_computation[i];
  660. printf("Bench case: ");
  661. for (auto&& shape : shapes.first) {
  662. printf("%s ", shape.to_string().c_str());
  663. }
  664. float computations = shapes.second;
  665. printf("%zu threads gflops: %f,\n single thread gflops: "
  666. "%f. spead up = %f, speedup/cores=%f\n",
  667. multi_thread_config.nr_thread, computations / multi_thread_times[i],
  668. computations / single_thread_times[i],
  669. single_thread_times[i] / multi_thread_times[i],
  670. single_thread_times[i] / multi_thread_times[i] /
  671. multi_thread_config.nr_thread);
  672. }
  673. }
  674. } // namespace
  675. TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_GI_CONVBIAS_DIRECTF32) {
  676. constexpr size_t RUNS = 50;
  677. param::ConvBias param;
  678. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  679. param.pad_h = 1;
  680. param.pad_w = 1;
  681. param.stride_h = 1;
  682. param.stride_w = 1;
  683. param.sparse = param::ConvBias::Sparse::GROUP;
  684. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  685. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  686. size_t group) {
  687. SmallVector<TensorShape> shapes{
  688. {N, IC, H, W},
  689. {group, OC / group, IC / group, FS, FS},
  690. {1, OC, 1, 1},
  691. {},
  692. {N, OC, H, W}};
  693. TensorShape dst{N, OC, H, W};
  694. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  695. dst.total_nr_elems()) *
  696. 1e-6;
  697. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  698. };
  699. bench_case(1, 32, 32, 200, 200, 3, 4);
  700. bench_case(1, 32, 32, 200, 200, 3, 32);
  701. bench_case(1, 32, 32, 128, 128, 3, 4);
  702. bench_case(1, 32, 32, 128, 128, 3, 32);
  703. bench_case(1, 32, 32, 100, 100, 3, 4);
  704. bench_case(1, 32, 32, 100, 100, 3, 32);
  705. bench_case(1, 32, 32, 80, 80, 3, 4);
  706. bench_case(1, 32, 32, 80, 80, 3, 32);
  707. std::string algo_name = "F32DIRECT";
  708. printf("Benchmark F32DIRECT_LARGE_GROUP algo\n");
  709. std::vector<DType> data_type = {
  710. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  711. benchmark_impl(
  712. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  713. data_type);
  714. benchmark_impl(
  715. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  716. data_type);
  717. benchmark_impl(
  718. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  719. data_type);
  720. shapes_and_computation.clear();
  721. algo_name = "F32DIRECT";
  722. printf("Benchmark F32DIRECT_SMALL_GROUP algo\n");
  723. bench_case(1, 32, 32, 200, 200, 3, 1);
  724. bench_case(1, 32, 32, 128, 128, 3, 1);
  725. bench_case(1, 32, 32, 100, 100, 3, 1);
  726. bench_case(1, 32, 32, 80, 80, 3, 1);
  727. benchmark_impl(
  728. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  729. data_type);
  730. benchmark_impl(
  731. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  732. data_type);
  733. benchmark_impl(
  734. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  735. data_type);
  736. }
  737. TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_GI_CONVBIAS_DIRECTF32_STR1) {
  738. constexpr size_t RUNS = 50;
  739. param::ConvBias param;
  740. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  741. param.pad_h = 1;
  742. param.pad_w = 1;
  743. param.stride_h = 1;
  744. param.stride_w = 1;
  745. param.sparse = param::ConvBias::Sparse::GROUP;
  746. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  747. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  748. size_t group) {
  749. SmallVector<TensorShape> shapes{
  750. {N, IC, H, W},
  751. {group, OC / group, IC / group, FS, FS},
  752. {1, OC, 1, 1},
  753. {},
  754. {N, OC, H, W}};
  755. TensorShape dst{N, OC, H, W};
  756. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  757. dst.total_nr_elems()) *
  758. 1e-6;
  759. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  760. };
  761. bench_case(1, 32, 32, 200, 200, 3, 4);
  762. bench_case(1, 32, 32, 200, 200, 3, 32);
  763. bench_case(1, 32, 32, 128, 128, 3, 4);
  764. bench_case(1, 32, 32, 128, 128, 3, 32);
  765. bench_case(1, 32, 32, 100, 100, 3, 4);
  766. bench_case(1, 32, 32, 100, 100, 3, 32);
  767. bench_case(1, 32, 32, 80, 80, 3, 4);
  768. bench_case(1, 32, 32, 80, 80, 3, 32);
  769. std::string algo_name = "F32STRD1";
  770. printf("Benchmark F32STRD1_LARGE_GROUP algo\n");
  771. std::vector<DType> data_type = {
  772. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  773. benchmark_impl(
  774. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  775. data_type);
  776. benchmark_impl(
  777. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  778. data_type);
  779. benchmark_impl(
  780. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  781. data_type);
  782. shapes_and_computation.clear();
  783. algo_name = "F32STRD1";
  784. printf("Benchmark F32STRD1_SMALL_GROUP algo\n");
  785. bench_case(1, 32, 32, 200, 200, 3, 1);
  786. bench_case(1, 32, 32, 128, 128, 3, 1);
  787. bench_case(1, 32, 32, 100, 100, 3, 1);
  788. bench_case(1, 32, 32, 80, 80, 3, 1);
  789. benchmark_impl(
  790. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  791. data_type);
  792. benchmark_impl(
  793. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  794. data_type);
  795. benchmark_impl(
  796. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  797. data_type);
  798. }
  799. TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_GI_CONVBIAS_DIRECTF32_STR2) {
  800. constexpr size_t RUNS = 50;
  801. param::ConvBias param;
  802. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  803. param.pad_h = 1;
  804. param.pad_w = 1;
  805. param.stride_h = 2;
  806. param.stride_w = 2;
  807. param.sparse = param::ConvBias::Sparse::GROUP;
  808. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  809. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  810. size_t group, size_t P, size_t S) {
  811. SmallVector<TensorShape> shapes{
  812. {N, IC, H, W},
  813. {group, OC / group, IC / group, FS, FS},
  814. {1, OC, 1, 1},
  815. {},
  816. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  817. TensorShape dst{N, OC, H, W};
  818. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  819. dst.total_nr_elems()) *
  820. 1e-6;
  821. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  822. };
  823. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 2);
  824. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 2);
  825. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 2);
  826. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 2);
  827. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 2);
  828. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 2);
  829. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 2);
  830. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 2);
  831. std::string algo_name = "F32STRD2";
  832. printf("Benchmark F32STRD2_LARGE_GROUP algo\n");
  833. std::vector<DType> data_type = {
  834. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  835. benchmark_impl(
  836. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  837. data_type);
  838. benchmark_impl(
  839. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  840. data_type);
  841. benchmark_impl(
  842. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  843. data_type);
  844. shapes_and_computation.clear();
  845. algo_name = "F32STRD2";
  846. printf("Benchmark F32STRD2_SMALL_GROUP algo\n");
  847. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 2);
  848. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 2);
  849. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 2);
  850. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 2);
  851. benchmark_impl(
  852. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  853. data_type);
  854. benchmark_impl(
  855. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  856. data_type);
  857. benchmark_impl(
  858. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  859. data_type);
  860. }
  861. TEST_F(FALLBACK, BENCHMARK_GI_CHANNEL_WISE_F32_STRIDE1_NCHW44) {
  862. // have to remove preferred restrict in usable func before run the benchmark
  863. using namespace conv_bias;
  864. param::ConvBias param;
  865. param.stride_h = 1;
  866. param.stride_w = 1;
  867. param.pad_h = 1;
  868. param.pad_w = 1;
  869. param.nonlineMode = NonlineMode::RELU;
  870. param.sparse = param::ConvBias::Sparse::GROUP;
  871. constexpr size_t RUN = 50;
  872. Benchmarker<ConvBias> benchmark0(handle());
  873. benchmark0.set_display(false);
  874. benchmark0.set_param(param);
  875. benchmark0.set_times(RUN);
  876. benchmark0.set_before_exec_callback(
  877. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32STRD1"));
  878. auto opr = handle()->create_operator<ConvBias>();
  879. opr->param() = param;
  880. param.format = param::ConvBias::Format::NCHW44;
  881. Benchmarker<ConvBias> benchmark1(handle());
  882. benchmark1.set_display(false);
  883. benchmark1.set_param(param);
  884. benchmark1.set_times(RUN);
  885. benchmark1.set_before_exec_callback(
  886. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32_CHANNEL_WISE_NCHW44"));
  887. auto run = [&](size_t group, size_t w, size_t h, size_t kernel) {
  888. TensorLayout dst_layout;
  889. opr->deduce_layout(
  890. {{1, group * 4, h, w}, dtype::Int8()},
  891. {{group * 4, 1, 1, kernel, kernel}, dtype::Int8()},
  892. {{1, group * 4, 1, 1}, dtype::Int32()}, {}, dst_layout);
  893. //! dst.nr_elems * IC * FH * FW * 2
  894. float computations = dst_layout.total_nr_elems() * kernel * kernel * 2.0 /
  895. (1024 * 1024 * 1024) * 1e3;
  896. auto used0 = benchmark0.exec(
  897. {{1, group * 4, h, w},
  898. {group * 4, 1, 1, kernel, kernel},
  899. {1, group * 4, 1, 1},
  900. {},
  901. {}}) /
  902. RUN;
  903. auto used1 = benchmark1.exec(
  904. {{1, group, h, w, 4},
  905. {group, 1, 1, kernel, kernel, 4},
  906. {1, group, 1, 1, 4},
  907. {},
  908. {}}) /
  909. RUN;
  910. printf("group/h/w/kernel:%zu,%zu,%zu,%zu: nchw: %f ms %f Gflops "
  911. "nchw44: "
  912. "%f ms %f GFlops "
  913. "speedup: %f\n",
  914. group, h, w, kernel, used0, computations / used0, used1,
  915. computations / used1, used0 / used1);
  916. };
  917. for (size_t group : {8, 16, 32, 64}) {
  918. for (size_t kerenl : {2, 3, 5}) {
  919. run(group, 112, 112, kerenl);
  920. run(group, 56, 56, kerenl);
  921. run(group, 48, 48, kerenl);
  922. run(group, 28, 28, kerenl);
  923. run(group, 14, 14, kerenl);
  924. }
  925. }
  926. run(8, 112, 112, 3);
  927. run(32, 56, 56, 3);
  928. run(64, 28, 28, 3);
  929. run(128, 14, 14, 3);
  930. }
  931. TEST_F(FALLBACK, BENCHMARK_GI_CHANNEL_WISE_F32_STRIDE2_NCHW44) {
  932. // have to remove preferred restrict in usable func before run the benchmark
  933. using namespace conv_bias;
  934. param::ConvBias param;
  935. param.stride_h = 2;
  936. param.stride_w = 2;
  937. param.pad_h = 1;
  938. param.pad_w = 1;
  939. param.nonlineMode = NonlineMode::RELU;
  940. param.sparse = param::ConvBias::Sparse::GROUP;
  941. constexpr size_t RUN = 50;
  942. Benchmarker<ConvBias> benchmark0(handle());
  943. benchmark0.set_display(false);
  944. benchmark0.set_param(param);
  945. benchmark0.set_times(RUN);
  946. benchmark0.set_before_exec_callback(
  947. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32STRD2"));
  948. auto opr = handle()->create_operator<ConvBias>();
  949. opr->param() = param;
  950. param.format = param::ConvBias::Format::NCHW44;
  951. Benchmarker<ConvBias> benchmark1(handle());
  952. benchmark1.set_display(false);
  953. benchmark1.set_param(param);
  954. benchmark1.set_times(RUN);
  955. benchmark1.set_before_exec_callback(
  956. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32_CHANNEL_WISE_NCHW44"));
  957. auto run = [&](size_t group, size_t w, size_t h, size_t kernel) {
  958. TensorLayout dst_layout;
  959. opr->deduce_layout(
  960. {{1, group * 4, h, w}, dtype::Int8()},
  961. {{group * 4, 1, 1, kernel, kernel}, dtype::Int8()},
  962. {{1, group * 4, 1, 1}, dtype::Int32()}, {}, dst_layout);
  963. //! dst.nr_elems * IC * FH * FW * 2
  964. float computations = dst_layout.total_nr_elems() * kernel * kernel * 2.0 /
  965. (1024 * 1024 * 1024) * 1e3;
  966. auto used0 = benchmark0.exec(
  967. {{1, group * 4, h, w},
  968. {group * 4, 1, 1, kernel, kernel},
  969. {1, group * 4, 1, 1},
  970. {},
  971. {}}) /
  972. RUN;
  973. auto used1 = benchmark1.exec(
  974. {{1, group, h, w, 4},
  975. {group, 1, 1, kernel, kernel, 4},
  976. {1, group, 1, 1, 4},
  977. {},
  978. {}}) /
  979. RUN;
  980. printf("group/h/w/kernel:%zu,%zu,%zu,%zu: nchw: %f ms %f Gflops "
  981. "nchw44: "
  982. "%f ms %f GFlops "
  983. "speedup: %f\n",
  984. group, h, w, kernel, used0, computations / used0, used1,
  985. computations / used1, used0 / used1);
  986. };
  987. for (size_t group : {8, 16, 32, 64}) {
  988. for (size_t kerenl : {2, 3, 5}) {
  989. run(group, 112, 112, kerenl);
  990. run(group, 56, 56, kerenl);
  991. run(group, 48, 48, kerenl);
  992. run(group, 28, 28, kerenl);
  993. run(group, 14, 14, kerenl);
  994. }
  995. }
  996. run(8, 112, 112, 3);
  997. run(32, 56, 56, 3);
  998. run(64, 28, 28, 3);
  999. run(128, 14, 14, 3);
  1000. }
  1001. TEST_F(FALLBACK, BENCHMARK_CONVBIAS) {
  1002. constexpr size_t RUNS = 10;
  1003. param::ConvBias param;
  1004. param.stride_h = 1;
  1005. param.stride_w = 1;
  1006. Benchmarker<ConvBias> benchmarker_int(handle());
  1007. benchmarker_int.set_times(RUNS)
  1008. .set_dtype(0, dtype::QuantizedS8(2.5f))
  1009. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1010. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1011. .set_dtype(4, dtype::QuantizedS8(40.25f))
  1012. .set_display(false);
  1013. Benchmarker<ConvBias> benchmarker_float(handle());
  1014. benchmarker_float.set_display(false).set_times(RUNS);
  1015. auto run = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS) {
  1016. TensorShape src({N, IC, H, W}), filter({OC, IC, FS, FS}), bias({N, OC, 1, 1}),
  1017. z({}), dst({N, OC, H, W});
  1018. param.pad_h = FS / 2;
  1019. param.pad_w = FS / 2;
  1020. auto int_used =
  1021. benchmarker_int.set_param(param).exec({src, filter, bias, z, dst}) /
  1022. RUNS;
  1023. auto float_used =
  1024. benchmarker_float.set_param(param).exec({src, filter, bias, z, dst}) /
  1025. RUNS;
  1026. float computations = IC * (FS * FS + 1) * dst.total_nr_elems() * 2 * 1e-6;
  1027. printf("run: %s %s %s->%s \nfloat: %f ms %f Gflops int: %f ms "
  1028. "%f Gflops speedup: %f\n",
  1029. src.to_string().c_str(), filter.to_string().c_str(),
  1030. bias.to_string().c_str(), dst.to_string().c_str(), float_used,
  1031. computations / float_used, int_used, computations / int_used,
  1032. float_used / int_used);
  1033. };
  1034. run(1, 128, 128, 32, 32, 3);
  1035. for (size_t IC : {32, 64, 128}) {
  1036. for (size_t OC : {32, 64, 128}) {
  1037. for (size_t size : {28, 56}) {
  1038. for (size_t FS : {3, 5}) {
  1039. run(1, IC, OC, size, size, FS);
  1040. }
  1041. }
  1042. }
  1043. }
  1044. }
  1045. TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F23_4x4) {
  1046. #if MEGDNN_AARCH64
  1047. conv_bias::benchmark_winograd("WINOGRAD:AARCH64_F32_MK4_4x16:4:2", handle(), 3, 4);
  1048. #elif MEGDNN_ARMV7
  1049. conv_bias::benchmark_winograd("WINOGRAD:ARMV7_F32_MK4_4x8:4:2", handle(), 3, 4);
  1050. #else
  1051. conv_bias::benchmark_winograd("WINOGRAD:FB_GI_F32_MK4_4x8:4:2", handle(), 3, 4);
  1052. #endif
  1053. }
  1054. void benchmark_winograd_nchw_vs_nchw44(
  1055. const char* algo_name0, const char* algo_name1, Handle* handle) {
  1056. using namespace conv_bias;
  1057. using NLMode = param::ConvBias::NonlineMode;
  1058. std::vector<conv_bias::TestArg> args_nchw44;
  1059. std::vector<conv_bias::TestArg> args_nchw;
  1060. auto pack = [&](size_t n, size_t oc, size_t ic, size_t h, size_t w, size_t group,
  1061. NLMode nlmode) {
  1062. param::ConvBias param;
  1063. param.format = param::ConvBias::Format::NCHW44;
  1064. param.stride_h = 1;
  1065. param.stride_w = 1;
  1066. param.pad_h = 1;
  1067. param.pad_w = 1;
  1068. param.nonlineMode = nlmode;
  1069. if (group == 1) {
  1070. param.sparse = param::ConvBias::Sparse::DENSE;
  1071. args_nchw44.emplace_back(
  1072. param, TensorShape{n, ic / 4, h, w, 4},
  1073. TensorShape{oc / 4, ic / 4, 3, 3, 4, 4}, TensorShape{});
  1074. param.format = param::ConvBias::Format::NCHW;
  1075. args_nchw.emplace_back(
  1076. param, TensorShape{n, ic, h, w}, TensorShape{oc, ic, 3, 3},
  1077. TensorShape{});
  1078. } else {
  1079. auto oc_per_group = oc / group;
  1080. auto ic_per_group = ic / group;
  1081. param.sparse = param::ConvBias::Sparse::GROUP;
  1082. args_nchw44.emplace_back(
  1083. param, TensorShape{n, ic_per_group / 4, h, w, 4},
  1084. TensorShape{group, oc_per_group / 4, ic_per_group / 4, 3, 3, 4, 4},
  1085. TensorShape{});
  1086. param.format = param::ConvBias::Format::NCHW;
  1087. args_nchw.emplace_back(
  1088. param, TensorShape{n, ic, h, w},
  1089. TensorShape{group, oc_per_group, ic_per_group, 3, 3},
  1090. TensorShape{});
  1091. }
  1092. };
  1093. std::vector<NLMode> nonlinemode = {NLMode::IDENTITY};
  1094. for (auto nlmode : nonlinemode)
  1095. for (size_t n : {1})
  1096. for (size_t group = 1; group <= 1; ++group) {
  1097. pack(n, 512, 512, 15, 15, group, nlmode);
  1098. pack(n, 512, 256, 15, 15, group, nlmode);
  1099. pack(n, 256, 256, 29, 29, group, nlmode);
  1100. pack(n, 256, 128, 29, 29, group, nlmode);
  1101. pack(n, 128, 128, 57, 57, group, nlmode);
  1102. pack(n, 128, 64, 57, 57, group, nlmode);
  1103. pack(n, 24, 24, 224, 224, group, nlmode);
  1104. pack(n, 64, 24, 123, 123, group, nlmode);
  1105. pack(n, 64, 64, 56, 56, group, nlmode);
  1106. pack(n, 128, 128, 28, 28, group, nlmode);
  1107. pack(n, 256, 256, 14, 14, group, nlmode);
  1108. pack(n, 512, 512, 7, 7, group, nlmode);
  1109. }
  1110. using namespace conv_bias;
  1111. constexpr size_t RUN = 10;
  1112. Benchmarker<ConvBias> benchmark_winograd_nchw(handle);
  1113. benchmark_winograd_nchw.set_display(false);
  1114. benchmark_winograd_nchw.set_times(RUN);
  1115. Benchmarker<ConvBias> benchmark_winograd_nchw44(handle);
  1116. benchmark_winograd_nchw44.set_display(false);
  1117. benchmark_winograd_nchw44.set_times(RUN);
  1118. std::string winograd_nchw_algo_name = ssprintf("WINOGRAD:%s", algo_name0);
  1119. std::string winograd_nchw44_algo_name = ssprintf("WINOGRAD_NCHW44:%s", algo_name1);
  1120. for (size_t i = 0; i < args_nchw.size(); ++i) {
  1121. auto arg_nchw = args_nchw[i];
  1122. auto arg_nchw44 = args_nchw44[i];
  1123. TensorLayout dst_layout;
  1124. auto opr = handle->create_operator<ConvBias>();
  1125. opr->param() = arg_nchw.param;
  1126. opr->deduce_layout(
  1127. {arg_nchw.src, dtype::Float32()}, {arg_nchw.filter, dtype::Float32()},
  1128. {arg_nchw.bias, dtype::Float32()}, {}, dst_layout);
  1129. //! dst.nr_elems * IC * FH * FW * 2
  1130. float computations = dst_layout.total_nr_elems() * arg_nchw.filter[1] *
  1131. arg_nchw.filter[2] * arg_nchw.filter[3] * 2.0 /
  1132. (1024 * 1024 * 1024) * 1e3;
  1133. benchmark_winograd_nchw.set_param(arg_nchw.param);
  1134. auto nchw_used = algo_benchmark<ConvBias>(
  1135. benchmark_winograd_nchw,
  1136. {arg_nchw.src, arg_nchw.filter, {}, {}, {}},
  1137. winograd_nchw_algo_name.c_str()) /
  1138. RUN;
  1139. benchmark_winograd_nchw44.set_param(arg_nchw44.param);
  1140. auto nchw44_used = algo_benchmark<ConvBias>(
  1141. benchmark_winograd_nchw44,
  1142. {arg_nchw44.src, arg_nchw44.filter, {}, {}, {}},
  1143. winograd_nchw44_algo_name.c_str()) /
  1144. RUN;
  1145. printf("%s %s: nchw: %f ms %f Gflops nchw44: %f ms %f GFlops "
  1146. "speedup: "
  1147. "%f\n",
  1148. arg_nchw.src.to_string().c_str(), arg_nchw.filter.to_string().c_str(),
  1149. nchw_used, computations / nchw_used, nchw44_used,
  1150. computations / nchw44_used, nchw_used / nchw44_used);
  1151. }
  1152. }
  1153. TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F23_MK4_NCHW_VS_NCHW44) {
  1154. #if MEGDNN_AARCH64
  1155. benchmark_winograd_nchw_vs_nchw44(
  1156. "AARCH64_F32_MK4_4x16:4:2", "AARCH64_F32_MK4_4x16:4:2", handle());
  1157. #elif MEGDNN_ARMV7
  1158. benchmark_winograd_nchw_vs_nchw44(
  1159. "ARMV7_F32_MK4_4x8:4:2", "ARMV7_F32_MK4_4x8:4:2", handle());
  1160. #else
  1161. benchmark_winograd_nchw_vs_nchw44(
  1162. "FB_GI_F32_MK4_4x8:4:2", "FB_GI_F32_MK4_4x8:4:2", handle());
  1163. #endif
  1164. }
  1165. TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F63_4x4) {
  1166. #if MEGDNN_AARCH64
  1167. conv_bias::benchmark_winograd("WINOGRAD:AARCH64_F32_MK4_4x16:4:6", handle(), 3, 4);
  1168. #elif MEGDNN_ARMV7
  1169. conv_bias::benchmark_winograd("WINOGRAD:ARMV7_F32_MK4_4x8:4:6", handle(), 3, 4);
  1170. #else
  1171. conv_bias::benchmark_winograd("WINOGRAD:FB_GI_F32_MK4_4x8:4:6", handle(), 3, 4);
  1172. #endif
  1173. }
  1174. TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F63_MK4_NCHW_VS_NCHW44) {
  1175. #if MEGDNN_AARCH64
  1176. benchmark_winograd_nchw_vs_nchw44(
  1177. "AARCH64_F32_MK4_4x16:4:6", "AARCH64_F32_MK4_4x16:4:6", handle());
  1178. #elif MEGDNN_ARMV7
  1179. benchmark_winograd_nchw_vs_nchw44(
  1180. "ARMV7_F32_MK4_4x8:4:6", "ARMV7_F32_MK4_4x8:4:6", handle());
  1181. #else
  1182. benchmark_winograd_nchw_vs_nchw44(
  1183. "FB_GI_F32_MK4_4x8:4:6", "FB_GI_F32_MK4_4x8:4:6", handle());
  1184. #endif
  1185. }
  1186. #endif
  1187. } // namespace test
  1188. } // namespace megdnn
  1189. // vim: syntax=cpp.doxygen