You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias.cpp 54 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350
  1. #include "test/common/conv_bias.h"
  2. #include "megdnn/opr_param_defs.h"
  3. #include "megdnn/oprs.h"
  4. #include "test/common/benchmarker.h"
  5. #include "test/common/checker.h"
  6. #include "test/common/rng.h"
  7. #include "test/common/task_record_check.h"
  8. #include "test/common/tensor.h"
  9. #include "test/fallback/fixture.h"
  10. #if MEGDNN_X86
  11. #include "src/x86/utils.h"
  12. #endif
  13. namespace megdnn {
  14. namespace test {
  15. TEST_F(FALLBACK, CONV_BIAS_FORWARD) {
  16. using namespace conv_bias;
  17. std::vector<TestArg> args = get_args();
  18. Checker<ConvBiasForward> checker(handle());
  19. NormalRNG default_rng;
  20. UniformIntRNG int_rng{-50, 50};
  21. param::ConvBias param;
  22. {
  23. param.format = param::ConvBias::Format::NHWC;
  24. auto src_shape = TensorShape{2, 16, 32, 24};
  25. auto filter_shape = TensorShape{4, 3, 3, 24};
  26. auto bias_shape_channel = TensorShape{1, 1, 1, 4};
  27. checker.set_dtype(0, dtype::Float32())
  28. .set_dtype(1, dtype::Float32())
  29. .set_dtype(2, dtype::Float32())
  30. .set_rng(0, &default_rng)
  31. .set_rng(1, &default_rng)
  32. .set_rng(2, &default_rng)
  33. .set_param(param)
  34. .execs({src_shape, filter_shape, bias_shape_channel, {}, {}});
  35. }
  36. checker.set_before_exec_callback(
  37. conv_bias::ConvBiasAlgoChecker<ConvBias>("FALLBACK_NAIVE"));
  38. for (auto&& arg : args) {
  39. checker.set_dtype(0, dtype::Float32())
  40. .set_dtype(1, dtype::Float32())
  41. .set_dtype(2, dtype::Float32())
  42. .set_rng(0, &default_rng)
  43. .set_rng(1, &default_rng)
  44. .set_rng(2, &default_rng)
  45. .set_epsilon(1e-3)
  46. .set_param(arg.param)
  47. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  48. }
  49. {
  50. param.format = param::ConvBias::Format::NCHW;
  51. param.sparse = ConvBias::Param::Sparse::GROUP;
  52. auto src_shape = TensorShape{2, 16, 32, 24};
  53. auto filter_shape = TensorShape{4, 4, 4, 1, 1};
  54. auto bias_shape_channel = TensorShape{1, 16, 1, 1};
  55. auto bias_shape = TensorShape{2, 16, 32, 24};
  56. checker.set_dtype(0, dtype::Float32())
  57. .set_dtype(1, dtype::Float32())
  58. .set_dtype(2, dtype::Float32())
  59. .set_rng(0, &default_rng)
  60. .set_rng(1, &default_rng)
  61. .set_rng(2, &default_rng)
  62. .set_param(param)
  63. .execs({src_shape, filter_shape, bias_shape, {}, {}})
  64. .execs({src_shape, filter_shape, bias_shape_channel, {}, {}});
  65. }
  66. }
  67. TEST_F(FALLBACK, CONV_BIAS_FORWARD_RECORD) {
  68. using namespace conv_bias;
  69. TaskRecordChecker<ConvBiasForward> checker(1);
  70. NormalRNG default_rng;
  71. UniformIntRNG int_rng{-50, 50};
  72. param::ConvBias param;
  73. {
  74. param.format = param::ConvBias::Format::NHWC;
  75. auto src_shape = TensorShape{2, 16, 32, 24};
  76. auto filter_shape = TensorShape{4, 3, 3, 24};
  77. auto bias_shape_channel = TensorShape{1, 1, 1, 4};
  78. checker.set_dtype(0, dtype::Float32())
  79. .set_dtype(1, dtype::Float32())
  80. .set_dtype(2, dtype::Float32())
  81. .set_rng(0, &default_rng)
  82. .set_rng(1, &default_rng)
  83. .set_rng(2, &default_rng)
  84. .set_param(param)
  85. .execs({src_shape, filter_shape, bias_shape_channel, {}, {}});
  86. }
  87. {
  88. param.format = param::ConvBias::Format::NCHW;
  89. param.sparse = ConvBias::Param::Sparse::GROUP;
  90. auto src_shape = TensorShape{2, 16, 32, 24};
  91. auto filter_shape = TensorShape{4, 4, 4, 1, 1};
  92. auto bias_shape_channel = TensorShape{1, 16, 1, 1};
  93. auto bias_shape = TensorShape{2, 16, 32, 24};
  94. checker.set_dtype(0, dtype::Float32())
  95. .set_dtype(1, dtype::Float32())
  96. .set_dtype(2, dtype::Float32())
  97. .set_rng(0, &default_rng)
  98. .set_rng(1, &default_rng)
  99. .set_rng(2, &default_rng)
  100. .set_param(param)
  101. .execs({src_shape, filter_shape, bias_shape, {}, {}})
  102. .execs({src_shape, filter_shape, bias_shape_channel, {}, {}});
  103. }
  104. }
  105. TEST_F(FALLBACK, FP32_GEMV_MK4_GI) {
  106. Checker<MatrixMul> checker(handle());
  107. using Param = MatrixMul::Param;
  108. checker.set_before_exec_callback(AlgoChecker<MatrixMul>("FB_GI_F32_GEMV_MK4"));
  109. checker.set_epsilon(1e-2);
  110. auto run = [&](size_t M, size_t K) {
  111. Param param;
  112. param.format = param::MatrixMul::Format::MK4;
  113. param.transposeA = false;
  114. param.transposeB = false;
  115. TensorShape A, B;
  116. A = TensorShape{M / 4, K / 4, 4, 4};
  117. B = TensorShape{K / 4, 1, 4};
  118. checker.set_param(param).execs({A, B, {}});
  119. };
  120. // N = 1
  121. for (size_t M : {4, 16, 128, 1024})
  122. for (size_t K : {4, 8, 12, 128, 256, 4096})
  123. run(M, K);
  124. }
  125. std::vector<conv_bias::TestArg> get_conv_bias_args(
  126. std::vector<size_t> kernel, std::vector<size_t> padv,
  127. std::vector<param::ConvBias::NonlineMode> nlmodev, std::vector<size_t> stridev,
  128. bool no_bias, bool only_broadbias) {
  129. using namespace conv_bias;
  130. using Param = param::ConvBias;
  131. using NLMode = param::ConvBias::NonlineMode;
  132. std::vector<TestArg> args;
  133. auto pack = [&](size_t n, size_t oc, size_t ic, size_t w, size_t h, size_t pad,
  134. size_t kernel, size_t stride, NLMode nonlinemode) {
  135. Param param;
  136. param.stride_h = stride;
  137. param.stride_w = stride;
  138. param.pad_h = pad;
  139. param.pad_w = pad;
  140. param.nonlineMode = nonlinemode;
  141. args.emplace_back(
  142. param, TensorShape{n, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  143. TensorShape{});
  144. if (!no_bias) {
  145. args.emplace_back(
  146. param, TensorShape{n, ic, h, w},
  147. TensorShape{oc, ic, kernel, kernel}, TensorShape{1, oc, 1, 1});
  148. if (!only_broadbias) {
  149. args.emplace_back(
  150. param, TensorShape{n, ic, h, w},
  151. TensorShape{oc, ic, kernel, kernel},
  152. TensorShape{
  153. n, oc, (h + 2 * param.pad_h - kernel) / stride + 1,
  154. (w + 2 * param.pad_h - kernel) / stride + 1});
  155. }
  156. }
  157. };
  158. auto pack_group = [&](size_t n, size_t oc, size_t ic, size_t w, size_t h,
  159. size_t pad, size_t kernel, size_t stride,
  160. NLMode nonlinemode) {
  161. Param param;
  162. param.stride_h = stride;
  163. param.stride_w = stride;
  164. param.pad_h = pad;
  165. param.pad_w = pad;
  166. param.nonlineMode = nonlinemode;
  167. param.sparse = param::ConvBias::Sparse::GROUP;
  168. args.emplace_back(
  169. param, TensorShape{n, 2 * ic, h, w},
  170. TensorShape{2, oc, ic, kernel, kernel}, TensorShape{});
  171. if (!no_bias) {
  172. args.emplace_back(
  173. param, TensorShape{n, 2 * ic, h, w},
  174. TensorShape{2, oc, ic, kernel, kernel},
  175. TensorShape{1, oc * 2, 1, 1});
  176. if (!only_broadbias) {
  177. args.emplace_back(
  178. param, TensorShape{n, 2 * ic, h, w},
  179. TensorShape{2, oc, ic, kernel, kernel},
  180. TensorShape{
  181. n, 2 * oc, (h + 2 * param.pad_h - kernel) / stride + 1,
  182. (w + 2 * param.pad_h - kernel) / stride + 1});
  183. }
  184. }
  185. };
  186. for (size_t n : {1, 2}) {
  187. for (auto nlmode : nlmodev) {
  188. for (auto pad : padv) {
  189. for (auto stride : stridev) {
  190. for (size_t ic : {1, 5}) {
  191. for (size_t oc : {1, 11}) {
  192. for (size_t size : {9, 30}) {
  193. for (size_t kern : kernel) {
  194. pack(n, oc, ic, size + 4, size + 4, pad, kern,
  195. stride, nlmode);
  196. pack_group(
  197. n, oc, ic, size, size, pad, kern, stride,
  198. nlmode);
  199. }
  200. }
  201. }
  202. }
  203. }
  204. }
  205. }
  206. }
  207. return args;
  208. }
  209. void checker_conv_bias(
  210. std::vector<conv_bias::TestArg> args, Handle* handle, RNG* rng, float epsilon,
  211. DType type0, DType type1, DType type2, DType type3, const char* algo_name) {
  212. using namespace conv_bias;
  213. Checker<ConvBias> checker(handle);
  214. checker.set_before_exec_callback(
  215. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));
  216. checker.set_dtype(0, type0);
  217. checker.set_dtype(1, type1);
  218. checker.set_dtype(2, type2);
  219. checker.set_dtype(4, type3);
  220. checker.set_epsilon(epsilon);
  221. if (NULL != rng) {
  222. checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng).set_rng(3, rng);
  223. }
  224. for (auto&& arg : args) {
  225. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}});
  226. }
  227. }
  228. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_1X1_S1_MK4_PACK_F32) {
  229. using namespace conv_bias;
  230. std::vector<conv_bias::TestArg> args =
  231. get_nchw44_conv_bias_args({1}, FULL_NLMODE, ALL_BIASMODE, 1, true);
  232. check_conv_bias(args, handle(), "CONV1x1:FB_GI_F32_MK4_PACK_4x12:24");
  233. }
  234. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_IM2COL_S1_MK4_PACK_F32_PREPROCESS) {
  235. using namespace conv_bias;
  236. std::vector<conv_bias::TestArg> args =
  237. get_nchw44_conv_bias_args({2, 4, 7}, FULL_NLMODE, BR_AND_NO_BIASMODE, 1);
  238. #define cb(name) \
  239. check_conv_bias_preprocess( \
  240. args, handle(), nullptr, 0.001, dtype::Float32(), dtype::Float32(), \
  241. dtype::Float32(), dtype::Float32(), name);
  242. cb("IM2COLMATMUL:FB_GI_F32_MK4_PACK_4x12");
  243. #undef cb
  244. }
  245. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_IM2COL_S2_MK4_PACK_F32_FUSE_PREPROCESS) {
  246. using namespace conv_bias;
  247. std::vector<conv_bias::TestArg> args =
  248. get_nchw44_conv_bias_args({3}, FULL_NLMODE, BR_AND_BIAS_BIASMODE, 2);
  249. #define cb(name) \
  250. check_conv_bias_preprocess( \
  251. args, handle(), nullptr, 0.001, dtype::Float32(), dtype::Float32(), \
  252. dtype::Float32(), dtype::Float32(), name);
  253. cb("IM2COLMATMUL:FB_GI_F32_MK4_PACK_4x12");
  254. #undef cb
  255. }
  256. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_1X1_S1_MK4_PACK_F32_PREPROCESS) {
  257. using namespace conv_bias;
  258. std::vector<conv_bias::TestArg> args =
  259. get_nchw44_conv_bias_args({1}, FULL_NLMODE, ALL_BIASMODE, 1, true);
  260. #define cb(name) \
  261. check_conv_bias_preprocess( \
  262. args, handle(), nullptr, 0.001, dtype::Float32(), dtype::Float32(), \
  263. dtype::Float32(), dtype::Float32(), name);
  264. cb("CONV1x1:FB_GI_F32_MK4_PACK_4x12:24");
  265. #undef cb
  266. }
  267. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_IM2COL_S1_MK4_PACK_F32) {
  268. using namespace conv_bias;
  269. std::vector<conv_bias::TestArg> args =
  270. get_nchw44_conv_bias_args({2, 4, 7}, FULL_NLMODE, BR_AND_BIAS_BIASMODE, 1);
  271. check_conv_bias(args, handle(), "IM2COLMATMUL:FB_GI_F32_MK4_PACK_4x12");
  272. }
  273. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_IM2COL_S2_MK4_PACK_F32) {
  274. using namespace conv_bias;
  275. std::vector<conv_bias::TestArg> args =
  276. get_nchw44_conv_bias_args({3, 5, 6}, FULL_NLMODE, BR_AND_BIAS_BIASMODE, 2);
  277. #define cb(name) check_conv_bias(args, handle(), name);
  278. cb("IM2COLMATMUL:FB_GI_F32_MK4_PACK_4x12");
  279. #undef cb
  280. }
  281. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_IM2COL_S2_MK4_PACK_F32_FUSE) {
  282. using namespace conv_bias;
  283. std::vector<conv_bias::TestArg> args =
  284. get_nchw44_conv_bias_args({3}, FULL_NLMODE, ALL_BIASMODE, 2);
  285. #define cb(name) check_conv_bias(args, handle(), name);
  286. cb("IM2COLMATMUL:FB_GI_F32_MK4_PACK_4x12");
  287. #undef cb
  288. }
  289. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_FORWARD_IM2COL_8X8X16) {
  290. using namespace conv_bias;
  291. param::ConvBias cur_param;
  292. using NLMode = param::ConvBias::NonlineMode;
  293. std::vector<conv_bias::TestArg> args = get_conv_bias_args(
  294. {1, 3}, {0}, {NLMode::IDENTITY, NLMode::RELU}, {1}, false, true);
  295. NormalRNG default_rng;
  296. Checker<ConvBias> checker(handle());
  297. checker.set_dtype(0, dtype::Int8{});
  298. checker.set_dtype(1, dtype::Int8{});
  299. checker.set_dtype(2, dtype::Int16{});
  300. checker.set_dtype(4, dtype::Int16{});
  301. for (auto&& arg : args) {
  302. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}});
  303. }
  304. }
  305. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_FORWARD) {
  306. using namespace conv_bias;
  307. param::ConvBias cur_param;
  308. using NLMode = param::ConvBias::NonlineMode;
  309. std::vector<conv_bias::TestArg> args = get_conv_bias_args(
  310. {1, 3, 5}, {0, 3},
  311. {NLMode::IDENTITY, NLMode::H_SWISH, NLMode::SIGMOID, NLMode::RELU}, {1, 2},
  312. false, false);
  313. NormalRNG default_rng;
  314. checker_conv_bias(
  315. args, handle(), &default_rng, 1e-3, dtype::Float32{}, dtype::Float32{},
  316. dtype::Float32{}, dtype::Float32{}, "FALLBACK_NAIVE");
  317. }
  318. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_NCHW_NCHW44_F32_S2) {
  319. check_conv_bias(
  320. conv_bias::get_nchw44_conv_bias_args(
  321. {2, 3, 5, 7}, ONLY_IDENTITY_NLMODE, ONLY_BR_BIASMODE, 2, false,
  322. true),
  323. handle(), "F32_CONV_NCHW_NCHW44");
  324. }
  325. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_NCHW_NCHW44_F32_S1) {
  326. check_conv_bias(
  327. conv_bias::get_nchw44_conv_bias_args(
  328. {2, 3, 5, 7}, ONLY_IDENTITY_NLMODE, ONLY_BR_BIASMODE, 1, false,
  329. true),
  330. handle(), "F32_CONV_NCHW_NCHW44");
  331. }
  332. #define CB(_MODE, _SUFFIX) \
  333. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_NCHW_NCHW44_F32_S2_AGENT_##_SUFFIX) { \
  334. check_conv_bias( \
  335. conv_bias::get_nchw44_conv_bias_args( \
  336. {2, 3, 5, 7}, ONLY_IDENTITY_NLMODE, {_MODE}, 2, false, true), \
  337. handle(), "F32_CONV_AGENT_NCHW_NCHW44"); \
  338. }
  339. CB(megdnn::BiasMode::NO_BIAS, NO_BIAS);
  340. CB(megdnn::BiasMode::BROADCAST_CHANNEL_BIAS, BROADCAST_CHANNEL_BIAS);
  341. #undef CB
  342. #define CB(_MODE, _SUFFIX) \
  343. TEST_F(FALLBACK_MULTI_THREADS, \
  344. CONVBIAS_GI_NCHW_NCHW44_F32_S1_AGENT_IDENTITY_##_SUFFIX) { \
  345. check_conv_bias( \
  346. conv_bias::get_nchw44_conv_bias_args( \
  347. {2, 3, 5, 7}, {_MODE}, ONLY_BR_BIASMODE, 1, false, true), \
  348. handle(), "F32_CONV_AGENT_NCHW_NCHW44"); \
  349. }
  350. CB(param::ConvBias::NonlineMode::IDENTITY, IDENTITY);
  351. CB(param::ConvBias::NonlineMode::RELU, RELU);
  352. CB(param::ConvBias::NonlineMode::H_SWISH, H_SWISH);
  353. CB(param::ConvBias::NonlineMode::SIGMOID, SIGMOID);
  354. #undef CB
  355. std::vector<conv_bias::TestArg> get_nchw44_channel_wise_args(
  356. std::vector<size_t> kernel, size_t stride, bool no_bias, bool no_nonlinemode,
  357. bool no_full_bias) {
  358. using namespace conv_bias;
  359. using Param = param::ConvBias;
  360. using NLMode = param::ConvBias::NonlineMode;
  361. std::vector<TestArg> args;
  362. auto pack = [&](size_t n, size_t group, size_t w, size_t h, size_t kernel,
  363. size_t stride, NLMode nlmode, bool pad) {
  364. Param param;
  365. param.stride_h = stride;
  366. param.stride_w = stride;
  367. if (pad) {
  368. param.pad_h = kernel / 2;
  369. param.pad_w = kernel / 2;
  370. } else {
  371. param.pad_h = 0;
  372. param.pad_w = 0;
  373. }
  374. param.nonlineMode = nlmode;
  375. param.format = param::ConvBias::Format::NCHW44;
  376. param.sparse = param::ConvBias::Sparse::GROUP;
  377. args.emplace_back(
  378. param, TensorShape{n, group, h, w, 4},
  379. TensorShape{group, 1, 1, kernel, kernel, 4}, TensorShape{});
  380. if (!no_bias) {
  381. args.emplace_back(
  382. param, TensorShape{n, group, h, w, 4},
  383. TensorShape{group, 1, 1, kernel, kernel, 4},
  384. TensorShape{1, group, 1, 1, 4});
  385. }
  386. if (!no_full_bias) {
  387. args.emplace_back(
  388. param, TensorShape{n, group, h, w, 4},
  389. TensorShape{group, 1, 1, kernel, kernel, 4},
  390. TensorShape{
  391. n, group, (h + 2 * param.pad_w - kernel) / stride + 1,
  392. (w + 2 * param.pad_w - kernel) / stride + 1, 4});
  393. }
  394. };
  395. std::vector<NLMode> nonlinemode = {NLMode::IDENTITY};
  396. if (!no_nonlinemode) {
  397. nonlinemode.emplace_back(NLMode::RELU);
  398. nonlinemode.emplace_back(NLMode::H_SWISH);
  399. }
  400. for (size_t n : {1, 2}) {
  401. for (auto nlmode : nonlinemode) {
  402. for (bool pad : {true}) {
  403. for (size_t group : {1, 2, 4, 7, 16}) {
  404. for (size_t size : {4, 6, 7, 9, 20}) {
  405. for (size_t kern : kernel) {
  406. pack(n, group, size, size, kern, stride, nlmode, pad);
  407. }
  408. }
  409. }
  410. }
  411. for (bool pad : {false}) {
  412. for (size_t group : {1, 2, 7, 16}) {
  413. for (size_t size : {7, 9, 20}) {
  414. for (size_t kern : kernel) {
  415. pack(n, group, size, size, kern, stride, nlmode, pad);
  416. }
  417. }
  418. }
  419. }
  420. }
  421. }
  422. return args;
  423. }
  424. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_CHANNEL_WISE_STRIDE1_FP32_NCHW44_1) {
  425. check_conv_bias(
  426. get_nchw44_channel_wise_args({2, 3}, 1, false, false, false), handle(),
  427. "F32_CHANNEL_WISE_NCHW44");
  428. }
  429. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_CHANNEL_WISE_STRIDE1_FP32_NCHW44_2) {
  430. check_conv_bias(
  431. get_nchw44_channel_wise_args({5}, 1, false, false, false), handle(),
  432. "F32_CHANNEL_WISE_NCHW44");
  433. }
  434. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_CHANNEL_WISE_STRIDE2_FP32_NCHW44) {
  435. check_conv_bias(
  436. get_nchw44_channel_wise_args({2, 3, 5}, 2, false, false, false), handle(),
  437. "F32_CHANNEL_WISE_NCHW44");
  438. }
  439. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S1_K7) {
  440. //! k=7 s=1
  441. check_conv_bias(
  442. conv_bias::get_nchw44_conv_bias_args(
  443. {7}, ONLY_IDENTITY_NLMODE, BR_AND_NO_BIASMODE, 1),
  444. handle(), "F32_CONV_NCHW44_DIRECT");
  445. }
  446. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S1_K2K3) {
  447. check_conv_bias(
  448. conv_bias::get_nchw44_conv_bias_args(
  449. {2, 3}, FULL_NLMODE, ONLY_BR_BIASMODE, 1),
  450. handle(), "F32_CONV_NCHW44_DIRECT");
  451. }
  452. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S1_K5) {
  453. check_conv_bias(
  454. conv_bias::get_nchw44_conv_bias_args({5}, FULL_NLMODE, ONLY_BR_BIASMODE, 1),
  455. handle(), "F32_CONV_NCHW44_DIRECT");
  456. }
  457. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S2) {
  458. check_conv_bias(
  459. conv_bias::get_nchw44_conv_bias_args(
  460. {2, 3, 5, 7}, FULL_NLMODE, ONLY_BR_BIASMODE, 2),
  461. handle(), "F32_CONV_NCHW44_DIRECT");
  462. }
  463. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32) {
  464. check_conv_bias(
  465. conv_bias::get_conv_bias_args(
  466. {1, 2, 3, 4, 5, 6, 7}, 1, false, false, false),
  467. handle(), "F32DIRECT");
  468. }
  469. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_STR2) {
  470. check_conv_bias(
  471. conv_bias::get_conv_bias_args({2, 3, 5, 7}, 2, false, false, false),
  472. handle(), "F32STRD2");
  473. }
  474. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_STR1) {
  475. check_conv_bias(
  476. conv_bias::get_conv_bias_args({2, 3, 5, 7}, 1, false, false, false),
  477. handle(), "F32STRD1");
  478. }
  479. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F23_4) {
  480. using namespace conv_bias;
  481. std::vector<TestArg> args = get_winograd_mk_packed_args();
  482. Checker<ConvBiasForward> checker(handle());
  483. check_winograd("4:2:32", checker, args, param::MatrixMul::Format::MK4);
  484. }
  485. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F23_4_NCHW44) {
  486. using namespace conv_bias;
  487. std::vector<TestArg> args =
  488. get_nchw44_conv_bias_args({3}, QUAN_NLMODE, BR_AND_NO_BIASMODE, 1);
  489. Checker<ConvBiasForward> checker(handle());
  490. check_winograd(
  491. "4:2:32", checker, args, param::MatrixMul::Format::MK4,
  492. param::ConvBias::Format::NCHW44);
  493. }
  494. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F23_4_WEIGHT_PREPROCESS) {
  495. using namespace conv_bias;
  496. std::vector<TestArg> args = get_winograd_mk_packed_args();
  497. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  498. handle());
  499. check_winograd("4:2:32", checker, args, param::MatrixMul::Format::MK4);
  500. }
  501. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F23_4_NCHW44_WEIGHT_PREPROCESS) {
  502. using namespace conv_bias;
  503. std::vector<TestArg> args =
  504. get_nchw44_conv_bias_args({3}, QUAN_NLMODE, BR_AND_NO_BIASMODE, 1);
  505. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  506. handle());
  507. check_winograd(
  508. "4:2:32", checker, args, param::MatrixMul::Format::MK4,
  509. param::ConvBias::Format::NCHW44);
  510. }
  511. TEST_F(FALLBACK, CONVBIAS_GI_WINOGRAD_F63_4) {
  512. using namespace conv_bias;
  513. std::vector<TestArg> args = get_winograd_mk_packed_args();
  514. Checker<ConvBiasForward> checker(handle());
  515. check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4);
  516. }
  517. TEST_F(FALLBACK, CONVBIAS_GI_WINOGRAD_F63_4_WEIGHT_PREPROCESS) {
  518. using namespace conv_bias;
  519. std::vector<TestArg> args = get_winograd_mk_packed_args();
  520. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  521. handle());
  522. check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4);
  523. }
  524. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F63) {
  525. using namespace conv_bias;
  526. std::vector<TestArg> args = get_winograd_args(3);
  527. Checker<ConvBiasForward> checker(handle());
  528. check_winograd("1:6:32", checker, args);
  529. }
  530. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F63_4) {
  531. using namespace conv_bias;
  532. std::vector<TestArg> args = get_winograd_mk_packed_args();
  533. Checker<ConvBiasForward> checker(handle());
  534. check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4);
  535. }
  536. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F63_4_NCHW44) {
  537. using namespace conv_bias;
  538. std::vector<TestArg> args =
  539. get_nchw44_conv_bias_args({3}, QUAN_NLMODE, BR_AND_NO_BIASMODE, 1);
  540. Checker<ConvBiasForward> checker(handle());
  541. check_winograd(
  542. "4:6:16", checker, args, param::MatrixMul::Format::MK4,
  543. param::ConvBias::Format::NCHW44);
  544. }
  545. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F43_4_NCHW44) {
  546. using namespace conv_bias;
  547. std::vector<TestArg> args =
  548. get_nchw44_conv_bias_args({3}, QUAN_NLMODE, BR_AND_NO_BIASMODE, 1);
  549. Checker<ConvBiasForward> checker(handle());
  550. check_winograd(
  551. "4:4:16", checker, args, param::MatrixMul::Format::MK4,
  552. param::ConvBias::Format::NCHW44);
  553. }
  554. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F43_4_WEIGHT_PREPROCESS) {
  555. using namespace conv_bias;
  556. std::vector<TestArg> args = get_winograd_mk_packed_args();
  557. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  558. handle());
  559. check_winograd("4:4:16", checker, args, param::MatrixMul::Format::MK4);
  560. }
  561. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F54) {
  562. using namespace conv_bias;
  563. std::vector<TestArg> args = get_winograd_args(4);
  564. Checker<ConvBiasForward> checker(handle());
  565. check_winograd("1:5:32", checker, args);
  566. }
  567. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F45) {
  568. using namespace conv_bias;
  569. std::vector<TestArg> args = get_winograd_args(5);
  570. Checker<ConvBiasForward> checker(handle());
  571. check_winograd("1:4:32", checker, args);
  572. }
  573. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F63_WEIGHT_PREPROCESS) {
  574. using namespace conv_bias;
  575. std::vector<TestArg> args = get_winograd_args(3);
  576. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  577. handle());
  578. check_winograd("1:6:32", checker, args);
  579. }
  580. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F63_4_WEIGHT_PREPROCESS) {
  581. using namespace conv_bias;
  582. std::vector<TestArg> args = get_winograd_mk_packed_args();
  583. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  584. handle());
  585. check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4);
  586. }
  587. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F63_4_NCHW44_WEIGHT_PREPROCESS) {
  588. using namespace conv_bias;
  589. std::vector<TestArg> args =
  590. get_nchw44_conv_bias_args({3}, QUAN_NLMODE, BR_AND_NO_BIASMODE, 1);
  591. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  592. handle());
  593. check_winograd(
  594. "4:6:16", checker, args, param::MatrixMul::Format::MK4,
  595. param::ConvBias::Format::NCHW44);
  596. }
  597. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F54_WEIGHT_PREPROCESS) {
  598. using namespace conv_bias;
  599. std::vector<TestArg> args = get_winograd_args(4);
  600. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  601. handle());
  602. check_winograd("1:5:32", checker, args);
  603. }
  604. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F45_WEIGHT_PREPROCESS) {
  605. using namespace conv_bias;
  606. std::vector<TestArg> args = get_winograd_args(5);
  607. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  608. handle());
  609. check_winograd("1:4:32", checker, args);
  610. }
  611. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_PREPROCESS_NCHW44) {
  612. using namespace conv_bias;
  613. std::vector<TestArg> nchw44_args = conv_bias::get_nchw44_conv_bias_args(
  614. {3}, QUAN_NLMODE, BR_AND_NO_BIASMODE, 1);
  615. Checker<ConvBiasForward> checker(handle());
  616. auto run = [&checker](
  617. const std::vector<TestArg>& args, DType A_dtype, DType B_dtype,
  618. DType C_dtype, DType D_dtype, const float eps) {
  619. for (auto&& arg : args) {
  620. checker.set_dtype(0, A_dtype)
  621. .set_dtype(1, B_dtype)
  622. .set_dtype(2, C_dtype)
  623. .set_dtype(4, D_dtype)
  624. .set_epsilon(eps)
  625. .set_param(arg.param)
  626. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  627. }
  628. };
  629. //! uncomment this when low precision mode is ok
  630. // run(handle(), nchw44_args, {2, 6, 7}, dtype::Float32(), dtype::Float32(),
  631. // dtype::Float32(), dtype::Float32(), 1e-2f);
  632. //! remove this when low precision mode is ok
  633. run(nchw44_args, dtype::Float32(), dtype::Float32(), dtype::Float32(),
  634. dtype::Float32(), 1e-3f);
  635. }
  636. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_FORWARD_QUANTIZED) {
  637. using namespace conv_bias;
  638. param::ConvBias cur_param;
  639. using NLMode = param::ConvBias::NonlineMode;
  640. std::vector<conv_bias::TestArg> args = get_conv_bias_args(
  641. {1, 3, 5, 7}, {0, 3}, {NLMode::IDENTITY, NLMode::H_SWISH, NLMode::RELU},
  642. {1, 2}, false, false);
  643. UniformIntRNG int_rng{-50, 50};
  644. float epsilon = 1e-3;
  645. checker_conv_bias(
  646. args, handle(), &int_rng, epsilon, dtype::QuantizedS8(2.5f),
  647. dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f),
  648. dtype::QuantizedS8(60.25f), "FALLBACK_NAIVE");
  649. }
  650. #if MEGDNN_WITH_BENCHMARK
  651. namespace {
  652. void benchmark_impl(
  653. const param::ConvBias param,
  654. std::vector<std::pair<SmallVector<TensorShape>, float>>& shapes_and_computation,
  655. const std::string algo_name, size_t RUNS,
  656. TaskExecutorConfig&& multi_thread_config,
  657. TaskExecutorConfig&& single_thread_config, std::vector<DType>& data_type) {
  658. std::vector<float> multi_thread_times, single_thread_times;
  659. {
  660. auto multi_thread_hanle = create_cpu_handle(0, true, &multi_thread_config);
  661. auto benchmarker = Benchmarker<ConvBias>(multi_thread_hanle.get());
  662. benchmarker.set_times(RUNS)
  663. .set_display(false)
  664. .set_param(param)
  665. .set_dtype(0, data_type[0])
  666. .set_dtype(1, data_type[1])
  667. .set_dtype(2, data_type[2])
  668. .set_dtype(4, data_type[3])
  669. .set_before_exec_callback(
  670. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name.c_str()));
  671. for (auto shape : shapes_and_computation) {
  672. multi_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  673. }
  674. }
  675. {
  676. auto single_thread_handle = create_cpu_handle(0, true, &single_thread_config);
  677. auto benchmarker = Benchmarker<ConvBias>(single_thread_handle.get());
  678. benchmarker.set_times(RUNS)
  679. .set_display(false)
  680. .set_param(param)
  681. .set_dtype(0, data_type[0])
  682. .set_dtype(1, data_type[1])
  683. .set_dtype(2, data_type[2])
  684. .set_dtype(4, data_type[3])
  685. .set_before_exec_callback(
  686. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name.c_str()));
  687. for (auto shape : shapes_and_computation) {
  688. single_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  689. }
  690. }
  691. printf("Benchmark : Multi threads %zu, ", multi_thread_config.nr_thread);
  692. printf("core_ids:");
  693. for (size_t i = 0; i < multi_thread_config.affinity_core_set.size(); i++) {
  694. printf("%zu ", multi_thread_config.affinity_core_set[i]);
  695. }
  696. printf(", Single thread core_id %zu\n", single_thread_config.affinity_core_set[0]);
  697. for (size_t i = 0; i < shapes_and_computation.size(); i++) {
  698. auto shapes = shapes_and_computation[i];
  699. printf("Bench case: ");
  700. for (auto&& shape : shapes.first) {
  701. printf("%s ", shape.to_string().c_str());
  702. }
  703. float computations = shapes.second;
  704. printf("%zu threads gflops: %f,\n single thread gflops: "
  705. "%f. spead up = %f, speedup/cores=%f\n",
  706. multi_thread_config.nr_thread, computations / multi_thread_times[i],
  707. computations / single_thread_times[i],
  708. single_thread_times[i] / multi_thread_times[i],
  709. single_thread_times[i] / multi_thread_times[i] /
  710. multi_thread_config.nr_thread);
  711. }
  712. }
  713. } // namespace
  714. TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_GI_CONVBIAS_DIRECTF32) {
  715. constexpr size_t RUNS = 50;
  716. param::ConvBias param;
  717. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  718. param.pad_h = 1;
  719. param.pad_w = 1;
  720. param.stride_h = 1;
  721. param.stride_w = 1;
  722. param.sparse = param::ConvBias::Sparse::GROUP;
  723. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  724. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  725. size_t group) {
  726. SmallVector<TensorShape> shapes{
  727. {N, IC, H, W},
  728. {group, OC / group, IC / group, FS, FS},
  729. {1, OC, 1, 1},
  730. {},
  731. {N, OC, H, W}};
  732. TensorShape dst{N, OC, H, W};
  733. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  734. dst.total_nr_elems()) *
  735. 1e-6;
  736. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  737. };
  738. bench_case(1, 32, 32, 200, 200, 3, 4);
  739. bench_case(1, 32, 32, 200, 200, 3, 32);
  740. bench_case(1, 32, 32, 128, 128, 3, 4);
  741. bench_case(1, 32, 32, 128, 128, 3, 32);
  742. bench_case(1, 32, 32, 100, 100, 3, 4);
  743. bench_case(1, 32, 32, 100, 100, 3, 32);
  744. bench_case(1, 32, 32, 80, 80, 3, 4);
  745. bench_case(1, 32, 32, 80, 80, 3, 32);
  746. std::string algo_name = "F32DIRECT";
  747. printf("Benchmark F32DIRECT_LARGE_GROUP algo\n");
  748. std::vector<DType> data_type = {
  749. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  750. benchmark_impl(
  751. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  752. data_type);
  753. benchmark_impl(
  754. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  755. data_type);
  756. benchmark_impl(
  757. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  758. data_type);
  759. shapes_and_computation.clear();
  760. algo_name = "F32DIRECT";
  761. printf("Benchmark F32DIRECT_SMALL_GROUP algo\n");
  762. bench_case(1, 32, 32, 200, 200, 3, 1);
  763. bench_case(1, 32, 32, 128, 128, 3, 1);
  764. bench_case(1, 32, 32, 100, 100, 3, 1);
  765. bench_case(1, 32, 32, 80, 80, 3, 1);
  766. benchmark_impl(
  767. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  768. data_type);
  769. benchmark_impl(
  770. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  771. data_type);
  772. benchmark_impl(
  773. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  774. data_type);
  775. }
  776. TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_GI_CONVBIAS_DIRECTF32_STR1) {
  777. constexpr size_t RUNS = 50;
  778. param::ConvBias param;
  779. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  780. param.pad_h = 1;
  781. param.pad_w = 1;
  782. param.stride_h = 1;
  783. param.stride_w = 1;
  784. param.sparse = param::ConvBias::Sparse::GROUP;
  785. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  786. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  787. size_t group) {
  788. SmallVector<TensorShape> shapes{
  789. {N, IC, H, W},
  790. {group, OC / group, IC / group, FS, FS},
  791. {1, OC, 1, 1},
  792. {},
  793. {N, OC, H, W}};
  794. TensorShape dst{N, OC, H, W};
  795. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  796. dst.total_nr_elems()) *
  797. 1e-6;
  798. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  799. };
  800. bench_case(1, 32, 32, 200, 200, 3, 4);
  801. bench_case(1, 32, 32, 200, 200, 3, 32);
  802. bench_case(1, 32, 32, 128, 128, 3, 4);
  803. bench_case(1, 32, 32, 128, 128, 3, 32);
  804. bench_case(1, 32, 32, 100, 100, 3, 4);
  805. bench_case(1, 32, 32, 100, 100, 3, 32);
  806. bench_case(1, 32, 32, 80, 80, 3, 4);
  807. bench_case(1, 32, 32, 80, 80, 3, 32);
  808. std::string algo_name = "F32STRD1";
  809. printf("Benchmark F32STRD1_LARGE_GROUP algo\n");
  810. std::vector<DType> data_type = {
  811. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  812. benchmark_impl(
  813. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  814. data_type);
  815. benchmark_impl(
  816. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  817. data_type);
  818. benchmark_impl(
  819. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  820. data_type);
  821. shapes_and_computation.clear();
  822. algo_name = "F32STRD1";
  823. printf("Benchmark F32STRD1_SMALL_GROUP algo\n");
  824. bench_case(1, 32, 32, 200, 200, 3, 1);
  825. bench_case(1, 32, 32, 128, 128, 3, 1);
  826. bench_case(1, 32, 32, 100, 100, 3, 1);
  827. bench_case(1, 32, 32, 80, 80, 3, 1);
  828. benchmark_impl(
  829. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  830. data_type);
  831. benchmark_impl(
  832. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  833. data_type);
  834. benchmark_impl(
  835. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  836. data_type);
  837. }
  838. TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_GI_CONVBIAS_DIRECTF32_STR2) {
  839. constexpr size_t RUNS = 50;
  840. param::ConvBias param;
  841. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  842. param.pad_h = 1;
  843. param.pad_w = 1;
  844. param.stride_h = 2;
  845. param.stride_w = 2;
  846. param.sparse = param::ConvBias::Sparse::GROUP;
  847. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  848. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  849. size_t group, size_t P, size_t S) {
  850. SmallVector<TensorShape> shapes{
  851. {N, IC, H, W},
  852. {group, OC / group, IC / group, FS, FS},
  853. {1, OC, 1, 1},
  854. {},
  855. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  856. TensorShape dst{N, OC, H, W};
  857. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  858. dst.total_nr_elems()) *
  859. 1e-6;
  860. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  861. };
  862. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 2);
  863. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 2);
  864. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 2);
  865. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 2);
  866. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 2);
  867. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 2);
  868. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 2);
  869. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 2);
  870. std::string algo_name = "F32STRD2";
  871. printf("Benchmark F32STRD2_LARGE_GROUP algo\n");
  872. std::vector<DType> data_type = {
  873. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  874. benchmark_impl(
  875. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  876. data_type);
  877. benchmark_impl(
  878. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  879. data_type);
  880. benchmark_impl(
  881. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  882. data_type);
  883. shapes_and_computation.clear();
  884. algo_name = "F32STRD2";
  885. printf("Benchmark F32STRD2_SMALL_GROUP algo\n");
  886. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 2);
  887. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 2);
  888. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 2);
  889. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 2);
  890. benchmark_impl(
  891. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  892. data_type);
  893. benchmark_impl(
  894. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  895. data_type);
  896. benchmark_impl(
  897. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  898. data_type);
  899. }
  900. TEST_F(FALLBACK, BENCHMARK_GI_CHANNEL_WISE_F32_STRIDE1_NCHW44) {
  901. // have to remove preferred restrict in usable func before run the benchmark
  902. using namespace conv_bias;
  903. param::ConvBias param;
  904. param.stride_h = 1;
  905. param.stride_w = 1;
  906. param.pad_h = 1;
  907. param.pad_w = 1;
  908. param.nonlineMode = NonlineMode::RELU;
  909. param.sparse = param::ConvBias::Sparse::GROUP;
  910. constexpr size_t RUN = 50;
  911. Benchmarker<ConvBias> benchmark0(handle());
  912. benchmark0.set_display(false);
  913. benchmark0.set_param(param);
  914. benchmark0.set_times(RUN);
  915. benchmark0.set_before_exec_callback(
  916. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32STRD1"));
  917. auto opr = handle()->create_operator<ConvBias>();
  918. opr->param() = param;
  919. param.format = param::ConvBias::Format::NCHW44;
  920. Benchmarker<ConvBias> benchmark1(handle());
  921. benchmark1.set_display(false);
  922. benchmark1.set_param(param);
  923. benchmark1.set_times(RUN);
  924. benchmark1.set_before_exec_callback(
  925. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32_CHANNEL_WISE_NCHW44"));
  926. auto run = [&](size_t group, size_t w, size_t h, size_t kernel) {
  927. TensorLayout dst_layout;
  928. opr->deduce_layout(
  929. {{1, group * 4, h, w}, dtype::Int8()},
  930. {{group * 4, 1, 1, kernel, kernel}, dtype::Int8()},
  931. {{1, group * 4, 1, 1}, dtype::Int32()}, {}, dst_layout);
  932. //! dst.nr_elems * IC * FH * FW * 2
  933. float computations = dst_layout.total_nr_elems() * kernel * kernel * 2.0 /
  934. (1024 * 1024 * 1024) * 1e3;
  935. auto used0 = benchmark0.exec(
  936. {{1, group * 4, h, w},
  937. {group * 4, 1, 1, kernel, kernel},
  938. {1, group * 4, 1, 1},
  939. {},
  940. {}}) /
  941. RUN;
  942. auto used1 = benchmark1.exec(
  943. {{1, group, h, w, 4},
  944. {group, 1, 1, kernel, kernel, 4},
  945. {1, group, 1, 1, 4},
  946. {},
  947. {}}) /
  948. RUN;
  949. printf("group/h/w/kernel:%zu,%zu,%zu,%zu: nchw: %f ms %f Gflops "
  950. "nchw44: "
  951. "%f ms %f GFlops "
  952. "speedup: %f\n",
  953. group, h, w, kernel, used0, computations / used0, used1,
  954. computations / used1, used0 / used1);
  955. };
  956. for (size_t group : {8, 16, 32, 64}) {
  957. for (size_t kerenl : {2, 3, 5}) {
  958. run(group, 112, 112, kerenl);
  959. run(group, 56, 56, kerenl);
  960. run(group, 48, 48, kerenl);
  961. run(group, 28, 28, kerenl);
  962. run(group, 14, 14, kerenl);
  963. }
  964. }
  965. run(8, 112, 112, 3);
  966. run(32, 56, 56, 3);
  967. run(64, 28, 28, 3);
  968. run(128, 14, 14, 3);
  969. }
  970. TEST_F(FALLBACK, BENCHMARK_GI_CHANNEL_WISE_F32_STRIDE2_NCHW44) {
  971. // have to remove preferred restrict in usable func before run the benchmark
  972. using namespace conv_bias;
  973. param::ConvBias param;
  974. param.stride_h = 2;
  975. param.stride_w = 2;
  976. param.pad_h = 1;
  977. param.pad_w = 1;
  978. param.nonlineMode = NonlineMode::RELU;
  979. param.sparse = param::ConvBias::Sparse::GROUP;
  980. constexpr size_t RUN = 50;
  981. Benchmarker<ConvBias> benchmark0(handle());
  982. benchmark0.set_display(false);
  983. benchmark0.set_param(param);
  984. benchmark0.set_times(RUN);
  985. benchmark0.set_before_exec_callback(
  986. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32STRD2"));
  987. auto opr = handle()->create_operator<ConvBias>();
  988. opr->param() = param;
  989. param.format = param::ConvBias::Format::NCHW44;
  990. Benchmarker<ConvBias> benchmark1(handle());
  991. benchmark1.set_display(false);
  992. benchmark1.set_param(param);
  993. benchmark1.set_times(RUN);
  994. benchmark1.set_before_exec_callback(
  995. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32_CHANNEL_WISE_NCHW44"));
  996. auto run = [&](size_t group, size_t w, size_t h, size_t kernel) {
  997. TensorLayout dst_layout;
  998. opr->deduce_layout(
  999. {{1, group * 4, h, w}, dtype::Int8()},
  1000. {{group * 4, 1, 1, kernel, kernel}, dtype::Int8()},
  1001. {{1, group * 4, 1, 1}, dtype::Int32()}, {}, dst_layout);
  1002. //! dst.nr_elems * IC * FH * FW * 2
  1003. float computations = dst_layout.total_nr_elems() * kernel * kernel * 2.0 /
  1004. (1024 * 1024 * 1024) * 1e3;
  1005. auto used0 = benchmark0.exec(
  1006. {{1, group * 4, h, w},
  1007. {group * 4, 1, 1, kernel, kernel},
  1008. {1, group * 4, 1, 1},
  1009. {},
  1010. {}}) /
  1011. RUN;
  1012. auto used1 = benchmark1.exec(
  1013. {{1, group, h, w, 4},
  1014. {group, 1, 1, kernel, kernel, 4},
  1015. {1, group, 1, 1, 4},
  1016. {},
  1017. {}}) /
  1018. RUN;
  1019. printf("group/h/w/kernel:%zu,%zu,%zu,%zu: nchw: %f ms %f Gflops "
  1020. "nchw44: "
  1021. "%f ms %f GFlops "
  1022. "speedup: %f\n",
  1023. group, h, w, kernel, used0, computations / used0, used1,
  1024. computations / used1, used0 / used1);
  1025. };
  1026. for (size_t group : {8, 16, 32, 64}) {
  1027. for (size_t kerenl : {2, 3, 5}) {
  1028. run(group, 112, 112, kerenl);
  1029. run(group, 56, 56, kerenl);
  1030. run(group, 48, 48, kerenl);
  1031. run(group, 28, 28, kerenl);
  1032. run(group, 14, 14, kerenl);
  1033. }
  1034. }
  1035. run(8, 112, 112, 3);
  1036. run(32, 56, 56, 3);
  1037. run(64, 28, 28, 3);
  1038. run(128, 14, 14, 3);
  1039. }
  1040. TEST_F(FALLBACK, BENCHMARK_CONVBIAS) {
  1041. constexpr size_t RUNS = 10;
  1042. param::ConvBias param;
  1043. param.stride_h = 1;
  1044. param.stride_w = 1;
  1045. Benchmarker<ConvBias> benchmarker_int(handle());
  1046. benchmarker_int.set_times(RUNS)
  1047. .set_dtype(0, dtype::QuantizedS8(2.5f))
  1048. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1049. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1050. .set_dtype(4, dtype::QuantizedS8(40.25f))
  1051. .set_display(false);
  1052. Benchmarker<ConvBias> benchmarker_float(handle());
  1053. benchmarker_float.set_display(false).set_times(RUNS);
  1054. auto run = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS) {
  1055. TensorShape src({N, IC, H, W}), filter({OC, IC, FS, FS}), bias({N, OC, 1, 1}),
  1056. z({}), dst({N, OC, H, W});
  1057. param.pad_h = FS / 2;
  1058. param.pad_w = FS / 2;
  1059. auto int_used =
  1060. benchmarker_int.set_param(param).exec({src, filter, bias, z, dst}) /
  1061. RUNS;
  1062. auto float_used =
  1063. benchmarker_float.set_param(param).exec({src, filter, bias, z, dst}) /
  1064. RUNS;
  1065. float computations = IC * (FS * FS + 1) * dst.total_nr_elems() * 2 * 1e-6;
  1066. printf("run: %s %s %s->%s \nfloat: %f ms %f Gflops int: %f ms "
  1067. "%f Gflops speedup: %f\n",
  1068. src.to_string().c_str(), filter.to_string().c_str(),
  1069. bias.to_string().c_str(), dst.to_string().c_str(), float_used,
  1070. computations / float_used, int_used, computations / int_used,
  1071. float_used / int_used);
  1072. };
  1073. run(1, 128, 128, 32, 32, 3);
  1074. for (size_t IC : {32, 64, 128}) {
  1075. for (size_t OC : {32, 64, 128}) {
  1076. for (size_t size : {28, 56}) {
  1077. for (size_t FS : {3, 5}) {
  1078. run(1, IC, OC, size, size, FS);
  1079. }
  1080. }
  1081. }
  1082. }
  1083. }
  1084. TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F23_4x4) {
  1085. #if MEGDNN_AARCH64
  1086. conv_bias::benchmark_winograd("WINOGRAD:AARCH64_F32_MK4_4x16:4:2", handle(), 3, 4);
  1087. #elif MEGDNN_ARMV7
  1088. conv_bias::benchmark_winograd("WINOGRAD:ARMV7_F32_MK4_4x8:4:2", handle(), 3, 4);
  1089. #else
  1090. conv_bias::benchmark_winograd("WINOGRAD:FB_GI_F32_MK4_4x8:4:2", handle(), 3, 4);
  1091. #endif
  1092. }
  1093. void benchmark_winograd_nchw_vs_nchw44(
  1094. const char* algo_name0, const char* algo_name1, Handle* handle) {
  1095. using namespace conv_bias;
  1096. using NLMode = param::ConvBias::NonlineMode;
  1097. std::vector<conv_bias::TestArg> args_nchw44;
  1098. std::vector<conv_bias::TestArg> args_nchw;
  1099. auto pack = [&](size_t n, size_t oc, size_t ic, size_t h, size_t w, size_t group,
  1100. NLMode nlmode) {
  1101. param::ConvBias param;
  1102. param.format = param::ConvBias::Format::NCHW44;
  1103. param.stride_h = 1;
  1104. param.stride_w = 1;
  1105. param.pad_h = 1;
  1106. param.pad_w = 1;
  1107. param.nonlineMode = nlmode;
  1108. if (group == 1) {
  1109. param.sparse = param::ConvBias::Sparse::DENSE;
  1110. args_nchw44.emplace_back(
  1111. param, TensorShape{n, ic / 4, h, w, 4},
  1112. TensorShape{oc / 4, ic / 4, 3, 3, 4, 4}, TensorShape{});
  1113. param.format = param::ConvBias::Format::NCHW;
  1114. args_nchw.emplace_back(
  1115. param, TensorShape{n, ic, h, w}, TensorShape{oc, ic, 3, 3},
  1116. TensorShape{});
  1117. } else {
  1118. auto oc_per_group = oc / group;
  1119. auto ic_per_group = ic / group;
  1120. param.sparse = param::ConvBias::Sparse::GROUP;
  1121. args_nchw44.emplace_back(
  1122. param, TensorShape{n, ic_per_group / 4, h, w, 4},
  1123. TensorShape{group, oc_per_group / 4, ic_per_group / 4, 3, 3, 4, 4},
  1124. TensorShape{});
  1125. param.format = param::ConvBias::Format::NCHW;
  1126. args_nchw.emplace_back(
  1127. param, TensorShape{n, ic, h, w},
  1128. TensorShape{group, oc_per_group, ic_per_group, 3, 3},
  1129. TensorShape{});
  1130. }
  1131. };
  1132. std::vector<NLMode> nonlinemode = {NLMode::IDENTITY};
  1133. for (auto nlmode : nonlinemode)
  1134. for (size_t n : {1})
  1135. for (size_t group = 1; group <= 1; ++group) {
  1136. pack(n, 512, 512, 15, 15, group, nlmode);
  1137. pack(n, 512, 256, 15, 15, group, nlmode);
  1138. pack(n, 256, 256, 29, 29, group, nlmode);
  1139. pack(n, 256, 128, 29, 29, group, nlmode);
  1140. pack(n, 128, 128, 57, 57, group, nlmode);
  1141. pack(n, 128, 64, 57, 57, group, nlmode);
  1142. pack(n, 24, 24, 224, 224, group, nlmode);
  1143. pack(n, 64, 24, 123, 123, group, nlmode);
  1144. pack(n, 64, 64, 56, 56, group, nlmode);
  1145. pack(n, 128, 128, 28, 28, group, nlmode);
  1146. pack(n, 256, 256, 14, 14, group, nlmode);
  1147. pack(n, 512, 512, 7, 7, group, nlmode);
  1148. }
  1149. using namespace conv_bias;
  1150. constexpr size_t RUN = 10;
  1151. Benchmarker<ConvBias> benchmark_winograd_nchw(handle);
  1152. benchmark_winograd_nchw.set_display(false);
  1153. benchmark_winograd_nchw.set_times(RUN);
  1154. Benchmarker<ConvBias> benchmark_winograd_nchw44(handle);
  1155. benchmark_winograd_nchw44.set_display(false);
  1156. benchmark_winograd_nchw44.set_times(RUN);
  1157. std::string winograd_nchw_algo_name = ssprintf("WINOGRAD:%s", algo_name0);
  1158. std::string winograd_nchw44_algo_name = ssprintf("WINOGRAD_NCHW44:%s", algo_name1);
  1159. for (size_t i = 0; i < args_nchw.size(); ++i) {
  1160. auto arg_nchw = args_nchw[i];
  1161. auto arg_nchw44 = args_nchw44[i];
  1162. TensorLayout dst_layout;
  1163. auto opr = handle->create_operator<ConvBias>();
  1164. opr->param() = arg_nchw.param;
  1165. opr->deduce_layout(
  1166. {arg_nchw.src, dtype::Float32()}, {arg_nchw.filter, dtype::Float32()},
  1167. {arg_nchw.bias, dtype::Float32()}, {}, dst_layout);
  1168. //! dst.nr_elems * IC * FH * FW * 2
  1169. float computations = dst_layout.total_nr_elems() * arg_nchw.filter[1] *
  1170. arg_nchw.filter[2] * arg_nchw.filter[3] * 2.0 /
  1171. (1024 * 1024 * 1024) * 1e3;
  1172. benchmark_winograd_nchw.set_param(arg_nchw.param);
  1173. auto nchw_used = algo_benchmark<ConvBias>(
  1174. benchmark_winograd_nchw,
  1175. {arg_nchw.src, arg_nchw.filter, {}, {}, {}},
  1176. winograd_nchw_algo_name.c_str()) /
  1177. RUN;
  1178. benchmark_winograd_nchw44.set_param(arg_nchw44.param);
  1179. auto nchw44_used = algo_benchmark<ConvBias>(
  1180. benchmark_winograd_nchw44,
  1181. {arg_nchw44.src, arg_nchw44.filter, {}, {}, {}},
  1182. winograd_nchw44_algo_name.c_str()) /
  1183. RUN;
  1184. printf("%s %s: nchw: %f ms %f Gflops nchw44: %f ms %f GFlops "
  1185. "speedup: "
  1186. "%f\n",
  1187. arg_nchw.src.to_string().c_str(), arg_nchw.filter.to_string().c_str(),
  1188. nchw_used, computations / nchw_used, nchw44_used,
  1189. computations / nchw44_used, nchw_used / nchw44_used);
  1190. }
  1191. }
  1192. TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F23_MK4_NCHW_VS_NCHW44) {
  1193. #if MEGDNN_AARCH64
  1194. benchmark_winograd_nchw_vs_nchw44(
  1195. "AARCH64_F32_MK4_4x16:4:2", "AARCH64_F32_MK4_4x16:4:2", handle());
  1196. #elif MEGDNN_ARMV7
  1197. benchmark_winograd_nchw_vs_nchw44(
  1198. "ARMV7_F32_MK4_4x8:4:2", "ARMV7_F32_MK4_4x8:4:2", handle());
  1199. #else
  1200. benchmark_winograd_nchw_vs_nchw44(
  1201. "FB_GI_F32_MK4_4x8:4:2", "FB_GI_F32_MK4_4x8:4:2", handle());
  1202. #endif
  1203. }
  1204. TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F63_4x4) {
  1205. #if MEGDNN_AARCH64
  1206. conv_bias::benchmark_winograd("WINOGRAD:AARCH64_F32_MK4_4x16:4:6", handle(), 3, 4);
  1207. #elif MEGDNN_ARMV7
  1208. conv_bias::benchmark_winograd("WINOGRAD:ARMV7_F32_MK4_4x8:4:6", handle(), 3, 4);
  1209. #else
  1210. conv_bias::benchmark_winograd("WINOGRAD:FB_GI_F32_MK4_4x8:4:6", handle(), 3, 4);
  1211. #endif
  1212. }
  1213. TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F63_MK4_NCHW_VS_NCHW44) {
  1214. #if MEGDNN_AARCH64
  1215. benchmark_winograd_nchw_vs_nchw44(
  1216. "AARCH64_F32_MK4_4x16:4:6", "AARCH64_F32_MK4_4x16:4:6", handle());
  1217. #elif MEGDNN_ARMV7
  1218. benchmark_winograd_nchw_vs_nchw44(
  1219. "ARMV7_F32_MK4_4x8:4:6", "ARMV7_F32_MK4_4x8:4:6", handle());
  1220. #else
  1221. benchmark_winograd_nchw_vs_nchw44(
  1222. "FB_GI_F32_MK4_4x8:4:6", "FB_GI_F32_MK4_4x8:4:6", handle());
  1223. #endif
  1224. }
  1225. #endif
  1226. } // namespace test
  1227. } // namespace megdnn
  1228. // vim: syntax=cpp.doxygen