You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias.cpp 93 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305
  1. /**
  2. * \file dnn/test/x86/conv_bias.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "src/x86/utils.h"
  12. #include "test/x86/fixture.h"
  13. #include "megdnn/opr_param_defs.h"
  14. #include "megdnn/oprs.h"
  15. #include "test/common/benchmarker.h"
  16. #include "test/common/checker.h"
  17. #include "test/common/conv_bias.h"
  18. #include "test/common/rng.h"
  19. #include "test/common/tensor.h"
  20. #include "test/common/workspace_wrapper.h"
  21. namespace megdnn {
  22. namespace test {
  23. TEST_F(X86, CONV_BIAS_FORWARD) {
  24. using namespace conv_bias;
  25. std::vector<TestArg> args = get_args();
  26. Checker<ConvBiasForward> checker(handle());
  27. NormalRNG default_rng;
  28. ConstValue const_val;
  29. for (auto&& arg : args) {
  30. checker.set_dtype(0, dtype::Float32())
  31. .set_dtype(1, dtype::Float32())
  32. .set_dtype(2, dtype::Float32())
  33. .set_rng(0, &default_rng)
  34. .set_rng(1, &default_rng)
  35. .set_rng(2, &default_rng)
  36. .set_epsilon(1e-3)
  37. .set_param(arg.param)
  38. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  39. }
  40. }
  41. TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_INT8x8x32) {
  42. using namespace conv_bias;
  43. std::vector<TestArg> args;
  44. auto run = [&](size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  45. NonlineMode nonline_mode) {
  46. if (w + 2 * p < kernel || h + 2 * p < kernel)
  47. return;
  48. param::ConvBias param;
  49. param.stride_h = 1;
  50. param.stride_w = 1;
  51. param.pad_h = p;
  52. param.pad_w = p;
  53. param.nonlineMode = nonline_mode;
  54. param.sparse = param::ConvBias::Sparse::GROUP;
  55. //! no bias
  56. args.emplace_back(param, TensorShape{2, ic, h, w},
  57. TensorShape{ic, 1, 1, kernel, kernel}, TensorShape{});
  58. //! bias channel
  59. args.emplace_back(param, TensorShape{2, ic, h, w},
  60. TensorShape{ic, 1, 1, kernel, kernel},
  61. TensorShape{1, ic, 1, 1});
  62. };
  63. for (size_t kernel : {2, 3, 5, 7})
  64. for (size_t pad : {0, 1})
  65. for (size_t ic : {1, 5, 17, 20})
  66. for (size_t h : {7, 16, 38, 40})
  67. for (size_t w : {16, 25, 40, 55})
  68. for (NonlineMode nonline_mode : {NonlineMode::IDENTITY})
  69. run(ic, w, h, kernel, pad, nonline_mode);
  70. Checker<ConvBias> checker(handle());
  71. UniformIntRNG rng{-50, 50};
  72. checker.set_dtype(0, dtype::Int8())
  73. .set_dtype(1, dtype::Int8())
  74. .set_dtype(2, dtype::Int32())
  75. .set_dtype(4, dtype::Int32())
  76. .set_rng(0, &rng)
  77. .set_rng(1, &rng)
  78. .set_rng(2, &rng)
  79. .set_epsilon(1e-3);
  80. checker.set_before_exec_callback(
  81. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  82. "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"));
  83. for (auto&& arg : args) {
  84. checker.set_param(arg.param).exec(
  85. {arg.src, arg.filter, arg.bias, {}, {}});
  86. }
  87. }
  88. TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_QuantizedS32) {
  89. using namespace conv_bias;
  90. std::vector<TestArg> args;
  91. auto run = [&](size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  92. NonlineMode nonline_mode) {
  93. if (w + 2 * p < kernel || h + 2 * p < kernel)
  94. return;
  95. param::ConvBias param;
  96. param.stride_h = 1;
  97. param.stride_w = 1;
  98. param.pad_h = p;
  99. param.pad_w = p;
  100. param.nonlineMode = nonline_mode;
  101. param.sparse = param::ConvBias::Sparse::GROUP;
  102. //! no bias
  103. args.emplace_back(param, TensorShape{2, ic, h, w},
  104. TensorShape{ic, 1, 1, kernel, kernel}, TensorShape{});
  105. //! bias channel
  106. args.emplace_back(param, TensorShape{2, ic, h, w},
  107. TensorShape{ic, 1, 1, kernel, kernel},
  108. TensorShape{1, ic, 1, 1});
  109. };
  110. for (size_t kernel : {2, 3, 5, 7})
  111. for (size_t pad : {0, 1})
  112. for (size_t ic : {1, 3, 5, 7, 17})
  113. for (size_t h : {10, 17, 25, 30})
  114. for (size_t w : {19, 28, 58, 168})
  115. for (NonlineMode nonline_mode : {NonlineMode::IDENTITY})
  116. run(ic, w, h, kernel, pad, nonline_mode);
  117. Checker<ConvBias> checker(handle());
  118. UniformIntRNG rng{-50, 50};
  119. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  120. .set_dtype(1, dtype::QuantizedS8(2.5f))
  121. .set_dtype(2, dtype::QuantizedS32(6.25f))
  122. .set_dtype(4, {})
  123. .set_rng(0, &rng)
  124. .set_rng(1, &rng)
  125. .set_rng(2, &rng)
  126. .set_epsilon(1e-3);
  127. checker.set_before_exec_callback(
  128. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  129. "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"));
  130. for (auto&& arg : args) {
  131. checker.set_param(arg.param).exec(
  132. {arg.src, arg.filter, arg.bias, {}, {}});
  133. }
  134. }
  135. TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_QuantizedS8x8x8) {
  136. using namespace conv_bias;
  137. std::vector<TestArg> args;
  138. auto run = [&](size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  139. NonlineMode nonline_mode) {
  140. if (w + 2 * p < kernel || h + 2 * p < kernel)
  141. return;
  142. param::ConvBias param;
  143. param.stride_h = 1;
  144. param.stride_w = 1;
  145. param.pad_h = p;
  146. param.pad_w = p;
  147. param.nonlineMode = nonline_mode;
  148. param.sparse = param::ConvBias::Sparse::GROUP;
  149. //! no bias
  150. args.emplace_back(param, TensorShape{2, ic, h, w},
  151. TensorShape{ic, 1, 1, kernel, kernel}, TensorShape{});
  152. //! bias channel
  153. args.emplace_back(param, TensorShape{2, ic, h, w},
  154. TensorShape{ic, 1, 1, kernel, kernel},
  155. TensorShape{1, ic, 1, 1});
  156. };
  157. for (size_t kernel : {2, 3, 5, 7})
  158. for (size_t pad : {0, 1})
  159. for (size_t ic : {1, 3, 5, 7, 17})
  160. for (size_t h : {10, 15, 17, 30})
  161. for (size_t w : {19, 28, 58, 168})
  162. for (NonlineMode nonline_mode :
  163. {NonlineMode::IDENTITY, NonlineMode::H_SWISH,
  164. NonlineMode::RELU})
  165. run(ic, w, h, kernel, pad, nonline_mode);
  166. Checker<ConvBias> checker(handle());
  167. UniformIntRNG rng{-50, 50};
  168. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  169. .set_dtype(1, dtype::QuantizedS8(2.5f))
  170. .set_dtype(2, dtype::QuantizedS32(6.25f))
  171. .set_dtype(4, dtype::QuantizedS8(60.25f))
  172. .set_rng(0, &rng)
  173. .set_rng(1, &rng)
  174. .set_rng(2, &rng)
  175. .set_epsilon(1e-3);
  176. checker.set_before_exec_callback(
  177. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  178. "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"));
  179. for (auto&& arg : args) {
  180. checker.set_param(arg.param).exec(
  181. {arg.src, arg.filter, arg.bias, {}, {}});
  182. }
  183. }
  184. TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE1_INT8x8x32) {
  185. using namespace conv_bias;
  186. std::vector<TestArg> args;
  187. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  188. size_t p, NonlineMode nonline_mode) {
  189. if (w + 2 * p < kernel || h + 2 * p < kernel)
  190. return;
  191. param::ConvBias param;
  192. param.stride_h = 1;
  193. param.stride_w = 1;
  194. param.pad_h = p;
  195. param.pad_w = p;
  196. param.nonlineMode = nonline_mode;
  197. param.sparse = param::ConvBias::Sparse::DENSE;
  198. //! no bias
  199. args.emplace_back(param, TensorShape{2, ic, h, w},
  200. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  201. param.sparse = param::ConvBias::Sparse::GROUP;
  202. //! no bias
  203. args.emplace_back(param, TensorShape{2, 2 * ic, h, w},
  204. TensorShape{2, oc / 2, ic, kernel, kernel},
  205. TensorShape{});
  206. };
  207. for (size_t kernel : {2, 3, 5, 7})
  208. for (size_t pad : {0, 1})
  209. for (size_t oc : {4, 8, 13, 16, 24})
  210. for (size_t ic : {2, 3, 7, 10})
  211. for (size_t h : {10, 11})
  212. for (size_t w : {8, 10})
  213. for (NonlineMode nonline_mode :
  214. {NonlineMode::IDENTITY})
  215. run(oc, ic, w, h, kernel, pad, nonline_mode);
  216. Checker<ConvBias> checker(handle());
  217. UniformIntRNG rng{-50, 50};
  218. checker.set_dtype(0, dtype::Int8())
  219. .set_dtype(1, dtype::Int8())
  220. .set_dtype(2, dtype::Int32())
  221. .set_dtype(4, dtype::Int32())
  222. .set_rng(0, &rng)
  223. .set_rng(1, &rng)
  224. .set_rng(2, &rng)
  225. .set_epsilon(1e-3);
  226. checker.set_before_exec_callback(
  227. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  228. "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE1"));
  229. for (auto&& arg : args) {
  230. checker.set_param(arg.param).exec(
  231. {arg.src, arg.filter, arg.bias, {}, {}});
  232. }
  233. }
  234. TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE1_QuantizedS32) {
  235. using namespace conv_bias;
  236. std::vector<TestArg> args;
  237. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  238. size_t p, NonlineMode nonline_mode) {
  239. if (w + 2 * p < kernel || h + 2 * p < kernel)
  240. return;
  241. param::ConvBias param;
  242. param.stride_h = 1;
  243. param.stride_w = 1;
  244. param.pad_h = p;
  245. param.pad_w = p;
  246. param.nonlineMode = nonline_mode;
  247. param.sparse = param::ConvBias::Sparse::DENSE;
  248. //! no bias
  249. args.emplace_back(param, TensorShape{2, ic, h, w},
  250. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  251. param.sparse = param::ConvBias::Sparse::GROUP;
  252. //! no bias
  253. args.emplace_back(param, TensorShape{2, 2 * ic, h, w},
  254. TensorShape{2, oc / 2, ic, kernel, kernel},
  255. TensorShape{});
  256. };
  257. for (size_t kernel : {2, 3, 5, 7})
  258. for (size_t pad : {0, 1})
  259. for (size_t oc : {4, 8, 13, 16, 24})
  260. for (size_t ic : {2, 3, 7, 10})
  261. for (size_t h : {10, 11})
  262. for (size_t w : {8, 10})
  263. for (NonlineMode nonline_mode :
  264. {NonlineMode::IDENTITY})
  265. run(oc, ic, w, h, kernel, pad, nonline_mode);
  266. Checker<ConvBias> checker(handle());
  267. UniformIntRNG rng{-50, 50};
  268. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  269. .set_dtype(1, dtype::QuantizedS8(2.5f))
  270. .set_dtype(2, dtype::QuantizedS32(6.25f))
  271. .set_dtype(4, {})
  272. .set_rng(0, &rng)
  273. .set_rng(1, &rng)
  274. .set_rng(2, &rng)
  275. .set_epsilon(1e-3);
  276. checker.set_before_exec_callback(
  277. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  278. "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE1"));
  279. for (auto&& arg : args) {
  280. checker.set_param(arg.param).exec(
  281. {arg.src, arg.filter, arg.bias, {}, {}});
  282. }
  283. }
  284. TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE1_S8S8S8) {
  285. using namespace conv_bias;
  286. std::vector<TestArg> args;
  287. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  288. size_t p, NonlineMode nonline_mode) {
  289. if (w + 2 * p < kernel || h + 2 * p < kernel)
  290. return;
  291. param::ConvBias param;
  292. param.stride_h = 1;
  293. param.stride_w = 1;
  294. param.pad_h = p;
  295. param.pad_w = p;
  296. param.nonlineMode = nonline_mode;
  297. param.sparse = param::ConvBias::Sparse::DENSE;
  298. //! no bias
  299. args.emplace_back(param, TensorShape{1, ic, h, w},
  300. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  301. //! bias channel
  302. args.emplace_back(param, TensorShape{1, ic, h, w},
  303. TensorShape{oc, ic, kernel, kernel},
  304. TensorShape{1, oc, 1, 1});
  305. param.sparse = param::ConvBias::Sparse::GROUP;
  306. //! no bias
  307. args.emplace_back(param, TensorShape{2, 2 * ic, h, w},
  308. TensorShape{2, oc / 2, ic, kernel, kernel},
  309. TensorShape{});
  310. //! bias channel
  311. args.emplace_back(param, TensorShape{2, 2 * ic, h, w},
  312. TensorShape{2, oc / 2, ic, kernel, kernel},
  313. TensorShape{1, oc, 1, 1});
  314. };
  315. for (size_t kernel : {2, 3, 5, 7})
  316. for (size_t pad : {0, 1})
  317. for (size_t oc : {4, 8, 14, 16, 24})
  318. for (size_t ic : {2, 3, 7, 10})
  319. for (size_t h : {10, 11})
  320. for (size_t w : {8, 10})
  321. for (NonlineMode nonline_mode :
  322. {NonlineMode::IDENTITY, NonlineMode::RELU,
  323. NonlineMode::H_SWISH})
  324. run(oc, ic, w, h, kernel, pad, nonline_mode);
  325. Checker<ConvBias> checker(handle());
  326. UniformIntRNG rng{-50, 50};
  327. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  328. .set_dtype(1, dtype::QuantizedS8(2.5f))
  329. .set_dtype(2, dtype::QuantizedS32(6.25f))
  330. .set_dtype(4, dtype::QuantizedS8(60.25f))
  331. .set_rng(0, &rng)
  332. .set_rng(1, &rng)
  333. .set_rng(2, &rng)
  334. .set_epsilon(1e-3);
  335. checker.set_before_exec_callback(
  336. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  337. "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE1"));
  338. for (auto&& arg : args) {
  339. checker.set_param(arg.param).exec(
  340. {arg.src, arg.filter, arg.bias, {}, {}});
  341. }
  342. }
  343. TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE2_INT8x8x32) {
  344. using namespace conv_bias;
  345. std::vector<TestArg> args;
  346. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  347. size_t p, NonlineMode nonline_mode) {
  348. if (w + 2 * p < kernel || h + 2 * p < kernel)
  349. return;
  350. param::ConvBias param;
  351. param.stride_h = 2;
  352. param.stride_w = 2;
  353. param.pad_h = p;
  354. param.pad_w = p;
  355. param.nonlineMode = nonline_mode;
  356. param.sparse = param::ConvBias::Sparse::DENSE;
  357. //! no bias
  358. args.emplace_back(param, TensorShape{2, ic, h, w},
  359. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  360. param.sparse = param::ConvBias::Sparse::GROUP;
  361. //! no bias
  362. args.emplace_back(param, TensorShape{2, 2 * ic, h, w},
  363. TensorShape{2, oc / 2, ic, kernel, kernel},
  364. TensorShape{});
  365. };
  366. for (size_t kernel : {2, 3, 5, 7})
  367. for (size_t pad : {0, 1, 2, 5})
  368. for (size_t oc : {4, 8, 13, 16, 24})
  369. for (size_t ic : {2, 3, 7, 10})
  370. for (size_t h : {10, 11})
  371. for (size_t w : {8, 10, 20})
  372. for (NonlineMode nonline_mode :
  373. {NonlineMode::IDENTITY})
  374. run(oc, ic, w, h, kernel, pad, nonline_mode);
  375. Checker<ConvBias> checker(handle());
  376. UniformIntRNG rng{-50, 50};
  377. checker.set_dtype(0, dtype::Int8())
  378. .set_dtype(1, dtype::Int8())
  379. .set_dtype(2, dtype::Int32())
  380. .set_dtype(4, dtype::Int32())
  381. .set_rng(0, &rng)
  382. .set_rng(1, &rng)
  383. .set_rng(2, &rng)
  384. .set_epsilon(1e-3);
  385. checker.set_before_exec_callback(
  386. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  387. "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE2"));
  388. for (auto&& arg : args) {
  389. checker.set_param(arg.param).exec(
  390. {arg.src, arg.filter, arg.bias, {}, {}});
  391. }
  392. }
  393. TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE2_QuantizedS32) {
  394. using namespace conv_bias;
  395. std::vector<TestArg> args;
  396. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  397. size_t p, NonlineMode nonline_mode) {
  398. if (w + 2 * p < kernel || h + 2 * p < kernel)
  399. return;
  400. param::ConvBias param;
  401. param.stride_h = 2;
  402. param.stride_w = 2;
  403. param.pad_h = p;
  404. param.pad_w = p;
  405. param.nonlineMode = nonline_mode;
  406. param.sparse = param::ConvBias::Sparse::DENSE;
  407. //! no bias
  408. args.emplace_back(param, TensorShape{2, ic, h, w},
  409. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  410. param.sparse = param::ConvBias::Sparse::GROUP;
  411. //! no bias
  412. args.emplace_back(param, TensorShape{2, 2 * ic, h, w},
  413. TensorShape{2, oc / 2, ic, kernel, kernel},
  414. TensorShape{});
  415. };
  416. for (size_t kernel : {2, 3, 5, 7})
  417. for (size_t pad : {0, 1, 3, 5})
  418. for (size_t oc : {4, 8, 13, 16, 24})
  419. for (size_t ic : {2, 3, 7, 10})
  420. for (size_t h : {10, 11})
  421. for (size_t w : {8, 10, 19})
  422. for (NonlineMode nonline_mode :
  423. {NonlineMode::IDENTITY})
  424. run(oc, ic, w, h, kernel, pad, nonline_mode);
  425. Checker<ConvBias> checker(handle());
  426. UniformIntRNG rng{-50, 50};
  427. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  428. .set_dtype(1, dtype::QuantizedS8(2.5f))
  429. .set_dtype(2, dtype::QuantizedS32(6.25f))
  430. .set_dtype(4, {})
  431. .set_rng(0, &rng)
  432. .set_rng(1, &rng)
  433. .set_rng(2, &rng)
  434. .set_epsilon(1e-3);
  435. checker.set_before_exec_callback(
  436. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  437. "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE2"));
  438. for (auto&& arg : args) {
  439. checker.set_param(arg.param).exec(
  440. {arg.src, arg.filter, arg.bias, {}, {}});
  441. }
  442. }
  443. TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE2_S8S8S8) {
  444. using namespace conv_bias;
  445. std::vector<TestArg> args;
  446. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  447. size_t p, NonlineMode nonline_mode) {
  448. if (w + 2 * p < kernel || h + 2 * p < kernel)
  449. return;
  450. param::ConvBias param;
  451. param.stride_h = 2;
  452. param.stride_w = 2;
  453. param.pad_h = p;
  454. param.pad_w = p;
  455. param.nonlineMode = nonline_mode;
  456. param.sparse = param::ConvBias::Sparse::DENSE;
  457. //! no bias
  458. args.emplace_back(param, TensorShape{1, ic, h, w},
  459. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  460. //! bias channel
  461. args.emplace_back(param, TensorShape{1, ic, h, w},
  462. TensorShape{oc, ic, kernel, kernel},
  463. TensorShape{1, oc, 1, 1});
  464. param.sparse = param::ConvBias::Sparse::GROUP;
  465. //! no bias
  466. args.emplace_back(param, TensorShape{2, 2 * ic, h, w},
  467. TensorShape{2, oc / 2, ic, kernel, kernel},
  468. TensorShape{});
  469. //! bias channel
  470. args.emplace_back(param, TensorShape{2, 2 * ic, h, w},
  471. TensorShape{2, oc / 2, ic, kernel, kernel},
  472. TensorShape{1, oc, 1, 1});
  473. };
  474. for (size_t kernel : {2, 3, 5, 7})
  475. for (size_t pad : {0, 1, 3, 5})
  476. for (size_t oc : {4, 8, 14, 16, 24})
  477. for (size_t ic : {2, 3, 7, 10})
  478. for (size_t h : {10, 11})
  479. for (size_t w : {8, 10, 18})
  480. for (NonlineMode nonline_mode :
  481. {NonlineMode::IDENTITY, NonlineMode::RELU,
  482. NonlineMode::H_SWISH})
  483. run(oc, ic, w, h, kernel, pad, nonline_mode);
  484. Checker<ConvBias> checker(handle());
  485. UniformIntRNG rng{-50, 50};
  486. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  487. .set_dtype(1, dtype::QuantizedS8(2.5f))
  488. .set_dtype(2, dtype::QuantizedS32(6.25f))
  489. .set_dtype(4, dtype::QuantizedS8(60.25f))
  490. .set_rng(0, &rng)
  491. .set_rng(1, &rng)
  492. .set_rng(2, &rng)
  493. .set_epsilon(1e-3);
  494. checker.set_before_exec_callback(
  495. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  496. "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE2"));
  497. for (auto&& arg : args) {
  498. checker.set_param(arg.param).exec(
  499. {arg.src, arg.filter, arg.bias, {}, {}});
  500. }
  501. }
  502. TEST_F(X86_MULTI_THREADS, CONV_BIAS_DIRECT_STRIDE1_SMALL_GROUP) {
  503. using namespace conv_bias;
  504. std::vector<TestArg> args;
  505. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  506. size_t p, NonlineMode nonline_mode) {
  507. if (w + 2 * p < kernel || h + 2 * p < kernel)
  508. return;
  509. param::ConvBias param;
  510. param.stride_h = 1;
  511. param.stride_w = 1;
  512. param.pad_h = p;
  513. param.pad_w = p;
  514. param.nonlineMode = nonline_mode;
  515. //! no bias
  516. args.emplace_back(param, TensorShape{1, ic, h, w},
  517. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  518. //! bias channel
  519. args.emplace_back(param, TensorShape{2, ic, h, w},
  520. TensorShape{oc, ic, kernel, kernel},
  521. TensorShape{1, oc, 1, 1});
  522. //! bias
  523. args.emplace_back(param, TensorShape{2, ic, h, w},
  524. TensorShape{oc, ic, kernel, kernel},
  525. TensorShape{2, oc, (h + param.pad_h * 2 - kernel) + 1,
  526. (w + param.pad_w * 2 - kernel) + 1});
  527. };
  528. for (size_t kernel : {1, 2, 3, 4, 5, 6, 7})
  529. for (size_t ic : {1, 4, 8, 16})
  530. for (size_t oc : {1, 4, 8})
  531. for (size_t p : {0, 2})
  532. for (size_t size : {20, 21, 24})
  533. for (NonlineMode nonline_mode :
  534. {NonlineMode::RELU, NonlineMode::SIGMOID,
  535. NonlineMode::H_SWISH, NonlineMode::IDENTITY}) {
  536. run(oc, ic, size, size, kernel, p, nonline_mode);
  537. }
  538. Checker<ConvBias> checker(handle());
  539. UniformIntRNG rng{-50, 50};
  540. checker.set_dtype(0, dtype::Float32())
  541. .set_dtype(1, dtype::Float32())
  542. .set_dtype(2, dtype::Float32())
  543. .set_rng(0, &rng)
  544. .set_rng(1, &rng)
  545. .set_rng(2, &rng);
  546. checker.set_before_exec_callback(
  547. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  548. "X86_CONV_BIAS_DIRECT_STRIDE1_SMALL_GROUP"));
  549. for (auto&& arg : args) {
  550. checker.set_param(arg.param).exec(
  551. {arg.src, arg.filter, arg.bias, {}, {}});
  552. }
  553. }
  554. TEST_F(X86_MULTI_THREADS, CONV_BIAS_DIRECT_STRIDE1_LARGE_GROUP) {
  555. using namespace conv_bias;
  556. std::vector<TestArg> args;
  557. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  558. size_t p, NonlineMode nonline_mode) {
  559. if (w + 2 * p < kernel || h + 2 * p < kernel)
  560. return;
  561. param::ConvBias param;
  562. param.stride_h = 1;
  563. param.stride_w = 1;
  564. param.pad_h = p;
  565. param.pad_w = p;
  566. param.nonlineMode = nonline_mode;
  567. //! no bias
  568. args.emplace_back(param, TensorShape{1, ic, h, w},
  569. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  570. //! bias channel
  571. args.emplace_back(param, TensorShape{2, ic, h, w},
  572. TensorShape{oc, ic, kernel, kernel},
  573. TensorShape{1, oc, 1, 1});
  574. //! bias
  575. args.emplace_back(param, TensorShape{2, ic, h, w},
  576. TensorShape{oc, ic, kernel, kernel},
  577. TensorShape{2, oc, (h + param.pad_h * 2 - kernel) + 1,
  578. (w + param.pad_w * 2 - kernel) + 1});
  579. };
  580. for (size_t kernel : {1, 2, 3, 4, 5, 6, 7})
  581. for (size_t ic : {1, 4, 8, 16})
  582. for (size_t oc : {1, 4, 8})
  583. for (size_t p : {0, 2})
  584. for (size_t size : {20, 21, 24})
  585. for (NonlineMode nonline_mode :
  586. {NonlineMode::RELU, NonlineMode::SIGMOID,
  587. NonlineMode::H_SWISH, NonlineMode::IDENTITY}) {
  588. run(oc, ic, size, size, kernel, p, nonline_mode);
  589. }
  590. Checker<ConvBias> checker(handle());
  591. UniformIntRNG rng{-50, 50};
  592. checker.set_dtype(0, dtype::Float32())
  593. .set_dtype(1, dtype::Float32())
  594. .set_dtype(2, dtype::Float32())
  595. .set_rng(0, &rng)
  596. .set_rng(1, &rng)
  597. .set_rng(2, &rng);
  598. checker.set_before_exec_callback(
  599. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  600. "X86_CONV_BIAS_DIRECT_STRIDE1_LARGE_GROUP"));
  601. for (auto&& arg : args) {
  602. checker.set_param(arg.param).exec(
  603. {arg.src, arg.filter, arg.bias, {}, {}});
  604. }
  605. }
  606. TEST_F(X86_MULTI_THREADS, CONV_BIAS_DIRECT_STRIDE2) {
  607. using namespace conv_bias;
  608. std::vector<TestArg> args;
  609. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  610. size_t p, NonlineMode nonline_mode) {
  611. if (w + 2 * p < kernel || h + 2 * p < kernel)
  612. return;
  613. param::ConvBias param;
  614. param.stride_h = 2;
  615. param.stride_w = 2;
  616. param.pad_h = p;
  617. param.pad_w = p;
  618. param.nonlineMode = nonline_mode;
  619. //! no bias
  620. args.emplace_back(param, TensorShape{1, ic, h, w},
  621. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  622. };
  623. for (size_t kernel : {2, 3, 5, 7})
  624. for (size_t ic : {1, 4, 8, 16})
  625. for (size_t oc : {1, 4, 8})
  626. for (size_t p : {0, 2})
  627. for (size_t size : {20, 21, 24})
  628. for (NonlineMode nonline_mode :
  629. {NonlineMode::RELU, NonlineMode::SIGMOID,
  630. NonlineMode::H_SWISH, NonlineMode::IDENTITY}) {
  631. run(oc, ic, size, size, kernel, p, nonline_mode);
  632. }
  633. Checker<ConvBias> checker(handle());
  634. UniformIntRNG rng{-50, 50};
  635. checker.set_dtype(0, dtype::Float32())
  636. .set_dtype(1, dtype::Float32())
  637. .set_dtype(2, dtype::Float32())
  638. .set_rng(0, &rng)
  639. .set_rng(1, &rng)
  640. .set_rng(2, &rng);
  641. checker.set_before_exec_callback(
  642. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  643. "X86_CONV_BIAS_DIRECT_STRIDE2_SMALL_GROUP"));
  644. for (auto&& arg : args) {
  645. checker.set_param(arg.param).exec(
  646. {arg.src, arg.filter, arg.bias, {}, {}});
  647. }
  648. checker.set_before_exec_callback(
  649. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  650. "X86_CONV_BIAS_DIRECT_STRIDE2_LARGE_GROUP"));
  651. for (auto&& arg : args) {
  652. checker.set_param(arg.param).exec(
  653. {arg.src, arg.filter, arg.bias, {}, {}});
  654. }
  655. }
  656. TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32) {
  657. using namespace conv_bias;
  658. std::vector<TestArg> args;
  659. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  660. size_t p, NonlineMode nonline_mode) {
  661. if (w + 2 * p < kernel || h + 2 * p < kernel)
  662. return;
  663. param::ConvBias param;
  664. param.stride_h = 1;
  665. param.stride_w = 1;
  666. param.pad_h = p;
  667. param.pad_w = p;
  668. param.nonlineMode = nonline_mode;
  669. //! no bias
  670. args.emplace_back(param, TensorShape{1, ic, h, w},
  671. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  672. };
  673. for (size_t kernel : {1, 2, 3, 4, 5, 6, 7})
  674. for (size_t ic : {1, 4, 8, 16})
  675. for (size_t oc : {1, 4, 8})
  676. for (size_t p : {0, 2})
  677. for (size_t size : {20, 21, 24})
  678. for (NonlineMode nonline_mode :
  679. {NonlineMode::IDENTITY}) {
  680. run(oc, ic, size, size, kernel, p, nonline_mode);
  681. }
  682. //! test OC block
  683. run(2046, 1, 8, 8, 1, 0, NonlineMode::IDENTITY);
  684. Checker<ConvBias> checker(handle());
  685. UniformIntRNG rng{-50, 50};
  686. #define cb(algo_name) \
  687. checker.set_before_exec_callback( \
  688. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
  689. checker.set_dtype(0, dtype::Int8()); \
  690. checker.set_dtype(1, dtype::Int8()); \
  691. checker.set_dtype(2, dtype::Int32()); \
  692. checker.set_dtype(4, dtype::Int32()); \
  693. for (auto&& arg : args) { \
  694. checker.set_param(arg.param).execs({arg.src, arg.filter, {}, {}, {}}); \
  695. } \
  696. for (auto&& arg : args) { \
  697. checker.set_dtype(0, dtype::QuantizedS8(2.5f)) \
  698. .set_dtype(1, dtype::QuantizedS8(2.5f)) \
  699. .set_dtype(2, dtype::QuantizedS32(6.25f)) \
  700. .set_dtype(4, {}) \
  701. .set_rng(0, &rng) \
  702. .set_rng(1, &rng) \
  703. .set_rng(2, &rng) \
  704. .set_param(arg.param) \
  705. .execs({arg.src, arg.filter, {}, {}, {}}); \
  706. }
  707. #if defined(MEGDNN_X86_WITH_MKL_DNN)
  708. if (megdnn::x86::is_supported(x86::SIMDType::VNNI)) {
  709. cb("IM2COLMATMUL:X86_INT8X8X32_MKLDNN");
  710. }
  711. #endif
  712. #if MEGDNN_X86_WITH_VNNI
  713. if (megdnn::x86::is_supported(x86::SIMDType::VNNI)) {
  714. cb("IM2COLMATMUL:X86_INT8X8X32_VNNI");
  715. }
  716. #endif
  717. if (megdnn::x86::is_supported(x86::SIMDType::AVX2)) {
  718. cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16");
  719. cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2");
  720. }
  721. if (::megdnn::x86::is_supported(::megdnn::x86::SIMDType::SSE4_2)) {
  722. cb("IM2COLMATMUL:X86_INT8X8X32_SSE_4X8X2");
  723. }
  724. #undef cb
  725. }
  726. TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32) {
  727. using namespace conv_bias;
  728. std::vector<TestArg> args;
  729. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  730. size_t p, NonlineMode nonline_mode) {
  731. if (w + 2 * p < kernel || h + 2 * p < kernel)
  732. return;
  733. param::ConvBias param;
  734. param.stride_h = 1;
  735. param.stride_w = 1;
  736. param.pad_h = p;
  737. param.pad_w = p;
  738. param.nonlineMode = nonline_mode;
  739. //! no bias
  740. args.emplace_back(param, TensorShape{1, ic, h, w},
  741. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  742. args.emplace_back(param, TensorShape{1, ic, h, w},
  743. TensorShape{oc, ic, kernel, kernel},
  744. TensorShape{1, oc, 1, 1});
  745. args.emplace_back(
  746. param, TensorShape{1, ic, h, w},
  747. TensorShape{oc, ic, kernel, kernel},
  748. TensorShape{1, oc, (h + 2 * p - kernel) / param.stride_h + 1,
  749. (w + 2 * p - kernel) / param.stride_w + 1});
  750. };
  751. for (size_t kernel : {1, 2, 3, 4, 5, 6, 7})
  752. for (size_t ic : {1, 4, 8, 16})
  753. for (size_t oc : {1, 4, 8, 16, 300})
  754. for (size_t p : {0, 2})
  755. for (size_t size : {8, 24})
  756. for (NonlineMode nonline_mode :
  757. {NonlineMode::IDENTITY, NonlineMode::RELU}) {
  758. run(oc, ic, size, size, kernel, p, nonline_mode);
  759. }
  760. run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);
  761. Checker<ConvBias> checker(handle());
  762. #define cb(algo_name) \
  763. checker.set_before_exec_callback( \
  764. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
  765. for (auto&& arg : args) { \
  766. checker.set_param(arg.param).execs( \
  767. {arg.src, arg.filter, arg.bias, {}, {}}); \
  768. }
  769. #if defined(MEGDNN_X86_WITH_MKL) || defined(MEGDNN_X86_WITH_OPENBLAS)
  770. cb("IM2COLMATMUL:X86_F32_BLAS");
  771. #endif
  772. #undef cb
  773. }
  774. #if defined(MEGDNN_X86_WITH_MKL)
  775. TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA) {
  776. using namespace conv_bias;
  777. std::vector<TestArg> args;
  778. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  779. size_t p, NonlineMode nonline_mode) {
  780. if (w + 2 * p < kernel || h + 2 * p < kernel)
  781. return;
  782. param::ConvBias param;
  783. param.stride_h = 1;
  784. param.stride_w = 1;
  785. param.pad_h = p;
  786. param.pad_w = p;
  787. param.nonlineMode = nonline_mode;
  788. //! no bias
  789. args.emplace_back(param, TensorShape{1, ic, h, w},
  790. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  791. args.emplace_back(param, TensorShape{1, ic, h, w},
  792. TensorShape{oc, ic, kernel, kernel},
  793. TensorShape{1, oc, 1, 1});
  794. args.emplace_back(
  795. param, TensorShape{1, ic, h, w},
  796. TensorShape{oc, ic, kernel, kernel},
  797. TensorShape{1, oc, (h + 2 * p - kernel) / param.stride_h + 1,
  798. (w + 2 * p - kernel) / param.stride_w + 1});
  799. param.sparse = param::ConvBias::Sparse::GROUP;
  800. args.emplace_back(param, TensorShape{1, 2 * ic, h, w},
  801. TensorShape{2, oc, ic, kernel, kernel},
  802. TensorShape{});
  803. args.emplace_back(param, TensorShape{1, 2 * ic, h, w},
  804. TensorShape{2, oc, ic, kernel, kernel},
  805. TensorShape{1, oc * 2, 1, 1});
  806. args.emplace_back(
  807. param, TensorShape{1, 2 * ic, h, w},
  808. TensorShape{2, oc, ic, kernel, kernel},
  809. TensorShape{1, 2 * oc, (h + 2 * param.pad_h - kernel) / 1 + 1,
  810. (w + 2 * param.pad_w - kernel) / 1 + 1});
  811. };
  812. for (size_t kernel : {1, 2, 3, 4, 5, 6, 7})
  813. for (size_t ic : {1, 4, 8, 16})
  814. for (size_t oc : {1, 4, 8, 16})
  815. for (size_t p : {0, 1})
  816. for (size_t size : {8, 24})
  817. for (NonlineMode nonline_mode :
  818. {NonlineMode::IDENTITY, NonlineMode::RELU}) {
  819. run(oc, ic, size, size, kernel, p, nonline_mode);
  820. }
  821. run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);
  822. Checker<ConvBias> checker(handle());
  823. #define cb(algo_name) \
  824. checker.set_before_exec_callback( \
  825. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
  826. for (auto&& arg : args) { \
  827. checker.set_param(arg.param).execs( \
  828. {arg.src, arg.filter, arg.bias, {}, {}}); \
  829. }
  830. cb("IM2COLMATMUL:X86_F32_MKL_PACKA:192");
  831. #undef cb
  832. }
  833. #endif
  834. TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8) {
  835. using namespace conv_bias;
  836. std::vector<TestArg> args;
  837. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  838. size_t p, NonlineMode nonline_mode) {
  839. if (w + 2 * p < kernel || h + 2 * p < kernel)
  840. return;
  841. param::ConvBias param;
  842. param.stride_h = 1;
  843. param.stride_w = 1;
  844. param.pad_h = p;
  845. param.pad_w = p;
  846. param.nonlineMode = nonline_mode;
  847. //! no bias
  848. args.emplace_back(param, TensorShape{1, ic, h, w},
  849. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  850. //! bias channel
  851. args.emplace_back(param, TensorShape{2, ic, h, w},
  852. TensorShape{oc, ic, kernel, kernel},
  853. TensorShape{1, oc, 1, 1});
  854. };
  855. for (size_t kernel : {1, 2, 3, 4, 5, 6, 7})
  856. for (size_t ic : {1, 4, 8, 16})
  857. for (size_t oc : {1, 4, 8})
  858. for (size_t p : {0, 2})
  859. for (size_t size : {20, 21, 24})
  860. for (NonlineMode nonline_mode :
  861. {NonlineMode::IDENTITY, NonlineMode::RELU,
  862. NonlineMode::H_SWISH}) {
  863. run(oc, ic, size, size, kernel, p, nonline_mode);
  864. }
  865. run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);
  866. Checker<ConvBias> checker(handle());
  867. #define cb(algo_name) \
  868. checker.set_before_exec_callback( \
  869. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
  870. UniformIntRNG rng{-50, 50}; \
  871. for (auto&& arg : args) { \
  872. checker.set_dtype(0, dtype::QuantizedS8(2.5f)) \
  873. .set_dtype(1, dtype::QuantizedS8(2.5f)) \
  874. .set_dtype(2, dtype::QuantizedS32(6.25f)) \
  875. .set_dtype(4, dtype::QuantizedS8(60.25)) \
  876. .set_rng(0, &rng) \
  877. .set_rng(1, &rng) \
  878. .set_rng(2, &rng) \
  879. .set_param(arg.param) \
  880. .execs({arg.src, arg.filter, {}, {}, {}}); \
  881. }
  882. #if defined(MEGDNN_X86_WITH_MKL_DNN)
  883. if (x86::is_supported(x86::SIMDType::VNNI)) {
  884. cb("IM2COLMATMUL:X86_INT8X8X32_MKLDNN");
  885. }
  886. #endif
  887. #if MEGDNN_X86_WITH_VNNI
  888. if (x86::is_supported(x86::SIMDType::VNNI)) {
  889. cb("IM2COLMATMUL:X86_INT8X8X32_VNNI");
  890. }
  891. #endif
  892. if (x86::is_supported(x86::SIMDType::AVX2)) {
  893. cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16");
  894. }
  895. #undef cb
  896. }
  897. TEST_F(X86, CONV_BIAS_MATMUL) {
  898. using namespace conv_bias;
  899. std::vector<TestArg> args;
  900. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  901. size_t p, NonlineMode nonline_mode) {
  902. if (w + 2 * p < kernel || h + 2 * p < kernel)
  903. return;
  904. param::ConvBias param;
  905. param.stride_h = 1;
  906. param.stride_w = 1;
  907. param.pad_h = p;
  908. param.pad_w = p;
  909. param.nonlineMode = nonline_mode;
  910. //! no bias
  911. param.sparse = param::ConvBias::Sparse::DENSE;
  912. args.emplace_back(param, TensorShape{1, ic, h, w},
  913. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  914. //! bias channel
  915. args.emplace_back(param, TensorShape{2, ic, h, w},
  916. TensorShape{oc, ic, kernel, kernel},
  917. TensorShape{1, oc, 1, 1});
  918. //! bias
  919. args.emplace_back(param, TensorShape{2, ic, h, w},
  920. TensorShape{oc, ic, kernel, kernel},
  921. TensorShape{2, oc, (h + param.pad_h * 2 - kernel) + 1,
  922. (w + param.pad_w * 2 - kernel) + 1});
  923. //! gruop
  924. param.sparse = param::ConvBias::Sparse::GROUP;
  925. args.emplace_back(
  926. param, TensorShape{2, 2 * ic, h, w},
  927. TensorShape{2, oc, ic, kernel, kernel},
  928. TensorShape{2, 2 * oc, (h + param.pad_h * 2 - kernel) + 1,
  929. (w + param.pad_w * 2 - kernel) + 1});
  930. };
  931. for (size_t kernel : {2, 3, 5, 7})
  932. for (size_t ic : {1, 2, 3, 4})
  933. for (size_t oc : {1, 2, 3, 4})
  934. for (size_t p : {0, 2})
  935. for (size_t size : {20, 21, 22, 23, 24})
  936. for (NonlineMode nonline_mode :
  937. {NonlineMode::RELU, NonlineMode::SIGMOID,
  938. NonlineMode::H_SWISH, NonlineMode::IDENTITY}) {
  939. run(oc, ic, size, size, kernel, p, nonline_mode);
  940. }
  941. Checker<ConvBias> checker(handle());
  942. checker.set_before_exec_callback(
  943. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  944. "X86_CONV_BIAS_MATMUL"));
  945. checker.set_epsilon(1);
  946. UniformIntRNG rng{-50, 50};
  947. checker.set_dtype(0, dtype::Float32())
  948. .set_dtype(1, dtype::Float32())
  949. .set_dtype(2, dtype::Float32())
  950. .set_rng(0, &rng)
  951. .set_rng(1, &rng)
  952. .set_rng(2, &rng);
  953. for (auto&& arg : args) {
  954. checker.set_param(arg.param).exec(
  955. {arg.src, arg.filter, arg.bias, {}, {}});
  956. }
  957. }
  958. #if MEGDNN_WITH_BENCHMARK
  959. #if defined(MEGDNN_X86_WITH_MKL_DNN)
  960. static void x86_benchmark_fp32_mkldnn(Handle* handle) {
  961. constexpr size_t RUNS = 30;
  962. param::ConvBias param;
  963. Benchmarker<ConvBias> benchmarker_mkldnn(handle);
  964. benchmarker_mkldnn.set_display(false).set_times(RUNS);
  965. benchmarker_mkldnn.set_before_exec_callback(
  966. AlgoChecker<ConvBias>("MKLDNN_CONV_FP32"));
  967. Benchmarker<ConvBias> benchmarker_im2col(handle);
  968. benchmarker_im2col.set_display(false).set_times(RUNS);
  969. benchmarker_im2col.set_before_exec_callback(
  970. AlgoChecker<ConvBias>("IM2COLMATMUL.+"));
  971. auto run = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  972. size_t FS, size_t SZ, size_t GROUP = 1) {
  973. TensorShape src({N, IC, H, W}), filter({OC, IC, FS, FS}),
  974. bias({1, OC, 1, 1}), z({}), dst({N, OC, H / SZ, W / SZ});
  975. param.pad_h = FS / 2;
  976. param.pad_w = FS / 2;
  977. param.stride_h = SZ;
  978. param.stride_w = SZ;
  979. param.format = param::ConvBias::Format::NCHW;
  980. param.sparse = param::ConvBias::Sparse::DENSE;
  981. if (GROUP > 1) {
  982. param.sparse = param::ConvBias::Sparse::GROUP;
  983. filter = {GROUP, OC / GROUP, IC / GROUP, FS, FS};
  984. }
  985. auto im2col_used = benchmarker_im2col.set_param(param).exec(
  986. {src, filter, bias, z, dst}) /
  987. RUNS;
  988. src = IC < 8 ? TensorShape{N, IC, H, W}
  989. : TensorShape{N, IC / 8, H, W, 8};
  990. filter = IC < 8 ? TensorShape{OC / 8, FS, FS, IC, 8}
  991. : TensorShape{OC / 8, IC / 8, FS, FS, 8, 8};
  992. if (GROUP > 1 && OC == GROUP && IC == GROUP) {
  993. filter = {GROUP / 8, 1, 1, FS, FS, 8};
  994. } else if (GROUP > 1 && OC / GROUP % 8 == 0 && IC / GROUP % 8 == 0) {
  995. filter = {GROUP, OC / GROUP / 8, IC / GROUP / 8, FS, FS, 8, 8};
  996. }
  997. bias = {1, OC / 8, 1, 1, 8};
  998. z = {};
  999. dst = {N, OC / 8, H / SZ, W / SZ, 8};
  1000. param.format = param::ConvBias::Format::NCHW88;
  1001. auto mkldnn_used = benchmarker_mkldnn.set_param(param).exec(
  1002. {src, filter, bias, z, dst}) /
  1003. RUNS;
  1004. float computations =
  1005. (IC / GROUP * FS * FS + 1) * dst.total_nr_elems() * 2 * 1e-6;
  1006. std::cout << "run " << src.to_string() << " " << filter.to_string()
  1007. << " " << bias.to_string() << " " << dst.to_string()
  1008. << std::endl;
  1009. std::cout << "im2col: " << im2col_used << " ms, "
  1010. << (computations / im2col_used) << " Gops, ";
  1011. std::cout << "mkldnn: " << mkldnn_used << " ms, "
  1012. << (computations / mkldnn_used) << " Gops, "
  1013. << "spped up: " << (im2col_used / mkldnn_used) << ", ";
  1014. std::cout << std::endl;
  1015. };
  1016. run(1, 64, 64, 56, 56, 3, 1);
  1017. run(1, 3, 64, 224, 224, 3, 1);
  1018. run(1, 3, 64, 224, 224, 7, 2);
  1019. run(1, 64, 64, 56, 56, 3, 1);
  1020. run(1, 128, 128, 28, 28, 3, 1);
  1021. run(1, 256, 256, 14, 14, 3, 1);
  1022. run(1, 512, 512, 7, 7, 3, 1);
  1023. run(1, 256, 64, 56, 56, 1, 1);
  1024. run(1, 512, 128, 28, 28, 1, 1);
  1025. run(1, 1024, 256, 14, 14, 1, 1);
  1026. run(1, 2048, 512, 7, 7, 1, 1);
  1027. run(1, 32, 32, 112, 112, 3, 1, 32);
  1028. run(1, 144, 144, 56, 56, 3, 1, 144);
  1029. run(1, 192, 192, 28, 28, 3, 1, 192);
  1030. run(1, 384, 384, 28, 28, 3, 1, 384);
  1031. run(1, 576, 576, 14, 14, 3, 1, 576);
  1032. run(1, 960, 960, 7, 7, 3, 1, 960);
  1033. run(1, 256, 128, 56, 56, 1, 2, 1);
  1034. run(1, 512, 256, 28, 28, 1, 2, 1);
  1035. run(1, 1024, 512, 14, 14, 1, 2, 1);
  1036. run(1, 96, 96, 112, 112, 3, 2, 96);
  1037. run(1, 144, 144, 56, 56, 3, 2, 144);
  1038. run(1, 384, 384, 28, 28, 3, 2, 384);
  1039. run(1, 576, 576, 14, 14, 3, 2, 576);
  1040. }
  1041. TEST_F(X86, BENCHMARK_CONVBIAS_FP32_MKLDNN) {
  1042. x86_benchmark_fp32_mkldnn(handle());
  1043. }
  1044. TEST_F(X86_MULTI_THREADS, BENCHMARK_CONVBIAS_FP32_MKLDNN) {
  1045. x86_benchmark_fp32_mkldnn(handle());
  1046. }
  1047. #endif
  1048. #endif
  1049. /************************* Winograd ****************************/
  1050. namespace{
  1051. std::vector<conv_bias::TestArg> get_winograd_mk_nchw88_args() {
  1052. std::vector<conv_bias::TestArg> args;
  1053. param::ConvBias cur_param;
  1054. cur_param.format = param::ConvBias::Format::NCHW88;
  1055. using NLMode = param::ConvBias::NonlineMode;
  1056. // clang-format off
  1057. for (auto nlmode :
  1058. {NLMode::IDENTITY, NLMode::RELU, NLMode::SIGMOID, NLMode::H_SWISH}) {
  1059. for (size_t ic : {1, 2}) {
  1060. for (size_t oc : {1, 2}) {
  1061. for (size_t i : {9, 63}) {
  1062. cur_param.mode = param::ConvBias::Mode::CROSS_CORRELATION;
  1063. cur_param.nonlineMode = nlmode;
  1064. cur_param.sparse = param::ConvBias::Sparse::DENSE;
  1065. cur_param.pad_h = cur_param.pad_w = 1;
  1066. args.emplace_back(cur_param, TensorShape{1, ic, i, i, 8},
  1067. TensorShape{oc, ic, 3, 3, 8, 8},
  1068. TensorShape{1, oc, 1, 1, 8});
  1069. args.emplace_back(cur_param, TensorShape{1, ic, i, i, 8},
  1070. TensorShape{oc, ic, 3, 3, 8, 8},TensorShape{});
  1071. //! bias
  1072. args.emplace_back(cur_param, TensorShape{2, ic, i, i, 8},
  1073. TensorShape{oc, ic, 3, 3, 8, 8}, TensorShape{2, oc, i, i, 8});
  1074. /*cur_param.sparse = param::ConvBias::Sparse::GROUP;
  1075. args.emplace_back(cur_param, TensorShape{2, 2 * ic, i, i, 8},
  1076. TensorShape{2, oc, ic, 3, 3, 8, 8},
  1077. TensorShape{1, 2 * oc, 1, 1, 8});*/
  1078. }}}
  1079. // clang-format on
  1080. //! test for multi-thread OC parallel
  1081. cur_param.sparse = param::ConvBias::Sparse::DENSE;
  1082. cur_param.pad_h = cur_param.pad_w = 1;
  1083. args.emplace_back(cur_param, TensorShape{2, 1, 9, 9, 8},
  1084. TensorShape{128, 1, 3, 3, 8, 8},
  1085. TensorShape{1, 128, 1, 1, 8});
  1086. /*cur_param.sparse = param::ConvBias::Sparse::GROUP;
  1087. args.emplace_back(cur_param, TensorShape{2, 2, 9, 9, 8},
  1088. TensorShape{2, 128, 1, 3, 3, 8, 8},
  1089. TensorShape{1, 2 * 128, 1, 1, 8});*/
  1090. }
  1091. return args;
  1092. }
  1093. } // namespace
  1094. TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW88_F63) {
  1095. using namespace conv_bias;
  1096. std::vector<TestArg> args = get_winograd_mk_nchw88_args();
  1097. Checker<ConvBiasForward> checker(handle());
  1098. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1099. ssprintf("WINOGRAD:X86_F32MK8_8X8:8:6").c_str()));
  1100. for (auto&& arg : args) {
  1101. checker.set_param(arg.param).execs(
  1102. {arg.src, arg.filter, arg.bias, {}, {}});
  1103. }
  1104. }
  1105. TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW88_F23) {
  1106. using namespace conv_bias;
  1107. std::vector<TestArg> args = get_winograd_mk_nchw88_args();
  1108. Checker<ConvBiasForward> checker(handle());
  1109. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1110. ssprintf("WINOGRAD:X86_F32MK8_8X8:8:2").c_str()));
  1111. for (auto&& arg : args) {
  1112. checker.set_param(arg.param).execs(
  1113. {arg.src, arg.filter, arg.bias, {}, {}});
  1114. }
  1115. }
  1116. TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_WEIGHT_PREPROCESS) {
  1117. using namespace conv_bias;
  1118. std::vector<TestArg> args = get_winograd_mk_nchw88_args();
  1119. Checker<ConvBiasForward> checker(handle());
  1120. auto extra_impl = [](const TensorNDArray& tensors, uint32_t m,
  1121. param::ConvBias param, Handle* handle) {
  1122. megdnn_assert(param.format == param::ConvBias::Format::NCHW88);
  1123. auto winograd_preprocess_opr =
  1124. handle->create_operator<WinogradFilterPreprocess>();
  1125. winograd_preprocess_opr->param().output_block_size = m;
  1126. winograd_preprocess_opr->param().format = param::MatrixMul::Format::MK8;
  1127. TensorLayout filter_transform_layout;
  1128. winograd_preprocess_opr->deduce_layout(tensors[1].layout,
  1129. filter_transform_layout);
  1130. size_t winograd_preprocess_workspace_in_bytes =
  1131. winograd_preprocess_opr->get_workspace_in_bytes(
  1132. tensors[1].layout, filter_transform_layout);
  1133. auto conv_bias_opr = handle->create_operator<ConvBias>();
  1134. conv_bias_opr->param() = param;
  1135. conv_bias_opr->param().format = param::ConvBias::Format::NCHW88_WINOGRAD;
  1136. conv_bias_opr->param().output_block_size = m;
  1137. size_t conv_bias_workspace_in_bytes =
  1138. conv_bias_opr->get_workspace_in_bytes(
  1139. tensors[0].layout, filter_transform_layout,
  1140. tensors[2].layout, tensors[3].layout,
  1141. tensors[4].layout);
  1142. WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(),
  1143. conv_bias_workspace_in_bytes,
  1144. winograd_preprocess_workspace_in_bytes});
  1145. wb.set(malloc(wb.total_size_in_bytes()));
  1146. TensorND filter_transform_tensor(wb.get(0),
  1147. std::move(filter_transform_layout));
  1148. winograd_preprocess_opr->exec(tensors[1], filter_transform_tensor,
  1149. wb.get_workspace(2));
  1150. conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2],
  1151. tensors[3], tensors[4], wb.get_workspace(1));
  1152. free(wb.ptr());
  1153. };
  1154. auto run = [&checker, &extra_impl](
  1155. Handle* handle, const std::vector<TestArg>& args,
  1156. const std::vector<size_t>& out_size, DType A_dtype,
  1157. DType B_dtype, DType C_dtype, DType D_dtype,
  1158. const float eps) {
  1159. for (auto&& arg : args) {
  1160. for (uint32_t m : out_size) {
  1161. checker.set_extra_opr_impl(std::bind(extra_impl,
  1162. std::placeholders::_1, m,
  1163. arg.param, handle));
  1164. checker.set_dtype(0, A_dtype)
  1165. .set_dtype(1, B_dtype)
  1166. .set_dtype(2, C_dtype)
  1167. .set_dtype(4, D_dtype)
  1168. .set_epsilon(eps)
  1169. .set_param(arg.param)
  1170. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  1171. }
  1172. }
  1173. };
  1174. run(handle(), args, {2, 6}, dtype::Float32(), dtype::Float32(),
  1175. dtype::Float32(), dtype::Float32(), 1e-3f);
  1176. }
  1177. /*********************************** End winograd ************************/
  1178. #if defined(MEGDNN_X86_WITH_MKL_DNN)
  1179. static void x86_correctness_fp32_mkldnn_run(
  1180. Checker<ConvBias>& checker, UniformIntRNG& rng, Handle* handle,
  1181. ConvBiasForward::BiasMode bias_mode,
  1182. param::ConvBias::NonlineMode noline_mode, size_t n, size_t stride,
  1183. size_t kernel, size_t oc, size_t ic, size_t h, size_t w, size_t group) {
  1184. auto oc_per_group = oc / group;
  1185. auto ic_per_group = ic / group;
  1186. bool ok_group = oc_per_group % 8 == 0 && oc_per_group > 0 &&
  1187. (ic_per_group % 8 == 0 || ic_per_group == 3) &&
  1188. ic_per_group > 0;
  1189. bool ok_depthwise = oc == ic && oc == group;
  1190. if (!(ok_group || ok_depthwise)) {
  1191. return;
  1192. }
  1193. size_t pad = kernel / 2;
  1194. size_t kernel_h = kernel;
  1195. size_t kernel_w = kernel;
  1196. param::ConvBias param;
  1197. param.format = param::ConvBias::Format::NCHW88;
  1198. param.stride_h = stride;
  1199. param.stride_w = stride;
  1200. param.pad_h = pad;
  1201. param.pad_w = pad;
  1202. param.nonlineMode = noline_mode;
  1203. auto src_tensor_shape = TensorShape{n, ic / 8, h, w, 8};
  1204. if (ic == 3) {
  1205. src_tensor_shape = TensorShape{n, ic, h, w};
  1206. }
  1207. auto weight_tensor_shape =
  1208. TensorShape{oc / 8, ic / 8, kernel_h, kernel_w, 8, 8};
  1209. if (ic == 3) {
  1210. weight_tensor_shape = TensorShape{oc / 8, kernel_h, kernel_w, ic, 8};
  1211. }
  1212. auto bias_tensor_shape = TensorShape{};
  1213. if (bias_mode == megdnn::BiasMode::BROADCAST_CHANNEL_BIAS) {
  1214. bias_tensor_shape = {1, oc / 8, 1, 1, 8};
  1215. } else if (bias_mode == megdnn::BiasMode::BIAS) {
  1216. TensorLayout dst_layout;
  1217. auto ConvBiasOp = handle->create_operator<ConvBias>();
  1218. ConvBiasOp->param() = param;
  1219. ConvBiasOp->deduce_layout({src_tensor_shape, dtype::Float32()},
  1220. {weight_tensor_shape, dtype::Float32()}, {},
  1221. {}, dst_layout);
  1222. bias_tensor_shape = dst_layout;
  1223. }
  1224. if (group == 1) {
  1225. param.sparse = param::ConvBias::Sparse::DENSE;
  1226. } else if (group > 1 && ic / group == 1 && oc / group == 1) {
  1227. param.sparse = param::ConvBias::Sparse::GROUP;
  1228. weight_tensor_shape =
  1229. TensorShape{group / 8, 1, 1, kernel_h, kernel_w, 8};
  1230. } else if (group > 1 && oc / group % 8 == 0 && oc / group > 0 &&
  1231. ic / group % 8 == 0 && ic / group > 0) {
  1232. param.sparse = param::ConvBias::Sparse::GROUP;
  1233. weight_tensor_shape = TensorShape{
  1234. group, oc / group / 8, ic / group / 8, kernel_h, kernel_w, 8,
  1235. 8};
  1236. }
  1237. checker.set_dtype(0, dtype::Float32())
  1238. .set_dtype(1, dtype::Float32())
  1239. .set_dtype(2, dtype::Float32())
  1240. .set_dtype(4, dtype::Float32())
  1241. .set_rng(0, &rng)
  1242. .set_rng(1, &rng)
  1243. .set_rng(2, &rng)
  1244. .set_epsilon(1e-3)
  1245. .set_param(param)
  1246. .execs({src_tensor_shape,
  1247. weight_tensor_shape,
  1248. bias_tensor_shape,
  1249. {},
  1250. {}});
  1251. }
  1252. static void x86_correctness_fp32_mkldnn(Handle* handle) {
  1253. Checker<ConvBias> checker(handle);
  1254. UniformIntRNG rng{-127, 127};
  1255. checker.set_before_exec_callback(
  1256. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  1257. "MKLDNN_CONV_FP32"));
  1258. for (auto bias_mode :
  1259. {megdnn::BiasMode::NO_BIAS, megdnn::BiasMode::BROADCAST_CHANNEL_BIAS,
  1260. megdnn::BiasMode::BIAS})
  1261. for (auto noline_mode : {param::ConvBias::NonlineMode::IDENTITY,
  1262. param::ConvBias::NonlineMode::SIGMOID,
  1263. param::ConvBias::NonlineMode::H_SWISH})
  1264. for (size_t n : {1, 2})
  1265. for (size_t stride : {1, 2})
  1266. for (size_t kernel : {3, 5, 7})
  1267. for (size_t oc : {8, 16})
  1268. for (size_t ic : {3, 8, 16})
  1269. for (size_t h : {22, 33})
  1270. for (size_t w : {22, 33}) {
  1271. for (size_t group = 1;
  1272. group <= std::min(oc, ic);
  1273. ++group) {
  1274. x86_correctness_fp32_mkldnn_run(
  1275. checker, rng, handle,
  1276. bias_mode, noline_mode, n,
  1277. stride, kernel, oc, ic, h,
  1278. w, group);
  1279. }
  1280. }
  1281. }
  1282. TEST_F(X86, CONV_BIAS_DIRECT_MKLDNN_C8) {
  1283. x86_correctness_fp32_mkldnn(handle());
  1284. }
  1285. TEST_F(X86_MULTI_THREADS, CONV_BIAS_DIRECT_MKLDNN_C8) {
  1286. x86_correctness_fp32_mkldnn(handle());
  1287. }
  1288. TEST_F(X86, CONV_BIAS_MKL_DNN_MATMUL_INT8) {
  1289. using namespace conv_bias;
  1290. std::vector<TestArg> args;
  1291. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1292. size_t p, NonlineMode nonline_mode) {
  1293. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1294. return;
  1295. param::ConvBias param;
  1296. param.stride_h = 1;
  1297. param.stride_w = 1;
  1298. param.pad_h = p;
  1299. param.pad_w = p;
  1300. param.nonlineMode = nonline_mode;
  1301. //! no bias
  1302. args.emplace_back(param, TensorShape{1, ic, h, w},
  1303. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  1304. };
  1305. for (size_t kernel : {2, 3, 5, 7})
  1306. for (size_t ic : {1, 2, 3, 4})
  1307. for (size_t oc : {1, 2, 4})
  1308. for (size_t p : {0, 2})
  1309. for (size_t size : {20, 21, 22, 23, 24})
  1310. for (NonlineMode nonline_mode :
  1311. {NonlineMode::IDENTITY}) {
  1312. run(oc, ic, size, size, kernel, p, nonline_mode);
  1313. }
  1314. Checker<ConvBias> checker(handle());
  1315. checker.set_before_exec_callback(
  1316. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  1317. "MKLDNN_MATMUL_INT8"));
  1318. checker.set_epsilon(1);
  1319. UniformIntRNG rng{-50, 50};
  1320. checker.set_dtype(0, dtype::Int8())
  1321. .set_dtype(1, dtype::Int8())
  1322. .set_dtype(2, dtype::Int32())
  1323. .set_dtype(4, dtype::Int32())
  1324. .set_rng(0, &rng)
  1325. .set_rng(1, &rng)
  1326. .set_rng(2, &rng);
  1327. for (auto&& arg : args) {
  1328. checker.set_param(arg.param).exec(
  1329. {arg.src, arg.filter, arg.bias, {}, {}});
  1330. }
  1331. }
  1332. TEST_F(X86, CONV_BIAS_MKL_DNN_INT8) {
  1333. using namespace conv_bias;
  1334. std::vector<TestArg> args;
  1335. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1336. size_t p, NonlineMode nonline_mode) {
  1337. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1338. return;
  1339. param::ConvBias param;
  1340. param.stride_h = 1;
  1341. param.stride_w = 1;
  1342. param.pad_h = p;
  1343. param.pad_w = p;
  1344. param.nonlineMode = nonline_mode;
  1345. //! no bias
  1346. args.emplace_back(param, TensorShape{1, ic, h, w},
  1347. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  1348. };
  1349. for (size_t kernel : {2, 3, 5, 7})
  1350. for (size_t ic : {1, 2, 3, 4})
  1351. for (size_t oc : {1, 2, 4})
  1352. for (size_t p : {0, 2})
  1353. for (size_t size : {20, 22, 24})
  1354. for (NonlineMode nonline_mode :
  1355. {NonlineMode::IDENTITY}) {
  1356. run(oc, ic, size, size, kernel, p, nonline_mode);
  1357. }
  1358. Checker<ConvBias> checker(handle());
  1359. checker.set_before_exec_callback(
  1360. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("MKLDNN_INT8"));
  1361. checker.set_epsilon(1);
  1362. UniformIntRNG rng{-50, 50};
  1363. checker.set_dtype(0, dtype::Int8())
  1364. .set_dtype(1, dtype::Int8())
  1365. .set_dtype(2, dtype::Int32())
  1366. .set_dtype(4, dtype::Int32())
  1367. .set_rng(0, &rng)
  1368. .set_rng(1, &rng)
  1369. .set_rng(2, &rng);
  1370. for (auto&& arg : args) {
  1371. checker.set_param(arg.param).exec(
  1372. {arg.src, arg.filter, arg.bias, {}, {}});
  1373. }
  1374. }
  1375. TEST_F(X86_MULTI_THREADS, CONV_BIAS_MKL_DNN_INT8) {
  1376. using namespace conv_bias;
  1377. std::vector<TestArg> args;
  1378. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1379. size_t p, NonlineMode nonline_mode) {
  1380. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1381. return;
  1382. param::ConvBias param;
  1383. param.stride_h = 1;
  1384. param.stride_w = 1;
  1385. param.pad_h = p;
  1386. param.pad_w = p;
  1387. param.nonlineMode = nonline_mode;
  1388. //! no bias
  1389. args.emplace_back(param, TensorShape{1, ic, h, w},
  1390. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  1391. };
  1392. for (size_t kernel : {2, 3, 5, 7})
  1393. for (size_t ic : {1, 2, 3, 4})
  1394. for (size_t oc : {1, 2, 4})
  1395. for (size_t p : {0, 2})
  1396. for (size_t size : {20, 22, 24})
  1397. for (NonlineMode nonline_mode :
  1398. {NonlineMode::IDENTITY}) {
  1399. run(oc, ic, size, size, kernel, p, nonline_mode);
  1400. }
  1401. Checker<ConvBias> checker(handle());
  1402. checker.set_before_exec_callback(
  1403. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("MKLDNN_INT8"));
  1404. checker.set_epsilon(1);
  1405. UniformIntRNG rng{-50, 50};
  1406. checker.set_dtype(0, dtype::Int8())
  1407. .set_dtype(1, dtype::Int8())
  1408. .set_dtype(2, dtype::Int32())
  1409. .set_dtype(4, dtype::Int32())
  1410. .set_rng(0, &rng)
  1411. .set_rng(1, &rng)
  1412. .set_rng(2, &rng);
  1413. for (auto&& arg : args) {
  1414. checker.set_param(arg.param).exec(
  1415. {arg.src, arg.filter, arg.bias, {}, {}});
  1416. }
  1417. }
  1418. #endif
  1419. #if MEGDNN_WITH_BENCHMARK
  1420. namespace {
  1421. void benchmark_impl(const param::ConvBias param,
  1422. std::vector<std::pair<SmallVector<TensorShape>, float>>&
  1423. shapes_and_computation,
  1424. const std::string algo_name, size_t RUNS,
  1425. TaskExecutorConfig&& multi_thread_config,
  1426. TaskExecutorConfig&& single_thread_config,
  1427. std::vector<DType> dtype_v) {
  1428. std::vector<DType> data_type = {dtype::Float32(), dtype::Float32(),
  1429. dtype::Float32(), dtype::Float32()};
  1430. std::vector<float> multi_thread_times, single_thread_times;
  1431. {
  1432. auto multi_thread_hanle =
  1433. create_cpu_handle(0, true, &multi_thread_config);
  1434. auto benchmarker = Benchmarker<ConvBias>(multi_thread_hanle.get());
  1435. benchmarker.set_times(RUNS)
  1436. .set_display(false)
  1437. .set_dtype(0, dtype_v[0])
  1438. .set_dtype(1, dtype_v[1])
  1439. .set_dtype(2, dtype_v[2])
  1440. .set_dtype(4, dtype_v[3])
  1441. .set_param(param)
  1442. .set_before_exec_callback(
  1443. conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1444. algo_name.c_str()));
  1445. for (auto shape : shapes_and_computation) {
  1446. multi_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  1447. }
  1448. }
  1449. {
  1450. auto single_thread_handle =
  1451. create_cpu_handle(0, true, &single_thread_config);
  1452. auto benchmarker = Benchmarker<ConvBias>(single_thread_handle.get());
  1453. benchmarker.set_times(RUNS)
  1454. .set_display(false)
  1455. .set_dtype(0, dtype_v[0])
  1456. .set_dtype(1, dtype_v[1])
  1457. .set_dtype(2, dtype_v[2])
  1458. .set_dtype(4, dtype_v[3])
  1459. .set_param(param)
  1460. .set_before_exec_callback(
  1461. conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1462. algo_name.c_str()));
  1463. for (auto shape : shapes_and_computation) {
  1464. single_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  1465. }
  1466. }
  1467. printf("Benchmark : Multi threads %zu, ", multi_thread_config.nr_thread);
  1468. printf("core_ids:");
  1469. for (size_t i = 0; i < multi_thread_config.affinity_core_set.size(); i++) {
  1470. printf("%zu ", multi_thread_config.affinity_core_set[i]);
  1471. }
  1472. printf(", Single thread core_id %zu\n",
  1473. single_thread_config.affinity_core_set[0]);
  1474. for (size_t i = 0; i < shapes_and_computation.size(); i++) {
  1475. auto shapes = shapes_and_computation[i];
  1476. printf("Bench case: ");
  1477. for (auto&& shape : shapes.first) {
  1478. printf("%s ", shape.to_string().c_str());
  1479. }
  1480. float computations = shapes.second;
  1481. printf("%zu threads gflops: %f,\n single thread gflops: "
  1482. "%f. spead up = %f, speedup/cores=%f\n",
  1483. multi_thread_config.nr_thread,
  1484. computations / multi_thread_times[i],
  1485. computations / single_thread_times[i],
  1486. single_thread_times[i] / multi_thread_times[i],
  1487. single_thread_times[i] / multi_thread_times[i] /
  1488. multi_thread_config.nr_thread);
  1489. }
  1490. }
  1491. void benchmark_impl_comp(const param::ConvBias param,
  1492. std::vector<std::pair<SmallVector<TensorShape>, float>>&
  1493. shapes_and_computation,
  1494. const std::string algo_name, const std::string algo_name1,size_t RUNS,
  1495. TaskExecutorConfig&& multi_thread_config,
  1496. TaskExecutorConfig&& single_thread_config,std::vector<DType> dtype_v) {
  1497. std::vector<DType> data_type = {dtype::Float32(), dtype::Float32(),
  1498. dtype::Float32(), dtype::Float32()};
  1499. std::vector<float> multi_thread_times, single_thread_times;
  1500. {
  1501. auto multi_thread_hanle =
  1502. create_cpu_handle(0, true, &multi_thread_config);
  1503. auto benchmarker = Benchmarker<ConvBias>(multi_thread_hanle.get());
  1504. benchmarker.set_times(RUNS)
  1505. .set_display(false)
  1506. .set_dtype(0,dtype_v[0])
  1507. .set_dtype(1,dtype_v[1])
  1508. .set_dtype(2,dtype_v[2])
  1509. .set_dtype(4,dtype_v[3])
  1510. .set_param(param)
  1511. .set_before_exec_callback(
  1512. conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1513. algo_name.c_str()));
  1514. for (auto shape : shapes_and_computation) {
  1515. multi_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  1516. }
  1517. }
  1518. {
  1519. auto single_thread_handle =
  1520. create_cpu_handle(0, true, &single_thread_config);
  1521. auto benchmarker = Benchmarker<ConvBias>(single_thread_handle.get());
  1522. benchmarker.set_times(RUNS)
  1523. .set_display(false)
  1524. .set_dtype(0,dtype_v[0])
  1525. .set_dtype(1,dtype_v[1])
  1526. .set_dtype(2,dtype_v[2])
  1527. .set_dtype(4,dtype_v[3])
  1528. .set_param(param)
  1529. .set_before_exec_callback(
  1530. conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1531. algo_name1.c_str()));
  1532. for (auto shape : shapes_and_computation) {
  1533. single_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  1534. }
  1535. }
  1536. printf("Benchmark : Multi threads %zu, ", multi_thread_config.nr_thread);
  1537. printf("core_ids:");
  1538. for (size_t i = 0; i < multi_thread_config.affinity_core_set.size(); i++) {
  1539. printf("%zu ", multi_thread_config.affinity_core_set[i]);
  1540. }
  1541. for (size_t i = 0; i < shapes_and_computation.size(); i++) {
  1542. auto shapes = shapes_and_computation[i];
  1543. printf("Bench case: ");
  1544. for (auto&& shape : shapes.first) {
  1545. printf("%s ", shape.to_string().c_str());
  1546. }
  1547. float computations = shapes.second;
  1548. printf("algo:%s gflops: %f,\n algo:%s gflops: "
  1549. "%f. spead up = %f\n",
  1550. algo_name.c_str(), computations / multi_thread_times[i],
  1551. algo_name1.c_str(), computations / single_thread_times[i],
  1552. single_thread_times[i] / multi_thread_times[i]);
  1553. }
  1554. }
  1555. } // namespace
  1556. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_CHANWISE_AVX2_INT8) {
  1557. constexpr size_t RUNS = 50;
  1558. param::ConvBias param;
  1559. param.stride_h = 1;
  1560. param.stride_w = 1;
  1561. param.sparse = param::ConvBias::Sparse::GROUP;
  1562. std::vector<DType> data_type = {dtype::Int8(), dtype::Int8(),
  1563. dtype::Int32(), dtype::Int32()};
  1564. std::vector<std::pair<SmallVector<TensorShape>, float>>
  1565. shapes_and_computation;
  1566. auto bench_case = [&](size_t N, size_t IC, size_t H, size_t W, size_t FS) {
  1567. param.pad_h = FS / 2;
  1568. param.pad_w = FS / 2;
  1569. SmallVector<TensorShape> shapes{
  1570. {N, IC, H, W}, {IC, 1, 1, FS, FS}, {}, {}, {}};
  1571. TensorShape dst{N, IC, (H + 2 * param.pad_h - FS) + 1,
  1572. (W + 2 * param.pad_w - FS) + 1};
  1573. float computations = (FS * FS * dst.total_nr_elems() * 2) * 1e-6;
  1574. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1575. };
  1576. bench_case(1, 32, 112, 112, 7);
  1577. bench_case(1, 144, 56, 56, 7);
  1578. bench_case(1, 192, 28, 28, 7);
  1579. bench_case(1, 384, 28, 28, 7);
  1580. bench_case(1, 576, 14, 14, 7);
  1581. bench_case(1, 960, 7, 7, 7);
  1582. bench_case(1, 32, 112, 112, 5);
  1583. bench_case(1, 144, 56, 56, 5);
  1584. bench_case(1, 192, 28, 28, 5);
  1585. bench_case(1, 384, 28, 28, 5);
  1586. bench_case(1, 576, 14, 14, 5);
  1587. bench_case(1, 960, 7, 7, 5);
  1588. bench_case(1, 32, 112, 112, 3);
  1589. bench_case(1, 144, 56, 56, 3);
  1590. bench_case(1, 192, 28, 28, 3);
  1591. bench_case(1, 384, 28, 28, 3);
  1592. bench_case(1, 576, 14, 14, 3);
  1593. bench_case(1, 960, 7, 7, 3);
  1594. bench_case(1, 32, 112, 112, 2);
  1595. bench_case(1, 144, 56, 56, 2);
  1596. bench_case(1, 192, 28, 28, 2);
  1597. bench_case(1, 384, 28, 28, 2);
  1598. bench_case(1, 576, 14, 14, 2);
  1599. bench_case(1, 960, 7, 7, 2);
  1600. std::string algo_name = "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1";
  1601. printf("Benchmark X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1\n");
  1602. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1603. {4, {4, 5, 6, 7}}, {1, {4}}, data_type);
  1604. benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
  1605. {1, {4}}, data_type);
  1606. shapes_and_computation.clear();
  1607. }
  1608. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECT_AVX2_INT8) {
  1609. constexpr size_t RUNS = 50;
  1610. param::ConvBias param;
  1611. param.stride_h = 1;
  1612. param.stride_w = 1;
  1613. param.sparse = param::ConvBias::Sparse::DENSE;
  1614. std::vector<DType> data_type = {dtype::Int8(), dtype::Int8(),
  1615. dtype::Int32(), dtype::Int32()};
  1616. std::vector<std::pair<SmallVector<TensorShape>, float>>
  1617. shapes_and_computation;
  1618. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  1619. size_t FS) {
  1620. param.pad_h = FS / 2;
  1621. param.pad_w = FS / 2;
  1622. SmallVector<TensorShape> shapes{
  1623. {N, IC, H, W}, {OC, IC, FS, FS}, {}, {}, {}};
  1624. TensorShape dst{N, OC, (H + 2 * param.pad_h - FS) + 1,
  1625. (W + 2 * param.pad_w - FS) + 1};
  1626. float computations = (IC * FS * FS * dst.total_nr_elems() * 2) * 1e-6;
  1627. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1628. };
  1629. bench_case(1, 32, 32, 200, 200, 7);
  1630. bench_case(1, 32, 64, 200, 200, 7);
  1631. bench_case(1, 32, 32, 128, 128, 7);
  1632. bench_case(1, 32, 64, 128, 128, 7);
  1633. bench_case(1, 32, 32, 100, 100, 7);
  1634. bench_case(1, 32, 64, 100, 100, 7);
  1635. bench_case(1, 32, 32, 80, 80, 7);
  1636. bench_case(1, 32, 64, 80, 80, 7);
  1637. bench_case(1, 32, 32, 200, 200, 5);
  1638. bench_case(1, 32, 64, 200, 200, 5);
  1639. bench_case(1, 32, 32, 128, 128, 5);
  1640. bench_case(1, 32, 64, 128, 128, 5);
  1641. bench_case(1, 32, 32, 100, 100, 5);
  1642. bench_case(1, 32, 64, 100, 100, 5);
  1643. bench_case(1, 32, 32, 80, 80, 5);
  1644. bench_case(1, 32, 64, 80, 80, 5);
  1645. bench_case(1, 32, 32, 200, 200, 3);
  1646. bench_case(1, 32, 64, 200, 200, 3);
  1647. bench_case(1, 32, 32, 128, 128, 3);
  1648. bench_case(1, 32, 64, 128, 128, 3);
  1649. bench_case(1, 32, 32, 100, 100, 3);
  1650. bench_case(1, 32, 64, 100, 100, 3);
  1651. bench_case(1, 32, 32, 80, 80, 3);
  1652. bench_case(1, 32, 64, 80, 80, 3);
  1653. bench_case(1, 32, 32, 200, 200, 2);
  1654. bench_case(1, 32, 64, 200, 200, 2);
  1655. bench_case(1, 32, 32, 128, 128, 2);
  1656. bench_case(1, 32, 64, 128, 128, 2);
  1657. bench_case(1, 32, 32, 100, 100, 2);
  1658. bench_case(1, 32, 64, 100, 100, 2);
  1659. bench_case(1, 32, 32, 80, 80, 2);
  1660. bench_case(1, 32, 64, 80, 80, 2);
  1661. std::string algo_name = "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE1";
  1662. printf("Benchmark X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE1 algo\n");
  1663. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1664. {4, {4, 5, 6, 7}}, {1, {4}}, data_type);
  1665. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1666. {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  1667. benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
  1668. {1, {4}}, data_type);
  1669. shapes_and_computation.clear();
  1670. }
  1671. TEST_F(X86_BENCHMARK_MULTI_THREADS,
  1672. BENCHMARK_CONVBIAS_DIRECT_AVX2_INT8_STRIDE2) {
  1673. constexpr size_t RUNS = 50;
  1674. param::ConvBias param;
  1675. param.stride_h = 2;
  1676. param.stride_w = 2;
  1677. param.sparse = param::ConvBias::Sparse::DENSE;
  1678. std::vector<DType> data_type = {dtype::Int8(), dtype::Int8(),
  1679. dtype::Int32(), dtype::Int32()};
  1680. std::vector<std::pair<SmallVector<TensorShape>, float>>
  1681. shapes_and_computation;
  1682. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  1683. size_t FS) {
  1684. param.pad_h = FS / 2;
  1685. param.pad_w = FS / 2;
  1686. SmallVector<TensorShape> shapes{
  1687. {N, IC, H, W}, {OC, IC, FS, FS}, {}, {}, {}};
  1688. TensorShape dst{N, OC, (H + 2 * param.pad_h - FS) / param.stride_h + 1,
  1689. (W + 2 * param.pad_w - FS) / param.pad_w + 1};
  1690. float computations = (IC * FS * FS * dst.total_nr_elems() * 2) * 1e-6;
  1691. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1692. };
  1693. bench_case(1, 32, 32, 200, 200, 7);
  1694. bench_case(1, 32, 64, 200, 200, 7);
  1695. bench_case(1, 32, 32, 128, 128, 7);
  1696. bench_case(1, 32, 64, 128, 128, 7);
  1697. bench_case(1, 32, 32, 100, 100, 7);
  1698. bench_case(1, 32, 64, 100, 100, 7);
  1699. bench_case(1, 32, 32, 80, 80, 7);
  1700. bench_case(1, 32, 64, 80, 80, 7);
  1701. bench_case(1, 32, 32, 200, 200, 5);
  1702. bench_case(1, 32, 64, 200, 200, 5);
  1703. bench_case(1, 32, 32, 128, 128, 5);
  1704. bench_case(1, 32, 64, 128, 128, 5);
  1705. bench_case(1, 32, 32, 100, 100, 5);
  1706. bench_case(1, 32, 64, 100, 100, 5);
  1707. bench_case(1, 32, 32, 80, 80, 5);
  1708. bench_case(1, 32, 64, 80, 80, 5);
  1709. bench_case(1, 32, 32, 200, 200, 3);
  1710. bench_case(1, 32, 64, 200, 200, 3);
  1711. bench_case(1, 32, 32, 128, 128, 3);
  1712. bench_case(1, 32, 64, 128, 128, 3);
  1713. bench_case(1, 32, 32, 100, 100, 3);
  1714. bench_case(1, 32, 64, 100, 100, 3);
  1715. bench_case(1, 32, 32, 80, 80, 3);
  1716. bench_case(1, 32, 64, 80, 80, 3);
  1717. bench_case(1, 32, 32, 200, 200, 2);
  1718. bench_case(1, 32, 64, 200, 200, 2);
  1719. bench_case(1, 32, 32, 128, 128, 2);
  1720. bench_case(1, 32, 64, 128, 128, 2);
  1721. bench_case(1, 32, 32, 100, 100, 2);
  1722. bench_case(1, 32, 64, 100, 100, 2);
  1723. bench_case(1, 32, 32, 80, 80, 2);
  1724. bench_case(1, 32, 64, 80, 80, 2);
  1725. std::string algo_name = "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE2";
  1726. printf("Benchmark X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE2 algo\n");
  1727. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1728. {4, {4, 5, 6, 7}}, {1, {4}}, data_type);
  1729. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1730. {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  1731. benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
  1732. {1, {4}}, data_type);
  1733. shapes_and_computation.clear();
  1734. }
  1735. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECTF32) {
  1736. constexpr size_t RUNS = 50;
  1737. param::ConvBias param;
  1738. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1739. param.pad_h = 1;
  1740. param.pad_w = 1;
  1741. param.stride_h = 1;
  1742. param.stride_w = 1;
  1743. param.sparse = param::ConvBias::Sparse::GROUP;
  1744. std::vector<DType> data_type = {dtype::Float32(), dtype::Float32(),
  1745. dtype::Float32(), dtype::Float32()};
  1746. std::vector<std::pair<SmallVector<TensorShape>, float>>
  1747. shapes_and_computation;
  1748. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  1749. size_t FS, size_t group) {
  1750. SmallVector<TensorShape> shapes{{N, IC, H, W},
  1751. {group, OC / group, IC / group, FS, FS},
  1752. {1, OC, 1, 1},
  1753. {},
  1754. {N, OC, H, W}};
  1755. TensorShape dst{N, OC, H, W};
  1756. float computations =
  1757. ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1758. dst.total_nr_elems()) *
  1759. 1e-6;
  1760. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1761. };
  1762. bench_case(1, 32, 32, 200, 200, 3, 4);
  1763. bench_case(1, 32, 32, 200, 200, 3, 32);
  1764. bench_case(1, 32, 32, 128, 128, 3, 4);
  1765. bench_case(1, 32, 32, 128, 128, 3, 32);
  1766. bench_case(1, 32, 32, 100, 100, 3, 4);
  1767. bench_case(1, 32, 32, 100, 100, 3, 32);
  1768. bench_case(1, 32, 32, 80, 80, 3, 4);
  1769. bench_case(1, 32, 32, 80, 80, 3, 32);
  1770. std::string algo_name = "X86_CONV_BIAS_DIRECT_STRIDE1_LARGE_GROUP";
  1771. printf("Benchmark X86_CONV_BIAS_DIRECT_STRIDE1_LARGE_GROUP algo\n");
  1772. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1773. {4, {4, 5, 6, 7}}, {1, {4}}, data_type);
  1774. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1775. {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  1776. benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
  1777. {1, {4}}, data_type);
  1778. shapes_and_computation.clear();
  1779. algo_name = "X86_CONV_BIAS_DIRECT_STRIDE1_SMALL_GROUP";
  1780. printf("Benchmark X86_CONV_BIAS_DIRECT_STRIDE1_SMALL_GROUP algo\n");
  1781. bench_case(1, 32, 32, 200, 200, 3, 1);
  1782. bench_case(1, 32, 32, 128, 128, 3, 1);
  1783. bench_case(1, 32, 32, 100, 100, 3, 1);
  1784. bench_case(1, 32, 32, 80, 80, 3, 1);
  1785. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1786. {4, {4, 5, 6, 7}}, {1, {4}}, data_type);
  1787. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1788. {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  1789. benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
  1790. {1, {4}}, data_type);
  1791. }
  1792. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_F32) {
  1793. constexpr size_t RUNS = 50;
  1794. param::ConvBias param;
  1795. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1796. param.pad_h = 1;
  1797. param.pad_w = 1;
  1798. param.stride_h = 1;
  1799. param.stride_w = 1;
  1800. std::vector<DType> data_type = {dtype::Float32(), dtype::Float32(),
  1801. dtype::Float32(), dtype::Float32()};
  1802. std::vector<std::pair<SmallVector<TensorShape>, float>>
  1803. shapes_and_computation;
  1804. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  1805. size_t FS, size_t group) {
  1806. SmallVector<TensorShape> shapes{{N, IC, H, W},
  1807. {OC / group, IC / group, FS, FS},
  1808. {1, OC, 1, 1},
  1809. {},
  1810. {N, OC, H, W}};
  1811. TensorShape dst{N, OC, H, W};
  1812. float computations =
  1813. ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1814. dst.total_nr_elems()) *
  1815. 1e-6;
  1816. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1817. };
  1818. bench_case(1, 32, 32, 200, 200, 3, 1);
  1819. bench_case(1, 32, 32, 200, 200, 3, 1);
  1820. bench_case(1, 32, 32, 128, 128, 3, 1);
  1821. bench_case(1, 32, 32, 128, 128, 3, 1);
  1822. bench_case(1, 32, 32, 100, 100, 3, 1);
  1823. bench_case(1, 32, 32, 100, 100, 3, 1);
  1824. bench_case(1, 32, 32, 80, 80, 3, 1);
  1825. bench_case(1, 32, 32, 80, 80, 3, 1);
  1826. bench_case(1, 64, 32, 7, 7, 3, 1);
  1827. bench_case(1, 64, 64, 7, 7, 3, 1);
  1828. bench_case(1, 64, 128, 7, 7, 3, 1);
  1829. bench_case(1, 64, 256, 7, 7, 3, 1);
  1830. bench_case(1, 64, 512, 7, 7, 3, 1);
  1831. bench_case(1, 64, 1024, 7, 7, 3, 1);
  1832. bench_case(1, 64, 32, 14, 14, 3, 1);
  1833. bench_case(1, 64, 64, 14, 14, 3, 1);
  1834. bench_case(1, 64, 128, 14, 14, 3, 1);
  1835. bench_case(1, 64, 256, 14, 14, 3, 1);
  1836. bench_case(1, 64, 512, 14, 14, 3, 1);
  1837. bench_case(1, 64, 1024, 14, 14, 3, 1);
  1838. bench_case(1, 128, 128, 14, 14, 3, 1);
  1839. bench_case(1, 128, 256, 14, 14, 3, 1);
  1840. bench_case(1, 512, 512, 14, 14, 3, 1);
  1841. bench_case(1, 256, 512, 14, 14, 3, 1);
  1842. bench_case(1, 512, 1024, 14, 14, 3, 1);
  1843. bench_case(1, 1024, 1024, 14, 14, 3, 1);
  1844. std::string algo_name = "IM2COLMATMUL:X86_F32_BLAS:192";
  1845. printf("Benchmark IM2COLMATMUL:X86_F32_BLAS algo\n");
  1846. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1847. {4, {4, 5, 6, 7}}, {1, {4}}, data_type);
  1848. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1849. {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  1850. benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
  1851. {1, {4}}, data_type);
  1852. shapes_and_computation.clear();
  1853. }
  1854. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_F32_single_thread) {
  1855. constexpr size_t RUNS = 50;
  1856. param::ConvBias param;
  1857. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1858. param.pad_h = 1;
  1859. param.pad_w = 1;
  1860. param.stride_h = 1;
  1861. param.stride_w = 1;
  1862. std::vector<DType> data_type = {dtype::Float32(), dtype::Float32(),
  1863. dtype::Float32(), dtype::Float32()};
  1864. std::vector<std::pair<SmallVector<TensorShape>, float>>
  1865. shapes_and_computation;
  1866. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H,
  1867. size_t W, size_t FS,
  1868. size_t group) {
  1869. SmallVector<TensorShape> shapes{{N, IC, H, W},
  1870. {OC / group, IC / group, FS, FS},
  1871. {1, OC, 1, 1},
  1872. {},
  1873. {N, OC, H, W}};
  1874. TensorShape dst{N, OC, H, W};
  1875. float computations =
  1876. ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1877. dst.total_nr_elems()) *
  1878. 1e-6;
  1879. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1880. };
  1881. bench_case(1, 32, 32, 200, 200, 3, 1);
  1882. bench_case(1, 32, 32, 200, 200, 3, 1);
  1883. bench_case(1, 32, 32, 128, 128, 3, 1);
  1884. bench_case(1, 32, 32, 128, 128, 3, 1);
  1885. bench_case(1, 32, 32, 100, 100, 3, 1);
  1886. bench_case(1, 32, 32, 100, 100, 3, 1);
  1887. bench_case(1, 32, 32, 80, 80, 3, 1);
  1888. bench_case(1, 32, 32, 80, 80, 3, 1);
  1889. bench_case(1, 64, 32, 7, 7, 3, 1);
  1890. bench_case(1, 64, 64, 7, 7, 3, 1);
  1891. bench_case(1, 64, 128, 7, 7, 3, 1);
  1892. bench_case(1, 64, 256, 7, 7, 3, 1);
  1893. bench_case(1, 64, 512, 7, 7, 3, 1);
  1894. bench_case(1, 64, 1024, 7, 7, 3, 1);
  1895. bench_case(1, 64, 32, 14, 14, 3, 1);
  1896. bench_case(1, 64, 64, 14, 14, 3, 1);
  1897. bench_case(1, 64, 128, 14, 14, 3, 1);
  1898. bench_case(1, 64, 256, 14, 14, 3, 1);
  1899. bench_case(1, 64, 512, 14, 14, 3, 1);
  1900. bench_case(1, 64, 1024, 14, 14, 3, 1);
  1901. bench_case(1, 128, 128, 14, 14, 3, 1);
  1902. bench_case(1, 128, 256, 14, 14, 3, 1);
  1903. bench_case(1, 512, 512, 14, 14, 3, 1);
  1904. bench_case(1, 256, 512, 14, 14, 3, 1);
  1905. bench_case(1, 512, 1024, 14, 14, 3, 1);
  1906. bench_case(1, 1024, 1024, 14, 14, 3, 1);
  1907. std::string algo_name = "IM2COLMATMUL:X86_F32_MKL_PACKA:192";
  1908. std::string algo_name1 = "IM2COLMATMUL:X86_F32_BLAS:192";
  1909. printf("Benchmark IM2COLMATMUL:X86_F32_BLAS algo\n");
  1910. benchmark_impl_comp(param, shapes_and_computation, algo_name,algo_name1, RUNS,
  1911. {1, {4}}, {1, {4}},data_type);
  1912. benchmark_impl_comp(param, shapes_and_computation, algo_name,algo_name1, RUNS,
  1913. {1, {7}}, {1, {7}},data_type);
  1914. shapes_and_computation.clear();
  1915. }
  1916. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_INT8X8X32) {
  1917. constexpr size_t RUNS = 50;
  1918. param::ConvBias param;
  1919. param.pad_h = 1;
  1920. param.pad_w = 1;
  1921. param.stride_h = 1;
  1922. param.stride_w = 1;
  1923. std::vector<std::pair<SmallVector<TensorShape>, float>>
  1924. shapes_and_computation;
  1925. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  1926. size_t FS, size_t group) {
  1927. SmallVector<TensorShape> shapes{{N, IC, H, W},
  1928. {OC / group, IC / group, FS, FS},
  1929. {1, OC, 1, 1},
  1930. {},
  1931. {N, OC, H, W}};
  1932. TensorShape dst{N, OC, H, W};
  1933. float computations =
  1934. ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1935. dst.total_nr_elems()) *
  1936. 1e-6;
  1937. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1938. };
  1939. bench_case(1, 32, 32, 200, 200, 3, 1);
  1940. bench_case(1, 32, 32, 200, 200, 3, 1);
  1941. bench_case(1, 32, 32, 128, 128, 3, 1);
  1942. bench_case(1, 32, 32, 128, 128, 3, 1);
  1943. bench_case(1, 32, 32, 100, 100, 3, 1);
  1944. bench_case(1, 32, 32, 100, 100, 3, 1);
  1945. bench_case(1, 32, 32, 80, 80, 3, 1);
  1946. bench_case(1, 32, 32, 80, 80, 3, 1);
  1947. bench_case(1, 64, 32, 7, 7, 3, 1);
  1948. bench_case(1, 64, 64, 7, 7, 3, 1);
  1949. bench_case(1, 64, 128, 7, 7, 3, 1);
  1950. bench_case(1, 64, 256, 7, 7, 3, 1);
  1951. bench_case(1, 64, 512, 7, 7, 3, 1);
  1952. bench_case(1, 64, 1024, 7, 7, 3, 1);
  1953. bench_case(1, 64, 32, 14, 14, 3, 1);
  1954. bench_case(1, 64, 64, 14, 14, 3, 1);
  1955. bench_case(1, 64, 128, 14, 14, 3, 1);
  1956. bench_case(1, 64, 256, 14, 14, 3, 1);
  1957. bench_case(1, 64, 512, 14, 14, 3, 1);
  1958. bench_case(1, 64, 1024, 14, 14, 3, 1);
  1959. bench_case(1, 128, 128, 14, 14, 3, 1);
  1960. bench_case(1, 128, 256, 14, 14, 3, 1);
  1961. bench_case(1, 512, 512, 14, 14, 3, 1);
  1962. bench_case(1, 256, 512, 14, 14, 3, 1);
  1963. bench_case(1, 512, 1024, 14, 14, 3, 1);
  1964. bench_case(1, 1024, 1024, 14, 14, 3, 1);
  1965. std::vector<DType> data_type = {dtype::Int8(), dtype::Int8(),
  1966. dtype::Int32(), dtype::Int32()};
  1967. std::string algo_name = "IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2";
  1968. // std::string algo_name = "IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16";
  1969. // printf("Benchmark IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2 algo\n");
  1970. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1971. {4, {4, 5, 6, 7}}, {1, {4}}, data_type);
  1972. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1973. {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  1974. benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
  1975. {1, {4}}, data_type);
  1976. shapes_and_computation.clear();
  1977. }
  1978. namespace{
  1979. std::vector<conv_bias::TestArg> get_winograd_benchmark_args(size_t kernel,
  1980. size_t pack_size) {
  1981. std::vector<conv_bias::TestArg> args;
  1982. auto pack = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1983. size_t p) {
  1984. if (ic % pack_size != 0 || oc % pack_size != 0)
  1985. return;
  1986. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1987. return;
  1988. param::ConvBias param;
  1989. param.mode = param::ConvBias::Mode::CROSS_CORRELATION;
  1990. param.format = param::ConvBias::Format::NCHW88;
  1991. param.sparse = param::ConvBias::Sparse::DENSE;
  1992. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1993. param.stride_h = 1;
  1994. param.stride_w = 1;
  1995. param.pad_h = p;
  1996. param.pad_w = p;
  1997. args.push_back(conv_bias::TestArg{param,
  1998. TensorShape{1, ic/8, h, w, 8},
  1999. TensorShape{oc/8, ic/8, kernel, kernel, 8, 8},
  2000. {1, oc/8, 1, 1, 8}});
  2001. };
  2002. for (size_t ic : {64, 128, 256}) {
  2003. for (size_t oc : {64,128,256}) {
  2004. pack(oc, ic, 56, 56, kernel, kernel / 2);
  2005. pack(oc, ic, 14, 14, kernel, kernel / 2);
  2006. pack(oc, ic, 28, 28, kernel, kernel / 2);
  2007. }
  2008. }
  2009. //! conv in vgg16
  2010. pack(512, 512, 15, 15, kernel, kernel / 2);
  2011. pack(512, 256, 15, 15, kernel, kernel / 2);
  2012. pack(256, 256, 29, 29, kernel, kernel / 2);
  2013. pack(256, 128, 29, 29, kernel, kernel / 2);
  2014. pack(128, 128, 57, 57, kernel, kernel / 2);
  2015. pack(128, 64, 57, 57, kernel, kernel / 2);
  2016. pack(64, 64, 56, 56, kernel, kernel / 2);
  2017. pack(128, 128, 28, 28, kernel, kernel / 2);
  2018. pack(512, 512, 14, 14, kernel, kernel / 2);
  2019. return args;
  2020. }
  2021. void benchmark_winograd(const char* algo_name, Handle* handle,
  2022. size_t kernel, size_t pack_size) {
  2023. auto&& args = get_winograd_benchmark_args(kernel, pack_size);
  2024. using namespace conv_bias;
  2025. constexpr size_t RUN = 10;
  2026. Benchmarker<ConvBias> benchmark(handle);
  2027. benchmark.set_display(false);
  2028. benchmark.set_times(RUN);
  2029. Benchmarker<ConvBias> benchmark_winograd(handle);
  2030. benchmark_winograd.set_display(false);
  2031. benchmark_winograd.set_times(RUN);
  2032. for (auto&& arg : args) {
  2033. TensorLayout dst_layout;
  2034. auto opr = handle->create_operator<ConvBias>();
  2035. opr->param() = arg.param;
  2036. opr->deduce_layout({arg.src, dtype::Float32()},
  2037. {arg.filter, dtype::Float32()},
  2038. {arg.bias, dtype::Float32()}, {}, dst_layout);
  2039. //! dst.nr_elems * IC * FH * FW * 2
  2040. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  2041. arg.filter[2] * arg.filter[3] * 2.0 * 8.0 /
  2042. (1024 * 1024 * 1024) * 1e3;
  2043. auto used = benchmark.set_param(arg.param).exec(
  2044. {arg.src, arg.filter, {}, {}, {}}) /
  2045. RUN;
  2046. benchmark_winograd.set_param(arg.param);
  2047. auto used_winograd =
  2048. algo_benchmark<ConvBias>(benchmark_winograd,
  2049. {arg.src, arg.filter, {}, {}, {}},
  2050. algo_name) /
  2051. RUN;
  2052. printf("%s %s: normal: %f ms %f Gflops winograd: %f ms %f GFlops "
  2053. "speedup: "
  2054. "%f\n",
  2055. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  2056. used, computations / used, used_winograd,
  2057. computations / used_winograd, used / used_winograd);
  2058. }
  2059. }
  2060. }
  2061. TEST_F(X86, BENCHMARK_CONVBIAS_WINOGRAD_F63_8x8) {
  2062. benchmark_winograd("WINOGRAD:X86_F32MK8_8X8:8:6:8", handle(), 3, 8);
  2063. }
  2064. TEST_F(X86, BENCHMARK_CONVBIAS_WINOGRAD_F23_8x8) {
  2065. benchmark_winograd("WINOGRAD:X86_F32MK8_8X8:8:2:8", handle(), 3, 8);
  2066. }
  2067. #endif
  2068. } // namespace test
  2069. } // namespace megdnn
  2070. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台