You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias.cpp 96 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378
  1. /**
  2. * \file dnn/test/x86/conv_bias.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "src/x86/utils.h"
  12. #include "test/x86/fixture.h"
  13. #include "megdnn/opr_param_defs.h"
  14. #include "megdnn/oprs.h"
  15. #include "test/common/benchmarker.h"
  16. #include "test/common/checker.h"
  17. #include "test/common/conv_bias.h"
  18. #include "test/common/rng.h"
  19. #include "test/common/tensor.h"
  20. #include "test/common/workspace_wrapper.h"
  21. namespace megdnn {
  22. namespace test {
  23. TEST_F(X86, CONV_BIAS_FORWARD) {
  24. using namespace conv_bias;
  25. std::vector<TestArg> args = get_args();
  26. Checker<ConvBiasForward> checker(handle());
  27. NormalRNG default_rng;
  28. ConstValue const_val;
  29. for (auto&& arg : args) {
  30. checker.set_dtype(0, dtype::Float32())
  31. .set_dtype(1, dtype::Float32())
  32. .set_dtype(2, dtype::Float32())
  33. .set_rng(0, &default_rng)
  34. .set_rng(1, &default_rng)
  35. .set_rng(2, &default_rng)
  36. .set_epsilon(1e-3)
  37. .set_param(arg.param)
  38. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  39. }
  40. }
  41. TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_INT8x8x32) {
  42. using namespace conv_bias;
  43. std::vector<TestArg> args;
  44. auto run = [&](size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  45. NonlineMode nonline_mode) {
  46. if (w + 2 * p < kernel || h + 2 * p < kernel)
  47. return;
  48. param::ConvBias param;
  49. param.stride_h = 1;
  50. param.stride_w = 1;
  51. param.pad_h = p;
  52. param.pad_w = p;
  53. param.nonlineMode = nonline_mode;
  54. param.sparse = param::ConvBias::Sparse::GROUP;
  55. //! no bias
  56. args.emplace_back(param, TensorShape{2, ic, h, w},
  57. TensorShape{ic, 1, 1, kernel, kernel}, TensorShape{});
  58. //! bias channel
  59. args.emplace_back(param, TensorShape{2, ic, h, w},
  60. TensorShape{ic, 1, 1, kernel, kernel},
  61. TensorShape{1, ic, 1, 1});
  62. };
  63. for (size_t kernel : {2, 3, 5, 7})
  64. for (size_t pad : {0, 1})
  65. for (size_t ic : {1, 5, 17, 20})
  66. for (size_t h : {7, 16, 38, 40})
  67. for (size_t w : {16, 25, 40, 55})
  68. for (NonlineMode nonline_mode : {NonlineMode::IDENTITY})
  69. run(ic, w, h, kernel, pad, nonline_mode);
  70. Checker<ConvBias> checker(handle());
  71. UniformIntRNG rng{-50, 50};
  72. checker.set_dtype(0, dtype::Int8())
  73. .set_dtype(1, dtype::Int8())
  74. .set_dtype(2, dtype::Int32())
  75. .set_dtype(4, dtype::Int32())
  76. .set_rng(0, &rng)
  77. .set_rng(1, &rng)
  78. .set_rng(2, &rng)
  79. .set_epsilon(1e-3);
  80. checker.set_before_exec_callback(
  81. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  82. "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"));
  83. for (auto&& arg : args) {
  84. checker.set_param(arg.param).exec(
  85. {arg.src, arg.filter, arg.bias, {}, {}});
  86. }
  87. }
  88. TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_QuantizedS32) {
  89. using namespace conv_bias;
  90. std::vector<TestArg> args;
  91. auto run = [&](size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  92. NonlineMode nonline_mode) {
  93. if (w + 2 * p < kernel || h + 2 * p < kernel)
  94. return;
  95. param::ConvBias param;
  96. param.stride_h = 1;
  97. param.stride_w = 1;
  98. param.pad_h = p;
  99. param.pad_w = p;
  100. param.nonlineMode = nonline_mode;
  101. param.sparse = param::ConvBias::Sparse::GROUP;
  102. //! no bias
  103. args.emplace_back(param, TensorShape{2, ic, h, w},
  104. TensorShape{ic, 1, 1, kernel, kernel}, TensorShape{});
  105. //! bias channel
  106. args.emplace_back(param, TensorShape{2, ic, h, w},
  107. TensorShape{ic, 1, 1, kernel, kernel},
  108. TensorShape{1, ic, 1, 1});
  109. };
  110. for (size_t kernel : {2, 3, 5, 7})
  111. for (size_t pad : {0, 1})
  112. for (size_t ic : {1, 3, 5, 7, 17})
  113. for (size_t h : {10, 17, 25, 30})
  114. for (size_t w : {19, 28, 58, 168})
  115. for (NonlineMode nonline_mode : {NonlineMode::IDENTITY})
  116. run(ic, w, h, kernel, pad, nonline_mode);
  117. Checker<ConvBias> checker(handle());
  118. UniformIntRNG rng{-50, 50};
  119. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  120. .set_dtype(1, dtype::QuantizedS8(2.5f))
  121. .set_dtype(2, dtype::QuantizedS32(6.25f))
  122. .set_dtype(4, {})
  123. .set_rng(0, &rng)
  124. .set_rng(1, &rng)
  125. .set_rng(2, &rng)
  126. .set_epsilon(1e-3);
  127. checker.set_before_exec_callback(
  128. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  129. "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"));
  130. for (auto&& arg : args) {
  131. checker.set_param(arg.param).exec(
  132. {arg.src, arg.filter, arg.bias, {}, {}});
  133. }
  134. }
  135. TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_QuantizedS8x8x8) {
  136. using namespace conv_bias;
  137. std::vector<TestArg> args;
  138. auto run = [&](size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  139. NonlineMode nonline_mode) {
  140. if (w + 2 * p < kernel || h + 2 * p < kernel)
  141. return;
  142. param::ConvBias param;
  143. param.stride_h = 1;
  144. param.stride_w = 1;
  145. param.pad_h = p;
  146. param.pad_w = p;
  147. param.nonlineMode = nonline_mode;
  148. param.sparse = param::ConvBias::Sparse::GROUP;
  149. //! no bias
  150. args.emplace_back(param, TensorShape{2, ic, h, w},
  151. TensorShape{ic, 1, 1, kernel, kernel}, TensorShape{});
  152. //! bias channel
  153. args.emplace_back(param, TensorShape{2, ic, h, w},
  154. TensorShape{ic, 1, 1, kernel, kernel},
  155. TensorShape{1, ic, 1, 1});
  156. };
  157. for (size_t kernel : {2, 3, 5, 7})
  158. for (size_t pad : {0, 1})
  159. for (size_t ic : {1, 3, 5, 7, 17})
  160. for (size_t h : {10, 15, 17, 30})
  161. for (size_t w : {19, 28, 58, 168})
  162. for (NonlineMode nonline_mode :
  163. {NonlineMode::IDENTITY, NonlineMode::H_SWISH,
  164. NonlineMode::RELU})
  165. run(ic, w, h, kernel, pad, nonline_mode);
  166. Checker<ConvBias> checker(handle());
  167. UniformIntRNG rng{-50, 50};
  168. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  169. .set_dtype(1, dtype::QuantizedS8(2.5f))
  170. .set_dtype(2, dtype::QuantizedS32(6.25f))
  171. .set_dtype(4, dtype::QuantizedS8(60.25f))
  172. .set_rng(0, &rng)
  173. .set_rng(1, &rng)
  174. .set_rng(2, &rng)
  175. .set_epsilon(1e-3);
  176. checker.set_before_exec_callback(
  177. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  178. "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"));
  179. for (auto&& arg : args) {
  180. checker.set_param(arg.param).exec(
  181. {arg.src, arg.filter, arg.bias, {}, {}});
  182. }
  183. }
  184. TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE1_INT8x8x32) {
  185. using namespace conv_bias;
  186. std::vector<TestArg> args;
  187. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  188. size_t p, NonlineMode nonline_mode) {
  189. if (w + 2 * p < kernel || h + 2 * p < kernel)
  190. return;
  191. param::ConvBias param;
  192. param.stride_h = 1;
  193. param.stride_w = 1;
  194. param.pad_h = p;
  195. param.pad_w = p;
  196. param.nonlineMode = nonline_mode;
  197. param.sparse = param::ConvBias::Sparse::DENSE;
  198. //! no bias
  199. args.emplace_back(param, TensorShape{2, ic, h, w},
  200. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  201. param.sparse = param::ConvBias::Sparse::GROUP;
  202. //! no bias
  203. args.emplace_back(param, TensorShape{2, 2 * ic, h, w},
  204. TensorShape{2, oc / 2, ic, kernel, kernel},
  205. TensorShape{});
  206. };
  207. for (size_t kernel : {2, 3, 5, 7})
  208. for (size_t pad : {0, 1})
  209. for (size_t oc : {4, 8, 13, 16, 24})
  210. for (size_t ic : {2, 3, 7, 10})
  211. for (size_t h : {10, 11})
  212. for (size_t w : {8, 10})
  213. for (NonlineMode nonline_mode :
  214. {NonlineMode::IDENTITY})
  215. run(oc, ic, w, h, kernel, pad, nonline_mode);
  216. Checker<ConvBias> checker(handle());
  217. UniformIntRNG rng{-50, 50};
  218. checker.set_dtype(0, dtype::Int8())
  219. .set_dtype(1, dtype::Int8())
  220. .set_dtype(2, dtype::Int32())
  221. .set_dtype(4, dtype::Int32())
  222. .set_rng(0, &rng)
  223. .set_rng(1, &rng)
  224. .set_rng(2, &rng)
  225. .set_epsilon(1e-3);
  226. checker.set_before_exec_callback(
  227. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  228. "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE1"));
  229. for (auto&& arg : args) {
  230. checker.set_param(arg.param).exec(
  231. {arg.src, arg.filter, arg.bias, {}, {}});
  232. }
  233. }
  234. TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE1_QuantizedS32) {
  235. using namespace conv_bias;
  236. std::vector<TestArg> args;
  237. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  238. size_t p, NonlineMode nonline_mode) {
  239. if (w + 2 * p < kernel || h + 2 * p < kernel)
  240. return;
  241. param::ConvBias param;
  242. param.stride_h = 1;
  243. param.stride_w = 1;
  244. param.pad_h = p;
  245. param.pad_w = p;
  246. param.nonlineMode = nonline_mode;
  247. param.sparse = param::ConvBias::Sparse::DENSE;
  248. //! no bias
  249. args.emplace_back(param, TensorShape{2, ic, h, w},
  250. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  251. param.sparse = param::ConvBias::Sparse::GROUP;
  252. //! no bias
  253. args.emplace_back(param, TensorShape{2, 2 * ic, h, w},
  254. TensorShape{2, oc / 2, ic, kernel, kernel},
  255. TensorShape{});
  256. };
  257. for (size_t kernel : {2, 3, 5, 7})
  258. for (size_t pad : {0, 1})
  259. for (size_t oc : {4, 8, 13, 16, 24})
  260. for (size_t ic : {2, 3, 7, 10})
  261. for (size_t h : {10, 11})
  262. for (size_t w : {8, 10})
  263. for (NonlineMode nonline_mode :
  264. {NonlineMode::IDENTITY})
  265. run(oc, ic, w, h, kernel, pad, nonline_mode);
  266. Checker<ConvBias> checker(handle());
  267. UniformIntRNG rng{-50, 50};
  268. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  269. .set_dtype(1, dtype::QuantizedS8(2.5f))
  270. .set_dtype(2, dtype::QuantizedS32(6.25f))
  271. .set_dtype(4, {})
  272. .set_rng(0, &rng)
  273. .set_rng(1, &rng)
  274. .set_rng(2, &rng)
  275. .set_epsilon(1e-3);
  276. checker.set_before_exec_callback(
  277. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  278. "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE1"));
  279. for (auto&& arg : args) {
  280. checker.set_param(arg.param).exec(
  281. {arg.src, arg.filter, arg.bias, {}, {}});
  282. }
  283. }
  284. TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE1_S8S8S8) {
  285. using namespace conv_bias;
  286. std::vector<TestArg> args;
  287. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  288. size_t p, NonlineMode nonline_mode) {
  289. if (w + 2 * p < kernel || h + 2 * p < kernel)
  290. return;
  291. param::ConvBias param;
  292. param.stride_h = 1;
  293. param.stride_w = 1;
  294. param.pad_h = p;
  295. param.pad_w = p;
  296. param.nonlineMode = nonline_mode;
  297. param.sparse = param::ConvBias::Sparse::DENSE;
  298. //! no bias
  299. args.emplace_back(param, TensorShape{1, ic, h, w},
  300. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  301. //! bias channel
  302. args.emplace_back(param, TensorShape{1, ic, h, w},
  303. TensorShape{oc, ic, kernel, kernel},
  304. TensorShape{1, oc, 1, 1});
  305. param.sparse = param::ConvBias::Sparse::GROUP;
  306. //! no bias
  307. args.emplace_back(param, TensorShape{2, 2 * ic, h, w},
  308. TensorShape{2, oc / 2, ic, kernel, kernel},
  309. TensorShape{});
  310. //! bias channel
  311. args.emplace_back(param, TensorShape{2, 2 * ic, h, w},
  312. TensorShape{2, oc / 2, ic, kernel, kernel},
  313. TensorShape{1, oc, 1, 1});
  314. };
  315. for (size_t kernel : {2, 3, 5, 7})
  316. for (size_t pad : {0, 1})
  317. for (size_t oc : {4, 8, 14, 16, 24})
  318. for (size_t ic : {2, 3, 7, 10})
  319. for (size_t h : {10, 11})
  320. for (size_t w : {8, 10})
  321. for (NonlineMode nonline_mode :
  322. {NonlineMode::IDENTITY, NonlineMode::RELU,
  323. NonlineMode::H_SWISH})
  324. run(oc, ic, w, h, kernel, pad, nonline_mode);
  325. Checker<ConvBias> checker(handle());
  326. UniformIntRNG rng{-50, 50};
  327. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  328. .set_dtype(1, dtype::QuantizedS8(2.5f))
  329. .set_dtype(2, dtype::QuantizedS32(6.25f))
  330. .set_dtype(4, dtype::QuantizedS8(60.25f))
  331. .set_rng(0, &rng)
  332. .set_rng(1, &rng)
  333. .set_rng(2, &rng)
  334. .set_epsilon(1e-3);
  335. checker.set_before_exec_callback(
  336. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  337. "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE1"));
  338. for (auto&& arg : args) {
  339. checker.set_param(arg.param).exec(
  340. {arg.src, arg.filter, arg.bias, {}, {}});
  341. }
  342. }
  343. TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE2_INT8x8x32) {
  344. using namespace conv_bias;
  345. std::vector<TestArg> args;
  346. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  347. size_t p, NonlineMode nonline_mode) {
  348. if (w + 2 * p < kernel || h + 2 * p < kernel)
  349. return;
  350. param::ConvBias param;
  351. param.stride_h = 2;
  352. param.stride_w = 2;
  353. param.pad_h = p;
  354. param.pad_w = p;
  355. param.nonlineMode = nonline_mode;
  356. param.sparse = param::ConvBias::Sparse::DENSE;
  357. //! no bias
  358. args.emplace_back(param, TensorShape{2, ic, h, w},
  359. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  360. param.sparse = param::ConvBias::Sparse::GROUP;
  361. //! no bias
  362. args.emplace_back(param, TensorShape{2, 2 * ic, h, w},
  363. TensorShape{2, oc / 2, ic, kernel, kernel},
  364. TensorShape{});
  365. };
  366. for (size_t kernel : {2, 3, 5, 7})
  367. for (size_t pad : {0, 1, 2, 5})
  368. for (size_t oc : {4, 8, 13, 16, 24})
  369. for (size_t ic : {2, 3, 7, 10})
  370. for (size_t h : {10, 11})
  371. for (size_t w : {8, 10, 20})
  372. for (NonlineMode nonline_mode :
  373. {NonlineMode::IDENTITY})
  374. run(oc, ic, w, h, kernel, pad, nonline_mode);
  375. Checker<ConvBias> checker(handle());
  376. UniformIntRNG rng{-50, 50};
  377. checker.set_dtype(0, dtype::Int8())
  378. .set_dtype(1, dtype::Int8())
  379. .set_dtype(2, dtype::Int32())
  380. .set_dtype(4, dtype::Int32())
  381. .set_rng(0, &rng)
  382. .set_rng(1, &rng)
  383. .set_rng(2, &rng)
  384. .set_epsilon(1e-3);
  385. checker.set_before_exec_callback(
  386. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  387. "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE2"));
  388. for (auto&& arg : args) {
  389. checker.set_param(arg.param).exec(
  390. {arg.src, arg.filter, arg.bias, {}, {}});
  391. }
  392. }
  393. TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE2_QuantizedS32) {
  394. using namespace conv_bias;
  395. std::vector<TestArg> args;
  396. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  397. size_t p, NonlineMode nonline_mode) {
  398. if (w + 2 * p < kernel || h + 2 * p < kernel)
  399. return;
  400. param::ConvBias param;
  401. param.stride_h = 2;
  402. param.stride_w = 2;
  403. param.pad_h = p;
  404. param.pad_w = p;
  405. param.nonlineMode = nonline_mode;
  406. param.sparse = param::ConvBias::Sparse::DENSE;
  407. //! no bias
  408. args.emplace_back(param, TensorShape{2, ic, h, w},
  409. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  410. param.sparse = param::ConvBias::Sparse::GROUP;
  411. //! no bias
  412. args.emplace_back(param, TensorShape{2, 2 * ic, h, w},
  413. TensorShape{2, oc / 2, ic, kernel, kernel},
  414. TensorShape{});
  415. };
  416. for (size_t kernel : {2, 3, 5, 7})
  417. for (size_t pad : {0, 1, 3, 5})
  418. for (size_t oc : {4, 8, 13, 16, 24})
  419. for (size_t ic : {2, 3, 7, 10})
  420. for (size_t h : {10, 11})
  421. for (size_t w : {8, 10, 19})
  422. for (NonlineMode nonline_mode :
  423. {NonlineMode::IDENTITY})
  424. run(oc, ic, w, h, kernel, pad, nonline_mode);
  425. Checker<ConvBias> checker(handle());
  426. UniformIntRNG rng{-50, 50};
  427. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  428. .set_dtype(1, dtype::QuantizedS8(2.5f))
  429. .set_dtype(2, dtype::QuantizedS32(6.25f))
  430. .set_dtype(4, {})
  431. .set_rng(0, &rng)
  432. .set_rng(1, &rng)
  433. .set_rng(2, &rng)
  434. .set_epsilon(1e-3);
  435. checker.set_before_exec_callback(
  436. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  437. "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE2"));
  438. for (auto&& arg : args) {
  439. checker.set_param(arg.param).exec(
  440. {arg.src, arg.filter, arg.bias, {}, {}});
  441. }
  442. }
  443. TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE2_S8S8S8) {
  444. using namespace conv_bias;
  445. std::vector<TestArg> args;
  446. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  447. size_t p, NonlineMode nonline_mode) {
  448. if (w + 2 * p < kernel || h + 2 * p < kernel)
  449. return;
  450. param::ConvBias param;
  451. param.stride_h = 2;
  452. param.stride_w = 2;
  453. param.pad_h = p;
  454. param.pad_w = p;
  455. param.nonlineMode = nonline_mode;
  456. param.sparse = param::ConvBias::Sparse::DENSE;
  457. //! no bias
  458. args.emplace_back(param, TensorShape{1, ic, h, w},
  459. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  460. //! bias channel
  461. args.emplace_back(param, TensorShape{1, ic, h, w},
  462. TensorShape{oc, ic, kernel, kernel},
  463. TensorShape{1, oc, 1, 1});
  464. param.sparse = param::ConvBias::Sparse::GROUP;
  465. //! no bias
  466. args.emplace_back(param, TensorShape{2, 2 * ic, h, w},
  467. TensorShape{2, oc / 2, ic, kernel, kernel},
  468. TensorShape{});
  469. //! bias channel
  470. args.emplace_back(param, TensorShape{2, 2 * ic, h, w},
  471. TensorShape{2, oc / 2, ic, kernel, kernel},
  472. TensorShape{1, oc, 1, 1});
  473. };
  474. for (size_t kernel : {2, 3, 5, 7})
  475. for (size_t pad : {0, 1, 3, 5})
  476. for (size_t oc : {4, 8, 14, 16, 24})
  477. for (size_t ic : {2, 3, 7, 10})
  478. for (size_t h : {10, 11})
  479. for (size_t w : {8, 10, 18})
  480. for (NonlineMode nonline_mode :
  481. {NonlineMode::IDENTITY, NonlineMode::RELU,
  482. NonlineMode::H_SWISH})
  483. run(oc, ic, w, h, kernel, pad, nonline_mode);
  484. Checker<ConvBias> checker(handle());
  485. UniformIntRNG rng{-50, 50};
  486. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  487. .set_dtype(1, dtype::QuantizedS8(2.5f))
  488. .set_dtype(2, dtype::QuantizedS32(6.25f))
  489. .set_dtype(4, dtype::QuantizedS8(60.25f))
  490. .set_rng(0, &rng)
  491. .set_rng(1, &rng)
  492. .set_rng(2, &rng)
  493. .set_epsilon(1e-3);
  494. checker.set_before_exec_callback(
  495. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  496. "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE2"));
  497. for (auto&& arg : args) {
  498. checker.set_param(arg.param).exec(
  499. {arg.src, arg.filter, arg.bias, {}, {}});
  500. }
  501. }
  502. TEST_F(X86_MULTI_THREADS, CONV_BIAS_DIRECT_STRIDE1_SMALL_GROUP) {
  503. using namespace conv_bias;
  504. std::vector<TestArg> args;
  505. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  506. size_t p, NonlineMode nonline_mode) {
  507. if (w + 2 * p < kernel || h + 2 * p < kernel)
  508. return;
  509. param::ConvBias param;
  510. param.stride_h = 1;
  511. param.stride_w = 1;
  512. param.pad_h = p;
  513. param.pad_w = p;
  514. param.nonlineMode = nonline_mode;
  515. //! no bias
  516. args.emplace_back(param, TensorShape{1, ic, h, w},
  517. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  518. //! bias channel
  519. args.emplace_back(param, TensorShape{2, ic, h, w},
  520. TensorShape{oc, ic, kernel, kernel},
  521. TensorShape{1, oc, 1, 1});
  522. //! bias
  523. args.emplace_back(param, TensorShape{2, ic, h, w},
  524. TensorShape{oc, ic, kernel, kernel},
  525. TensorShape{2, oc, (h + param.pad_h * 2 - kernel) + 1,
  526. (w + param.pad_w * 2 - kernel) + 1});
  527. };
  528. for (size_t kernel : {1, 2, 3, 4, 5, 6, 7})
  529. for (size_t ic : {1, 4, 8, 16})
  530. for (size_t oc : {1, 4, 8})
  531. for (size_t p : {0, 2})
  532. for (size_t size : {20, 21, 24})
  533. for (NonlineMode nonline_mode :
  534. {NonlineMode::RELU, NonlineMode::SIGMOID,
  535. NonlineMode::H_SWISH, NonlineMode::IDENTITY}) {
  536. run(oc, ic, size, size, kernel, p, nonline_mode);
  537. }
  538. Checker<ConvBias> checker(handle());
  539. UniformIntRNG rng{-50, 50};
  540. checker.set_dtype(0, dtype::Float32())
  541. .set_dtype(1, dtype::Float32())
  542. .set_dtype(2, dtype::Float32())
  543. .set_rng(0, &rng)
  544. .set_rng(1, &rng)
  545. .set_rng(2, &rng);
  546. checker.set_before_exec_callback(
  547. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  548. "X86_CONV_BIAS_DIRECT_STRIDE1_SMALL_GROUP"));
  549. for (auto&& arg : args) {
  550. checker.set_param(arg.param).exec(
  551. {arg.src, arg.filter, arg.bias, {}, {}});
  552. }
  553. }
  554. TEST_F(X86_MULTI_THREADS, CONV_BIAS_DIRECT_STRIDE1_LARGE_GROUP) {
  555. using namespace conv_bias;
  556. std::vector<TestArg> args;
  557. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  558. size_t p, NonlineMode nonline_mode) {
  559. if (w + 2 * p < kernel || h + 2 * p < kernel)
  560. return;
  561. param::ConvBias param;
  562. param.stride_h = 1;
  563. param.stride_w = 1;
  564. param.pad_h = p;
  565. param.pad_w = p;
  566. param.nonlineMode = nonline_mode;
  567. //! no bias
  568. args.emplace_back(param, TensorShape{1, ic, h, w},
  569. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  570. //! bias channel
  571. args.emplace_back(param, TensorShape{2, ic, h, w},
  572. TensorShape{oc, ic, kernel, kernel},
  573. TensorShape{1, oc, 1, 1});
  574. //! bias
  575. args.emplace_back(param, TensorShape{2, ic, h, w},
  576. TensorShape{oc, ic, kernel, kernel},
  577. TensorShape{2, oc, (h + param.pad_h * 2 - kernel) + 1,
  578. (w + param.pad_w * 2 - kernel) + 1});
  579. };
  580. for (size_t kernel : {1, 2, 3, 4, 5, 6, 7})
  581. for (size_t ic : {1, 4, 8, 16})
  582. for (size_t oc : {1, 4, 8})
  583. for (size_t p : {0, 2})
  584. for (size_t size : {20, 21, 24})
  585. for (NonlineMode nonline_mode :
  586. {NonlineMode::RELU, NonlineMode::SIGMOID,
  587. NonlineMode::H_SWISH, NonlineMode::IDENTITY}) {
  588. run(oc, ic, size, size, kernel, p, nonline_mode);
  589. }
  590. Checker<ConvBias> checker(handle());
  591. UniformIntRNG rng{-50, 50};
  592. checker.set_dtype(0, dtype::Float32())
  593. .set_dtype(1, dtype::Float32())
  594. .set_dtype(2, dtype::Float32())
  595. .set_rng(0, &rng)
  596. .set_rng(1, &rng)
  597. .set_rng(2, &rng);
  598. checker.set_before_exec_callback(
  599. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  600. "X86_CONV_BIAS_DIRECT_STRIDE1_LARGE_GROUP"));
  601. for (auto&& arg : args) {
  602. checker.set_param(arg.param).exec(
  603. {arg.src, arg.filter, arg.bias, {}, {}});
  604. }
  605. }
  606. TEST_F(X86_MULTI_THREADS, CONV_BIAS_DIRECT_STRIDE2) {
  607. using namespace conv_bias;
  608. std::vector<TestArg> args;
  609. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  610. size_t p, NonlineMode nonline_mode) {
  611. if (w + 2 * p < kernel || h + 2 * p < kernel)
  612. return;
  613. param::ConvBias param;
  614. param.stride_h = 2;
  615. param.stride_w = 2;
  616. param.pad_h = p;
  617. param.pad_w = p;
  618. param.nonlineMode = nonline_mode;
  619. //! no bias
  620. args.emplace_back(param, TensorShape{1, ic, h, w},
  621. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  622. };
  623. for (size_t kernel : {2, 3, 5, 7})
  624. for (size_t ic : {1, 4, 8, 16})
  625. for (size_t oc : {1, 4, 8})
  626. for (size_t p : {0, 2})
  627. for (size_t size : {20, 21, 24})
  628. for (NonlineMode nonline_mode :
  629. {NonlineMode::RELU, NonlineMode::SIGMOID,
  630. NonlineMode::H_SWISH, NonlineMode::IDENTITY}) {
  631. run(oc, ic, size, size, kernel, p, nonline_mode);
  632. }
  633. Checker<ConvBias> checker(handle());
  634. UniformIntRNG rng{-50, 50};
  635. checker.set_dtype(0, dtype::Float32())
  636. .set_dtype(1, dtype::Float32())
  637. .set_dtype(2, dtype::Float32())
  638. .set_rng(0, &rng)
  639. .set_rng(1, &rng)
  640. .set_rng(2, &rng);
  641. checker.set_before_exec_callback(
  642. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  643. "X86_CONV_BIAS_DIRECT_STRIDE2_SMALL_GROUP"));
  644. for (auto&& arg : args) {
  645. checker.set_param(arg.param).exec(
  646. {arg.src, arg.filter, arg.bias, {}, {}});
  647. }
  648. checker.set_before_exec_callback(
  649. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  650. "X86_CONV_BIAS_DIRECT_STRIDE2_LARGE_GROUP"));
  651. for (auto&& arg : args) {
  652. checker.set_param(arg.param).exec(
  653. {arg.src, arg.filter, arg.bias, {}, {}});
  654. }
  655. }
  656. TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32) {
  657. using namespace conv_bias;
  658. std::vector<TestArg> args;
  659. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  660. size_t p, NonlineMode nonline_mode) {
  661. if (w + 2 * p < kernel || h + 2 * p < kernel)
  662. return;
  663. param::ConvBias param;
  664. param.stride_h = 1;
  665. param.stride_w = 1;
  666. param.pad_h = p;
  667. param.pad_w = p;
  668. param.nonlineMode = nonline_mode;
  669. //! no bias
  670. args.emplace_back(param, TensorShape{1, ic, h, w},
  671. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  672. };
  673. for (size_t kernel : {2, 3, 4, 5, 6, 7})
  674. for (size_t ic : {1, 4, 8, 16})
  675. for (size_t oc : {1, 4, 8})
  676. for (size_t p : {0, 2})
  677. for (size_t size : {20, 21, 24})
  678. for (NonlineMode nonline_mode :
  679. {NonlineMode::IDENTITY}) {
  680. run(oc, ic, size, size, kernel, p, nonline_mode);
  681. }
  682. //! test OC block
  683. run(2046, 1, 8, 8, 2, 0, NonlineMode::IDENTITY);
  684. Checker<ConvBias> checker(handle());
  685. UniformIntRNG rng{-50, 50};
  686. #define cb(algo_name) \
  687. checker.set_before_exec_callback( \
  688. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
  689. checker.set_dtype(0, dtype::Int8()); \
  690. checker.set_dtype(1, dtype::Int8()); \
  691. checker.set_dtype(2, dtype::Int32()); \
  692. checker.set_dtype(4, dtype::Int32()); \
  693. for (auto&& arg : args) { \
  694. checker.set_param(arg.param).execs({arg.src, arg.filter, {}, {}, {}}); \
  695. } \
  696. for (auto&& arg : args) { \
  697. checker.set_dtype(0, dtype::QuantizedS8(2.5f)) \
  698. .set_dtype(1, dtype::QuantizedS8(2.5f)) \
  699. .set_dtype(2, dtype::QuantizedS32(6.25f)) \
  700. .set_dtype(4, {}) \
  701. .set_rng(0, &rng) \
  702. .set_rng(1, &rng) \
  703. .set_rng(2, &rng) \
  704. .set_param(arg.param) \
  705. .execs({arg.src, arg.filter, {}, {}, {}}); \
  706. }
  707. #if MEGDNN_X86_WITH_MKL_DNN
  708. if (megdnn::x86::is_supported(x86::SIMDType::VNNI)) {
  709. cb("IM2COLMATMUL:X86_INT8X8X32_MKLDNN");
  710. }
  711. #endif
  712. #if MEGDNN_X86_WITH_VNNI
  713. if (megdnn::x86::is_supported(x86::SIMDType::VNNI)) {
  714. cb("IM2COLMATMUL:X86_INT8X8X32_VNNI");
  715. }
  716. #endif
  717. if (megdnn::x86::is_supported(x86::SIMDType::AVX2)) {
  718. cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16");
  719. cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2");
  720. }
  721. if (::megdnn::x86::is_supported(::megdnn::x86::SIMDType::SSE4_2)) {
  722. cb("IM2COLMATMUL:X86_INT8X8X32_SSE_4X8X2");
  723. }
  724. #undef cb
  725. }
  726. TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32) {
  727. using namespace conv_bias;
  728. std::vector<TestArg> args;
  729. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  730. size_t p, NonlineMode nonline_mode) {
  731. if (w + 2 * p < kernel || h + 2 * p < kernel)
  732. return;
  733. param::ConvBias param;
  734. param.stride_h = 1;
  735. param.stride_w = 1;
  736. param.pad_h = p;
  737. param.pad_w = p;
  738. param.nonlineMode = nonline_mode;
  739. //! no bias
  740. args.emplace_back(param, TensorShape{1, ic, h, w},
  741. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  742. args.emplace_back(param, TensorShape{1, ic, h, w},
  743. TensorShape{oc, ic, kernel, kernel},
  744. TensorShape{1, oc, 1, 1});
  745. args.emplace_back(
  746. param, TensorShape{1, ic, h, w},
  747. TensorShape{oc, ic, kernel, kernel},
  748. TensorShape{1, oc, (h + 2 * p - kernel) / param.stride_h + 1,
  749. (w + 2 * p - kernel) / param.stride_w + 1});
  750. };
  751. for (size_t kernel : {2, 3, 4, 5, 6, 7})
  752. for (size_t ic : {1, 4, 8, 16})
  753. for (size_t oc : {1, 4, 8, 16, 300})
  754. for (size_t p : {0, 2})
  755. for (size_t size : {8, 24})
  756. for (NonlineMode nonline_mode :
  757. {NonlineMode::IDENTITY, NonlineMode::RELU}) {
  758. run(oc, ic, size, size, kernel, p, nonline_mode);
  759. }
  760. run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);
  761. Checker<ConvBias> checker(handle());
  762. #define cb(algo_name) \
  763. checker.set_before_exec_callback( \
  764. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
  765. for (auto&& arg : args) { \
  766. checker.set_param(arg.param).execs( \
  767. {arg.src, arg.filter, arg.bias, {}, {}}); \
  768. }
  769. #if MEGDNN_X86_WITH_MKL || MEGDNN_X86_WITH_OPENBLAS
  770. cb("IM2COLMATMUL:X86_F32_BLAS");
  771. #endif
  772. #undef cb
  773. }
  774. #if MEGDNN_X86_WITH_MKL
  775. TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA) {
  776. using namespace conv_bias;
  777. std::vector<TestArg> args;
  778. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  779. size_t p, NonlineMode nonline_mode) {
  780. if (w + 2 * p < kernel || h + 2 * p < kernel)
  781. return;
  782. param::ConvBias param;
  783. param.stride_h = 1;
  784. param.stride_w = 1;
  785. param.pad_h = p;
  786. param.pad_w = p;
  787. param.nonlineMode = nonline_mode;
  788. //! no bias
  789. args.emplace_back(param, TensorShape{1, ic, h, w},
  790. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  791. args.emplace_back(param, TensorShape{1, ic, h, w},
  792. TensorShape{oc, ic, kernel, kernel},
  793. TensorShape{1, oc, 1, 1});
  794. args.emplace_back(
  795. param, TensorShape{1, ic, h, w},
  796. TensorShape{oc, ic, kernel, kernel},
  797. TensorShape{1, oc, (h + 2 * p - kernel) / param.stride_h + 1,
  798. (w + 2 * p - kernel) / param.stride_w + 1});
  799. param.sparse = param::ConvBias::Sparse::GROUP;
  800. args.emplace_back(param, TensorShape{1, 2 * ic, h, w},
  801. TensorShape{2, oc, ic, kernel, kernel},
  802. TensorShape{});
  803. args.emplace_back(param, TensorShape{1, 2 * ic, h, w},
  804. TensorShape{2, oc, ic, kernel, kernel},
  805. TensorShape{1, oc * 2, 1, 1});
  806. args.emplace_back(
  807. param, TensorShape{1, 2 * ic, h, w},
  808. TensorShape{2, oc, ic, kernel, kernel},
  809. TensorShape{1, 2 * oc, (h + 2 * param.pad_h - kernel) / 1 + 1,
  810. (w + 2 * param.pad_w - kernel) / 1 + 1});
  811. };
  812. for (size_t kernel : {2, 3, 4, 5, 6, 7})
  813. for (size_t ic : {1, 4, 8, 16})
  814. for (size_t oc : {1, 4, 8, 16})
  815. for (size_t p : {0, 1})
  816. for (size_t size : {8, 24})
  817. for (NonlineMode nonline_mode :
  818. {NonlineMode::IDENTITY, NonlineMode::RELU}) {
  819. run(oc, ic, size, size, kernel, p, nonline_mode);
  820. }
  821. run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);
  822. Checker<ConvBias> checker(handle());
  823. #define cb(algo_name) \
  824. checker.set_before_exec_callback( \
  825. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
  826. for (auto&& arg : args) { \
  827. checker.set_param(arg.param).execs( \
  828. {arg.src, arg.filter, arg.bias, {}, {}}); \
  829. }
  830. cb("IM2COLMATMUL:X86_F32_MKL_PACKA:192");
  831. #undef cb
  832. }
  833. /**************************** Conv1x1 PackA *************************/
  834. namespace {
  835. void checker_conv_bias(std::vector<conv_bias::TestArg> args, Handle* handle,
  836. RNG* rng, float epsilon, DType type0, DType type1,
  837. DType type2, DType type3, const char* algo_name) {
  838. using namespace conv_bias;
  839. Checker<ConvBias> checker(handle);
  840. checker.set_before_exec_callback(
  841. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));
  842. checker.set_dtype(0, type0);
  843. checker.set_dtype(1, type1);
  844. checker.set_dtype(2, type2);
  845. checker.set_dtype(4, type3);
  846. checker.set_epsilon(epsilon);
  847. if (NULL != rng) {
  848. checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng).set_rng(3, rng);
  849. }
  850. for (auto&& arg : args) {
  851. checker.set_param(arg.param).execs(
  852. {arg.src, arg.filter, arg.bias, {}, {}});
  853. }
  854. }
  855. } // namespace
  856. #if MEGDNN_X86_WITH_MKL
  857. TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_FP32_PACKA) {
  858. using namespace conv_bias;
  859. std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(false, false);
  860. check_conv_bias(args, handle(), "CONV1x1:X86_F32_MKL_PACKA:24");
  861. }
  862. TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_FP32_BLAS) {
  863. using namespace conv_bias;
  864. std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(false, false);
  865. check_conv_bias(args, handle(), "CONV1x1:X86_F32_BLAS:48");
  866. }
  867. #endif
  868. TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_INT8X8X32) {
  869. using namespace conv_bias;
  870. UniformIntRNG rng{-50, 50};
  871. float epsilon = 0.001;
  872. std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(true, true);
  873. #if MEGDNN_X86_WITH_MKL_DNN
  874. if (x86::is_supported(x86::SIMDType::VNNI)) {
  875. checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{},
  876. dtype::Int8{}, dtype::Int32{}, dtype::Int32{},
  877. "CONV1x1:X86_INT8X8X32_MKLDNN:24");
  878. }
  879. #endif
  880. #if MEGDNN_X86_WITH_VNNI
  881. if (x86::is_supported(x86::SIMDType::VNNI)) {
  882. checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{},
  883. dtype::Int8{}, dtype::Int32{}, dtype::Int32{},
  884. "CONV1x1:X86_INT8X8X32_VNNI:24");
  885. }
  886. #endif
  887. if (x86::is_supported(x86::SIMDType::AVX2)) {
  888. checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{},
  889. dtype::Int8{}, dtype::Int32{}, dtype::Int32{},
  890. "CONV1x1:X86_INT8X8X32_AVX2_4X16X2:24");
  891. checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{},
  892. dtype::Int8{}, dtype::Int32{}, dtype::Int32{},
  893. "CONV1x1:X86_INT8X8X32_AVX2_2X4X16:24");
  894. }
  895. checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{},
  896. dtype::Int8{}, dtype::Int32{}, dtype::Int32{},
  897. "CONV1x1:X86_INT8X8X32_SSE_4X8X2:48");
  898. }
  899. /************************* End Conv1x1 PackA ************************/
  900. #endif
  901. TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8) {
  902. using namespace conv_bias;
  903. std::vector<TestArg> args;
  904. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  905. size_t p, NonlineMode nonline_mode) {
  906. if (w + 2 * p < kernel || h + 2 * p < kernel)
  907. return;
  908. param::ConvBias param;
  909. param.stride_h = 1;
  910. param.stride_w = 1;
  911. param.pad_h = p;
  912. param.pad_w = p;
  913. param.nonlineMode = nonline_mode;
  914. //! no bias
  915. args.emplace_back(param, TensorShape{1, ic, h, w},
  916. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  917. //! bias channel
  918. args.emplace_back(param, TensorShape{2, ic, h, w},
  919. TensorShape{oc, ic, kernel, kernel},
  920. TensorShape{1, oc, 1, 1});
  921. };
  922. for (size_t kernel : {2, 3, 4, 5, 6, 7})
  923. for (size_t ic : {1, 4, 8, 16})
  924. for (size_t oc : {1, 4, 8})
  925. for (size_t p : {0, 2})
  926. for (size_t size : {20, 21, 24})
  927. for (NonlineMode nonline_mode :
  928. {NonlineMode::IDENTITY, NonlineMode::RELU,
  929. NonlineMode::H_SWISH}) {
  930. run(oc, ic, size, size, kernel, p, nonline_mode);
  931. }
  932. run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);
  933. Checker<ConvBias> checker(handle());
  934. #define cb(algo_name) \
  935. checker.set_before_exec_callback( \
  936. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
  937. UniformIntRNG rng{-50, 50}; \
  938. for (auto&& arg : args) { \
  939. checker.set_dtype(0, dtype::QuantizedS8(2.5f)) \
  940. .set_dtype(1, dtype::QuantizedS8(2.5f)) \
  941. .set_dtype(2, dtype::QuantizedS32(6.25f)) \
  942. .set_dtype(4, dtype::QuantizedS8(60.25)) \
  943. .set_rng(0, &rng) \
  944. .set_rng(1, &rng) \
  945. .set_rng(2, &rng) \
  946. .set_param(arg.param) \
  947. .execs({arg.src, arg.filter, {}, {}, {}}); \
  948. }
  949. #if MEGDNN_X86_WITH_MKL_DNN
  950. if (x86::is_supported(x86::SIMDType::VNNI)) {
  951. cb("IM2COLMATMUL:X86_INT8X8X32_MKLDNN");
  952. }
  953. #endif
  954. #if MEGDNN_X86_WITH_VNNI
  955. if (x86::is_supported(x86::SIMDType::VNNI)) {
  956. cb("IM2COLMATMUL:X86_INT8X8X32_VNNI");
  957. }
  958. #endif
  959. if (x86::is_supported(x86::SIMDType::AVX2)) {
  960. cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16");
  961. }
  962. #undef cb
  963. }
  964. TEST_F(X86, CONV_BIAS_MATMUL) {
  965. using namespace conv_bias;
  966. std::vector<TestArg> args;
  967. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  968. size_t p, NonlineMode nonline_mode) {
  969. if (w + 2 * p < kernel || h + 2 * p < kernel)
  970. return;
  971. param::ConvBias param;
  972. param.stride_h = 1;
  973. param.stride_w = 1;
  974. param.pad_h = p;
  975. param.pad_w = p;
  976. param.nonlineMode = nonline_mode;
  977. //! no bias
  978. param.sparse = param::ConvBias::Sparse::DENSE;
  979. args.emplace_back(param, TensorShape{1, ic, h, w},
  980. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  981. //! bias channel
  982. args.emplace_back(param, TensorShape{2, ic, h, w},
  983. TensorShape{oc, ic, kernel, kernel},
  984. TensorShape{1, oc, 1, 1});
  985. //! bias
  986. args.emplace_back(param, TensorShape{2, ic, h, w},
  987. TensorShape{oc, ic, kernel, kernel},
  988. TensorShape{2, oc, (h + param.pad_h * 2 - kernel) + 1,
  989. (w + param.pad_w * 2 - kernel) + 1});
  990. //! gruop
  991. param.sparse = param::ConvBias::Sparse::GROUP;
  992. args.emplace_back(
  993. param, TensorShape{2, 2 * ic, h, w},
  994. TensorShape{2, oc, ic, kernel, kernel},
  995. TensorShape{2, 2 * oc, (h + param.pad_h * 2 - kernel) + 1,
  996. (w + param.pad_w * 2 - kernel) + 1});
  997. };
  998. for (size_t kernel : {2, 3, 5, 7})
  999. for (size_t ic : {1, 2, 3, 4})
  1000. for (size_t oc : {1, 2, 3, 4})
  1001. for (size_t p : {0, 2})
  1002. for (size_t size : {20, 21, 22, 23, 24})
  1003. for (NonlineMode nonline_mode :
  1004. {NonlineMode::RELU, NonlineMode::SIGMOID,
  1005. NonlineMode::H_SWISH, NonlineMode::IDENTITY}) {
  1006. run(oc, ic, size, size, kernel, p, nonline_mode);
  1007. }
  1008. Checker<ConvBias> checker(handle());
  1009. checker.set_before_exec_callback(
  1010. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  1011. "X86_CONV_BIAS_MATMUL"));
  1012. checker.set_epsilon(1);
  1013. UniformIntRNG rng{-50, 50};
  1014. checker.set_dtype(0, dtype::Float32())
  1015. .set_dtype(1, dtype::Float32())
  1016. .set_dtype(2, dtype::Float32())
  1017. .set_rng(0, &rng)
  1018. .set_rng(1, &rng)
  1019. .set_rng(2, &rng);
  1020. for (auto&& arg : args) {
  1021. checker.set_param(arg.param).exec(
  1022. {arg.src, arg.filter, arg.bias, {}, {}});
  1023. }
  1024. }
  1025. #if MEGDNN_WITH_BENCHMARK
  1026. #if MEGDNN_X86_WITH_MKL_DNN
  1027. static void x86_benchmark_fp32_mkldnn(Handle* handle) {
  1028. constexpr size_t RUNS = 30;
  1029. param::ConvBias param;
  1030. Benchmarker<ConvBias> benchmarker_mkldnn(handle);
  1031. benchmarker_mkldnn.set_display(false).set_times(RUNS);
  1032. benchmarker_mkldnn.set_before_exec_callback(
  1033. AlgoChecker<ConvBias>("MKLDNN_CONV_FP32"));
  1034. Benchmarker<ConvBias> benchmarker_im2col(handle);
  1035. benchmarker_im2col.set_display(false).set_times(RUNS);
  1036. benchmarker_im2col.set_before_exec_callback(
  1037. AlgoChecker<ConvBias>("IM2COLMATMUL.+"));
  1038. auto run = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  1039. size_t FS, size_t SZ, size_t GROUP = 1) {
  1040. TensorShape src({N, IC, H, W}), filter({OC, IC, FS, FS}),
  1041. bias({1, OC, 1, 1}), z({}), dst({N, OC, H / SZ, W / SZ});
  1042. param.pad_h = FS / 2;
  1043. param.pad_w = FS / 2;
  1044. param.stride_h = SZ;
  1045. param.stride_w = SZ;
  1046. param.format = param::ConvBias::Format::NCHW;
  1047. param.sparse = param::ConvBias::Sparse::DENSE;
  1048. if (GROUP > 1) {
  1049. param.sparse = param::ConvBias::Sparse::GROUP;
  1050. filter = {GROUP, OC / GROUP, IC / GROUP, FS, FS};
  1051. }
  1052. auto im2col_used = benchmarker_im2col.set_param(param).exec(
  1053. {src, filter, bias, z, dst}) /
  1054. RUNS;
  1055. src = IC < 8 ? TensorShape{N, IC, H, W}
  1056. : TensorShape{N, IC / 8, H, W, 8};
  1057. filter = IC < 8 ? TensorShape{OC / 8, FS, FS, IC, 8}
  1058. : TensorShape{OC / 8, IC / 8, FS, FS, 8, 8};
  1059. if (GROUP > 1 && OC == GROUP && IC == GROUP) {
  1060. filter = {GROUP / 8, 1, 1, FS, FS, 8};
  1061. } else if (GROUP > 1 && OC / GROUP % 8 == 0 && IC / GROUP % 8 == 0) {
  1062. filter = {GROUP, OC / GROUP / 8, IC / GROUP / 8, FS, FS, 8, 8};
  1063. }
  1064. bias = {1, OC / 8, 1, 1, 8};
  1065. z = {};
  1066. dst = {N, OC / 8, H / SZ, W / SZ, 8};
  1067. param.format = param::ConvBias::Format::NCHW88;
  1068. auto mkldnn_used = benchmarker_mkldnn.set_param(param).exec(
  1069. {src, filter, bias, z, dst}) /
  1070. RUNS;
  1071. float computations =
  1072. (IC / GROUP * FS * FS + 1) * dst.total_nr_elems() * 2 * 1e-6;
  1073. std::cout << "run " << src.to_string() << " " << filter.to_string()
  1074. << " " << bias.to_string() << " " << dst.to_string()
  1075. << std::endl;
  1076. std::cout << "im2col: " << im2col_used << " ms, "
  1077. << (computations / im2col_used) << " Gops, ";
  1078. std::cout << "mkldnn: " << mkldnn_used << " ms, "
  1079. << (computations / mkldnn_used) << " Gops, "
  1080. << "spped up: " << (im2col_used / mkldnn_used) << ", ";
  1081. std::cout << std::endl;
  1082. };
  1083. run(1, 64, 64, 56, 56, 3, 1);
  1084. run(1, 3, 64, 224, 224, 3, 1);
  1085. run(1, 3, 64, 224, 224, 7, 2);
  1086. run(1, 64, 64, 56, 56, 3, 1);
  1087. run(1, 128, 128, 28, 28, 3, 1);
  1088. run(1, 256, 256, 14, 14, 3, 1);
  1089. run(1, 512, 512, 7, 7, 3, 1);
  1090. run(1, 256, 64, 56, 56, 1, 1);
  1091. run(1, 512, 128, 28, 28, 1, 1);
  1092. run(1, 1024, 256, 14, 14, 1, 1);
  1093. run(1, 2048, 512, 7, 7, 1, 1);
  1094. run(1, 32, 32, 112, 112, 3, 1, 32);
  1095. run(1, 144, 144, 56, 56, 3, 1, 144);
  1096. run(1, 192, 192, 28, 28, 3, 1, 192);
  1097. run(1, 384, 384, 28, 28, 3, 1, 384);
  1098. run(1, 576, 576, 14, 14, 3, 1, 576);
  1099. run(1, 960, 960, 7, 7, 3, 1, 960);
  1100. run(1, 256, 128, 56, 56, 1, 2, 1);
  1101. run(1, 512, 256, 28, 28, 1, 2, 1);
  1102. run(1, 1024, 512, 14, 14, 1, 2, 1);
  1103. run(1, 96, 96, 112, 112, 3, 2, 96);
  1104. run(1, 144, 144, 56, 56, 3, 2, 144);
  1105. run(1, 384, 384, 28, 28, 3, 2, 384);
  1106. run(1, 576, 576, 14, 14, 3, 2, 576);
  1107. }
  1108. TEST_F(X86, BENCHMARK_CONVBIAS_FP32_MKLDNN) {
  1109. x86_benchmark_fp32_mkldnn(handle());
  1110. }
  1111. TEST_F(X86_MULTI_THREADS, BENCHMARK_CONVBIAS_FP32_MKLDNN) {
  1112. x86_benchmark_fp32_mkldnn(handle());
  1113. }
  1114. #endif
  1115. #endif
  1116. /************************* Winograd ****************************/
  1117. namespace{
  1118. std::vector<conv_bias::TestArg> get_winograd_mk_nchw88_args() {
  1119. std::vector<conv_bias::TestArg> args;
  1120. param::ConvBias cur_param;
  1121. cur_param.format = param::ConvBias::Format::NCHW88;
  1122. using NLMode = param::ConvBias::NonlineMode;
  1123. // clang-format off
  1124. for (auto nlmode :
  1125. {NLMode::IDENTITY, NLMode::RELU, NLMode::SIGMOID, NLMode::H_SWISH}) {
  1126. for (size_t ic : {1, 2}) {
  1127. for (size_t oc : {1, 2}) {
  1128. for (size_t i : {9, 63}) {
  1129. cur_param.mode = param::ConvBias::Mode::CROSS_CORRELATION;
  1130. cur_param.nonlineMode = nlmode;
  1131. cur_param.sparse = param::ConvBias::Sparse::DENSE;
  1132. cur_param.pad_h = cur_param.pad_w = 1;
  1133. args.emplace_back(cur_param, TensorShape{1, ic, i, i, 8},
  1134. TensorShape{oc, ic, 3, 3, 8, 8},
  1135. TensorShape{1, oc, 1, 1, 8});
  1136. args.emplace_back(cur_param, TensorShape{1, ic, i, i, 8},
  1137. TensorShape{oc, ic, 3, 3, 8, 8},TensorShape{});
  1138. //! bias
  1139. args.emplace_back(cur_param, TensorShape{2, ic, i, i, 8},
  1140. TensorShape{oc, ic, 3, 3, 8, 8}, TensorShape{2, oc, i, i, 8});
  1141. /*cur_param.sparse = param::ConvBias::Sparse::GROUP;
  1142. args.emplace_back(cur_param, TensorShape{2, 2 * ic, i, i, 8},
  1143. TensorShape{2, oc, ic, 3, 3, 8, 8},
  1144. TensorShape{1, 2 * oc, 1, 1, 8});*/
  1145. }}}
  1146. // clang-format on
  1147. //! test for multi-thread OC parallel
  1148. cur_param.sparse = param::ConvBias::Sparse::DENSE;
  1149. cur_param.pad_h = cur_param.pad_w = 1;
  1150. args.emplace_back(cur_param, TensorShape{2, 1, 9, 9, 8},
  1151. TensorShape{128, 1, 3, 3, 8, 8},
  1152. TensorShape{1, 128, 1, 1, 8});
  1153. /*cur_param.sparse = param::ConvBias::Sparse::GROUP;
  1154. args.emplace_back(cur_param, TensorShape{2, 2, 9, 9, 8},
  1155. TensorShape{2, 128, 1, 3, 3, 8, 8},
  1156. TensorShape{1, 2 * 128, 1, 1, 8});*/
  1157. }
  1158. return args;
  1159. }
  1160. } // namespace
  1161. TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW88_F63) {
  1162. using namespace conv_bias;
  1163. std::vector<TestArg> args = get_winograd_mk_nchw88_args();
  1164. Checker<ConvBiasForward> checker(handle());
  1165. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1166. ssprintf("WINOGRAD:X86_F32MK8_8X8:8:6").c_str()));
  1167. for (auto&& arg : args) {
  1168. checker.set_param(arg.param).execs(
  1169. {arg.src, arg.filter, arg.bias, {}, {}});
  1170. }
  1171. }
  1172. TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW88_F23) {
  1173. using namespace conv_bias;
  1174. std::vector<TestArg> args = get_winograd_mk_nchw88_args();
  1175. Checker<ConvBiasForward> checker(handle());
  1176. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1177. ssprintf("WINOGRAD:X86_F32MK8_8X8:8:2").c_str()));
  1178. for (auto&& arg : args) {
  1179. checker.set_param(arg.param).execs(
  1180. {arg.src, arg.filter, arg.bias, {}, {}});
  1181. }
  1182. }
  1183. TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_WEIGHT_PREPROCESS) {
  1184. using namespace conv_bias;
  1185. std::vector<TestArg> args = get_winograd_mk_nchw88_args();
  1186. Checker<ConvBiasForward> checker(handle());
  1187. auto extra_impl = [](const TensorNDArray& tensors, uint32_t m,
  1188. param::ConvBias param, Handle* handle) {
  1189. megdnn_assert(param.format == param::ConvBias::Format::NCHW88);
  1190. auto winograd_preprocess_opr =
  1191. handle->create_operator<WinogradFilterPreprocess>();
  1192. winograd_preprocess_opr->param().output_block_size = m;
  1193. winograd_preprocess_opr->param().format = param::MatrixMul::Format::MK8;
  1194. TensorLayout filter_transform_layout;
  1195. winograd_preprocess_opr->deduce_layout(tensors[1].layout,
  1196. filter_transform_layout);
  1197. size_t winograd_preprocess_workspace_in_bytes =
  1198. winograd_preprocess_opr->get_workspace_in_bytes(
  1199. tensors[1].layout, filter_transform_layout);
  1200. auto conv_bias_opr = handle->create_operator<ConvBias>();
  1201. conv_bias_opr->param() = param;
  1202. conv_bias_opr->param().format = param::ConvBias::Format::NCHW88_WINOGRAD;
  1203. conv_bias_opr->param().output_block_size = m;
  1204. size_t conv_bias_workspace_in_bytes =
  1205. conv_bias_opr->get_workspace_in_bytes(
  1206. tensors[0].layout, filter_transform_layout,
  1207. tensors[2].layout, tensors[3].layout,
  1208. tensors[4].layout);
  1209. WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(),
  1210. conv_bias_workspace_in_bytes,
  1211. winograd_preprocess_workspace_in_bytes});
  1212. wb.set(malloc(wb.total_size_in_bytes()));
  1213. TensorND filter_transform_tensor(wb.get(0),
  1214. std::move(filter_transform_layout));
  1215. winograd_preprocess_opr->exec(tensors[1], filter_transform_tensor,
  1216. wb.get_workspace(2));
  1217. conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2],
  1218. tensors[3], tensors[4], wb.get_workspace(1));
  1219. free(wb.ptr());
  1220. };
  1221. auto run = [&checker, &extra_impl](
  1222. Handle* handle, const std::vector<TestArg>& args,
  1223. const std::vector<size_t>& out_size, DType A_dtype,
  1224. DType B_dtype, DType C_dtype, DType D_dtype,
  1225. const float eps) {
  1226. for (auto&& arg : args) {
  1227. for (uint32_t m : out_size) {
  1228. checker.set_extra_opr_impl(std::bind(extra_impl,
  1229. std::placeholders::_1, m,
  1230. arg.param, handle));
  1231. checker.set_dtype(0, A_dtype)
  1232. .set_dtype(1, B_dtype)
  1233. .set_dtype(2, C_dtype)
  1234. .set_dtype(4, D_dtype)
  1235. .set_epsilon(eps)
  1236. .set_param(arg.param)
  1237. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  1238. }
  1239. }
  1240. };
  1241. run(handle(), args, {2, 6}, dtype::Float32(), dtype::Float32(),
  1242. dtype::Float32(), dtype::Float32(), 1e-3f);
  1243. }
  1244. /*********************************** End winograd ************************/
  1245. #if MEGDNN_X86_WITH_MKL_DNN
  1246. static void x86_correctness_fp32_mkldnn_run(
  1247. Checker<ConvBias>& checker, UniformIntRNG& rng, Handle* handle,
  1248. ConvBiasForward::BiasMode bias_mode,
  1249. param::ConvBias::NonlineMode noline_mode, size_t n, size_t stride,
  1250. size_t kernel, size_t oc, size_t ic, size_t h, size_t w, size_t group) {
  1251. auto oc_per_group = oc / group;
  1252. auto ic_per_group = ic / group;
  1253. bool ok_group = oc_per_group % 8 == 0 && oc_per_group > 0 &&
  1254. (ic_per_group % 8 == 0 || ic_per_group == 3) &&
  1255. ic_per_group > 0;
  1256. bool ok_depthwise = oc == ic && oc == group;
  1257. if (!(ok_group || ok_depthwise)) {
  1258. return;
  1259. }
  1260. size_t pad = kernel / 2;
  1261. size_t kernel_h = kernel;
  1262. size_t kernel_w = kernel;
  1263. param::ConvBias param;
  1264. param.format = param::ConvBias::Format::NCHW88;
  1265. param.stride_h = stride;
  1266. param.stride_w = stride;
  1267. param.pad_h = pad;
  1268. param.pad_w = pad;
  1269. param.nonlineMode = noline_mode;
  1270. auto src_tensor_shape = TensorShape{n, ic / 8, h, w, 8};
  1271. if (ic == 3) {
  1272. src_tensor_shape = TensorShape{n, ic, h, w};
  1273. }
  1274. auto weight_tensor_shape =
  1275. TensorShape{oc / 8, ic / 8, kernel_h, kernel_w, 8, 8};
  1276. if (ic == 3) {
  1277. weight_tensor_shape = TensorShape{oc / 8, kernel_h, kernel_w, ic, 8};
  1278. }
  1279. auto bias_tensor_shape = TensorShape{};
  1280. if (bias_mode == megdnn::BiasMode::BROADCAST_CHANNEL_BIAS) {
  1281. bias_tensor_shape = {1, oc / 8, 1, 1, 8};
  1282. } else if (bias_mode == megdnn::BiasMode::BIAS) {
  1283. TensorLayout dst_layout;
  1284. auto ConvBiasOp = handle->create_operator<ConvBias>();
  1285. ConvBiasOp->param() = param;
  1286. ConvBiasOp->deduce_layout({src_tensor_shape, dtype::Float32()},
  1287. {weight_tensor_shape, dtype::Float32()}, {},
  1288. {}, dst_layout);
  1289. bias_tensor_shape = dst_layout;
  1290. }
  1291. if (group == 1) {
  1292. param.sparse = param::ConvBias::Sparse::DENSE;
  1293. } else if (group > 1 && ic / group == 1 && oc / group == 1) {
  1294. param.sparse = param::ConvBias::Sparse::GROUP;
  1295. weight_tensor_shape =
  1296. TensorShape{group / 8, 1, 1, kernel_h, kernel_w, 8};
  1297. } else if (group > 1 && oc / group % 8 == 0 && oc / group > 0 &&
  1298. ic / group % 8 == 0 && ic / group > 0) {
  1299. param.sparse = param::ConvBias::Sparse::GROUP;
  1300. weight_tensor_shape = TensorShape{
  1301. group, oc / group / 8, ic / group / 8, kernel_h, kernel_w, 8,
  1302. 8};
  1303. }
  1304. checker.set_dtype(0, dtype::Float32())
  1305. .set_dtype(1, dtype::Float32())
  1306. .set_dtype(2, dtype::Float32())
  1307. .set_dtype(4, dtype::Float32())
  1308. .set_rng(0, &rng)
  1309. .set_rng(1, &rng)
  1310. .set_rng(2, &rng)
  1311. .set_epsilon(1e-3)
  1312. .set_param(param)
  1313. .execs({src_tensor_shape,
  1314. weight_tensor_shape,
  1315. bias_tensor_shape,
  1316. {},
  1317. {}});
  1318. }
  1319. static void x86_correctness_fp32_mkldnn(Handle* handle) {
  1320. Checker<ConvBias> checker(handle);
  1321. UniformIntRNG rng{-127, 127};
  1322. checker.set_before_exec_callback(
  1323. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  1324. "MKLDNN_CONV_FP32"));
  1325. for (auto bias_mode :
  1326. {megdnn::BiasMode::NO_BIAS, megdnn::BiasMode::BROADCAST_CHANNEL_BIAS,
  1327. megdnn::BiasMode::BIAS})
  1328. for (auto noline_mode : {param::ConvBias::NonlineMode::IDENTITY,
  1329. param::ConvBias::NonlineMode::SIGMOID,
  1330. param::ConvBias::NonlineMode::H_SWISH})
  1331. for (size_t n : {1, 2})
  1332. for (size_t stride : {1, 2})
  1333. for (size_t kernel : {3, 5, 7})
  1334. for (size_t oc : {8, 16})
  1335. for (size_t ic : {3, 8, 16})
  1336. for (size_t h : {22, 33})
  1337. for (size_t w : {22, 33}) {
  1338. for (size_t group = 1;
  1339. group <= std::min(oc, ic);
  1340. ++group) {
  1341. x86_correctness_fp32_mkldnn_run(
  1342. checker, rng, handle,
  1343. bias_mode, noline_mode, n,
  1344. stride, kernel, oc, ic, h,
  1345. w, group);
  1346. }
  1347. }
  1348. }
  1349. TEST_F(X86, CONV_BIAS_DIRECT_MKLDNN_C8) {
  1350. x86_correctness_fp32_mkldnn(handle());
  1351. }
  1352. TEST_F(X86_MULTI_THREADS, CONV_BIAS_DIRECT_MKLDNN_C8) {
  1353. x86_correctness_fp32_mkldnn(handle());
  1354. }
  1355. TEST_F(X86, CONV_BIAS_MKL_DNN_MATMUL_INT8) {
  1356. using namespace conv_bias;
  1357. std::vector<TestArg> args;
  1358. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1359. size_t p, NonlineMode nonline_mode) {
  1360. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1361. return;
  1362. param::ConvBias param;
  1363. param.stride_h = 1;
  1364. param.stride_w = 1;
  1365. param.pad_h = p;
  1366. param.pad_w = p;
  1367. param.nonlineMode = nonline_mode;
  1368. //! no bias
  1369. args.emplace_back(param, TensorShape{1, ic, h, w},
  1370. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  1371. };
  1372. for (size_t kernel : {2, 3, 5, 7})
  1373. for (size_t ic : {1, 2, 3, 4})
  1374. for (size_t oc : {1, 2, 4})
  1375. for (size_t p : {0, 2})
  1376. for (size_t size : {20, 21, 22, 23, 24})
  1377. for (NonlineMode nonline_mode :
  1378. {NonlineMode::IDENTITY}) {
  1379. run(oc, ic, size, size, kernel, p, nonline_mode);
  1380. }
  1381. Checker<ConvBias> checker(handle());
  1382. checker.set_before_exec_callback(
  1383. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  1384. "MKLDNN_MATMUL_INT8"));
  1385. checker.set_epsilon(1);
  1386. UniformIntRNG rng{-50, 50};
  1387. checker.set_dtype(0, dtype::Int8())
  1388. .set_dtype(1, dtype::Int8())
  1389. .set_dtype(2, dtype::Int32())
  1390. .set_dtype(4, dtype::Int32())
  1391. .set_rng(0, &rng)
  1392. .set_rng(1, &rng)
  1393. .set_rng(2, &rng);
  1394. for (auto&& arg : args) {
  1395. checker.set_param(arg.param).exec(
  1396. {arg.src, arg.filter, arg.bias, {}, {}});
  1397. }
  1398. }
  1399. TEST_F(X86, CONV_BIAS_MKL_DNN_INT8) {
  1400. using namespace conv_bias;
  1401. std::vector<TestArg> args;
  1402. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1403. size_t p, NonlineMode nonline_mode) {
  1404. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1405. return;
  1406. param::ConvBias param;
  1407. param.stride_h = 1;
  1408. param.stride_w = 1;
  1409. param.pad_h = p;
  1410. param.pad_w = p;
  1411. param.nonlineMode = nonline_mode;
  1412. //! no bias
  1413. args.emplace_back(param, TensorShape{1, ic, h, w},
  1414. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  1415. };
  1416. for (size_t kernel : {2, 3, 5, 7})
  1417. for (size_t ic : {1, 2, 3, 4})
  1418. for (size_t oc : {1, 2, 4})
  1419. for (size_t p : {0, 2})
  1420. for (size_t size : {20, 22, 24})
  1421. for (NonlineMode nonline_mode :
  1422. {NonlineMode::IDENTITY}) {
  1423. run(oc, ic, size, size, kernel, p, nonline_mode);
  1424. }
  1425. Checker<ConvBias> checker(handle());
  1426. checker.set_before_exec_callback(
  1427. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("MKLDNN_INT8"));
  1428. checker.set_epsilon(1);
  1429. UniformIntRNG rng{-50, 50};
  1430. checker.set_dtype(0, dtype::Int8())
  1431. .set_dtype(1, dtype::Int8())
  1432. .set_dtype(2, dtype::Int32())
  1433. .set_dtype(4, dtype::Int32())
  1434. .set_rng(0, &rng)
  1435. .set_rng(1, &rng)
  1436. .set_rng(2, &rng);
  1437. for (auto&& arg : args) {
  1438. checker.set_param(arg.param).exec(
  1439. {arg.src, arg.filter, arg.bias, {}, {}});
  1440. }
  1441. }
  1442. TEST_F(X86_MULTI_THREADS, CONV_BIAS_MKL_DNN_INT8) {
  1443. using namespace conv_bias;
  1444. std::vector<TestArg> args;
  1445. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1446. size_t p, NonlineMode nonline_mode) {
  1447. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1448. return;
  1449. param::ConvBias param;
  1450. param.stride_h = 1;
  1451. param.stride_w = 1;
  1452. param.pad_h = p;
  1453. param.pad_w = p;
  1454. param.nonlineMode = nonline_mode;
  1455. //! no bias
  1456. args.emplace_back(param, TensorShape{1, ic, h, w},
  1457. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  1458. };
  1459. for (size_t kernel : {2, 3, 5, 7})
  1460. for (size_t ic : {1, 2, 3, 4})
  1461. for (size_t oc : {1, 2, 4})
  1462. for (size_t p : {0, 2})
  1463. for (size_t size : {20, 22, 24})
  1464. for (NonlineMode nonline_mode :
  1465. {NonlineMode::IDENTITY}) {
  1466. run(oc, ic, size, size, kernel, p, nonline_mode);
  1467. }
  1468. Checker<ConvBias> checker(handle());
  1469. checker.set_before_exec_callback(
  1470. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("MKLDNN_INT8"));
  1471. checker.set_epsilon(1);
  1472. UniformIntRNG rng{-50, 50};
  1473. checker.set_dtype(0, dtype::Int8())
  1474. .set_dtype(1, dtype::Int8())
  1475. .set_dtype(2, dtype::Int32())
  1476. .set_dtype(4, dtype::Int32())
  1477. .set_rng(0, &rng)
  1478. .set_rng(1, &rng)
  1479. .set_rng(2, &rng);
  1480. for (auto&& arg : args) {
  1481. checker.set_param(arg.param).exec(
  1482. {arg.src, arg.filter, arg.bias, {}, {}});
  1483. }
  1484. }
  1485. #endif
  1486. #if MEGDNN_WITH_BENCHMARK
  1487. namespace {
  1488. void benchmark_impl(const param::ConvBias param,
  1489. std::vector<std::pair<SmallVector<TensorShape>, float>>&
  1490. shapes_and_computation,
  1491. const std::string algo_name, size_t RUNS,
  1492. TaskExecutorConfig&& multi_thread_config,
  1493. TaskExecutorConfig&& single_thread_config,
  1494. std::vector<DType> dtype_v) {
  1495. std::vector<DType> data_type = {dtype::Float32(), dtype::Float32(),
  1496. dtype::Float32(), dtype::Float32()};
  1497. std::vector<float> multi_thread_times, single_thread_times;
  1498. {
  1499. auto multi_thread_hanle =
  1500. create_cpu_handle(0, true, &multi_thread_config);
  1501. auto benchmarker = Benchmarker<ConvBias>(multi_thread_hanle.get());
  1502. benchmarker.set_times(RUNS)
  1503. .set_display(false)
  1504. .set_dtype(0, dtype_v[0])
  1505. .set_dtype(1, dtype_v[1])
  1506. .set_dtype(2, dtype_v[2])
  1507. .set_dtype(4, dtype_v[3])
  1508. .set_param(param)
  1509. .set_before_exec_callback(
  1510. conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1511. algo_name.c_str()));
  1512. for (auto shape : shapes_and_computation) {
  1513. multi_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  1514. }
  1515. }
  1516. {
  1517. auto single_thread_handle =
  1518. create_cpu_handle(0, true, &single_thread_config);
  1519. auto benchmarker = Benchmarker<ConvBias>(single_thread_handle.get());
  1520. benchmarker.set_times(RUNS)
  1521. .set_display(false)
  1522. .set_dtype(0, dtype_v[0])
  1523. .set_dtype(1, dtype_v[1])
  1524. .set_dtype(2, dtype_v[2])
  1525. .set_dtype(4, dtype_v[3])
  1526. .set_param(param)
  1527. .set_before_exec_callback(
  1528. conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1529. algo_name.c_str()));
  1530. for (auto shape : shapes_and_computation) {
  1531. single_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  1532. }
  1533. }
  1534. printf("Benchmark : Multi threads %zu, ", multi_thread_config.nr_thread);
  1535. printf("core_ids:");
  1536. for (size_t i = 0; i < multi_thread_config.affinity_core_set.size(); i++) {
  1537. printf("%zu ", multi_thread_config.affinity_core_set[i]);
  1538. }
  1539. printf(", Single thread core_id %zu\n",
  1540. single_thread_config.affinity_core_set[0]);
  1541. for (size_t i = 0; i < shapes_and_computation.size(); i++) {
  1542. auto shapes = shapes_and_computation[i];
  1543. printf("Bench case: ");
  1544. for (auto&& shape : shapes.first) {
  1545. printf("%s ", shape.to_string().c_str());
  1546. }
  1547. float computations = shapes.second;
  1548. printf("%zu threads gflops: %f,\n single thread gflops: "
  1549. "%f. spead up = %f, speedup/cores=%f\n",
  1550. multi_thread_config.nr_thread,
  1551. computations / multi_thread_times[i],
  1552. computations / single_thread_times[i],
  1553. single_thread_times[i] / multi_thread_times[i],
  1554. single_thread_times[i] / multi_thread_times[i] /
  1555. multi_thread_config.nr_thread);
  1556. }
  1557. }
  1558. void benchmark_impl_comp(const param::ConvBias param,
  1559. std::vector<std::pair<SmallVector<TensorShape>, float>>&
  1560. shapes_and_computation,
  1561. const std::string algo_name, const std::string algo_name1,size_t RUNS,
  1562. TaskExecutorConfig&& multi_thread_config,
  1563. TaskExecutorConfig&& single_thread_config,std::vector<DType> dtype_v) {
  1564. std::vector<DType> data_type = {dtype::Float32(), dtype::Float32(),
  1565. dtype::Float32(), dtype::Float32()};
  1566. std::vector<float> multi_thread_times, single_thread_times;
  1567. {
  1568. auto multi_thread_hanle =
  1569. create_cpu_handle(0, true, &multi_thread_config);
  1570. auto benchmarker = Benchmarker<ConvBias>(multi_thread_hanle.get());
  1571. benchmarker.set_times(RUNS)
  1572. .set_display(false)
  1573. .set_dtype(0,dtype_v[0])
  1574. .set_dtype(1,dtype_v[1])
  1575. .set_dtype(2,dtype_v[2])
  1576. .set_dtype(4,dtype_v[3])
  1577. .set_param(param)
  1578. .set_before_exec_callback(
  1579. conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1580. algo_name.c_str()));
  1581. for (auto shape : shapes_and_computation) {
  1582. multi_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  1583. }
  1584. }
  1585. {
  1586. auto single_thread_handle =
  1587. create_cpu_handle(0, true, &single_thread_config);
  1588. auto benchmarker = Benchmarker<ConvBias>(single_thread_handle.get());
  1589. benchmarker.set_times(RUNS)
  1590. .set_display(false)
  1591. .set_dtype(0,dtype_v[0])
  1592. .set_dtype(1,dtype_v[1])
  1593. .set_dtype(2,dtype_v[2])
  1594. .set_dtype(4,dtype_v[3])
  1595. .set_param(param)
  1596. .set_before_exec_callback(
  1597. conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1598. algo_name1.c_str()));
  1599. for (auto shape : shapes_and_computation) {
  1600. single_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  1601. }
  1602. }
  1603. printf("Benchmark : Multi threads %zu, ", multi_thread_config.nr_thread);
  1604. printf("core_ids:");
  1605. for (size_t i = 0; i < multi_thread_config.affinity_core_set.size(); i++) {
  1606. printf("%zu ", multi_thread_config.affinity_core_set[i]);
  1607. }
  1608. for (size_t i = 0; i < shapes_and_computation.size(); i++) {
  1609. auto shapes = shapes_and_computation[i];
  1610. printf("Bench case: ");
  1611. for (auto&& shape : shapes.first) {
  1612. printf("%s ", shape.to_string().c_str());
  1613. }
  1614. float computations = shapes.second;
  1615. printf("algo:%s gflops: %f,\n algo:%s gflops: "
  1616. "%f. spead up = %f\n",
  1617. algo_name.c_str(), computations / multi_thread_times[i],
  1618. algo_name1.c_str(), computations / single_thread_times[i],
  1619. single_thread_times[i] / multi_thread_times[i]);
  1620. }
  1621. }
  1622. } // namespace
  1623. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_CHANWISE_AVX2_INT8) {
  1624. constexpr size_t RUNS = 50;
  1625. param::ConvBias param;
  1626. param.stride_h = 1;
  1627. param.stride_w = 1;
  1628. param.sparse = param::ConvBias::Sparse::GROUP;
  1629. std::vector<DType> data_type = {dtype::Int8(), dtype::Int8(),
  1630. dtype::Int32(), dtype::Int32()};
  1631. std::vector<std::pair<SmallVector<TensorShape>, float>>
  1632. shapes_and_computation;
  1633. auto bench_case = [&](size_t N, size_t IC, size_t H, size_t W, size_t FS) {
  1634. param.pad_h = FS / 2;
  1635. param.pad_w = FS / 2;
  1636. SmallVector<TensorShape> shapes{
  1637. {N, IC, H, W}, {IC, 1, 1, FS, FS}, {}, {}, {}};
  1638. TensorShape dst{N, IC, (H + 2 * param.pad_h - FS) + 1,
  1639. (W + 2 * param.pad_w - FS) + 1};
  1640. float computations = (FS * FS * dst.total_nr_elems() * 2) * 1e-6;
  1641. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1642. };
  1643. bench_case(1, 32, 112, 112, 7);
  1644. bench_case(1, 144, 56, 56, 7);
  1645. bench_case(1, 192, 28, 28, 7);
  1646. bench_case(1, 384, 28, 28, 7);
  1647. bench_case(1, 576, 14, 14, 7);
  1648. bench_case(1, 960, 7, 7, 7);
  1649. bench_case(1, 32, 112, 112, 5);
  1650. bench_case(1, 144, 56, 56, 5);
  1651. bench_case(1, 192, 28, 28, 5);
  1652. bench_case(1, 384, 28, 28, 5);
  1653. bench_case(1, 576, 14, 14, 5);
  1654. bench_case(1, 960, 7, 7, 5);
  1655. bench_case(1, 32, 112, 112, 3);
  1656. bench_case(1, 144, 56, 56, 3);
  1657. bench_case(1, 192, 28, 28, 3);
  1658. bench_case(1, 384, 28, 28, 3);
  1659. bench_case(1, 576, 14, 14, 3);
  1660. bench_case(1, 960, 7, 7, 3);
  1661. bench_case(1, 32, 112, 112, 2);
  1662. bench_case(1, 144, 56, 56, 2);
  1663. bench_case(1, 192, 28, 28, 2);
  1664. bench_case(1, 384, 28, 28, 2);
  1665. bench_case(1, 576, 14, 14, 2);
  1666. bench_case(1, 960, 7, 7, 2);
  1667. std::string algo_name = "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1";
  1668. printf("Benchmark X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1\n");
  1669. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1670. {4, {4, 5, 6, 7}}, {1, {4}}, data_type);
  1671. benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
  1672. {1, {4}}, data_type);
  1673. shapes_and_computation.clear();
  1674. }
  1675. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECT_AVX2_INT8) {
  1676. constexpr size_t RUNS = 50;
  1677. param::ConvBias param;
  1678. param.stride_h = 1;
  1679. param.stride_w = 1;
  1680. param.sparse = param::ConvBias::Sparse::DENSE;
  1681. std::vector<DType> data_type = {dtype::Int8(), dtype::Int8(),
  1682. dtype::Int32(), dtype::Int32()};
  1683. std::vector<std::pair<SmallVector<TensorShape>, float>>
  1684. shapes_and_computation;
  1685. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  1686. size_t FS) {
  1687. param.pad_h = FS / 2;
  1688. param.pad_w = FS / 2;
  1689. SmallVector<TensorShape> shapes{
  1690. {N, IC, H, W}, {OC, IC, FS, FS}, {}, {}, {}};
  1691. TensorShape dst{N, OC, (H + 2 * param.pad_h - FS) + 1,
  1692. (W + 2 * param.pad_w - FS) + 1};
  1693. float computations = (IC * FS * FS * dst.total_nr_elems() * 2) * 1e-6;
  1694. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1695. };
  1696. bench_case(1, 32, 32, 200, 200, 7);
  1697. bench_case(1, 32, 64, 200, 200, 7);
  1698. bench_case(1, 32, 32, 128, 128, 7);
  1699. bench_case(1, 32, 64, 128, 128, 7);
  1700. bench_case(1, 32, 32, 100, 100, 7);
  1701. bench_case(1, 32, 64, 100, 100, 7);
  1702. bench_case(1, 32, 32, 80, 80, 7);
  1703. bench_case(1, 32, 64, 80, 80, 7);
  1704. bench_case(1, 32, 32, 200, 200, 5);
  1705. bench_case(1, 32, 64, 200, 200, 5);
  1706. bench_case(1, 32, 32, 128, 128, 5);
  1707. bench_case(1, 32, 64, 128, 128, 5);
  1708. bench_case(1, 32, 32, 100, 100, 5);
  1709. bench_case(1, 32, 64, 100, 100, 5);
  1710. bench_case(1, 32, 32, 80, 80, 5);
  1711. bench_case(1, 32, 64, 80, 80, 5);
  1712. bench_case(1, 32, 32, 200, 200, 3);
  1713. bench_case(1, 32, 64, 200, 200, 3);
  1714. bench_case(1, 32, 32, 128, 128, 3);
  1715. bench_case(1, 32, 64, 128, 128, 3);
  1716. bench_case(1, 32, 32, 100, 100, 3);
  1717. bench_case(1, 32, 64, 100, 100, 3);
  1718. bench_case(1, 32, 32, 80, 80, 3);
  1719. bench_case(1, 32, 64, 80, 80, 3);
  1720. bench_case(1, 32, 32, 200, 200, 2);
  1721. bench_case(1, 32, 64, 200, 200, 2);
  1722. bench_case(1, 32, 32, 128, 128, 2);
  1723. bench_case(1, 32, 64, 128, 128, 2);
  1724. bench_case(1, 32, 32, 100, 100, 2);
  1725. bench_case(1, 32, 64, 100, 100, 2);
  1726. bench_case(1, 32, 32, 80, 80, 2);
  1727. bench_case(1, 32, 64, 80, 80, 2);
  1728. std::string algo_name = "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE1";
  1729. printf("Benchmark X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE1 algo\n");
  1730. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1731. {4, {4, 5, 6, 7}}, {1, {4}}, data_type);
  1732. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1733. {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  1734. benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
  1735. {1, {4}}, data_type);
  1736. shapes_and_computation.clear();
  1737. }
  1738. TEST_F(X86_BENCHMARK_MULTI_THREADS,
  1739. BENCHMARK_CONVBIAS_DIRECT_AVX2_INT8_STRIDE2) {
  1740. constexpr size_t RUNS = 50;
  1741. param::ConvBias param;
  1742. param.stride_h = 2;
  1743. param.stride_w = 2;
  1744. param.sparse = param::ConvBias::Sparse::DENSE;
  1745. std::vector<DType> data_type = {dtype::Int8(), dtype::Int8(),
  1746. dtype::Int32(), dtype::Int32()};
  1747. std::vector<std::pair<SmallVector<TensorShape>, float>>
  1748. shapes_and_computation;
  1749. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  1750. size_t FS) {
  1751. param.pad_h = FS / 2;
  1752. param.pad_w = FS / 2;
  1753. SmallVector<TensorShape> shapes{
  1754. {N, IC, H, W}, {OC, IC, FS, FS}, {}, {}, {}};
  1755. TensorShape dst{N, OC, (H + 2 * param.pad_h - FS) / param.stride_h + 1,
  1756. (W + 2 * param.pad_w - FS) / param.pad_w + 1};
  1757. float computations = (IC * FS * FS * dst.total_nr_elems() * 2) * 1e-6;
  1758. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1759. };
  1760. bench_case(1, 32, 32, 200, 200, 7);
  1761. bench_case(1, 32, 64, 200, 200, 7);
  1762. bench_case(1, 32, 32, 128, 128, 7);
  1763. bench_case(1, 32, 64, 128, 128, 7);
  1764. bench_case(1, 32, 32, 100, 100, 7);
  1765. bench_case(1, 32, 64, 100, 100, 7);
  1766. bench_case(1, 32, 32, 80, 80, 7);
  1767. bench_case(1, 32, 64, 80, 80, 7);
  1768. bench_case(1, 32, 32, 200, 200, 5);
  1769. bench_case(1, 32, 64, 200, 200, 5);
  1770. bench_case(1, 32, 32, 128, 128, 5);
  1771. bench_case(1, 32, 64, 128, 128, 5);
  1772. bench_case(1, 32, 32, 100, 100, 5);
  1773. bench_case(1, 32, 64, 100, 100, 5);
  1774. bench_case(1, 32, 32, 80, 80, 5);
  1775. bench_case(1, 32, 64, 80, 80, 5);
  1776. bench_case(1, 32, 32, 200, 200, 3);
  1777. bench_case(1, 32, 64, 200, 200, 3);
  1778. bench_case(1, 32, 32, 128, 128, 3);
  1779. bench_case(1, 32, 64, 128, 128, 3);
  1780. bench_case(1, 32, 32, 100, 100, 3);
  1781. bench_case(1, 32, 64, 100, 100, 3);
  1782. bench_case(1, 32, 32, 80, 80, 3);
  1783. bench_case(1, 32, 64, 80, 80, 3);
  1784. bench_case(1, 32, 32, 200, 200, 2);
  1785. bench_case(1, 32, 64, 200, 200, 2);
  1786. bench_case(1, 32, 32, 128, 128, 2);
  1787. bench_case(1, 32, 64, 128, 128, 2);
  1788. bench_case(1, 32, 32, 100, 100, 2);
  1789. bench_case(1, 32, 64, 100, 100, 2);
  1790. bench_case(1, 32, 32, 80, 80, 2);
  1791. bench_case(1, 32, 64, 80, 80, 2);
  1792. std::string algo_name = "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE2";
  1793. printf("Benchmark X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE2 algo\n");
  1794. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1795. {4, {4, 5, 6, 7}}, {1, {4}}, data_type);
  1796. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1797. {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  1798. benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
  1799. {1, {4}}, data_type);
  1800. shapes_and_computation.clear();
  1801. }
  1802. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECTF32) {
  1803. constexpr size_t RUNS = 50;
  1804. param::ConvBias param;
  1805. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1806. param.pad_h = 1;
  1807. param.pad_w = 1;
  1808. param.stride_h = 1;
  1809. param.stride_w = 1;
  1810. param.sparse = param::ConvBias::Sparse::GROUP;
  1811. std::vector<DType> data_type = {dtype::Float32(), dtype::Float32(),
  1812. dtype::Float32(), dtype::Float32()};
  1813. std::vector<std::pair<SmallVector<TensorShape>, float>>
  1814. shapes_and_computation;
  1815. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  1816. size_t FS, size_t group) {
  1817. SmallVector<TensorShape> shapes{{N, IC, H, W},
  1818. {group, OC / group, IC / group, FS, FS},
  1819. {1, OC, 1, 1},
  1820. {},
  1821. {N, OC, H, W}};
  1822. TensorShape dst{N, OC, H, W};
  1823. float computations =
  1824. ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1825. dst.total_nr_elems()) *
  1826. 1e-6;
  1827. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1828. };
  1829. bench_case(1, 32, 32, 200, 200, 3, 4);
  1830. bench_case(1, 32, 32, 200, 200, 3, 32);
  1831. bench_case(1, 32, 32, 128, 128, 3, 4);
  1832. bench_case(1, 32, 32, 128, 128, 3, 32);
  1833. bench_case(1, 32, 32, 100, 100, 3, 4);
  1834. bench_case(1, 32, 32, 100, 100, 3, 32);
  1835. bench_case(1, 32, 32, 80, 80, 3, 4);
  1836. bench_case(1, 32, 32, 80, 80, 3, 32);
  1837. std::string algo_name = "X86_CONV_BIAS_DIRECT_STRIDE1_LARGE_GROUP";
  1838. printf("Benchmark X86_CONV_BIAS_DIRECT_STRIDE1_LARGE_GROUP algo\n");
  1839. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1840. {4, {4, 5, 6, 7}}, {1, {4}}, data_type);
  1841. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1842. {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  1843. benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
  1844. {1, {4}}, data_type);
  1845. shapes_and_computation.clear();
  1846. algo_name = "X86_CONV_BIAS_DIRECT_STRIDE1_SMALL_GROUP";
  1847. printf("Benchmark X86_CONV_BIAS_DIRECT_STRIDE1_SMALL_GROUP algo\n");
  1848. bench_case(1, 32, 32, 200, 200, 3, 1);
  1849. bench_case(1, 32, 32, 128, 128, 3, 1);
  1850. bench_case(1, 32, 32, 100, 100, 3, 1);
  1851. bench_case(1, 32, 32, 80, 80, 3, 1);
  1852. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1853. {4, {4, 5, 6, 7}}, {1, {4}}, data_type);
  1854. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1855. {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  1856. benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
  1857. {1, {4}}, data_type);
  1858. }
  1859. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_F32) {
  1860. constexpr size_t RUNS = 50;
  1861. param::ConvBias param;
  1862. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1863. param.pad_h = 1;
  1864. param.pad_w = 1;
  1865. param.stride_h = 1;
  1866. param.stride_w = 1;
  1867. std::vector<DType> data_type = {dtype::Float32(), dtype::Float32(),
  1868. dtype::Float32(), dtype::Float32()};
  1869. std::vector<std::pair<SmallVector<TensorShape>, float>>
  1870. shapes_and_computation;
  1871. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  1872. size_t FS, size_t group) {
  1873. SmallVector<TensorShape> shapes{{N, IC, H, W},
  1874. {OC / group, IC / group, FS, FS},
  1875. {1, OC, 1, 1},
  1876. {},
  1877. {N, OC, H, W}};
  1878. TensorShape dst{N, OC, H, W};
  1879. float computations =
  1880. ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1881. dst.total_nr_elems()) *
  1882. 1e-6;
  1883. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1884. };
  1885. bench_case(1, 32, 32, 200, 200, 3, 1);
  1886. bench_case(1, 32, 32, 200, 200, 3, 1);
  1887. bench_case(1, 32, 32, 128, 128, 3, 1);
  1888. bench_case(1, 32, 32, 128, 128, 3, 1);
  1889. bench_case(1, 32, 32, 100, 100, 3, 1);
  1890. bench_case(1, 32, 32, 100, 100, 3, 1);
  1891. bench_case(1, 32, 32, 80, 80, 3, 1);
  1892. bench_case(1, 32, 32, 80, 80, 3, 1);
  1893. bench_case(1, 64, 32, 7, 7, 3, 1);
  1894. bench_case(1, 64, 64, 7, 7, 3, 1);
  1895. bench_case(1, 64, 128, 7, 7, 3, 1);
  1896. bench_case(1, 64, 256, 7, 7, 3, 1);
  1897. bench_case(1, 64, 512, 7, 7, 3, 1);
  1898. bench_case(1, 64, 1024, 7, 7, 3, 1);
  1899. bench_case(1, 64, 32, 14, 14, 3, 1);
  1900. bench_case(1, 64, 64, 14, 14, 3, 1);
  1901. bench_case(1, 64, 128, 14, 14, 3, 1);
  1902. bench_case(1, 64, 256, 14, 14, 3, 1);
  1903. bench_case(1, 64, 512, 14, 14, 3, 1);
  1904. bench_case(1, 64, 1024, 14, 14, 3, 1);
  1905. bench_case(1, 128, 128, 14, 14, 3, 1);
  1906. bench_case(1, 128, 256, 14, 14, 3, 1);
  1907. bench_case(1, 512, 512, 14, 14, 3, 1);
  1908. bench_case(1, 256, 512, 14, 14, 3, 1);
  1909. bench_case(1, 512, 1024, 14, 14, 3, 1);
  1910. bench_case(1, 1024, 1024, 14, 14, 3, 1);
  1911. std::string algo_name = "IM2COLMATMUL:X86_F32_BLAS:192";
  1912. printf("Benchmark IM2COLMATMUL:X86_F32_BLAS algo\n");
  1913. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1914. {4, {4, 5, 6, 7}}, {1, {4}}, data_type);
  1915. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1916. {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  1917. benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
  1918. {1, {4}}, data_type);
  1919. shapes_and_computation.clear();
  1920. }
  1921. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_F32_single_thread) {
  1922. constexpr size_t RUNS = 50;
  1923. param::ConvBias param;
  1924. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1925. param.pad_h = 1;
  1926. param.pad_w = 1;
  1927. param.stride_h = 1;
  1928. param.stride_w = 1;
  1929. std::vector<DType> data_type = {dtype::Float32(), dtype::Float32(),
  1930. dtype::Float32(), dtype::Float32()};
  1931. std::vector<std::pair<SmallVector<TensorShape>, float>>
  1932. shapes_and_computation;
  1933. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H,
  1934. size_t W, size_t FS,
  1935. size_t group) {
  1936. SmallVector<TensorShape> shapes{{N, IC, H, W},
  1937. {OC / group, IC / group, FS, FS},
  1938. {1, OC, 1, 1},
  1939. {},
  1940. {N, OC, H, W}};
  1941. TensorShape dst{N, OC, H, W};
  1942. float computations =
  1943. ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1944. dst.total_nr_elems()) *
  1945. 1e-6;
  1946. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1947. };
  1948. bench_case(1, 32, 32, 200, 200, 3, 1);
  1949. bench_case(1, 32, 32, 200, 200, 3, 1);
  1950. bench_case(1, 32, 32, 128, 128, 3, 1);
  1951. bench_case(1, 32, 32, 128, 128, 3, 1);
  1952. bench_case(1, 32, 32, 100, 100, 3, 1);
  1953. bench_case(1, 32, 32, 100, 100, 3, 1);
  1954. bench_case(1, 32, 32, 80, 80, 3, 1);
  1955. bench_case(1, 32, 32, 80, 80, 3, 1);
  1956. bench_case(1, 64, 32, 7, 7, 3, 1);
  1957. bench_case(1, 64, 64, 7, 7, 3, 1);
  1958. bench_case(1, 64, 128, 7, 7, 3, 1);
  1959. bench_case(1, 64, 256, 7, 7, 3, 1);
  1960. bench_case(1, 64, 512, 7, 7, 3, 1);
  1961. bench_case(1, 64, 1024, 7, 7, 3, 1);
  1962. bench_case(1, 64, 32, 14, 14, 3, 1);
  1963. bench_case(1, 64, 64, 14, 14, 3, 1);
  1964. bench_case(1, 64, 128, 14, 14, 3, 1);
  1965. bench_case(1, 64, 256, 14, 14, 3, 1);
  1966. bench_case(1, 64, 512, 14, 14, 3, 1);
  1967. bench_case(1, 64, 1024, 14, 14, 3, 1);
  1968. bench_case(1, 128, 128, 14, 14, 3, 1);
  1969. bench_case(1, 128, 256, 14, 14, 3, 1);
  1970. bench_case(1, 512, 512, 14, 14, 3, 1);
  1971. bench_case(1, 256, 512, 14, 14, 3, 1);
  1972. bench_case(1, 512, 1024, 14, 14, 3, 1);
  1973. bench_case(1, 1024, 1024, 14, 14, 3, 1);
  1974. std::string algo_name = "IM2COLMATMUL:X86_F32_MKL_PACKA:192";
  1975. std::string algo_name1 = "IM2COLMATMUL:X86_F32_BLAS:192";
  1976. printf("Benchmark IM2COLMATMUL:X86_F32_BLAS algo\n");
  1977. benchmark_impl_comp(param, shapes_and_computation, algo_name,algo_name1, RUNS,
  1978. {1, {4}}, {1, {4}},data_type);
  1979. benchmark_impl_comp(param, shapes_and_computation, algo_name,algo_name1, RUNS,
  1980. {1, {7}}, {1, {7}},data_type);
  1981. shapes_and_computation.clear();
  1982. }
  1983. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_INT8X8X32) {
  1984. constexpr size_t RUNS = 50;
  1985. param::ConvBias param;
  1986. param.pad_h = 1;
  1987. param.pad_w = 1;
  1988. param.stride_h = 1;
  1989. param.stride_w = 1;
  1990. std::vector<std::pair<SmallVector<TensorShape>, float>>
  1991. shapes_and_computation;
  1992. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  1993. size_t FS, size_t group) {
  1994. SmallVector<TensorShape> shapes{{N, IC, H, W},
  1995. {OC / group, IC / group, FS, FS},
  1996. {1, OC, 1, 1},
  1997. {},
  1998. {N, OC, H, W}};
  1999. TensorShape dst{N, OC, H, W};
  2000. float computations =
  2001. ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  2002. dst.total_nr_elems()) *
  2003. 1e-6;
  2004. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  2005. };
  2006. bench_case(1, 32, 32, 200, 200, 3, 1);
  2007. bench_case(1, 32, 32, 200, 200, 3, 1);
  2008. bench_case(1, 32, 32, 128, 128, 3, 1);
  2009. bench_case(1, 32, 32, 128, 128, 3, 1);
  2010. bench_case(1, 32, 32, 100, 100, 3, 1);
  2011. bench_case(1, 32, 32, 100, 100, 3, 1);
  2012. bench_case(1, 32, 32, 80, 80, 3, 1);
  2013. bench_case(1, 32, 32, 80, 80, 3, 1);
  2014. bench_case(1, 64, 32, 7, 7, 3, 1);
  2015. bench_case(1, 64, 64, 7, 7, 3, 1);
  2016. bench_case(1, 64, 128, 7, 7, 3, 1);
  2017. bench_case(1, 64, 256, 7, 7, 3, 1);
  2018. bench_case(1, 64, 512, 7, 7, 3, 1);
  2019. bench_case(1, 64, 1024, 7, 7, 3, 1);
  2020. bench_case(1, 64, 32, 14, 14, 3, 1);
  2021. bench_case(1, 64, 64, 14, 14, 3, 1);
  2022. bench_case(1, 64, 128, 14, 14, 3, 1);
  2023. bench_case(1, 64, 256, 14, 14, 3, 1);
  2024. bench_case(1, 64, 512, 14, 14, 3, 1);
  2025. bench_case(1, 64, 1024, 14, 14, 3, 1);
  2026. bench_case(1, 128, 128, 14, 14, 3, 1);
  2027. bench_case(1, 128, 256, 14, 14, 3, 1);
  2028. bench_case(1, 512, 512, 14, 14, 3, 1);
  2029. bench_case(1, 256, 512, 14, 14, 3, 1);
  2030. bench_case(1, 512, 1024, 14, 14, 3, 1);
  2031. bench_case(1, 1024, 1024, 14, 14, 3, 1);
  2032. std::vector<DType> data_type = {dtype::Int8(), dtype::Int8(),
  2033. dtype::Int32(), dtype::Int32()};
  2034. std::string algo_name = "IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2:192";
  2035. // std::string algo_name = "IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16";
  2036. // printf("Benchmark IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2 algo\n");
  2037. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  2038. {4, {4, 5, 6, 7}}, {1, {4}}, data_type);
  2039. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  2040. {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  2041. benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
  2042. {1, {4}}, data_type);
  2043. shapes_and_computation.clear();
  2044. }
  2045. namespace{
  2046. std::vector<conv_bias::TestArg> get_winograd_benchmark_args(size_t kernel,
  2047. size_t pack_size) {
  2048. std::vector<conv_bias::TestArg> args;
  2049. auto pack = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  2050. size_t p) {
  2051. if (ic % pack_size != 0 || oc % pack_size != 0)
  2052. return;
  2053. if (w + 2 * p < kernel || h + 2 * p < kernel)
  2054. return;
  2055. param::ConvBias param;
  2056. param.mode = param::ConvBias::Mode::CROSS_CORRELATION;
  2057. param.format = param::ConvBias::Format::NCHW88;
  2058. param.sparse = param::ConvBias::Sparse::DENSE;
  2059. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  2060. param.stride_h = 1;
  2061. param.stride_w = 1;
  2062. param.pad_h = p;
  2063. param.pad_w = p;
  2064. args.push_back(conv_bias::TestArg{param,
  2065. TensorShape{1, ic/8, h, w, 8},
  2066. TensorShape{oc/8, ic/8, kernel, kernel, 8, 8},
  2067. {1, oc/8, 1, 1, 8}});
  2068. };
  2069. for (size_t ic : {64, 128, 256}) {
  2070. for (size_t oc : {64,128,256}) {
  2071. pack(oc, ic, 56, 56, kernel, kernel / 2);
  2072. pack(oc, ic, 14, 14, kernel, kernel / 2);
  2073. pack(oc, ic, 28, 28, kernel, kernel / 2);
  2074. }
  2075. }
  2076. //! conv in vgg16
  2077. pack(512, 512, 15, 15, kernel, kernel / 2);
  2078. pack(512, 256, 15, 15, kernel, kernel / 2);
  2079. pack(256, 256, 29, 29, kernel, kernel / 2);
  2080. pack(256, 128, 29, 29, kernel, kernel / 2);
  2081. pack(128, 128, 57, 57, kernel, kernel / 2);
  2082. pack(128, 64, 57, 57, kernel, kernel / 2);
  2083. pack(64, 64, 56, 56, kernel, kernel / 2);
  2084. pack(128, 128, 28, 28, kernel, kernel / 2);
  2085. pack(512, 512, 14, 14, kernel, kernel / 2);
  2086. return args;
  2087. }
  2088. void benchmark_winograd(const char* algo_name, Handle* handle,
  2089. size_t kernel, size_t pack_size) {
  2090. auto&& args = get_winograd_benchmark_args(kernel, pack_size);
  2091. using namespace conv_bias;
  2092. constexpr size_t RUN = 10;
  2093. Benchmarker<ConvBias> benchmark(handle);
  2094. benchmark.set_display(false);
  2095. benchmark.set_times(RUN);
  2096. Benchmarker<ConvBias> benchmark_winograd(handle);
  2097. benchmark_winograd.set_display(false);
  2098. benchmark_winograd.set_times(RUN);
  2099. for (auto&& arg : args) {
  2100. TensorLayout dst_layout;
  2101. auto opr = handle->create_operator<ConvBias>();
  2102. opr->param() = arg.param;
  2103. opr->deduce_layout({arg.src, dtype::Float32()},
  2104. {arg.filter, dtype::Float32()},
  2105. {arg.bias, dtype::Float32()}, {}, dst_layout);
  2106. //! dst.nr_elems * IC * FH * FW * 2
  2107. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  2108. arg.filter[2] * arg.filter[3] * 2.0 * 8.0 /
  2109. (1024 * 1024 * 1024) * 1e3;
  2110. auto used = benchmark.set_param(arg.param).exec(
  2111. {arg.src, arg.filter, {}, {}, {}}) /
  2112. RUN;
  2113. benchmark_winograd.set_param(arg.param);
  2114. auto used_winograd =
  2115. algo_benchmark<ConvBias>(benchmark_winograd,
  2116. {arg.src, arg.filter, {}, {}, {}},
  2117. algo_name) /
  2118. RUN;
  2119. printf("%s %s: normal: %f ms %f Gflops winograd: %f ms %f GFlops "
  2120. "speedup: "
  2121. "%f\n",
  2122. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  2123. used, computations / used, used_winograd,
  2124. computations / used_winograd, used / used_winograd);
  2125. }
  2126. }
  2127. }
  2128. TEST_F(X86, BENCHMARK_CONVBIAS_WINOGRAD_F63_8x8) {
  2129. benchmark_winograd("WINOGRAD:X86_F32MK8_8X8:8:6:8", handle(), 3, 8);
  2130. }
  2131. TEST_F(X86, BENCHMARK_CONVBIAS_WINOGRAD_F23_8x8) {
  2132. benchmark_winograd("WINOGRAD:X86_F32MK8_8X8:8:2:8", handle(), 3, 8);
  2133. }
  2134. #endif
  2135. } // namespace test
  2136. } // namespace megdnn
  2137. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台