You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias.cpp 122 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029
  1. /**
  2. * \file dnn/test/x86/conv_bias.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "src/x86/utils.h"
  13. #include "test/x86/fixture.h"
  14. #include "megdnn/opr_param_defs.h"
  15. #include "megdnn/oprs.h"
  16. #include "test/common/benchmarker.h"
  17. #include "test/common/checker.h"
  18. #include "test/common/conv_bias.h"
  19. #include "test/common/rng.h"
  20. #include "test/common/tensor.h"
  21. #include "test/common/workspace_wrapper.h"
  22. namespace megdnn {
  23. namespace test {
  24. TEST_F(X86, CONV_BIAS_FORWARD) {
  25. using namespace conv_bias;
  26. std::vector<TestArg> args = get_args();
  27. Checker<ConvBiasForward> checker(handle());
  28. NormalRNG default_rng;
  29. ConstValue const_val;
  30. for (auto&& arg : args) {
  31. checker.set_dtype(0, dtype::Float32())
  32. .set_dtype(1, dtype::Float32())
  33. .set_dtype(2, dtype::Float32())
  34. .set_rng(0, &default_rng)
  35. .set_rng(1, &default_rng)
  36. .set_rng(2, &default_rng)
  37. .set_epsilon(1e-3)
  38. .set_param(arg.param)
  39. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  40. }
  41. }
  42. static void avx2_chanwise_direct_int8x8x32(
  43. Handle* handle, uint32_t stride, const char* algo) {
  44. using namespace conv_bias;
  45. std::vector<TestArg> args;
  46. auto run = [&](size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  47. NonlineMode nonline_mode) {
  48. if (w + 2 * p < kernel || h + 2 * p < kernel)
  49. return;
  50. param::ConvBias param;
  51. param.stride_h = stride;
  52. param.stride_w = stride;
  53. param.pad_h = p;
  54. param.pad_w = p;
  55. param.nonlineMode = nonline_mode;
  56. param.sparse = param::ConvBias::Sparse::GROUP;
  57. //! no bias
  58. args.emplace_back(
  59. param, TensorShape{2, ic, h, w}, TensorShape{ic, 1, 1, kernel, kernel},
  60. TensorShape{});
  61. //! bias channel
  62. args.emplace_back(
  63. param, TensorShape{2, ic, h, w}, TensorShape{ic, 1, 1, kernel, kernel},
  64. TensorShape{1, ic, 1, 1});
  65. };
  66. for (size_t kernel : {2, 3, 5, 7})
  67. for (size_t pad : {0, 1})
  68. for (size_t ic : {1, 5, 17, 20})
  69. for (size_t h : {7, 16, 38, 40})
  70. for (size_t w : {16, 25, 40, 55})
  71. for (NonlineMode nonline_mode : {NonlineMode::IDENTITY})
  72. run(ic, w, h, kernel, pad, nonline_mode);
  73. Checker<ConvBias> checker(handle);
  74. UniformIntRNG rng{-50, 50};
  75. checker.set_dtype(0, dtype::Int8())
  76. .set_dtype(1, dtype::Int8())
  77. .set_dtype(2, dtype::Int32())
  78. .set_dtype(4, dtype::Int32())
  79. .set_rng(0, &rng)
  80. .set_rng(1, &rng)
  81. .set_rng(2, &rng)
  82. .set_epsilon(1e-3);
  83. checker.set_before_exec_callback(
  84. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo));
  85. for (auto&& arg : args) {
  86. checker.set_param(arg.param).exec({arg.src, arg.filter, arg.bias, {}, {}});
  87. }
  88. }
  89. TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_INT8x8x32) {
  90. avx2_chanwise_direct_int8x8x32(
  91. handle(), 1, "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1");
  92. }
  93. TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE2_INT8x8x32) {
  94. avx2_chanwise_direct_int8x8x32(
  95. handle(), 2, "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE2");
  96. }
  97. static void avx2_chanwise_direct_quantizeds32(
  98. Handle* handle, uint32_t stride, const char* algo) {
  99. using namespace conv_bias;
  100. std::vector<TestArg> args;
  101. auto run = [&](size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  102. NonlineMode nonline_mode) {
  103. if (w + 2 * p < kernel || h + 2 * p < kernel)
  104. return;
  105. param::ConvBias param;
  106. param.stride_h = stride;
  107. param.stride_w = stride;
  108. param.pad_h = p;
  109. param.pad_w = p;
  110. param.nonlineMode = nonline_mode;
  111. param.sparse = param::ConvBias::Sparse::GROUP;
  112. //! no bias
  113. args.emplace_back(
  114. param, TensorShape{2, ic, h, w}, TensorShape{ic, 1, 1, kernel, kernel},
  115. TensorShape{});
  116. //! bias channel
  117. args.emplace_back(
  118. param, TensorShape{2, ic, h, w}, TensorShape{ic, 1, 1, kernel, kernel},
  119. TensorShape{1, ic, 1, 1});
  120. };
  121. for (size_t kernel : {2, 3, 5, 7})
  122. for (size_t pad : {0, 1})
  123. for (size_t ic : {1, 3, 5, 7, 17})
  124. for (size_t h : {10, 17, 25, 30})
  125. for (size_t w : {19, 28, 58, 168})
  126. for (NonlineMode nonline_mode : {NonlineMode::IDENTITY})
  127. run(ic, w, h, kernel, pad, nonline_mode);
  128. Checker<ConvBias> checker(handle);
  129. UniformIntRNG rng{-50, 50};
  130. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  131. .set_dtype(1, dtype::QuantizedS8(2.5f))
  132. .set_dtype(2, dtype::QuantizedS32(6.25f))
  133. .set_dtype(4, {})
  134. .set_rng(0, &rng)
  135. .set_rng(1, &rng)
  136. .set_rng(2, &rng)
  137. .set_epsilon(1e-3);
  138. checker.set_before_exec_callback(
  139. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo));
  140. for (auto&& arg : args) {
  141. checker.set_param(arg.param).exec({arg.src, arg.filter, arg.bias, {}, {}});
  142. }
  143. }
  144. TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_QuantizedS32) {
  145. avx2_chanwise_direct_quantizeds32(
  146. handle(), 1, "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1");
  147. }
  148. TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE2_QuantizedS32) {
  149. avx2_chanwise_direct_quantizeds32(
  150. handle(), 2, "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE2");
  151. }
  152. static void avx2_chanwise_direct_quantizeds8x8x8(
  153. Handle* handle, uint32_t stride, const char* algo) {
  154. using namespace conv_bias;
  155. std::vector<TestArg> args;
  156. auto run = [&](size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  157. NonlineMode nonline_mode) {
  158. if (w + 2 * p < kernel || h + 2 * p < kernel)
  159. return;
  160. param::ConvBias param;
  161. param.stride_h = stride;
  162. param.stride_w = stride;
  163. param.pad_h = p;
  164. param.pad_w = p;
  165. param.nonlineMode = nonline_mode;
  166. param.sparse = param::ConvBias::Sparse::GROUP;
  167. //! no bias
  168. args.emplace_back(
  169. param, TensorShape{2, ic, h, w}, TensorShape{ic, 1, 1, kernel, kernel},
  170. TensorShape{});
  171. //! bias channel
  172. args.emplace_back(
  173. param, TensorShape{2, ic, h, w}, TensorShape{ic, 1, 1, kernel, kernel},
  174. TensorShape{1, ic, 1, 1});
  175. };
  176. for (size_t kernel : {2, 3, 5, 7})
  177. for (size_t pad : {0, 1})
  178. for (size_t ic : {1, 3, 5, 7, 17})
  179. for (size_t h : {10, 15, 17, 30})
  180. for (size_t w : {19, 28, 58, 168})
  181. for (NonlineMode nonline_mode :
  182. {NonlineMode::IDENTITY, NonlineMode::H_SWISH,
  183. NonlineMode::RELU})
  184. run(ic, w, h, kernel, pad, nonline_mode);
  185. Checker<ConvBias> checker(handle);
  186. UniformIntRNG rng{-50, 50};
  187. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  188. .set_dtype(1, dtype::QuantizedS8(2.5f))
  189. .set_dtype(2, dtype::QuantizedS32(6.25f))
  190. .set_dtype(4, dtype::QuantizedS8(60.25f))
  191. .set_rng(0, &rng)
  192. .set_rng(1, &rng)
  193. .set_rng(2, &rng)
  194. .set_epsilon(1e-3);
  195. checker.set_before_exec_callback(
  196. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo));
  197. for (auto&& arg : args) {
  198. checker.set_param(arg.param).exec({arg.src, arg.filter, arg.bias, {}, {}});
  199. }
  200. }
  201. TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_QuantizedS8x8x8) {
  202. avx2_chanwise_direct_quantizeds8x8x8(
  203. handle(), 1, "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1");
  204. }
  205. TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE2_QuantizedS8x8x8) {
  206. avx2_chanwise_direct_quantizeds8x8x8(
  207. handle(), 2, "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE2");
  208. }
  209. TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE1_INT8x8x32) {
  210. using namespace conv_bias;
  211. std::vector<TestArg> args;
  212. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  213. NonlineMode nonline_mode) {
  214. if (w + 2 * p < kernel || h + 2 * p < kernel)
  215. return;
  216. param::ConvBias param;
  217. param.stride_h = 1;
  218. param.stride_w = 1;
  219. param.pad_h = p;
  220. param.pad_w = p;
  221. param.nonlineMode = nonline_mode;
  222. param.sparse = param::ConvBias::Sparse::DENSE;
  223. //! no bias
  224. args.emplace_back(
  225. param, TensorShape{2, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  226. TensorShape{});
  227. param.sparse = param::ConvBias::Sparse::GROUP;
  228. //! no bias
  229. args.emplace_back(
  230. param, TensorShape{2, 2 * ic, h, w},
  231. TensorShape{2, oc / 2, ic, kernel, kernel}, TensorShape{});
  232. };
  233. for (size_t kernel : {2, 3, 5, 7})
  234. for (size_t pad : {0, 1})
  235. for (size_t oc : {4, 8, 13, 16, 24})
  236. for (size_t ic : {2, 3, 7, 10})
  237. for (size_t h : {10, 11})
  238. for (size_t w : {8, 10})
  239. for (NonlineMode nonline_mode : {NonlineMode::IDENTITY})
  240. run(oc, ic, w, h, kernel, pad, nonline_mode);
  241. Checker<ConvBias> checker(handle());
  242. UniformIntRNG rng{-50, 50};
  243. checker.set_dtype(0, dtype::Int8())
  244. .set_dtype(1, dtype::Int8())
  245. .set_dtype(2, dtype::Int32())
  246. .set_dtype(4, dtype::Int32())
  247. .set_rng(0, &rng)
  248. .set_rng(1, &rng)
  249. .set_rng(2, &rng)
  250. .set_epsilon(1e-3);
  251. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  252. "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE1"));
  253. for (auto&& arg : args) {
  254. checker.set_param(arg.param).exec({arg.src, arg.filter, arg.bias, {}, {}});
  255. }
  256. }
  257. TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE1_QuantizedS32) {
  258. using namespace conv_bias;
  259. std::vector<TestArg> args;
  260. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  261. NonlineMode nonline_mode) {
  262. if (w + 2 * p < kernel || h + 2 * p < kernel)
  263. return;
  264. param::ConvBias param;
  265. param.stride_h = 1;
  266. param.stride_w = 1;
  267. param.pad_h = p;
  268. param.pad_w = p;
  269. param.nonlineMode = nonline_mode;
  270. param.sparse = param::ConvBias::Sparse::DENSE;
  271. //! no bias
  272. args.emplace_back(
  273. param, TensorShape{2, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  274. TensorShape{});
  275. param.sparse = param::ConvBias::Sparse::GROUP;
  276. //! no bias
  277. args.emplace_back(
  278. param, TensorShape{2, 2 * ic, h, w},
  279. TensorShape{2, oc / 2, ic, kernel, kernel}, TensorShape{});
  280. };
  281. for (size_t kernel : {2, 3, 5, 7})
  282. for (size_t pad : {0, 1})
  283. for (size_t oc : {4, 8, 13, 16, 24})
  284. for (size_t ic : {2, 3, 7, 10})
  285. for (size_t h : {10, 11})
  286. for (size_t w : {8, 10})
  287. for (NonlineMode nonline_mode : {NonlineMode::IDENTITY})
  288. run(oc, ic, w, h, kernel, pad, nonline_mode);
  289. Checker<ConvBias> checker(handle());
  290. UniformIntRNG rng{-50, 50};
  291. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  292. .set_dtype(1, dtype::QuantizedS8(2.5f))
  293. .set_dtype(2, dtype::QuantizedS32(6.25f))
  294. .set_dtype(4, {})
  295. .set_rng(0, &rng)
  296. .set_rng(1, &rng)
  297. .set_rng(2, &rng)
  298. .set_epsilon(1e-3);
  299. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  300. "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE1"));
  301. for (auto&& arg : args) {
  302. checker.set_param(arg.param).exec({arg.src, arg.filter, arg.bias, {}, {}});
  303. }
  304. }
  305. TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE1_S8S8S8) {
  306. using namespace conv_bias;
  307. std::vector<TestArg> args;
  308. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  309. NonlineMode nonline_mode) {
  310. if (w + 2 * p < kernel || h + 2 * p < kernel)
  311. return;
  312. param::ConvBias param;
  313. param.stride_h = 1;
  314. param.stride_w = 1;
  315. param.pad_h = p;
  316. param.pad_w = p;
  317. param.nonlineMode = nonline_mode;
  318. param.sparse = param::ConvBias::Sparse::DENSE;
  319. //! no bias
  320. args.emplace_back(
  321. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  322. TensorShape{});
  323. //! bias channel
  324. args.emplace_back(
  325. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  326. TensorShape{1, oc, 1, 1});
  327. param.sparse = param::ConvBias::Sparse::GROUP;
  328. //! no bias
  329. args.emplace_back(
  330. param, TensorShape{2, 2 * ic, h, w},
  331. TensorShape{2, oc / 2, ic, kernel, kernel}, TensorShape{});
  332. //! bias channel
  333. args.emplace_back(
  334. param, TensorShape{2, 2 * ic, h, w},
  335. TensorShape{2, oc / 2, ic, kernel, kernel}, TensorShape{1, oc, 1, 1});
  336. };
  337. for (size_t kernel : {2, 3, 5, 7})
  338. for (size_t pad : {0, 1})
  339. for (size_t oc : {4, 8, 14, 16, 24})
  340. for (size_t ic : {2, 3, 7, 10})
  341. for (size_t h : {10, 11})
  342. for (size_t w : {8, 10})
  343. for (NonlineMode nonline_mode :
  344. {NonlineMode::IDENTITY, NonlineMode::RELU,
  345. NonlineMode::H_SWISH})
  346. run(oc, ic, w, h, kernel, pad, nonline_mode);
  347. Checker<ConvBias> checker(handle());
  348. UniformIntRNG rng{-50, 50};
  349. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  350. .set_dtype(1, dtype::QuantizedS8(2.5f))
  351. .set_dtype(2, dtype::QuantizedS32(6.25f))
  352. .set_dtype(4, dtype::QuantizedS8(60.25f))
  353. .set_rng(0, &rng)
  354. .set_rng(1, &rng)
  355. .set_rng(2, &rng)
  356. .set_epsilon(1e-3);
  357. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  358. "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE1"));
  359. for (auto&& arg : args) {
  360. checker.set_param(arg.param).exec({arg.src, arg.filter, arg.bias, {}, {}});
  361. }
  362. }
  363. TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE2_INT8x8x32) {
  364. using namespace conv_bias;
  365. std::vector<TestArg> args;
  366. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  367. NonlineMode nonline_mode) {
  368. if (w + 2 * p < kernel || h + 2 * p < kernel)
  369. return;
  370. param::ConvBias param;
  371. param.stride_h = 2;
  372. param.stride_w = 2;
  373. param.pad_h = p;
  374. param.pad_w = p;
  375. param.nonlineMode = nonline_mode;
  376. param.sparse = param::ConvBias::Sparse::DENSE;
  377. //! no bias
  378. args.emplace_back(
  379. param, TensorShape{2, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  380. TensorShape{});
  381. param.sparse = param::ConvBias::Sparse::GROUP;
  382. //! no bias
  383. args.emplace_back(
  384. param, TensorShape{2, 2 * ic, h, w},
  385. TensorShape{2, oc / 2, ic, kernel, kernel}, TensorShape{});
  386. };
  387. for (size_t kernel : {2, 3, 5, 7})
  388. for (size_t pad : {0, 1, 2, 5})
  389. for (size_t oc : {4, 8, 13, 16, 24})
  390. for (size_t ic : {2, 3, 7, 10})
  391. for (size_t h : {10, 11})
  392. for (size_t w : {8, 10, 20})
  393. for (NonlineMode nonline_mode : {NonlineMode::IDENTITY})
  394. run(oc, ic, w, h, kernel, pad, nonline_mode);
  395. Checker<ConvBias> checker(handle());
  396. UniformIntRNG rng{-50, 50};
  397. checker.set_dtype(0, dtype::Int8())
  398. .set_dtype(1, dtype::Int8())
  399. .set_dtype(2, dtype::Int32())
  400. .set_dtype(4, dtype::Int32())
  401. .set_rng(0, &rng)
  402. .set_rng(1, &rng)
  403. .set_rng(2, &rng)
  404. .set_epsilon(1e-3);
  405. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  406. "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE2"));
  407. for (auto&& arg : args) {
  408. checker.set_param(arg.param).exec({arg.src, arg.filter, arg.bias, {}, {}});
  409. }
  410. }
  411. TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE2_QuantizedS32) {
  412. using namespace conv_bias;
  413. std::vector<TestArg> args;
  414. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  415. NonlineMode nonline_mode) {
  416. if (w + 2 * p < kernel || h + 2 * p < kernel)
  417. return;
  418. param::ConvBias param;
  419. param.stride_h = 2;
  420. param.stride_w = 2;
  421. param.pad_h = p;
  422. param.pad_w = p;
  423. param.nonlineMode = nonline_mode;
  424. param.sparse = param::ConvBias::Sparse::DENSE;
  425. //! no bias
  426. args.emplace_back(
  427. param, TensorShape{2, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  428. TensorShape{});
  429. param.sparse = param::ConvBias::Sparse::GROUP;
  430. //! no bias
  431. args.emplace_back(
  432. param, TensorShape{2, 2 * ic, h, w},
  433. TensorShape{2, oc / 2, ic, kernel, kernel}, TensorShape{});
  434. };
  435. for (size_t kernel : {2, 3, 5, 7})
  436. for (size_t pad : {0, 1, 3, 5})
  437. for (size_t oc : {4, 8, 13, 16, 24})
  438. for (size_t ic : {2, 3, 7, 10})
  439. for (size_t h : {10, 11})
  440. for (size_t w : {8, 10, 19})
  441. for (NonlineMode nonline_mode : {NonlineMode::IDENTITY})
  442. run(oc, ic, w, h, kernel, pad, nonline_mode);
  443. Checker<ConvBias> checker(handle());
  444. UniformIntRNG rng{-50, 50};
  445. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  446. .set_dtype(1, dtype::QuantizedS8(2.5f))
  447. .set_dtype(2, dtype::QuantizedS32(6.25f))
  448. .set_dtype(4, {})
  449. .set_rng(0, &rng)
  450. .set_rng(1, &rng)
  451. .set_rng(2, &rng)
  452. .set_epsilon(1e-3);
  453. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  454. "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE2"));
  455. for (auto&& arg : args) {
  456. checker.set_param(arg.param).exec({arg.src, arg.filter, arg.bias, {}, {}});
  457. }
  458. }
  459. TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE2_S8S8S8) {
  460. using namespace conv_bias;
  461. std::vector<TestArg> args;
  462. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  463. NonlineMode nonline_mode) {
  464. if (w + 2 * p < kernel || h + 2 * p < kernel)
  465. return;
  466. param::ConvBias param;
  467. param.stride_h = 2;
  468. param.stride_w = 2;
  469. param.pad_h = p;
  470. param.pad_w = p;
  471. param.nonlineMode = nonline_mode;
  472. param.sparse = param::ConvBias::Sparse::DENSE;
  473. //! no bias
  474. args.emplace_back(
  475. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  476. TensorShape{});
  477. //! bias channel
  478. args.emplace_back(
  479. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  480. TensorShape{1, oc, 1, 1});
  481. param.sparse = param::ConvBias::Sparse::GROUP;
  482. //! no bias
  483. args.emplace_back(
  484. param, TensorShape{2, 2 * ic, h, w},
  485. TensorShape{2, oc / 2, ic, kernel, kernel}, TensorShape{});
  486. //! bias channel
  487. args.emplace_back(
  488. param, TensorShape{2, 2 * ic, h, w},
  489. TensorShape{2, oc / 2, ic, kernel, kernel}, TensorShape{1, oc, 1, 1});
  490. };
  491. for (size_t kernel : {2, 3, 5, 7})
  492. for (size_t pad : {0, 1, 3, 5})
  493. for (size_t oc : {4, 8, 14, 16, 24})
  494. for (size_t ic : {2, 3, 7, 10})
  495. for (size_t h : {10, 11})
  496. for (size_t w : {8, 10, 18})
  497. for (NonlineMode nonline_mode :
  498. {NonlineMode::IDENTITY, NonlineMode::RELU,
  499. NonlineMode::H_SWISH})
  500. run(oc, ic, w, h, kernel, pad, nonline_mode);
  501. Checker<ConvBias> checker(handle());
  502. UniformIntRNG rng{-50, 50};
  503. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  504. .set_dtype(1, dtype::QuantizedS8(2.5f))
  505. .set_dtype(2, dtype::QuantizedS32(6.25f))
  506. .set_dtype(4, dtype::QuantizedS8(60.25f))
  507. .set_rng(0, &rng)
  508. .set_rng(1, &rng)
  509. .set_rng(2, &rng)
  510. .set_epsilon(1e-3);
  511. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  512. "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE2"));
  513. for (auto&& arg : args) {
  514. checker.set_param(arg.param).exec({arg.src, arg.filter, arg.bias, {}, {}});
  515. }
  516. }
  517. TEST_F(X86_MULTI_THREADS, CONV_BIAS_DIRECT_STRIDE1_DENSE) {
  518. using namespace conv_bias;
  519. std::vector<TestArg> args;
  520. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  521. NonlineMode nonline_mode) {
  522. if (w + 2 * p < kernel || h + 2 * p < kernel)
  523. return;
  524. param::ConvBias param;
  525. param.stride_h = 1;
  526. param.stride_w = 1;
  527. param.pad_h = p;
  528. param.pad_w = p;
  529. param.nonlineMode = nonline_mode;
  530. //! no bias
  531. args.emplace_back(
  532. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  533. TensorShape{});
  534. //! bias channel
  535. args.emplace_back(
  536. param, TensorShape{2, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  537. TensorShape{1, oc, 1, 1});
  538. //! bias
  539. args.emplace_back(
  540. param, TensorShape{2, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  541. TensorShape{
  542. 2, oc, (h + param.pad_h * 2 - kernel) + 1,
  543. (w + param.pad_w * 2 - kernel) + 1});
  544. };
  545. for (size_t kernel : {1, 2, 3, 4, 5, 6, 7})
  546. for (size_t ic : {1, 4, 8, 16})
  547. for (size_t oc : {1, 4, 8})
  548. for (size_t p : {0, 2})
  549. for (size_t size : {20, 21, 24})
  550. for (NonlineMode nonline_mode :
  551. {NonlineMode::RELU, NonlineMode::SIGMOID,
  552. NonlineMode::H_SWISH, NonlineMode::IDENTITY}) {
  553. run(oc, ic, size, size, kernel, p, nonline_mode);
  554. }
  555. Checker<ConvBias> checker(handle());
  556. UniformIntRNG rng{-50, 50};
  557. checker.set_dtype(0, dtype::Float32())
  558. .set_dtype(1, dtype::Float32())
  559. .set_dtype(2, dtype::Float32())
  560. .set_rng(0, &rng)
  561. .set_rng(1, &rng)
  562. .set_rng(2, &rng);
  563. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  564. "X86_CONV_BIAS_DIRECT_STRIDE1_LARGE_GROUP"));
  565. for (auto&& arg : args) {
  566. checker.set_param(arg.param).exec({arg.src, arg.filter, arg.bias, {}, {}});
  567. }
  568. }
  569. TEST_F(X86_MULTI_THREADS, CONV_BIAS_DIRECT_STRIDE1_GROUP) {
  570. using namespace conv_bias;
  571. std::vector<TestArg> args;
  572. auto run = [&](size_t group, size_t channel, size_t w, size_t h, size_t kernel,
  573. size_t p, NonlineMode nonline_mode) {
  574. if (w + 2 * p < kernel || h + 2 * p < kernel)
  575. return;
  576. param::ConvBias param;
  577. param.stride_h = 1;
  578. param.stride_w = 1;
  579. param.pad_h = p;
  580. param.pad_w = p;
  581. param.nonlineMode = nonline_mode;
  582. param.sparse = param::ConvBias::Sparse::GROUP;
  583. //! no bias
  584. args.emplace_back(
  585. param, TensorShape{1, channel, h, w},
  586. TensorShape{group, channel / group, channel / group, kernel, kernel},
  587. TensorShape{});
  588. //! bias channel
  589. args.emplace_back(
  590. param, TensorShape{2, channel, h, w},
  591. TensorShape{group, channel / group, channel / group, kernel, kernel},
  592. TensorShape{1, channel, 1, 1});
  593. //! bias
  594. args.emplace_back(
  595. param, TensorShape{2, channel, h, w},
  596. TensorShape{group, channel / group, channel / group, kernel, kernel},
  597. TensorShape{
  598. 2, channel, (h + param.pad_h * 2 - kernel) + 1,
  599. (w + param.pad_w * 2 - kernel) + 1});
  600. };
  601. for (size_t kernel : {1, 2, 3, 4, 5, 6, 7})
  602. for (size_t channel : {4, 8, 16})
  603. for (size_t group : {1, 2, 4})
  604. for (size_t p : {0, 2})
  605. for (size_t size : {20, 21, 24})
  606. for (NonlineMode nonline_mode :
  607. {NonlineMode::RELU, NonlineMode::SIGMOID,
  608. NonlineMode::H_SWISH, NonlineMode::IDENTITY}) {
  609. run(group, channel, size, size, kernel, p, nonline_mode);
  610. }
  611. Checker<ConvBias> checker(handle());
  612. UniformIntRNG rng{-50, 50};
  613. checker.set_dtype(0, dtype::Float32())
  614. .set_dtype(1, dtype::Float32())
  615. .set_dtype(2, dtype::Float32())
  616. .set_rng(0, &rng)
  617. .set_rng(1, &rng)
  618. .set_rng(2, &rng);
  619. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  620. "X86_CONV_BIAS_DIRECT_STRIDE1_LARGE_GROUP"));
  621. for (auto&& arg : args) {
  622. checker.set_param(arg.param).exec({arg.src, arg.filter, arg.bias, {}, {}});
  623. }
  624. }
  625. TEST_F(X86_MULTI_THREADS, CONV_BIAS_DIRECT_STRIDE2_DENSE) {
  626. using namespace conv_bias;
  627. std::vector<TestArg> args;
  628. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  629. NonlineMode nonline_mode) {
  630. if (w + 2 * p < kernel || h + 2 * p < kernel)
  631. return;
  632. param::ConvBias param;
  633. param.stride_h = 2;
  634. param.stride_w = 2;
  635. param.pad_h = p;
  636. param.pad_w = p;
  637. param.nonlineMode = nonline_mode;
  638. //! no bias
  639. args.emplace_back(
  640. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  641. TensorShape{});
  642. };
  643. for (size_t kernel : {2, 3, 5, 7})
  644. for (size_t ic : {1, 4, 8, 16})
  645. for (size_t oc : {1, 4, 8})
  646. for (size_t p : {0, 2})
  647. for (size_t size : {20, 21, 24})
  648. for (NonlineMode nonline_mode :
  649. {NonlineMode::RELU, NonlineMode::SIGMOID,
  650. NonlineMode::H_SWISH, NonlineMode::IDENTITY}) {
  651. run(oc, ic, size, size, kernel, p, nonline_mode);
  652. }
  653. Checker<ConvBias> checker(handle());
  654. UniformIntRNG rng{-50, 50};
  655. checker.set_dtype(0, dtype::Float32())
  656. .set_dtype(1, dtype::Float32())
  657. .set_dtype(2, dtype::Float32())
  658. .set_rng(0, &rng)
  659. .set_rng(1, &rng)
  660. .set_rng(2, &rng);
  661. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  662. "X86_CONV_BIAS_DIRECT_STRIDE2_LARGE_GROUP"));
  663. for (auto&& arg : args) {
  664. checker.set_param(arg.param).exec({arg.src, arg.filter, arg.bias, {}, {}});
  665. }
  666. }
  667. TEST_F(X86_MULTI_THREADS, CONV_BIAS_DIRECT_STRIDE2_GROUP) {
  668. using namespace conv_bias;
  669. std::vector<TestArg> args;
  670. auto run = [&](size_t group, size_t channel, size_t w, size_t h, size_t kernel,
  671. size_t p, NonlineMode nonline_mode) {
  672. if (w + 2 * p < kernel || h + 2 * p < kernel)
  673. return;
  674. param::ConvBias param;
  675. param.stride_h = 2;
  676. param.stride_w = 2;
  677. param.pad_h = p;
  678. param.pad_w = p;
  679. param.nonlineMode = nonline_mode;
  680. param.sparse = param::ConvBias::Sparse::GROUP;
  681. //! no bias
  682. args.emplace_back(
  683. param, TensorShape{1, channel, h, w},
  684. TensorShape{group, channel / group, channel / group, kernel, kernel},
  685. TensorShape{});
  686. //! bias channel
  687. args.emplace_back(
  688. param, TensorShape{2, channel, h, w},
  689. TensorShape{group, channel / group, channel / group, kernel, kernel},
  690. TensorShape{1, channel, 1, 1});
  691. //! bias
  692. args.emplace_back(
  693. param, TensorShape{2, channel, h, w},
  694. TensorShape{group, channel / group, channel / group, kernel, kernel},
  695. TensorShape{
  696. 2, channel, (h + param.pad_h * 2 - kernel) / 2 + 1,
  697. (w + param.pad_w * 2 - kernel) / 2 + 1});
  698. };
  699. for (size_t kernel : {2, 3, 5, 7})
  700. for (size_t channel : {4, 8, 16})
  701. for (size_t group : {1, 2, 4})
  702. for (size_t p : {0, 2})
  703. for (size_t size : {20, 21, 24})
  704. for (NonlineMode nonline_mode :
  705. {NonlineMode::RELU, NonlineMode::SIGMOID,
  706. NonlineMode::H_SWISH, NonlineMode::IDENTITY}) {
  707. run(group, channel, size, size, kernel, p, nonline_mode);
  708. }
  709. Checker<ConvBias> checker(handle());
  710. UniformIntRNG rng{-50, 50};
  711. checker.set_dtype(0, dtype::Float32())
  712. .set_dtype(1, dtype::Float32())
  713. .set_dtype(2, dtype::Float32())
  714. .set_rng(0, &rng)
  715. .set_rng(1, &rng)
  716. .set_rng(2, &rng);
  717. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  718. "X86_CONV_BIAS_DIRECT_STRIDE2_LARGE_GROUP"));
  719. for (auto&& arg : args) {
  720. checker.set_param(arg.param).exec({arg.src, arg.filter, arg.bias, {}, {}});
  721. }
  722. }
  723. TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8X8X32) {
  724. using namespace conv_bias;
  725. std::vector<TestArg> args;
  726. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  727. NonlineMode nonline_mode) {
  728. if (w + 2 * p < kernel || h + 2 * p < kernel)
  729. return;
  730. param::ConvBias param;
  731. param.stride_h = 1;
  732. param.stride_w = 1;
  733. param.pad_h = p;
  734. param.pad_w = p;
  735. param.nonlineMode = nonline_mode;
  736. //! no bias
  737. args.emplace_back(
  738. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  739. TensorShape{});
  740. args.emplace_back(
  741. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  742. TensorShape{1, oc, 1, 1});
  743. args.emplace_back(
  744. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  745. TensorShape{1, oc, (h + 2 * p - kernel) + 1, (h + 2 * p - kernel) + 1});
  746. };
  747. for (size_t kernel : {2, 3, 4, 5, 6, 7})
  748. for (size_t ic : {1, 4, 8, 16})
  749. for (size_t oc : {1, 4, 8})
  750. for (size_t p : {0, 2})
  751. for (size_t size : {20, 21, 24})
  752. for (NonlineMode nonline_mode : {NonlineMode::IDENTITY}) {
  753. run(oc, ic, size, size, kernel, p, nonline_mode);
  754. }
  755. //! test OC block
  756. run(2046, 1, 8, 8, 2, 0, NonlineMode::IDENTITY);
  757. Checker<ConvBias> checker(handle());
  758. UniformIntRNG rng{-50, 50};
  759. #define cb(algo_name) \
  760. checker.set_before_exec_callback( \
  761. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
  762. checker.set_dtype(0, dtype::Int8()); \
  763. checker.set_dtype(1, dtype::Int8()); \
  764. checker.set_dtype(2, dtype::Int32()); \
  765. checker.set_dtype(4, dtype::Int32()); \
  766. for (auto&& arg : args) { \
  767. checker.set_param(arg.param).execs({arg.src, arg.filter, {}, {}, {}}); \
  768. } \
  769. for (auto&& arg : args) { \
  770. checker.set_dtype(0, dtype::QuantizedS8(2.5f)) \
  771. .set_dtype(1, dtype::QuantizedS8(2.5f)) \
  772. .set_dtype(2, dtype::QuantizedS32(6.25f)) \
  773. .set_dtype(4, {}) \
  774. .set_rng(0, &rng) \
  775. .set_rng(1, &rng) \
  776. .set_rng(2, &rng) \
  777. .set_param(arg.param) \
  778. .execs({arg.src, arg.filter, {}, {}, {}}); \
  779. }
  780. #define cb2(algo_name) \
  781. checker.set_before_exec_callback( \
  782. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
  783. checker.set_dtype(0, dtype::Int8()); \
  784. checker.set_dtype(1, dtype::Int8()); \
  785. checker.set_dtype(2, dtype::Int16()); \
  786. checker.set_dtype(4, dtype::Int16()); \
  787. for (auto&& arg : args) { \
  788. checker.set_param(arg.param).execs({arg.src, arg.filter, {}, {}, {}}); \
  789. }
  790. #if MEGDNN_X86_WITH_MKL_DNN
  791. if (megdnn::x86::is_supported(x86::SIMDType::VNNI)) {
  792. cb("IM2COLMATMUL:X86_INT8X8X32_MKLDNN");
  793. }
  794. #endif
  795. #if MEGDNN_X86_WITH_VNNI
  796. if (megdnn::x86::is_supported(x86::SIMDType::VNNI)) {
  797. cb("IM2COLMATMUL:X86_INT8X8X32_VNNI");
  798. }
  799. #endif
  800. if (megdnn::x86::is_supported(x86::SIMDType::AVX2)) {
  801. cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16");
  802. cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2");
  803. cb2("IM2COLMATMUL:X86_INT8X8X16_AVX2");
  804. }
  805. if (::megdnn::x86::is_supported(::megdnn::x86::SIMDType::SSE4_2)) {
  806. cb("IM2COLMATMUL:X86_INT8X8X32_SSE_4X8X2");
  807. cb2("IM2COLMATMUL:X86_INT8X8X16_SSE");
  808. }
  809. #undef cb
  810. #undef cb2
  811. }
  812. TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8X8X32_FILTER_PREPROCESS) {
  813. using namespace conv_bias;
  814. std::vector<TestArg> args;
  815. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  816. NonlineMode nonline_mode) {
  817. if (w + 2 * p < kernel || h + 2 * p < kernel)
  818. return;
  819. param::ConvBias param;
  820. param.stride_h = 1;
  821. param.stride_w = 1;
  822. param.pad_h = p;
  823. param.pad_w = p;
  824. param.nonlineMode = nonline_mode;
  825. //! no bias
  826. args.emplace_back(
  827. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  828. TensorShape{});
  829. };
  830. for (size_t kernel : {2, 3, 4, 5, 6, 7})
  831. for (size_t ic : {1, 4, 8, 16})
  832. for (size_t oc : {1, 4, 8})
  833. for (size_t p : {0, 2})
  834. for (size_t size : {20, 21, 24})
  835. for (NonlineMode nonline_mode : {NonlineMode::IDENTITY}) {
  836. run(oc, ic, size, size, kernel, p, nonline_mode);
  837. }
  838. //! test OC block
  839. run(2046, 1, 8, 8, 2, 0, NonlineMode::IDENTITY);
  840. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  841. handle());
  842. UniformIntRNG rng{-50, 50};
  843. #define cb(algo_name) \
  844. checker.set_before_exec_callback( \
  845. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
  846. checker.set_dtype(0, dtype::Int8()); \
  847. checker.set_dtype(1, dtype::Int8()); \
  848. checker.set_dtype(2, dtype::Int32()); \
  849. checker.set_dtype(4, dtype::Int32()); \
  850. for (auto&& arg : args) { \
  851. checker.set_param(arg.param).execs({arg.src, arg.filter, {}, {}, {}}); \
  852. } \
  853. for (auto&& arg : args) { \
  854. checker.set_dtype(0, dtype::QuantizedS8(2.5f)) \
  855. .set_dtype(1, dtype::QuantizedS8(2.5f)) \
  856. .set_dtype(2, dtype::QuantizedS32(6.25f)) \
  857. .set_dtype(4, {}) \
  858. .set_rng(0, &rng) \
  859. .set_rng(1, &rng) \
  860. .set_rng(2, &rng) \
  861. .set_param(arg.param) \
  862. .execs({arg.src, arg.filter, {}, {}, {}}); \
  863. }
  864. #define cb2(algo_name) \
  865. checker.set_before_exec_callback( \
  866. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
  867. checker.set_dtype(0, dtype::Int8()); \
  868. checker.set_dtype(1, dtype::Int8()); \
  869. checker.set_dtype(2, dtype::Int16()); \
  870. checker.set_dtype(4, dtype::Int16()); \
  871. for (auto&& arg : args) { \
  872. checker.set_param(arg.param).execs({arg.src, arg.filter, {}, {}, {}}); \
  873. }
  874. #if MEGDNN_X86_WITH_MKL_DNN
  875. if (megdnn::x86::is_supported(x86::SIMDType::VNNI)) {
  876. cb("IM2COLMATMUL");
  877. }
  878. #endif
  879. #if MEGDNN_X86_WITH_VNNI
  880. if (megdnn::x86::is_supported(x86::SIMDType::VNNI)) {
  881. cb("IM2COLMATMUL:X86_INT8X8X32_VNNI");
  882. }
  883. #endif
  884. if (megdnn::x86::is_supported(x86::SIMDType::AVX2)) {
  885. cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16");
  886. cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2");
  887. cb2("IM2COLMATMUL:X86_INT8X8X16_AVX2");
  888. }
  889. if (::megdnn::x86::is_supported(::megdnn::x86::SIMDType::SSE4_2)) {
  890. cb("IM2COLMATMUL:X86_INT8X8X32_SSE_4X8X2");
  891. cb2("IM2COLMATMUL:X86_INT8X8X16_SSE");
  892. }
  893. #undef cb
  894. #undef cb2
  895. }
  896. TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32) {
  897. using namespace conv_bias;
  898. std::vector<TestArg> args;
  899. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  900. NonlineMode nonline_mode) {
  901. if (w + 2 * p < kernel || h + 2 * p < kernel)
  902. return;
  903. param::ConvBias param;
  904. param.stride_h = 1;
  905. param.stride_w = 1;
  906. param.pad_h = p;
  907. param.pad_w = p;
  908. param.nonlineMode = nonline_mode;
  909. //! no bias
  910. args.emplace_back(
  911. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  912. TensorShape{});
  913. args.emplace_back(
  914. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  915. TensorShape{1, oc, 1, 1});
  916. args.emplace_back(
  917. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  918. TensorShape{
  919. 1, oc, (h + 2 * p - kernel) / param.stride_h + 1,
  920. (w + 2 * p - kernel) / param.stride_w + 1});
  921. };
  922. for (size_t kernel : {2, 3, 4, 5, 6, 7})
  923. for (size_t ic : {1, 4, 8, 16})
  924. for (size_t oc : {1, 4, 8, 16, 300})
  925. for (size_t p : {0, 2})
  926. for (size_t size : {8, 24})
  927. for (NonlineMode nonline_mode :
  928. {NonlineMode::IDENTITY, NonlineMode::RELU}) {
  929. run(oc, ic, size, size, kernel, p, nonline_mode);
  930. }
  931. run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);
  932. Checker<ConvBias> checker(handle());
  933. #define cb(algo_name) \
  934. checker.set_before_exec_callback( \
  935. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
  936. for (auto&& arg : args) { \
  937. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}}); \
  938. }
  939. #if MEGDNN_X86_WITH_MKL || MEGDNN_X86_WITH_OPENBLAS
  940. cb("IM2COLMATMUL:X86_F32_BLAS");
  941. #endif
  942. #undef cb
  943. }
  944. #if MEGDNN_X86_WITH_MKL || MEGDNN_X86_WITH_OPENBLAS
  945. TEST_F(X86, CONV_BIAS_IM2COLMATMUL_FP32) {
  946. using namespace conv_bias;
  947. std::vector<TestArg> args;
  948. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  949. NonlineMode nonline_mode) {
  950. if (w + 2 * p < kernel || h + 2 * p < kernel)
  951. return;
  952. param::ConvBias param;
  953. param.stride_h = 1;
  954. param.stride_w = 1;
  955. param.pad_h = p;
  956. param.pad_w = p;
  957. param.nonlineMode = nonline_mode;
  958. //! no bias
  959. args.emplace_back(
  960. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  961. TensorShape{});
  962. args.emplace_back(
  963. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  964. TensorShape{1, oc, 1, 1});
  965. args.emplace_back(
  966. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  967. TensorShape{
  968. 1, oc, (h + 2 * p - kernel) / param.stride_h + 1,
  969. (w + 2 * p - kernel) / param.stride_w + 1});
  970. };
  971. for (size_t kernel : {2, 3, 4, 5, 6, 7})
  972. for (size_t ic : {1, 4, 8, 16})
  973. for (size_t oc : {1, 4, 8, 16, 300})
  974. for (size_t p : {0, 2})
  975. for (size_t size : {8, 24})
  976. for (NonlineMode nonline_mode :
  977. {NonlineMode::IDENTITY, NonlineMode::RELU}) {
  978. run(oc, ic, size, size, kernel, p, nonline_mode);
  979. }
  980. run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);
  981. Checker<ConvBias> checker(handle());
  982. #define cb(algo_name) \
  983. checker.set_before_exec_callback( \
  984. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
  985. for (auto&& arg : args) { \
  986. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}}); \
  987. }
  988. cb("IM2COLMATMUL:X86_F32_BLAS");
  989. #undef cb
  990. }
  991. TEST_F(X86, CONV_BIAS_IM2COLMATMUL_FP32_NOPACK_PREPROCESS) {
  992. using namespace conv_bias;
  993. std::vector<TestArg> args;
  994. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  995. NonlineMode nonline_mode) {
  996. if (w + 2 * p < kernel || h + 2 * p < kernel)
  997. return;
  998. param::ConvBias param;
  999. param.stride_h = 1;
  1000. param.stride_w = 1;
  1001. param.pad_h = p;
  1002. param.pad_w = p;
  1003. param.nonlineMode = nonline_mode;
  1004. //! no bias
  1005. args.emplace_back(
  1006. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1007. TensorShape{});
  1008. args.emplace_back(
  1009. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1010. TensorShape{1, oc, 1, 1});
  1011. args.emplace_back(
  1012. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1013. TensorShape{
  1014. 1, oc, (h + 2 * p - kernel) / param.stride_h + 1,
  1015. (w + 2 * p - kernel) / param.stride_w + 1});
  1016. };
  1017. for (size_t kernel : {2, 3, 4, 5, 6, 7})
  1018. for (size_t ic : {1, 4, 8, 16})
  1019. for (size_t oc : {1, 4, 8, 16, 300})
  1020. for (size_t p : {0, 2})
  1021. for (size_t size : {8, 24})
  1022. for (NonlineMode nonline_mode :
  1023. {NonlineMode::IDENTITY, NonlineMode::RELU}) {
  1024. run(oc, ic, size, size, kernel, p, nonline_mode);
  1025. }
  1026. run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);
  1027. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  1028. handle());
  1029. #define cb(algo_name) \
  1030. checker.set_before_exec_callback( \
  1031. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
  1032. for (auto&& arg : args) { \
  1033. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}}); \
  1034. }
  1035. cb("IM2COLMATMUL:X86_F32_BLAS");
  1036. #undef cb
  1037. }
  1038. #endif
  1039. TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_6x16) {
  1040. using namespace conv_bias;
  1041. std::vector<TestArg> args;
  1042. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  1043. NonlineMode nonline_mode) {
  1044. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1045. return;
  1046. param::ConvBias param;
  1047. param.stride_h = 1;
  1048. param.stride_w = 1;
  1049. param.pad_h = p;
  1050. param.pad_w = p;
  1051. param.nonlineMode = nonline_mode;
  1052. //! no bias
  1053. args.emplace_back(
  1054. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1055. TensorShape{});
  1056. args.emplace_back(
  1057. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1058. TensorShape{1, oc, 1, 1});
  1059. args.emplace_back(
  1060. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1061. TensorShape{
  1062. 1, oc, (h + 2 * p - kernel) / param.stride_h + 1,
  1063. (w + 2 * p - kernel) / param.stride_w + 1});
  1064. };
  1065. for (size_t kernel : {2, 3, 4, 5, 6, 7})
  1066. for (size_t ic : {1, 4, 8, 16})
  1067. for (size_t oc : {1, 4, 8, 16, 300})
  1068. for (size_t p : {0, 2})
  1069. for (size_t size : {8, 24})
  1070. for (NonlineMode nonline_mode :
  1071. {NonlineMode::IDENTITY, NonlineMode::RELU}) {
  1072. run(oc, ic, size, size, kernel, p, nonline_mode);
  1073. }
  1074. run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);
  1075. Checker<ConvBias> checker(handle());
  1076. #define cb(algo_name) \
  1077. checker.set_before_exec_callback( \
  1078. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
  1079. for (auto&& arg : args) { \
  1080. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}}); \
  1081. }
  1082. cb("IM2COLMATMUL:X86_F32_6x16:192");
  1083. }
  1084. #if MEGDNN_X86_WITH_MKL && SUPPORT_MKL_PACKED_GEMM
  1085. TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA) {
  1086. using namespace conv_bias;
  1087. std::vector<TestArg> args;
  1088. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  1089. NonlineMode nonline_mode) {
  1090. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1091. return;
  1092. param::ConvBias param;
  1093. param.stride_h = 1;
  1094. param.stride_w = 1;
  1095. param.pad_h = p;
  1096. param.pad_w = p;
  1097. param.nonlineMode = nonline_mode;
  1098. //! no bias
  1099. args.emplace_back(
  1100. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1101. TensorShape{});
  1102. args.emplace_back(
  1103. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1104. TensorShape{1, oc, 1, 1});
  1105. args.emplace_back(
  1106. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1107. TensorShape{
  1108. 1, oc, (h + 2 * p - kernel) / param.stride_h + 1,
  1109. (w + 2 * p - kernel) / param.stride_w + 1});
  1110. param.sparse = param::ConvBias::Sparse::GROUP;
  1111. args.emplace_back(
  1112. param, TensorShape{1, 2 * ic, h, w},
  1113. TensorShape{2, oc, ic, kernel, kernel}, TensorShape{});
  1114. args.emplace_back(
  1115. param, TensorShape{1, 2 * ic, h, w},
  1116. TensorShape{2, oc, ic, kernel, kernel}, TensorShape{1, oc * 2, 1, 1});
  1117. args.emplace_back(
  1118. param, TensorShape{1, 2 * ic, h, w},
  1119. TensorShape{2, oc, ic, kernel, kernel},
  1120. TensorShape{
  1121. 1, 2 * oc, (h + 2 * param.pad_h - kernel) / 1 + 1,
  1122. (w + 2 * param.pad_w - kernel) / 1 + 1});
  1123. };
  1124. for (size_t kernel : {2, 3, 4, 5, 6, 7})
  1125. for (size_t ic : {1, 4, 8, 16})
  1126. for (size_t oc : {1, 4, 8, 16})
  1127. for (size_t p : {0, 1})
  1128. for (size_t size : {8, 24})
  1129. for (NonlineMode nonline_mode :
  1130. {NonlineMode::IDENTITY, NonlineMode::RELU}) {
  1131. run(oc, ic, size, size, kernel, p, nonline_mode);
  1132. }
  1133. run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);
  1134. Checker<ConvBias> checker(handle());
  1135. #define cb(algo_name) \
  1136. checker.set_before_exec_callback( \
  1137. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
  1138. for (auto&& arg : args) { \
  1139. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}}); \
  1140. }
  1141. cb("IM2COLMATMUL:X86_F32_MKL_PACKA:192");
  1142. #undef cb
  1143. }
  1144. TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA_FILTER_PREPROCESS) {
  1145. using namespace conv_bias;
  1146. std::vector<TestArg> args;
  1147. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  1148. NonlineMode nonline_mode) {
  1149. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1150. return;
  1151. param::ConvBias param;
  1152. param.stride_h = 1;
  1153. param.stride_w = 1;
  1154. param.pad_h = p;
  1155. param.pad_w = p;
  1156. param.nonlineMode = nonline_mode;
  1157. //! no bias
  1158. args.emplace_back(
  1159. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1160. TensorShape{});
  1161. args.emplace_back(
  1162. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1163. TensorShape{1, oc, 1, 1});
  1164. args.emplace_back(
  1165. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1166. TensorShape{
  1167. 1, oc, (h + 2 * p - kernel) / param.stride_h + 1,
  1168. (w + 2 * p - kernel) / param.stride_w + 1});
  1169. param.sparse = param::ConvBias::Sparse::GROUP;
  1170. args.emplace_back(
  1171. param, TensorShape{1, 2 * ic, h, w},
  1172. TensorShape{2, oc, ic, kernel, kernel}, TensorShape{});
  1173. args.emplace_back(
  1174. param, TensorShape{1, 2 * ic, h, w},
  1175. TensorShape{2, oc, ic, kernel, kernel}, TensorShape{1, oc * 2, 1, 1});
  1176. args.emplace_back(
  1177. param, TensorShape{1, 2 * ic, h, w},
  1178. TensorShape{2, oc, ic, kernel, kernel},
  1179. TensorShape{
  1180. 1, 2 * oc, (h + 2 * param.pad_h - kernel) / 1 + 1,
  1181. (w + 2 * param.pad_w - kernel) / 1 + 1});
  1182. };
  1183. for (size_t kernel : {2, 3, 4, 5, 6, 7})
  1184. for (size_t ic : {1, 4, 8, 16})
  1185. for (size_t oc : {1, 4, 8, 16})
  1186. for (size_t p : {0, 1})
  1187. for (size_t size : {8, 24})
  1188. for (NonlineMode nonline_mode :
  1189. {NonlineMode::IDENTITY, NonlineMode::RELU}) {
  1190. run(oc, ic, size, size, kernel, p, nonline_mode);
  1191. }
  1192. run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);
  1193. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  1194. handle());
  1195. #define cb(algo_name) \
  1196. checker.set_before_exec_callback( \
  1197. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
  1198. for (auto&& arg : args) { \
  1199. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}}); \
  1200. }
  1201. cb("IM2COLMATMUL:X86_F32_MKL_PACKA:192");
  1202. #undef cb
  1203. }
  1204. /**************************** Conv1x1 PackA *************************/
  1205. namespace {
  1206. void checker_conv_bias(
  1207. std::vector<conv_bias::TestArg> args, Handle* handle, RNG* rng, float epsilon,
  1208. DType type0, DType type1, DType type2, DType type3, const char* algo_name) {
  1209. using namespace conv_bias;
  1210. Checker<ConvBias> checker(handle);
  1211. checker.set_before_exec_callback(
  1212. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));
  1213. checker.set_dtype(0, type0);
  1214. checker.set_dtype(1, type1);
  1215. checker.set_dtype(2, type2);
  1216. checker.set_dtype(4, type3);
  1217. checker.set_epsilon(epsilon);
  1218. if (NULL != rng) {
  1219. checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng).set_rng(3, rng);
  1220. }
  1221. for (auto&& arg : args) {
  1222. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}});
  1223. }
  1224. }
  1225. void checker_conv_bias_preprocess(
  1226. std::vector<conv_bias::TestArg> args, Handle* handle, RNG* rng, float epsilon,
  1227. DType type0, DType type1, DType type2, DType type3, const char* algo_name) {
  1228. using namespace conv_bias;
  1229. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(handle);
  1230. checker.set_before_exec_callback(
  1231. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));
  1232. checker.set_dtype(0, type0);
  1233. checker.set_dtype(1, type1);
  1234. checker.set_dtype(2, type2);
  1235. checker.set_dtype(4, type3);
  1236. checker.set_epsilon(epsilon);
  1237. if (NULL != rng) {
  1238. checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng).set_rng(3, rng);
  1239. }
  1240. for (auto&& arg : args) {
  1241. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}});
  1242. }
  1243. }
  1244. } // namespace
  1245. #if MEGDNN_X86_WITH_MKL
  1246. TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_FP32_PACKA) {
  1247. using namespace conv_bias;
  1248. std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(false, false);
  1249. check_conv_bias(args, handle(), "CONV1x1:X86_F32_MKL_PACKA:24");
  1250. }
  1251. TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_FP32_PACKA_PREPROCESS) {
  1252. using namespace conv_bias;
  1253. std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(false, false);
  1254. checker_conv_bias_preprocess(
  1255. args, handle(), nullptr, 0.001, dtype::Float32{}, dtype::Float32{},
  1256. dtype::Float32{}, dtype::Float32{}, "CONV1x1:X86_F32_MKL_PACKA:24");
  1257. }
  1258. TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_FP32_BLAS) {
  1259. using namespace conv_bias;
  1260. std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(false, false);
  1261. check_conv_bias(args, handle(), "CONV1x1:X86_F32_BLAS:48");
  1262. }
  1263. TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_FP32_BLAS_NOPACK_REPROCESS) {
  1264. using namespace conv_bias;
  1265. std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(false, false);
  1266. checker_conv_bias_preprocess(
  1267. args, handle(), nullptr, 0.001, dtype::Float32{}, dtype::Float32{},
  1268. dtype::Float32{}, dtype::Float32{}, "CONV1x1:X86_F32_BLAS:24");
  1269. }
  1270. #endif
  1271. TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_INT8X8X32) {
  1272. using namespace conv_bias;
  1273. UniformIntRNG rng{-50, 50};
  1274. float epsilon = 0.001;
  1275. std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(false, true);
  1276. #if MEGDNN_X86_WITH_MKL_DNN
  1277. if (x86::is_supported(x86::SIMDType::VNNI)) {
  1278. checker_conv_bias(
  1279. args, handle(), &rng, epsilon, dtype::Int8{}, dtype::Int8{},
  1280. dtype::Int32{}, dtype::Int32{}, "CONV1x1:X86_INT8X8X32_MKLDNN:24");
  1281. }
  1282. #endif
  1283. #if MEGDNN_X86_WITH_VNNI
  1284. if (x86::is_supported(x86::SIMDType::VNNI)) {
  1285. checker_conv_bias(
  1286. args, handle(), &rng, epsilon, dtype::Int8{}, dtype::Int8{},
  1287. dtype::Int32{}, dtype::Int32{}, "CONV1x1:X86_INT8X8X32_VNNI:24");
  1288. }
  1289. #endif
  1290. if (x86::is_supported(x86::SIMDType::AVX2)) {
  1291. checker_conv_bias(
  1292. args, handle(), &rng, epsilon, dtype::Int8{}, dtype::Int8{},
  1293. dtype::Int32{}, dtype::Int32{}, "CONV1x1:X86_INT8X8X32_AVX2_4X16X2:24");
  1294. checker_conv_bias(
  1295. args, handle(), &rng, epsilon, dtype::Int8{}, dtype::Int8{},
  1296. dtype::Int32{}, dtype::Int32{}, "CONV1x1:X86_INT8X8X32_AVX2_2X4X16:24");
  1297. checker_conv_bias(
  1298. args, handle(), &rng, epsilon, dtype::Int8{}, dtype::Int8{},
  1299. dtype::Int16{}, dtype::Int16{}, "CONV1x1:X86_INT8X8X16_AVX2");
  1300. }
  1301. checker_conv_bias(
  1302. args, handle(), &rng, epsilon, dtype::Int8{}, dtype::Int8{}, dtype::Int32{},
  1303. dtype::Int32{}, "CONV1x1:X86_INT8X8X32_SSE_4X8X2:48");
  1304. checker_conv_bias(
  1305. args, handle(), &rng, epsilon, dtype::Int8{}, dtype::Int8{}, dtype::Int16{},
  1306. dtype::Int16{}, "CONV1x1:X86_INT8X8X16_SSE");
  1307. }
  1308. TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_INT8X8X32_PREPROCESS) {
  1309. using namespace conv_bias;
  1310. UniformIntRNG rng{-50, 50};
  1311. float epsilon = 0.001;
  1312. std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(false, true);
  1313. #if MEGDNN_X86_WITH_VNNI
  1314. if (x86::is_supported(x86::SIMDType::VNNI)) {
  1315. checker_conv_bias_preprocess(
  1316. args, handle(), &rng, epsilon, dtype::Int8{}, dtype::Int8{},
  1317. dtype::Int32{}, dtype::Int32{}, "CONV1x1:X86_INT8X8X32_VNNI:24");
  1318. }
  1319. #endif
  1320. if (x86::is_supported(x86::SIMDType::AVX2)) {
  1321. checker_conv_bias_preprocess(
  1322. args, handle(), &rng, epsilon, dtype::Int8{}, dtype::Int8{},
  1323. dtype::Int32{}, dtype::Int32{}, "CONV1x1:X86_INT8X8X32_AVX2_4X16X2:24");
  1324. checker_conv_bias_preprocess(
  1325. args, handle(), &rng, epsilon, dtype::Int8{}, dtype::Int8{},
  1326. dtype::Int32{}, dtype::Int32{}, "CONV1x1:X86_INT8X8X32_AVX2_2X4X16:24");
  1327. checker_conv_bias_preprocess(
  1328. args, handle(), &rng, epsilon, dtype::Int8{}, dtype::Int8{},
  1329. dtype::Int16{}, dtype::Int16{}, "CONV1x1:X86_INT8X8X16_AVX2");
  1330. }
  1331. checker_conv_bias_preprocess(
  1332. args, handle(), &rng, epsilon, dtype::Int8{}, dtype::Int8{}, dtype::Int32{},
  1333. dtype::Int32{}, "CONV1x1:X86_INT8X8X32_SSE_4X8X2:48");
  1334. checker_conv_bias_preprocess(
  1335. args, handle(), &rng, epsilon, dtype::Int8{}, dtype::Int8{}, dtype::Int16{},
  1336. dtype::Int16{}, "CONV1x1:X86_INT8X8X16_SSE");
  1337. }
  1338. /************************* End Conv1x1 PackA ************************/
  1339. #endif
  1340. TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_FP32_6x16) {
  1341. using namespace conv_bias;
  1342. std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(false, false);
  1343. check_conv_bias(args, handle(), "CONV1x1:X86_F32_6x16:48");
  1344. }
  1345. TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8) {
  1346. using namespace conv_bias;
  1347. std::vector<TestArg> args;
  1348. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  1349. NonlineMode nonline_mode) {
  1350. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1351. return;
  1352. param::ConvBias param;
  1353. param.stride_h = 1;
  1354. param.stride_w = 1;
  1355. param.pad_h = p;
  1356. param.pad_w = p;
  1357. param.nonlineMode = nonline_mode;
  1358. //! no bias
  1359. args.emplace_back(
  1360. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1361. TensorShape{});
  1362. //! bias channel
  1363. args.emplace_back(
  1364. param, TensorShape{2, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1365. TensorShape{1, oc, 1, 1});
  1366. };
  1367. for (size_t kernel : {2, 3, 4, 5, 6, 7})
  1368. for (size_t ic : {1, 4, 8, 16})
  1369. for (size_t oc : {1, 4, 8})
  1370. for (size_t p : {0, 2})
  1371. for (size_t size : {20, 21, 24})
  1372. for (NonlineMode nonline_mode :
  1373. {NonlineMode::IDENTITY, NonlineMode::RELU,
  1374. NonlineMode::H_SWISH}) {
  1375. run(oc, ic, size, size, kernel, p, nonline_mode);
  1376. }
  1377. run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);
  1378. Checker<ConvBias> checker(handle());
  1379. #define cb(algo_name) \
  1380. checker.set_before_exec_callback( \
  1381. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
  1382. UniformIntRNG rng{-50, 50}; \
  1383. for (auto&& arg : args) { \
  1384. checker.set_dtype(0, dtype::QuantizedS8(2.5f)) \
  1385. .set_dtype(1, dtype::QuantizedS8(2.5f)) \
  1386. .set_dtype(2, dtype::QuantizedS32(6.25f)) \
  1387. .set_dtype(4, dtype::QuantizedS8(60.25)) \
  1388. .set_rng(0, &rng) \
  1389. .set_rng(1, &rng) \
  1390. .set_rng(2, &rng) \
  1391. .set_param(arg.param) \
  1392. .execs({arg.src, arg.filter, {}, {}, {}}); \
  1393. }
  1394. #if MEGDNN_X86_WITH_MKL_DNN
  1395. if (x86::is_supported(x86::SIMDType::VNNI)) {
  1396. cb("IM2COLMATMUL:X86_INT8X8X32_MKLDNN");
  1397. }
  1398. #endif
  1399. #if MEGDNN_X86_WITH_VNNI
  1400. if (x86::is_supported(x86::SIMDType::VNNI)) {
  1401. cb("IM2COLMATMUL:X86_INT8X8X32_VNNI");
  1402. }
  1403. #endif
  1404. if (x86::is_supported(x86::SIMDType::AVX2)) {
  1405. cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16");
  1406. }
  1407. #undef cb
  1408. }
  1409. TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8_FILTER_PREPROCESS) {
  1410. using namespace conv_bias;
  1411. std::vector<TestArg> args;
  1412. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  1413. NonlineMode nonline_mode) {
  1414. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1415. return;
  1416. param::ConvBias param;
  1417. param.stride_h = 1;
  1418. param.stride_w = 1;
  1419. param.pad_h = p;
  1420. param.pad_w = p;
  1421. param.nonlineMode = nonline_mode;
  1422. //! no bias
  1423. args.emplace_back(
  1424. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1425. TensorShape{});
  1426. //! bias channel
  1427. args.emplace_back(
  1428. param, TensorShape{2, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1429. TensorShape{1, oc, 1, 1});
  1430. };
  1431. for (size_t kernel : {2, 3, 4, 5, 6, 7})
  1432. for (size_t ic : {1, 4, 8, 16})
  1433. for (size_t oc : {1, 4, 8})
  1434. for (size_t p : {0, 2})
  1435. for (size_t size : {20, 21, 24})
  1436. for (NonlineMode nonline_mode :
  1437. {NonlineMode::IDENTITY, NonlineMode::RELU,
  1438. NonlineMode::H_SWISH}) {
  1439. run(oc, ic, size, size, kernel, p, nonline_mode);
  1440. }
  1441. run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);
  1442. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  1443. handle());
  1444. #define cb(algo_name) \
  1445. checker.set_before_exec_callback( \
  1446. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
  1447. UniformIntRNG rng{-50, 50}; \
  1448. for (auto&& arg : args) { \
  1449. checker.set_dtype(0, dtype::QuantizedS8(2.5f)) \
  1450. .set_dtype(1, dtype::QuantizedS8(2.5f)) \
  1451. .set_dtype(2, dtype::QuantizedS32(6.25f)) \
  1452. .set_dtype(4, dtype::QuantizedS8(60.25)) \
  1453. .set_rng(0, &rng) \
  1454. .set_rng(1, &rng) \
  1455. .set_rng(2, &rng) \
  1456. .set_param(arg.param) \
  1457. .execs({arg.src, arg.filter, {}, {}, {}}); \
  1458. }
  1459. #if MEGDNN_X86_WITH_MKL_DNN
  1460. if (x86::is_supported(x86::SIMDType::VNNI)) {
  1461. cb("IM2COLMATMUL:X86_INT8X8X32_MKLDNN");
  1462. }
  1463. #endif
  1464. #if MEGDNN_X86_WITH_VNNI
  1465. if (x86::is_supported(x86::SIMDType::VNNI)) {
  1466. cb("IM2COLMATMUL:X86_INT8X8X32_VNNI");
  1467. }
  1468. #endif
  1469. if (x86::is_supported(x86::SIMDType::AVX2)) {
  1470. cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16");
  1471. }
  1472. #undef cb
  1473. }
  1474. #if MEGDNN_WITH_BENCHMARK
  1475. #if MEGDNN_X86_WITH_MKL_DNN
  1476. static void x86_benchmark_fp32_mkldnn(Handle* handle) {
  1477. constexpr size_t RUNS = 30;
  1478. param::ConvBias param;
  1479. Benchmarker<ConvBias> benchmarker_mkldnn(handle);
  1480. benchmarker_mkldnn.set_display(false).set_times(RUNS);
  1481. benchmarker_mkldnn.set_before_exec_callback(
  1482. AlgoChecker<ConvBias>("MKLDNN_CONV_FP32"));
  1483. Benchmarker<ConvBias> benchmarker_im2col(handle);
  1484. benchmarker_im2col.set_display(false).set_times(RUNS);
  1485. benchmarker_im2col.set_before_exec_callback(
  1486. AlgoChecker<ConvBias>("IM2COLMATMUL.+"));
  1487. auto run = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1488. size_t SZ, size_t GROUP = 1) {
  1489. TensorShape src({N, IC, H, W}), filter({OC, IC, FS, FS}), bias({1, OC, 1, 1}),
  1490. z({}), dst({N, OC, H / SZ, W / SZ});
  1491. param.pad_h = FS / 2;
  1492. param.pad_w = FS / 2;
  1493. param.stride_h = SZ;
  1494. param.stride_w = SZ;
  1495. param.format = param::ConvBias::Format::NCHW;
  1496. param.sparse = param::ConvBias::Sparse::DENSE;
  1497. if (GROUP > 1) {
  1498. param.sparse = param::ConvBias::Sparse::GROUP;
  1499. filter = {GROUP, OC / GROUP, IC / GROUP, FS, FS};
  1500. }
  1501. auto im2col_used =
  1502. benchmarker_im2col.set_param(param).exec({src, filter, bias, z, dst}) /
  1503. RUNS;
  1504. src = IC < 8 ? TensorShape{N, IC, H, W} : TensorShape{N, IC / 8, H, W, 8};
  1505. filter = IC < 8 ? TensorShape{OC / 8, FS, FS, IC, 8}
  1506. : TensorShape{OC / 8, IC / 8, FS, FS, 8, 8};
  1507. if (GROUP > 1 && OC == GROUP && IC == GROUP) {
  1508. filter = {GROUP / 8, 1, 1, FS, FS, 8};
  1509. } else if (GROUP > 1 && OC / GROUP % 8 == 0 && IC / GROUP % 8 == 0) {
  1510. filter = {GROUP, OC / GROUP / 8, IC / GROUP / 8, FS, FS, 8, 8};
  1511. }
  1512. bias = {1, OC / 8, 1, 1, 8};
  1513. z = {};
  1514. dst = {N, OC / 8, H / SZ, W / SZ, 8};
  1515. param.format = param::ConvBias::Format::NCHW88;
  1516. auto mkldnn_used =
  1517. benchmarker_mkldnn.set_param(param).exec({src, filter, bias, z, dst}) /
  1518. RUNS;
  1519. float computations =
  1520. (IC / GROUP * FS * FS + 1) * dst.total_nr_elems() * 2 * 1e-6;
  1521. std::cout << "run " << src.to_string() << " " << filter.to_string() << " "
  1522. << bias.to_string() << " " << dst.to_string() << std::endl;
  1523. std::cout << "im2col: " << im2col_used << " ms, "
  1524. << (computations / im2col_used) << " Gops, ";
  1525. std::cout << "mkldnn: " << mkldnn_used << " ms, "
  1526. << (computations / mkldnn_used) << " Gops, "
  1527. << "spped up: " << (im2col_used / mkldnn_used) << ", ";
  1528. std::cout << std::endl;
  1529. };
  1530. run(1, 64, 64, 56, 56, 3, 1);
  1531. run(1, 3, 64, 224, 224, 3, 1);
  1532. run(1, 3, 64, 224, 224, 7, 2);
  1533. run(1, 64, 64, 56, 56, 3, 1);
  1534. run(1, 128, 128, 28, 28, 3, 1);
  1535. run(1, 256, 256, 14, 14, 3, 1);
  1536. run(1, 512, 512, 7, 7, 3, 1);
  1537. run(1, 256, 64, 56, 56, 1, 1);
  1538. run(1, 512, 128, 28, 28, 1, 1);
  1539. run(1, 1024, 256, 14, 14, 1, 1);
  1540. run(1, 2048, 512, 7, 7, 1, 1);
  1541. run(1, 32, 32, 112, 112, 3, 1, 32);
  1542. run(1, 144, 144, 56, 56, 3, 1, 144);
  1543. run(1, 192, 192, 28, 28, 3, 1, 192);
  1544. run(1, 384, 384, 28, 28, 3, 1, 384);
  1545. run(1, 576, 576, 14, 14, 3, 1, 576);
  1546. run(1, 960, 960, 7, 7, 3, 1, 960);
  1547. run(1, 256, 128, 56, 56, 1, 2, 1);
  1548. run(1, 512, 256, 28, 28, 1, 2, 1);
  1549. run(1, 1024, 512, 14, 14, 1, 2, 1);
  1550. run(1, 96, 96, 112, 112, 3, 2, 96);
  1551. run(1, 144, 144, 56, 56, 3, 2, 144);
  1552. run(1, 384, 384, 28, 28, 3, 2, 384);
  1553. run(1, 576, 576, 14, 14, 3, 2, 576);
  1554. }
  1555. TEST_F(X86, BENCHMARK_CONVBIAS_FP32_MKLDNN) {
  1556. x86_benchmark_fp32_mkldnn(handle());
  1557. }
  1558. TEST_F(X86_MULTI_THREADS, BENCHMARK_CONVBIAS_FP32_MKLDNN) {
  1559. x86_benchmark_fp32_mkldnn(handle());
  1560. }
  1561. #endif
  1562. #endif
  1563. /************************* Winograd ****************************/
  1564. namespace {
  1565. std::vector<conv_bias::TestArg> get_winograd_mk_nchw88_args() {
  1566. std::vector<conv_bias::TestArg> args;
  1567. param::ConvBias cur_param;
  1568. cur_param.format = param::ConvBias::Format::NCHW88;
  1569. using NLMode = param::ConvBias::NonlineMode;
  1570. // clang-format off
  1571. for (auto nlmode :
  1572. {NLMode::IDENTITY, NLMode::RELU, NLMode::SIGMOID, NLMode::H_SWISH}) {
  1573. for (size_t ic : {1, 2}) {
  1574. for (size_t oc : {1, 2}) {
  1575. for (size_t i : {9, 63}) {
  1576. cur_param.mode = param::ConvBias::Mode::CROSS_CORRELATION;
  1577. cur_param.nonlineMode = nlmode;
  1578. cur_param.sparse = param::ConvBias::Sparse::DENSE;
  1579. cur_param.pad_h = cur_param.pad_w = 1;
  1580. args.emplace_back(cur_param, TensorShape{1, ic, i, i, 8},
  1581. TensorShape{oc, ic, 3, 3, 8, 8},
  1582. TensorShape{1, oc, 1, 1, 8});
  1583. args.emplace_back(cur_param, TensorShape{1, ic, i, i, 8},
  1584. TensorShape{oc, ic, 3, 3, 8, 8},TensorShape{});
  1585. //! bias
  1586. args.emplace_back(cur_param, TensorShape{2, ic, i, i, 8},
  1587. TensorShape{oc, ic, 3, 3, 8, 8},
  1588. TensorShape{2, oc, i, i, 8});
  1589. /*cur_param.sparse = param::ConvBias::Sparse::GROUP;
  1590. args.emplace_back(cur_param, TensorShape{2, 2 * ic, i, i, 8},
  1591. TensorShape{2, oc, ic, 3, 3, 8, 8},
  1592. TensorShape{1, 2 * oc, 1, 1, 8});*/
  1593. }}}
  1594. // clang-format on
  1595. //! test for multi-thread OC parallel
  1596. cur_param.sparse = param::ConvBias::Sparse::DENSE;
  1597. cur_param.pad_h = cur_param.pad_w = 1;
  1598. args.emplace_back(
  1599. cur_param, TensorShape{2, 1, 9, 9, 8}, TensorShape{128, 1, 3, 3, 8, 8},
  1600. TensorShape{1, 128, 1, 1, 8});
  1601. /*cur_param.sparse = param::ConvBias::Sparse::GROUP;
  1602. args.emplace_back(cur_param, TensorShape{2, 2, 9, 9, 8},
  1603. TensorShape{2, 128, 1, 3, 3, 8, 8},
  1604. TensorShape{1, 2 * 128, 1, 1, 8});*/
  1605. }
  1606. return args;
  1607. }
  1608. } // namespace
  1609. TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW88_F63) {
  1610. using namespace conv_bias;
  1611. std::vector<TestArg> args = get_winograd_mk_nchw88_args();
  1612. Checker<ConvBiasForward> checker(handle());
  1613. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1614. ssprintf("WINOGRAD:X86_F32MK8_8X8:8:6").c_str()));
  1615. for (auto&& arg : args) {
  1616. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}});
  1617. }
  1618. }
  1619. TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW88_F63_WEIGHT_PREPROCESS) {
  1620. using namespace conv_bias;
  1621. std::vector<TestArg> args = get_winograd_mk_nchw88_args();
  1622. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  1623. handle());
  1624. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1625. ssprintf("WINOGRAD:X86_F32MK8_8X8:8:6").c_str()));
  1626. for (auto&& arg : args) {
  1627. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}});
  1628. }
  1629. }
  1630. TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW88_F23) {
  1631. using namespace conv_bias;
  1632. std::vector<TestArg> args = get_winograd_mk_nchw88_args();
  1633. Checker<ConvBiasForward> checker(handle());
  1634. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1635. ssprintf("WINOGRAD:X86_F32MK8_8X8:8:2").c_str()));
  1636. for (auto&& arg : args) {
  1637. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}});
  1638. }
  1639. }
  1640. TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW88_F23_WEIGHT_PREPROCESS) {
  1641. using namespace conv_bias;
  1642. std::vector<TestArg> args = get_winograd_mk_nchw88_args();
  1643. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  1644. handle());
  1645. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1646. ssprintf("WINOGRAD:X86_F32MK8_8X8:8:2").c_str()));
  1647. for (auto&& arg : args) {
  1648. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}});
  1649. }
  1650. }
  1651. TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_WEIGHT_PREPROCESS) {
  1652. using namespace conv_bias;
  1653. std::vector<TestArg> args = get_winograd_mk_nchw88_args();
  1654. Checker<ConvBiasForward> checker(handle());
  1655. auto run = [&checker](
  1656. const std::vector<TestArg>& args, DType A_dtype, DType B_dtype,
  1657. DType C_dtype, DType D_dtype, const float eps) {
  1658. for (auto&& arg : args) {
  1659. checker.set_dtype(0, A_dtype)
  1660. .set_dtype(1, B_dtype)
  1661. .set_dtype(2, C_dtype)
  1662. .set_dtype(4, D_dtype)
  1663. .set_epsilon(eps)
  1664. .set_param(arg.param)
  1665. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  1666. }
  1667. };
  1668. run(args, dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32(),
  1669. 1e-3f);
  1670. }
  1671. /*********************************** End winograd ************************/
  1672. #if MEGDNN_X86_WITH_MKL_DNN
  1673. static void x86_correctness_fp32_mkldnn_run(
  1674. Checker<ConvBias>& checker, UniformIntRNG& rng, Handle* handle,
  1675. ConvBiasForward::BiasMode bias_mode, param::ConvBias::NonlineMode noline_mode,
  1676. size_t n, size_t stride, size_t kernel, size_t oc, size_t ic, size_t h,
  1677. size_t w, size_t group) {
  1678. auto oc_per_group = oc / group;
  1679. auto ic_per_group = ic / group;
  1680. bool ok_group = oc_per_group % 8 == 0 && oc_per_group > 0 &&
  1681. (ic_per_group % 8 == 0 || ic_per_group == 3) && ic_per_group > 0;
  1682. bool ok_depthwise = oc == ic && oc == group;
  1683. if (!(ok_group || ok_depthwise)) {
  1684. return;
  1685. }
  1686. size_t pad = kernel / 2;
  1687. size_t kernel_h = kernel;
  1688. size_t kernel_w = kernel;
  1689. param::ConvBias param;
  1690. param.format = param::ConvBias::Format::NCHW88;
  1691. param.stride_h = stride;
  1692. param.stride_w = stride;
  1693. param.pad_h = pad;
  1694. param.pad_w = pad;
  1695. param.nonlineMode = noline_mode;
  1696. auto src_tensor_shape = TensorShape{n, ic / 8, h, w, 8};
  1697. if (ic == 3) {
  1698. src_tensor_shape = TensorShape{n, ic, h, w};
  1699. }
  1700. auto weight_tensor_shape = TensorShape{oc / 8, ic / 8, kernel_h, kernel_w, 8, 8};
  1701. if (ic == 3) {
  1702. weight_tensor_shape = TensorShape{oc / 8, kernel_h, kernel_w, ic, 8};
  1703. }
  1704. auto bias_tensor_shape = TensorShape{};
  1705. if (bias_mode == megdnn::BiasMode::BROADCAST_CHANNEL_BIAS) {
  1706. bias_tensor_shape = {1, oc / 8, 1, 1, 8};
  1707. } else if (bias_mode == megdnn::BiasMode::BIAS) {
  1708. TensorLayout dst_layout;
  1709. auto ConvBiasOp = handle->create_operator<ConvBias>();
  1710. ConvBiasOp->param() = param;
  1711. ConvBiasOp->deduce_layout(
  1712. {src_tensor_shape, dtype::Float32()},
  1713. {weight_tensor_shape, dtype::Float32()}, {}, {}, dst_layout);
  1714. bias_tensor_shape = dst_layout;
  1715. }
  1716. if (group == 1) {
  1717. param.sparse = param::ConvBias::Sparse::DENSE;
  1718. } else if (group > 1 && ic / group == 1 && oc / group == 1) {
  1719. param.sparse = param::ConvBias::Sparse::GROUP;
  1720. weight_tensor_shape = TensorShape{group / 8, 1, 1, kernel_h, kernel_w, 8};
  1721. } else if (
  1722. group > 1 && oc / group % 8 == 0 && oc / group > 0 && ic / group % 8 == 0 &&
  1723. ic / group > 0) {
  1724. param.sparse = param::ConvBias::Sparse::GROUP;
  1725. weight_tensor_shape = TensorShape{
  1726. group, oc / group / 8, ic / group / 8, kernel_h, kernel_w, 8, 8};
  1727. }
  1728. checker.set_dtype(0, dtype::Float32())
  1729. .set_dtype(1, dtype::Float32())
  1730. .set_dtype(2, dtype::Float32())
  1731. .set_dtype(4, dtype::Float32())
  1732. .set_rng(0, &rng)
  1733. .set_rng(1, &rng)
  1734. .set_rng(2, &rng)
  1735. .set_epsilon(1e-3)
  1736. .set_param(param)
  1737. .execs({src_tensor_shape, weight_tensor_shape, bias_tensor_shape, {}, {}});
  1738. }
  1739. static void x86_correctness_fp32_mkldnn(Handle* handle) {
  1740. Checker<ConvBias> checker(handle);
  1741. UniformIntRNG rng{-127, 127};
  1742. checker.set_before_exec_callback(
  1743. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("MKLDNN_CONV_FP32"));
  1744. for (auto bias_mode :
  1745. {megdnn::BiasMode::NO_BIAS, megdnn::BiasMode::BROADCAST_CHANNEL_BIAS,
  1746. megdnn::BiasMode::BIAS})
  1747. for (auto noline_mode :
  1748. {param::ConvBias::NonlineMode::IDENTITY,
  1749. param::ConvBias::NonlineMode::SIGMOID,
  1750. param::ConvBias::NonlineMode::H_SWISH})
  1751. for (size_t n : {1, 2})
  1752. for (size_t stride : {1, 2})
  1753. for (size_t kernel : {3, 5, 7})
  1754. for (size_t oc : {8, 16})
  1755. for (size_t ic : {3, 8, 16})
  1756. for (size_t h : {22, 33})
  1757. for (size_t w : {22, 33}) {
  1758. for (size_t group = 1;
  1759. group <= std::min(oc, ic); ++group) {
  1760. x86_correctness_fp32_mkldnn_run(
  1761. checker, rng, handle, bias_mode,
  1762. noline_mode, n, stride, kernel, oc,
  1763. ic, h, w, group);
  1764. }
  1765. }
  1766. }
  1767. TEST_F(X86, CONV_BIAS_DIRECT_MKLDNN_C8) {
  1768. x86_correctness_fp32_mkldnn(handle());
  1769. }
  1770. TEST_F(X86_MULTI_THREADS, CONV_BIAS_DIRECT_MKLDNN_C8) {
  1771. x86_correctness_fp32_mkldnn(handle());
  1772. }
  1773. TEST_F(X86, CONV_BIAS_MKL_DNN_MATMUL_INT8) {
  1774. using namespace conv_bias;
  1775. std::vector<TestArg> args;
  1776. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  1777. NonlineMode nonline_mode) {
  1778. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1779. return;
  1780. param::ConvBias param;
  1781. param.stride_h = 1;
  1782. param.stride_w = 1;
  1783. param.pad_h = p;
  1784. param.pad_w = p;
  1785. param.nonlineMode = nonline_mode;
  1786. //! no bias
  1787. args.emplace_back(
  1788. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1789. TensorShape{});
  1790. };
  1791. for (size_t kernel : {2, 3, 5, 7})
  1792. for (size_t ic : {1, 2, 3, 4})
  1793. for (size_t oc : {1, 2, 4})
  1794. for (size_t p : {0, 2})
  1795. for (size_t size : {20, 21, 22, 23, 24})
  1796. for (NonlineMode nonline_mode : {NonlineMode::IDENTITY}) {
  1797. run(oc, ic, size, size, kernel, p, nonline_mode);
  1798. }
  1799. Checker<ConvBias> checker(handle());
  1800. checker.set_before_exec_callback(
  1801. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("MKLDNN_MATMUL_INT8"));
  1802. checker.set_epsilon(1);
  1803. UniformIntRNG rng{-50, 50};
  1804. checker.set_dtype(0, dtype::Int8())
  1805. .set_dtype(1, dtype::Int8())
  1806. .set_dtype(2, dtype::Int32())
  1807. .set_dtype(4, dtype::Int32())
  1808. .set_rng(0, &rng)
  1809. .set_rng(1, &rng)
  1810. .set_rng(2, &rng);
  1811. for (auto&& arg : args) {
  1812. checker.set_param(arg.param).exec({arg.src, arg.filter, arg.bias, {}, {}});
  1813. }
  1814. }
  1815. TEST_F(X86, CONV_BIAS_MKL_DNN_INT8) {
  1816. using namespace conv_bias;
  1817. std::vector<TestArg> args;
  1818. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  1819. NonlineMode nonline_mode) {
  1820. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1821. return;
  1822. param::ConvBias param;
  1823. param.stride_h = 1;
  1824. param.stride_w = 1;
  1825. param.pad_h = p;
  1826. param.pad_w = p;
  1827. param.nonlineMode = nonline_mode;
  1828. //! no bias
  1829. args.emplace_back(
  1830. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1831. TensorShape{});
  1832. };
  1833. for (size_t kernel : {2, 3, 5, 7})
  1834. for (size_t ic : {1, 2, 3, 4})
  1835. for (size_t oc : {1, 2, 4})
  1836. for (size_t p : {0, 2})
  1837. for (size_t size : {20, 22, 24})
  1838. for (NonlineMode nonline_mode : {NonlineMode::IDENTITY}) {
  1839. run(oc, ic, size, size, kernel, p, nonline_mode);
  1840. }
  1841. Checker<ConvBias> checker(handle());
  1842. checker.set_before_exec_callback(
  1843. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("MKLDNN_INT8"));
  1844. checker.set_epsilon(1);
  1845. UniformIntRNG rng{-50, 50};
  1846. checker.set_dtype(0, dtype::Int8())
  1847. .set_dtype(1, dtype::Int8())
  1848. .set_dtype(2, dtype::Int32())
  1849. .set_dtype(4, dtype::Int32())
  1850. .set_rng(0, &rng)
  1851. .set_rng(1, &rng)
  1852. .set_rng(2, &rng);
  1853. for (auto&& arg : args) {
  1854. checker.set_param(arg.param).exec({arg.src, arg.filter, arg.bias, {}, {}});
  1855. }
  1856. }
  1857. TEST_F(X86_MULTI_THREADS, CONV_BIAS_MKL_DNN_INT8) {
  1858. using namespace conv_bias;
  1859. std::vector<TestArg> args;
  1860. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  1861. NonlineMode nonline_mode) {
  1862. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1863. return;
  1864. param::ConvBias param;
  1865. param.stride_h = 1;
  1866. param.stride_w = 1;
  1867. param.pad_h = p;
  1868. param.pad_w = p;
  1869. param.nonlineMode = nonline_mode;
  1870. //! no bias
  1871. args.emplace_back(
  1872. param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1873. TensorShape{});
  1874. };
  1875. for (size_t kernel : {2, 3, 5, 7})
  1876. for (size_t ic : {1, 2, 3, 4})
  1877. for (size_t oc : {1, 2, 4})
  1878. for (size_t p : {0, 2})
  1879. for (size_t size : {20, 22, 24})
  1880. for (NonlineMode nonline_mode : {NonlineMode::IDENTITY}) {
  1881. run(oc, ic, size, size, kernel, p, nonline_mode);
  1882. }
  1883. Checker<ConvBias> checker(handle());
  1884. checker.set_before_exec_callback(
  1885. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("MKLDNN_INT8"));
  1886. checker.set_epsilon(1);
  1887. UniformIntRNG rng{-50, 50};
  1888. checker.set_dtype(0, dtype::Int8())
  1889. .set_dtype(1, dtype::Int8())
  1890. .set_dtype(2, dtype::Int32())
  1891. .set_dtype(4, dtype::Int32())
  1892. .set_rng(0, &rng)
  1893. .set_rng(1, &rng)
  1894. .set_rng(2, &rng);
  1895. for (auto&& arg : args) {
  1896. checker.set_param(arg.param).exec({arg.src, arg.filter, arg.bias, {}, {}});
  1897. }
  1898. }
  1899. #endif
  1900. #if MEGDNN_WITH_BENCHMARK
  1901. namespace {
  1902. void benchmark_impl(
  1903. const param::ConvBias param,
  1904. std::vector<std::pair<SmallVector<TensorShape>, float>>& shapes_and_computation,
  1905. const std::string algo_name, size_t RUNS,
  1906. TaskExecutorConfig&& multi_thread_config,
  1907. TaskExecutorConfig&& single_thread_config, std::vector<DType> dtype_v) {
  1908. std::vector<DType> data_type = {
  1909. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  1910. std::vector<float> multi_thread_times, single_thread_times;
  1911. {
  1912. auto multi_thread_hanle = create_cpu_handle(0, true, &multi_thread_config);
  1913. auto benchmarker = Benchmarker<ConvBias>(multi_thread_hanle.get());
  1914. benchmarker.set_times(RUNS)
  1915. .set_display(false)
  1916. .set_dtype(0, dtype_v[0])
  1917. .set_dtype(1, dtype_v[1])
  1918. .set_dtype(2, dtype_v[2])
  1919. .set_dtype(4, dtype_v[3])
  1920. .set_param(param)
  1921. .set_before_exec_callback(
  1922. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name.c_str()));
  1923. for (auto shape : shapes_and_computation) {
  1924. multi_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  1925. }
  1926. }
  1927. {
  1928. auto single_thread_handle = create_cpu_handle(0, true, &single_thread_config);
  1929. auto benchmarker = Benchmarker<ConvBias>(single_thread_handle.get());
  1930. benchmarker.set_times(RUNS)
  1931. .set_display(false)
  1932. .set_dtype(0, dtype_v[0])
  1933. .set_dtype(1, dtype_v[1])
  1934. .set_dtype(2, dtype_v[2])
  1935. .set_dtype(4, dtype_v[3])
  1936. .set_param(param)
  1937. .set_before_exec_callback(
  1938. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name.c_str()));
  1939. for (auto shape : shapes_and_computation) {
  1940. single_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  1941. }
  1942. }
  1943. printf("Benchmark : Multi threads %zu, ", multi_thread_config.nr_thread);
  1944. printf("core_ids:");
  1945. for (size_t i = 0; i < multi_thread_config.affinity_core_set.size(); i++) {
  1946. printf("%zu ", multi_thread_config.affinity_core_set[i]);
  1947. }
  1948. printf(", Single thread core_id %zu\n", single_thread_config.affinity_core_set[0]);
  1949. for (size_t i = 0; i < shapes_and_computation.size(); i++) {
  1950. auto shapes = shapes_and_computation[i];
  1951. printf("Bench case: ");
  1952. for (auto&& shape : shapes.first) {
  1953. printf("%s ", shape.to_string().c_str());
  1954. }
  1955. float computations = shapes.second;
  1956. printf("%zu threads gflops: %f,\n single thread gflops: "
  1957. "%f. spead up = %f, speedup/cores=%f\n",
  1958. multi_thread_config.nr_thread, computations / multi_thread_times[i],
  1959. computations / single_thread_times[i],
  1960. single_thread_times[i] / multi_thread_times[i],
  1961. single_thread_times[i] / multi_thread_times[i] /
  1962. multi_thread_config.nr_thread);
  1963. }
  1964. }
  1965. void benchmark_impl_comp(
  1966. const param::ConvBias param,
  1967. std::vector<std::pair<SmallVector<TensorShape>, float>>& shapes_and_computation,
  1968. const std::string algo_name, const std::string algo_name1, size_t RUNS,
  1969. TaskExecutorConfig&& multi_thread_config,
  1970. TaskExecutorConfig&& single_thread_config, std::vector<DType> dtype_v) {
  1971. std::vector<DType> data_type = {
  1972. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  1973. std::vector<float> multi_thread_times, single_thread_times;
  1974. {
  1975. auto multi_thread_hanle = create_cpu_handle(0, true, &multi_thread_config);
  1976. auto benchmarker = Benchmarker<ConvBias>(multi_thread_hanle.get());
  1977. benchmarker.set_times(RUNS)
  1978. .set_display(false)
  1979. .set_dtype(0, dtype_v[0])
  1980. .set_dtype(1, dtype_v[1])
  1981. .set_dtype(2, dtype_v[2])
  1982. .set_dtype(4, dtype_v[3])
  1983. .set_param(param)
  1984. .set_before_exec_callback(
  1985. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name.c_str()));
  1986. for (auto shape : shapes_and_computation) {
  1987. multi_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  1988. }
  1989. }
  1990. {
  1991. auto single_thread_handle = create_cpu_handle(0, true, &single_thread_config);
  1992. auto benchmarker = Benchmarker<ConvBias>(single_thread_handle.get());
  1993. benchmarker.set_times(RUNS)
  1994. .set_display(false)
  1995. .set_dtype(0, dtype_v[0])
  1996. .set_dtype(1, dtype_v[1])
  1997. .set_dtype(2, dtype_v[2])
  1998. .set_dtype(4, dtype_v[3])
  1999. .set_param(param)
  2000. .set_before_exec_callback(
  2001. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name1.c_str()));
  2002. for (auto shape : shapes_and_computation) {
  2003. single_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  2004. }
  2005. }
  2006. printf("Benchmark : Multi threads %zu, ", multi_thread_config.nr_thread);
  2007. printf("core_ids:");
  2008. for (size_t i = 0; i < multi_thread_config.affinity_core_set.size(); i++) {
  2009. printf("%zu ", multi_thread_config.affinity_core_set[i]);
  2010. }
  2011. for (size_t i = 0; i < shapes_and_computation.size(); i++) {
  2012. auto shapes = shapes_and_computation[i];
  2013. printf("Bench case: ");
  2014. for (auto&& shape : shapes.first) {
  2015. printf("%s ", shape.to_string().c_str());
  2016. }
  2017. float computations = shapes.second;
  2018. printf("algo:%s gflops: %f,\n algo:%s gflops: "
  2019. "%f. spead up = %f\n",
  2020. algo_name.c_str(), computations / multi_thread_times[i],
  2021. algo_name1.c_str(), computations / single_thread_times[i],
  2022. single_thread_times[i] / multi_thread_times[i]);
  2023. }
  2024. }
  2025. } // namespace
  2026. static void benchmark_convbias_chanwise_avx2_int8(uint32_t stride, const char* algo) {
  2027. constexpr size_t RUNS = 50;
  2028. param::ConvBias param;
  2029. param.stride_h = stride;
  2030. param.stride_w = stride;
  2031. param.sparse = param::ConvBias::Sparse::GROUP;
  2032. std::vector<DType> data_type = {
  2033. dtype::Int8(), dtype::Int8(), dtype::Int32(), dtype::Int32()};
  2034. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  2035. auto bench_case = [&](size_t N, size_t IC, size_t H, size_t W, size_t FS) {
  2036. param.pad_h = FS / 2;
  2037. param.pad_w = FS / 2;
  2038. SmallVector<TensorShape> shapes{{N, IC, H, W}, {IC, 1, 1, FS, FS}, {}, {}, {}};
  2039. TensorShape dst{
  2040. N, IC, (H + 2 * param.pad_h - FS) + 1, (W + 2 * param.pad_w - FS) + 1};
  2041. float computations = (FS * FS * dst.total_nr_elems() * 2) * 1e-6;
  2042. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  2043. };
  2044. bench_case(1, 32, 112, 112, 7);
  2045. bench_case(1, 144, 56, 56, 7);
  2046. bench_case(1, 192, 28, 28, 7);
  2047. bench_case(1, 384, 28, 28, 7);
  2048. bench_case(1, 576, 14, 14, 7);
  2049. bench_case(1, 960, 7, 7, 7);
  2050. bench_case(1, 32, 112, 112, 5);
  2051. bench_case(1, 144, 56, 56, 5);
  2052. bench_case(1, 192, 28, 28, 5);
  2053. bench_case(1, 384, 28, 28, 5);
  2054. bench_case(1, 576, 14, 14, 5);
  2055. bench_case(1, 960, 7, 7, 5);
  2056. bench_case(1, 32, 112, 112, 3);
  2057. bench_case(1, 144, 56, 56, 3);
  2058. bench_case(1, 192, 28, 28, 3);
  2059. bench_case(1, 384, 28, 28, 3);
  2060. bench_case(1, 576, 14, 14, 3);
  2061. bench_case(1, 960, 7, 7, 3);
  2062. bench_case(1, 32, 112, 112, 2);
  2063. bench_case(1, 144, 56, 56, 2);
  2064. bench_case(1, 192, 28, 28, 2);
  2065. bench_case(1, 384, 28, 28, 2);
  2066. bench_case(1, 576, 14, 14, 2);
  2067. bench_case(1, 960, 7, 7, 2);
  2068. std::string algo_name = algo;
  2069. printf("Benchmark %s\n", algo);
  2070. benchmark_impl(
  2071. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  2072. data_type);
  2073. benchmark_impl(
  2074. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  2075. data_type);
  2076. shapes_and_computation.clear();
  2077. }
  2078. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_CHANWISE_AVX2_INT8_S1) {
  2079. benchmark_convbias_chanwise_avx2_int8(
  2080. 1, "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1");
  2081. }
  2082. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_CHANWISE_AVX2_INT8_S2) {
  2083. benchmark_convbias_chanwise_avx2_int8(
  2084. 2, "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE2");
  2085. }
  2086. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECT_AVX2_INT8) {
  2087. constexpr size_t RUNS = 50;
  2088. param::ConvBias param;
  2089. param.stride_h = 1;
  2090. param.stride_w = 1;
  2091. param.sparse = param::ConvBias::Sparse::DENSE;
  2092. std::vector<DType> data_type = {
  2093. dtype::Int8(), dtype::Int8(), dtype::Int32(), dtype::Int32()};
  2094. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  2095. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  2096. size_t FS) {
  2097. param.pad_h = FS / 2;
  2098. param.pad_w = FS / 2;
  2099. SmallVector<TensorShape> shapes{{N, IC, H, W}, {OC, IC, FS, FS}, {}, {}, {}};
  2100. TensorShape dst{
  2101. N, OC, (H + 2 * param.pad_h - FS) + 1, (W + 2 * param.pad_w - FS) + 1};
  2102. float computations = (IC * FS * FS * dst.total_nr_elems() * 2) * 1e-6;
  2103. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  2104. };
  2105. bench_case(1, 32, 32, 200, 200, 7);
  2106. bench_case(1, 32, 64, 200, 200, 7);
  2107. bench_case(1, 32, 32, 128, 128, 7);
  2108. bench_case(1, 32, 64, 128, 128, 7);
  2109. bench_case(1, 32, 32, 100, 100, 7);
  2110. bench_case(1, 32, 64, 100, 100, 7);
  2111. bench_case(1, 32, 32, 80, 80, 7);
  2112. bench_case(1, 32, 64, 80, 80, 7);
  2113. bench_case(1, 32, 32, 200, 200, 5);
  2114. bench_case(1, 32, 64, 200, 200, 5);
  2115. bench_case(1, 32, 32, 128, 128, 5);
  2116. bench_case(1, 32, 64, 128, 128, 5);
  2117. bench_case(1, 32, 32, 100, 100, 5);
  2118. bench_case(1, 32, 64, 100, 100, 5);
  2119. bench_case(1, 32, 32, 80, 80, 5);
  2120. bench_case(1, 32, 64, 80, 80, 5);
  2121. bench_case(1, 32, 32, 200, 200, 3);
  2122. bench_case(1, 32, 64, 200, 200, 3);
  2123. bench_case(1, 32, 32, 128, 128, 3);
  2124. bench_case(1, 32, 64, 128, 128, 3);
  2125. bench_case(1, 32, 32, 100, 100, 3);
  2126. bench_case(1, 32, 64, 100, 100, 3);
  2127. bench_case(1, 32, 32, 80, 80, 3);
  2128. bench_case(1, 32, 64, 80, 80, 3);
  2129. bench_case(1, 32, 32, 200, 200, 2);
  2130. bench_case(1, 32, 64, 200, 200, 2);
  2131. bench_case(1, 32, 32, 128, 128, 2);
  2132. bench_case(1, 32, 64, 128, 128, 2);
  2133. bench_case(1, 32, 32, 100, 100, 2);
  2134. bench_case(1, 32, 64, 100, 100, 2);
  2135. bench_case(1, 32, 32, 80, 80, 2);
  2136. bench_case(1, 32, 64, 80, 80, 2);
  2137. std::string algo_name = "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE1";
  2138. printf("Benchmark X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE1 algo\n");
  2139. benchmark_impl(
  2140. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  2141. data_type);
  2142. benchmark_impl(
  2143. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  2144. data_type);
  2145. benchmark_impl(
  2146. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  2147. data_type);
  2148. shapes_and_computation.clear();
  2149. }
  2150. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_8816) {
  2151. constexpr size_t RUNS = 30;
  2152. param::ConvBias param;
  2153. param.stride_h = 1;
  2154. param.stride_w = 1;
  2155. param.sparse = param::ConvBias::Sparse::DENSE;
  2156. std::vector<DType> data_type = {
  2157. dtype::Int8(), dtype::Int8(), dtype::Int16(), dtype::Int16()};
  2158. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  2159. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  2160. size_t FS) {
  2161. param.pad_h = FS / 2;
  2162. param.pad_w = FS / 2;
  2163. SmallVector<TensorShape> shapes{{N, IC, H, W}, {OC, IC, FS, FS}, {}, {}, {}};
  2164. TensorShape dst{
  2165. N, OC, (H + 2 * param.pad_h - FS) / param.stride_h + 1,
  2166. (W + 2 * param.pad_w - FS) / param.stride_w + 1};
  2167. float computations = (IC * FS * FS * dst.total_nr_elems() * 2) * 1e-6;
  2168. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  2169. };
  2170. bench_case(1, 48, 192, 15, 15, 1);
  2171. std::string algo_name = "IM2COLMATMUL:X86_INT8X8X16_AVX2";
  2172. benchmark_impl(
  2173. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  2174. data_type);
  2175. shapes_and_computation.clear();
  2176. }
  2177. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECT_AVX2_INT8_STRIDE2) {
  2178. constexpr size_t RUNS = 50;
  2179. param::ConvBias param;
  2180. param.stride_h = 2;
  2181. param.stride_w = 2;
  2182. param.sparse = param::ConvBias::Sparse::DENSE;
  2183. std::vector<DType> data_type = {
  2184. dtype::Int8(), dtype::Int8(), dtype::Int32(), dtype::Int32()};
  2185. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  2186. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  2187. size_t FS) {
  2188. param.pad_h = FS / 2;
  2189. param.pad_w = FS / 2;
  2190. SmallVector<TensorShape> shapes{{N, IC, H, W}, {OC, IC, FS, FS}, {}, {}, {}};
  2191. TensorShape dst{
  2192. N, OC, (H + 2 * param.pad_h - FS) / param.stride_h + 1,
  2193. (W + 2 * param.pad_w - FS) / param.stride_w + 1};
  2194. float computations = (IC * FS * FS * dst.total_nr_elems() * 2) * 1e-6;
  2195. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  2196. };
  2197. bench_case(1, 32, 32, 200, 200, 7);
  2198. bench_case(1, 32, 64, 200, 200, 7);
  2199. bench_case(1, 32, 32, 128, 128, 7);
  2200. bench_case(1, 32, 64, 128, 128, 7);
  2201. bench_case(1, 32, 32, 100, 100, 7);
  2202. bench_case(1, 32, 64, 100, 100, 7);
  2203. bench_case(1, 32, 32, 80, 80, 7);
  2204. bench_case(1, 32, 64, 80, 80, 7);
  2205. bench_case(1, 32, 32, 200, 200, 5);
  2206. bench_case(1, 32, 64, 200, 200, 5);
  2207. bench_case(1, 32, 32, 128, 128, 5);
  2208. bench_case(1, 32, 64, 128, 128, 5);
  2209. bench_case(1, 32, 32, 100, 100, 5);
  2210. bench_case(1, 32, 64, 100, 100, 5);
  2211. bench_case(1, 32, 32, 80, 80, 5);
  2212. bench_case(1, 32, 64, 80, 80, 5);
  2213. bench_case(1, 32, 32, 200, 200, 3);
  2214. bench_case(1, 32, 64, 200, 200, 3);
  2215. bench_case(1, 32, 32, 128, 128, 3);
  2216. bench_case(1, 32, 64, 128, 128, 3);
  2217. bench_case(1, 32, 32, 100, 100, 3);
  2218. bench_case(1, 32, 64, 100, 100, 3);
  2219. bench_case(1, 32, 32, 80, 80, 3);
  2220. bench_case(1, 32, 64, 80, 80, 3);
  2221. bench_case(1, 32, 32, 200, 200, 2);
  2222. bench_case(1, 32, 64, 200, 200, 2);
  2223. bench_case(1, 32, 32, 128, 128, 2);
  2224. bench_case(1, 32, 64, 128, 128, 2);
  2225. bench_case(1, 32, 32, 100, 100, 2);
  2226. bench_case(1, 32, 64, 100, 100, 2);
  2227. bench_case(1, 32, 32, 80, 80, 2);
  2228. bench_case(1, 32, 64, 80, 80, 2);
  2229. std::string algo_name = "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE2";
  2230. printf("Benchmark X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE2 algo\n");
  2231. benchmark_impl(
  2232. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  2233. data_type);
  2234. benchmark_impl(
  2235. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  2236. data_type);
  2237. benchmark_impl(
  2238. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  2239. data_type);
  2240. shapes_and_computation.clear();
  2241. }
  2242. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECTF32) {
  2243. constexpr size_t RUNS = 50;
  2244. param::ConvBias param;
  2245. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  2246. param.pad_h = 1;
  2247. param.pad_w = 1;
  2248. param.stride_h = 1;
  2249. param.stride_w = 1;
  2250. param.sparse = param::ConvBias::Sparse::GROUP;
  2251. std::vector<DType> data_type = {
  2252. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  2253. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  2254. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  2255. size_t group) {
  2256. SmallVector<TensorShape> shapes{
  2257. {N, IC, H, W},
  2258. {group, OC / group, IC / group, FS, FS},
  2259. {1, OC, 1, 1},
  2260. {},
  2261. {N, OC, H, W}};
  2262. TensorShape dst{N, OC, H, W};
  2263. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  2264. dst.total_nr_elems()) *
  2265. 1e-6;
  2266. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  2267. };
  2268. bench_case(1, 32, 32, 200, 200, 3, 4);
  2269. bench_case(1, 32, 32, 200, 200, 3, 32);
  2270. bench_case(1, 32, 32, 128, 128, 3, 4);
  2271. bench_case(1, 32, 32, 128, 128, 3, 32);
  2272. bench_case(1, 32, 32, 100, 100, 3, 4);
  2273. bench_case(1, 32, 32, 100, 100, 3, 32);
  2274. bench_case(1, 32, 32, 80, 80, 3, 4);
  2275. bench_case(1, 32, 32, 80, 80, 3, 32);
  2276. std::string algo_name = "X86_CONV_BIAS_DIRECT_STRIDE1_LARGE_GROUP";
  2277. printf("Benchmark X86_CONV_BIAS_DIRECT_STRIDE1_GROUP algo\n");
  2278. benchmark_impl(
  2279. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  2280. data_type);
  2281. benchmark_impl(
  2282. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  2283. data_type);
  2284. benchmark_impl(
  2285. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  2286. data_type);
  2287. shapes_and_computation.clear();
  2288. algo_name = "X86_CONV_BIAS_DIRECT_STRIDE1_LARGE_GROUP";
  2289. printf("Benchmark X86_CONV_BIAS_DIRECT_STRIDE1_DENSE algo\n");
  2290. bench_case(1, 32, 32, 200, 200, 3, 1);
  2291. bench_case(1, 32, 32, 128, 128, 3, 1);
  2292. bench_case(1, 32, 32, 100, 100, 3, 1);
  2293. bench_case(1, 32, 32, 80, 80, 3, 1);
  2294. benchmark_impl(
  2295. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  2296. data_type);
  2297. benchmark_impl(
  2298. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  2299. data_type);
  2300. benchmark_impl(
  2301. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  2302. data_type);
  2303. }
  2304. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_F32) {
  2305. constexpr size_t RUNS = 50;
  2306. param::ConvBias param;
  2307. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  2308. param.pad_h = 1;
  2309. param.pad_w = 1;
  2310. param.stride_h = 1;
  2311. param.stride_w = 1;
  2312. std::vector<DType> data_type = {
  2313. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  2314. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  2315. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  2316. size_t group) {
  2317. SmallVector<TensorShape> shapes{
  2318. {N, IC, H, W},
  2319. {OC / group, IC / group, FS, FS},
  2320. {1, OC, 1, 1},
  2321. {},
  2322. {N, OC, H, W}};
  2323. TensorShape dst{N, OC, H, W};
  2324. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  2325. dst.total_nr_elems()) *
  2326. 1e-6;
  2327. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  2328. };
  2329. bench_case(1, 32, 32, 200, 200, 3, 1);
  2330. bench_case(1, 32, 32, 200, 200, 3, 1);
  2331. bench_case(1, 32, 32, 128, 128, 3, 1);
  2332. bench_case(1, 32, 32, 128, 128, 3, 1);
  2333. bench_case(1, 32, 32, 100, 100, 3, 1);
  2334. bench_case(1, 32, 32, 100, 100, 3, 1);
  2335. bench_case(1, 32, 32, 80, 80, 3, 1);
  2336. bench_case(1, 32, 32, 80, 80, 3, 1);
  2337. bench_case(1, 64, 32, 7, 7, 3, 1);
  2338. bench_case(1, 64, 64, 7, 7, 3, 1);
  2339. bench_case(1, 64, 128, 7, 7, 3, 1);
  2340. bench_case(1, 64, 256, 7, 7, 3, 1);
  2341. bench_case(1, 64, 512, 7, 7, 3, 1);
  2342. bench_case(1, 64, 1024, 7, 7, 3, 1);
  2343. bench_case(1, 64, 32, 14, 14, 3, 1);
  2344. bench_case(1, 64, 64, 14, 14, 3, 1);
  2345. bench_case(1, 64, 128, 14, 14, 3, 1);
  2346. bench_case(1, 64, 256, 14, 14, 3, 1);
  2347. bench_case(1, 64, 512, 14, 14, 3, 1);
  2348. bench_case(1, 64, 1024, 14, 14, 3, 1);
  2349. bench_case(1, 128, 128, 14, 14, 3, 1);
  2350. bench_case(1, 128, 256, 14, 14, 3, 1);
  2351. bench_case(1, 512, 512, 14, 14, 3, 1);
  2352. bench_case(1, 256, 512, 14, 14, 3, 1);
  2353. bench_case(1, 512, 1024, 14, 14, 3, 1);
  2354. bench_case(1, 1024, 1024, 14, 14, 3, 1);
  2355. std::string algo_name = "IM2COLMATMUL:X86_F32_BLAS:192";
  2356. printf("Benchmark IM2COLMATMUL:X86_F32_BLAS algo\n");
  2357. benchmark_impl(
  2358. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  2359. data_type);
  2360. benchmark_impl(
  2361. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  2362. data_type);
  2363. benchmark_impl(
  2364. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  2365. data_type);
  2366. shapes_and_computation.clear();
  2367. }
  2368. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_F32_single_thread) {
  2369. constexpr size_t RUNS = 50;
  2370. param::ConvBias param;
  2371. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  2372. param.pad_h = 1;
  2373. param.pad_w = 1;
  2374. param.stride_h = 1;
  2375. param.stride_w = 1;
  2376. std::vector<DType> data_type = {
  2377. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  2378. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  2379. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  2380. size_t group) {
  2381. SmallVector<TensorShape> shapes{
  2382. {N, IC, H, W},
  2383. {OC / group, IC / group, FS, FS},
  2384. {1, OC, 1, 1},
  2385. {},
  2386. {N, OC, H, W}};
  2387. TensorShape dst{N, OC, H, W};
  2388. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  2389. dst.total_nr_elems()) *
  2390. 1e-6;
  2391. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  2392. };
  2393. bench_case(1, 32, 32, 200, 200, 3, 1);
  2394. bench_case(1, 32, 32, 200, 200, 3, 1);
  2395. bench_case(1, 32, 32, 128, 128, 3, 1);
  2396. bench_case(1, 32, 32, 128, 128, 3, 1);
  2397. bench_case(1, 32, 32, 100, 100, 3, 1);
  2398. bench_case(1, 32, 32, 100, 100, 3, 1);
  2399. bench_case(1, 32, 32, 80, 80, 3, 1);
  2400. bench_case(1, 32, 32, 80, 80, 3, 1);
  2401. bench_case(1, 64, 32, 7, 7, 3, 1);
  2402. bench_case(1, 64, 64, 7, 7, 3, 1);
  2403. bench_case(1, 64, 128, 7, 7, 3, 1);
  2404. bench_case(1, 64, 256, 7, 7, 3, 1);
  2405. bench_case(1, 64, 512, 7, 7, 3, 1);
  2406. bench_case(1, 64, 1024, 7, 7, 3, 1);
  2407. bench_case(1, 64, 32, 14, 14, 3, 1);
  2408. bench_case(1, 64, 64, 14, 14, 3, 1);
  2409. bench_case(1, 64, 128, 14, 14, 3, 1);
  2410. bench_case(1, 64, 256, 14, 14, 3, 1);
  2411. bench_case(1, 64, 512, 14, 14, 3, 1);
  2412. bench_case(1, 64, 1024, 14, 14, 3, 1);
  2413. bench_case(1, 128, 128, 14, 14, 3, 1);
  2414. bench_case(1, 128, 256, 14, 14, 3, 1);
  2415. bench_case(1, 512, 512, 14, 14, 3, 1);
  2416. bench_case(1, 256, 512, 14, 14, 3, 1);
  2417. bench_case(1, 512, 1024, 14, 14, 3, 1);
  2418. bench_case(1, 1024, 1024, 14, 14, 3, 1);
  2419. std::string algo_name = "IM2COLMATMUL:X86_F32_MKL_PACKA:192";
  2420. std::string algo_name1 = "IM2COLMATMUL:X86_F32_BLAS:192";
  2421. printf("Benchmark IM2COLMATMUL:X86_F32_BLAS algo\n");
  2422. benchmark_impl_comp(
  2423. param, shapes_and_computation, algo_name, algo_name1, RUNS, {1, {4}},
  2424. {1, {4}}, data_type);
  2425. benchmark_impl_comp(
  2426. param, shapes_and_computation, algo_name, algo_name1, RUNS, {1, {7}},
  2427. {1, {7}}, data_type);
  2428. shapes_and_computation.clear();
  2429. }
  2430. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_F32_6x16) {
  2431. constexpr size_t RUNS = 50;
  2432. param::ConvBias param;
  2433. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  2434. param.pad_h = 1;
  2435. param.pad_w = 1;
  2436. param.stride_h = 1;
  2437. param.stride_w = 1;
  2438. std::vector<DType> data_type = {
  2439. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  2440. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  2441. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  2442. size_t group) {
  2443. SmallVector<TensorShape> shapes{
  2444. {N, IC, H, W},
  2445. {OC / group, IC / group, FS, FS},
  2446. {1, OC, 1, 1},
  2447. {},
  2448. {N, OC, H, W}};
  2449. TensorShape dst{N, OC, H, W};
  2450. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  2451. dst.total_nr_elems()) *
  2452. 1e-6;
  2453. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  2454. };
  2455. bench_case(1, 32, 32, 200, 200, 3, 1);
  2456. bench_case(1, 32, 32, 200, 200, 3, 1);
  2457. bench_case(1, 32, 32, 128, 128, 3, 1);
  2458. bench_case(1, 32, 32, 128, 128, 3, 1);
  2459. bench_case(1, 32, 32, 100, 100, 3, 1);
  2460. bench_case(1, 32, 32, 100, 100, 3, 1);
  2461. bench_case(1, 32, 32, 80, 80, 3, 1);
  2462. bench_case(1, 32, 32, 80, 80, 3, 1);
  2463. bench_case(1, 64, 32, 7, 7, 3, 1);
  2464. bench_case(1, 64, 64, 7, 7, 3, 1);
  2465. bench_case(1, 64, 128, 7, 7, 3, 1);
  2466. bench_case(1, 64, 256, 7, 7, 3, 1);
  2467. bench_case(1, 64, 512, 7, 7, 3, 1);
  2468. bench_case(1, 64, 1024, 7, 7, 3, 1);
  2469. bench_case(1, 64, 32, 14, 14, 3, 1);
  2470. bench_case(1, 64, 64, 14, 14, 3, 1);
  2471. bench_case(1, 64, 128, 14, 14, 3, 1);
  2472. bench_case(1, 64, 256, 14, 14, 3, 1);
  2473. bench_case(1, 64, 512, 14, 14, 3, 1);
  2474. bench_case(1, 64, 1024, 14, 14, 3, 1);
  2475. bench_case(1, 128, 128, 14, 14, 3, 1);
  2476. bench_case(1, 128, 256, 14, 14, 3, 1);
  2477. bench_case(1, 512, 512, 14, 14, 3, 1);
  2478. bench_case(1, 256, 512, 14, 14, 3, 1);
  2479. bench_case(1, 512, 1024, 14, 14, 3, 1);
  2480. bench_case(1, 1024, 1024, 14, 14, 3, 1);
  2481. std::string algo_name = "IM2COLMATMUL:X86_F32_6x16:192";
  2482. printf("Benchmark IM2COLMATMUL:X86_F32_6x16 algo\n");
  2483. benchmark_impl(
  2484. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  2485. data_type);
  2486. benchmark_impl(
  2487. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  2488. data_type);
  2489. benchmark_impl(
  2490. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  2491. data_type);
  2492. shapes_and_computation.clear();
  2493. }
  2494. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_F32_6X16_single_thread) {
  2495. constexpr size_t RUNS = 50;
  2496. param::ConvBias param;
  2497. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  2498. param.pad_h = 1;
  2499. param.pad_w = 1;
  2500. param.stride_h = 1;
  2501. param.stride_w = 1;
  2502. std::vector<DType> data_type = {
  2503. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  2504. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  2505. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  2506. size_t group) {
  2507. SmallVector<TensorShape> shapes{
  2508. {N, IC, H, W},
  2509. {OC / group, IC / group, FS, FS},
  2510. {1, OC, 1, 1},
  2511. {},
  2512. {N, OC, H, W}};
  2513. TensorShape dst{N, OC, H, W};
  2514. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  2515. dst.total_nr_elems()) *
  2516. 1e-6;
  2517. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  2518. };
  2519. bench_case(1, 32, 32, 200, 200, 3, 1);
  2520. bench_case(1, 32, 32, 200, 200, 3, 1);
  2521. bench_case(1, 32, 32, 128, 128, 3, 1);
  2522. bench_case(1, 32, 32, 128, 128, 3, 1);
  2523. bench_case(1, 32, 32, 100, 100, 3, 1);
  2524. bench_case(1, 32, 32, 100, 100, 3, 1);
  2525. bench_case(1, 32, 32, 80, 80, 3, 1);
  2526. bench_case(1, 32, 32, 80, 80, 3, 1);
  2527. bench_case(1, 64, 32, 7, 7, 3, 1);
  2528. bench_case(1, 64, 64, 7, 7, 3, 1);
  2529. bench_case(1, 64, 128, 7, 7, 3, 1);
  2530. bench_case(1, 64, 256, 7, 7, 3, 1);
  2531. bench_case(1, 64, 512, 7, 7, 3, 1);
  2532. bench_case(1, 64, 1024, 7, 7, 3, 1);
  2533. bench_case(1, 64, 32, 14, 14, 3, 1);
  2534. bench_case(1, 64, 64, 14, 14, 3, 1);
  2535. bench_case(1, 64, 128, 14, 14, 3, 1);
  2536. bench_case(1, 64, 256, 14, 14, 3, 1);
  2537. bench_case(1, 64, 512, 14, 14, 3, 1);
  2538. bench_case(1, 64, 1024, 14, 14, 3, 1);
  2539. bench_case(1, 128, 128, 14, 14, 3, 1);
  2540. bench_case(1, 128, 256, 14, 14, 3, 1);
  2541. bench_case(1, 512, 512, 14, 14, 3, 1);
  2542. bench_case(1, 256, 512, 14, 14, 3, 1);
  2543. bench_case(1, 512, 1024, 14, 14, 3, 1);
  2544. bench_case(1, 1024, 1024, 14, 14, 3, 1);
  2545. std::string algo_name = "IM2COLMATMUL:X86_F32_MKL_PACKA:192";
  2546. std::string algo_name1 = "IM2COLMATMUL:X86_F32_6x16:192";
  2547. printf("Benchmark IM2COLMATMUL:X86_F32_6x16 algo\n");
  2548. benchmark_impl_comp(
  2549. param, shapes_and_computation, algo_name, algo_name1, RUNS, {1, {4}},
  2550. {1, {4}}, data_type);
  2551. benchmark_impl_comp(
  2552. param, shapes_and_computation, algo_name, algo_name1, RUNS, {1, {7}},
  2553. {1, {7}}, data_type);
  2554. shapes_and_computation.clear();
  2555. }
  2556. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_INT8X8X32) {
  2557. constexpr size_t RUNS = 50;
  2558. param::ConvBias param;
  2559. param.pad_h = 1;
  2560. param.pad_w = 1;
  2561. param.stride_h = 1;
  2562. param.stride_w = 1;
  2563. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  2564. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  2565. size_t group) {
  2566. SmallVector<TensorShape> shapes{
  2567. {N, IC, H, W},
  2568. {OC / group, IC / group, FS, FS},
  2569. {1, OC, 1, 1},
  2570. {},
  2571. {N, OC, H, W}};
  2572. TensorShape dst{N, OC, H, W};
  2573. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  2574. dst.total_nr_elems()) *
  2575. 1e-6;
  2576. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  2577. };
  2578. bench_case(1, 32, 32, 200, 200, 3, 1);
  2579. bench_case(1, 32, 32, 200, 200, 3, 1);
  2580. bench_case(1, 32, 32, 128, 128, 3, 1);
  2581. bench_case(1, 32, 32, 128, 128, 3, 1);
  2582. bench_case(1, 32, 32, 100, 100, 3, 1);
  2583. bench_case(1, 32, 32, 100, 100, 3, 1);
  2584. bench_case(1, 32, 32, 80, 80, 3, 1);
  2585. bench_case(1, 32, 32, 80, 80, 3, 1);
  2586. bench_case(1, 64, 32, 7, 7, 3, 1);
  2587. bench_case(1, 64, 64, 7, 7, 3, 1);
  2588. bench_case(1, 64, 128, 7, 7, 3, 1);
  2589. bench_case(1, 64, 256, 7, 7, 3, 1);
  2590. bench_case(1, 64, 512, 7, 7, 3, 1);
  2591. bench_case(1, 64, 1024, 7, 7, 3, 1);
  2592. bench_case(1, 64, 32, 14, 14, 3, 1);
  2593. bench_case(1, 64, 64, 14, 14, 3, 1);
  2594. bench_case(1, 64, 128, 14, 14, 3, 1);
  2595. bench_case(1, 64, 256, 14, 14, 3, 1);
  2596. bench_case(1, 64, 512, 14, 14, 3, 1);
  2597. bench_case(1, 64, 1024, 14, 14, 3, 1);
  2598. bench_case(1, 128, 128, 14, 14, 3, 1);
  2599. bench_case(1, 128, 256, 14, 14, 3, 1);
  2600. bench_case(1, 512, 512, 14, 14, 3, 1);
  2601. bench_case(1, 256, 512, 14, 14, 3, 1);
  2602. bench_case(1, 512, 1024, 14, 14, 3, 1);
  2603. bench_case(1, 1024, 1024, 14, 14, 3, 1);
  2604. std::vector<DType> data_type = {
  2605. dtype::Int8(), dtype::Int8(), dtype::Int32(), dtype::Int32()};
  2606. std::string algo_name = "IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2:192";
  2607. // std::string algo_name = "IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16";
  2608. // printf("Benchmark IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2 algo\n");
  2609. benchmark_impl(
  2610. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  2611. data_type);
  2612. benchmark_impl(
  2613. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  2614. data_type);
  2615. benchmark_impl(
  2616. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  2617. data_type);
  2618. shapes_and_computation.clear();
  2619. }
  2620. namespace {
  2621. std::vector<conv_bias::TestArg> get_winograd_benchmark_args(
  2622. size_t kernel, size_t pack_size) {
  2623. std::vector<conv_bias::TestArg> args;
  2624. auto pack = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p) {
  2625. if (ic % pack_size != 0 || oc % pack_size != 0)
  2626. return;
  2627. if (w + 2 * p < kernel || h + 2 * p < kernel)
  2628. return;
  2629. param::ConvBias param;
  2630. param.mode = param::ConvBias::Mode::CROSS_CORRELATION;
  2631. param.format = param::ConvBias::Format::NCHW88;
  2632. param.sparse = param::ConvBias::Sparse::DENSE;
  2633. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  2634. param.stride_h = 1;
  2635. param.stride_w = 1;
  2636. param.pad_h = p;
  2637. param.pad_w = p;
  2638. args.push_back(conv_bias::TestArg{
  2639. param,
  2640. TensorShape{1, ic / 8, h, w, 8},
  2641. TensorShape{oc / 8, ic / 8, kernel, kernel, 8, 8},
  2642. {1, oc / 8, 1, 1, 8}});
  2643. };
  2644. for (size_t ic : {64, 128, 256}) {
  2645. for (size_t oc : {64, 128, 256}) {
  2646. pack(oc, ic, 56, 56, kernel, kernel / 2);
  2647. pack(oc, ic, 14, 14, kernel, kernel / 2);
  2648. pack(oc, ic, 28, 28, kernel, kernel / 2);
  2649. }
  2650. }
  2651. //! conv in vgg16
  2652. pack(512, 512, 15, 15, kernel, kernel / 2);
  2653. pack(512, 256, 15, 15, kernel, kernel / 2);
  2654. pack(256, 256, 29, 29, kernel, kernel / 2);
  2655. pack(256, 128, 29, 29, kernel, kernel / 2);
  2656. pack(128, 128, 57, 57, kernel, kernel / 2);
  2657. pack(128, 64, 57, 57, kernel, kernel / 2);
  2658. pack(64, 64, 56, 56, kernel, kernel / 2);
  2659. pack(128, 128, 28, 28, kernel, kernel / 2);
  2660. pack(512, 512, 14, 14, kernel, kernel / 2);
  2661. return args;
  2662. }
  2663. void benchmark_winograd(
  2664. const char* algo_name, Handle* handle, size_t kernel, size_t pack_size) {
  2665. auto&& args = get_winograd_benchmark_args(kernel, pack_size);
  2666. using namespace conv_bias;
  2667. constexpr size_t RUN = 10;
  2668. Benchmarker<ConvBias> benchmark(handle);
  2669. benchmark.set_display(false);
  2670. benchmark.set_times(RUN);
  2671. Benchmarker<ConvBias> benchmark_winograd(handle);
  2672. benchmark_winograd.set_display(false);
  2673. benchmark_winograd.set_times(RUN);
  2674. for (auto&& arg : args) {
  2675. TensorLayout dst_layout;
  2676. auto opr = handle->create_operator<ConvBias>();
  2677. opr->param() = arg.param;
  2678. opr->deduce_layout(
  2679. {arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()},
  2680. {arg.bias, dtype::Float32()}, {}, dst_layout);
  2681. //! dst.nr_elems * IC * FH * FW * 2
  2682. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  2683. arg.filter[2] * arg.filter[3] * 2.0 * 8.0 /
  2684. (1024 * 1024 * 1024) * 1e3;
  2685. auto used =
  2686. benchmark.set_param(arg.param).exec({arg.src, arg.filter, {}, {}, {}}) /
  2687. RUN;
  2688. benchmark_winograd.set_param(arg.param);
  2689. auto used_winograd = algo_benchmark<ConvBias>(
  2690. benchmark_winograd,
  2691. {arg.src, arg.filter, {}, {}, {}}, algo_name) /
  2692. RUN;
  2693. printf("%s %s: normal: %f ms %f Gflops winograd: %f ms %f GFlops "
  2694. "speedup: "
  2695. "%f\n",
  2696. arg.src.to_string().c_str(), arg.filter.to_string().c_str(), used,
  2697. computations / used, used_winograd, computations / used_winograd,
  2698. used / used_winograd);
  2699. }
  2700. }
  2701. } // namespace
  2702. TEST_F(X86, BENCHMARK_CONVBIAS_WINOGRAD_F63_8x8) {
  2703. benchmark_winograd("WINOGRAD:X86_F32MK8_8X8:8:6:8", handle(), 3, 8);
  2704. }
  2705. TEST_F(X86, BENCHMARK_CONVBIAS_WINOGRAD_F23_8x8) {
  2706. benchmark_winograd("WINOGRAD:X86_F32MK8_8X8:8:2:8", handle(), 3, 8);
  2707. }
  2708. #endif
  2709. } // namespace test
  2710. } // namespace megdnn
  2711. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台