You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias.cpp 96 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379
  1. /**
  2. * \file dnn/test/x86/conv_bias.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "src/x86/utils.h"
  12. #include "test/x86/fixture.h"
  13. #include "megdnn/opr_param_defs.h"
  14. #include "megdnn/oprs.h"
  15. #include "test/common/benchmarker.h"
  16. #include "test/common/checker.h"
  17. #include "test/common/conv_bias.h"
  18. #include "test/common/rng.h"
  19. #include "test/common/tensor.h"
  20. #include "test/common/workspace_wrapper.h"
  21. namespace megdnn {
  22. namespace test {
  23. TEST_F(X86, CONV_BIAS_FORWARD) {
  24. using namespace conv_bias;
  25. std::vector<TestArg> args = get_args();
  26. Checker<ConvBiasForward> checker(handle());
  27. NormalRNG default_rng;
  28. ConstValue const_val;
  29. for (auto&& arg : args) {
  30. checker.set_dtype(0, dtype::Float32())
  31. .set_dtype(1, dtype::Float32())
  32. .set_dtype(2, dtype::Float32())
  33. .set_rng(0, &default_rng)
  34. .set_rng(1, &default_rng)
  35. .set_rng(2, &default_rng)
  36. .set_epsilon(1e-3)
  37. .set_param(arg.param)
  38. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  39. }
  40. }
  41. TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_INT8x8x32) {
  42. using namespace conv_bias;
  43. std::vector<TestArg> args;
  44. auto run = [&](size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  45. NonlineMode nonline_mode) {
  46. if (w + 2 * p < kernel || h + 2 * p < kernel)
  47. return;
  48. param::ConvBias param;
  49. param.stride_h = 1;
  50. param.stride_w = 1;
  51. param.pad_h = p;
  52. param.pad_w = p;
  53. param.nonlineMode = nonline_mode;
  54. param.sparse = param::ConvBias::Sparse::GROUP;
  55. //! no bias
  56. args.emplace_back(param, TensorShape{2, ic, h, w},
  57. TensorShape{ic, 1, 1, kernel, kernel}, TensorShape{});
  58. //! bias channel
  59. args.emplace_back(param, TensorShape{2, ic, h, w},
  60. TensorShape{ic, 1, 1, kernel, kernel},
  61. TensorShape{1, ic, 1, 1});
  62. };
  63. for (size_t kernel : {2, 3, 5, 7})
  64. for (size_t pad : {0, 1})
  65. for (size_t ic : {1, 5, 17, 20})
  66. for (size_t h : {7, 16, 38, 40})
  67. for (size_t w : {16, 25, 40, 55})
  68. for (NonlineMode nonline_mode : {NonlineMode::IDENTITY})
  69. run(ic, w, h, kernel, pad, nonline_mode);
  70. Checker<ConvBias> checker(handle());
  71. UniformIntRNG rng{-50, 50};
  72. checker.set_dtype(0, dtype::Int8())
  73. .set_dtype(1, dtype::Int8())
  74. .set_dtype(2, dtype::Int32())
  75. .set_dtype(4, dtype::Int32())
  76. .set_rng(0, &rng)
  77. .set_rng(1, &rng)
  78. .set_rng(2, &rng)
  79. .set_epsilon(1e-3);
  80. checker.set_before_exec_callback(
  81. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  82. "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"));
  83. for (auto&& arg : args) {
  84. checker.set_param(arg.param).exec(
  85. {arg.src, arg.filter, arg.bias, {}, {}});
  86. }
  87. }
  88. TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_QuantizedS32) {
  89. using namespace conv_bias;
  90. std::vector<TestArg> args;
  91. auto run = [&](size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  92. NonlineMode nonline_mode) {
  93. if (w + 2 * p < kernel || h + 2 * p < kernel)
  94. return;
  95. param::ConvBias param;
  96. param.stride_h = 1;
  97. param.stride_w = 1;
  98. param.pad_h = p;
  99. param.pad_w = p;
  100. param.nonlineMode = nonline_mode;
  101. param.sparse = param::ConvBias::Sparse::GROUP;
  102. //! no bias
  103. args.emplace_back(param, TensorShape{2, ic, h, w},
  104. TensorShape{ic, 1, 1, kernel, kernel}, TensorShape{});
  105. //! bias channel
  106. args.emplace_back(param, TensorShape{2, ic, h, w},
  107. TensorShape{ic, 1, 1, kernel, kernel},
  108. TensorShape{1, ic, 1, 1});
  109. };
  110. for (size_t kernel : {2, 3, 5, 7})
  111. for (size_t pad : {0, 1})
  112. for (size_t ic : {1, 3, 5, 7, 17})
  113. for (size_t h : {10, 17, 25, 30})
  114. for (size_t w : {19, 28, 58, 168})
  115. for (NonlineMode nonline_mode : {NonlineMode::IDENTITY})
  116. run(ic, w, h, kernel, pad, nonline_mode);
  117. Checker<ConvBias> checker(handle());
  118. UniformIntRNG rng{-50, 50};
  119. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  120. .set_dtype(1, dtype::QuantizedS8(2.5f))
  121. .set_dtype(2, dtype::QuantizedS32(6.25f))
  122. .set_dtype(4, {})
  123. .set_rng(0, &rng)
  124. .set_rng(1, &rng)
  125. .set_rng(2, &rng)
  126. .set_epsilon(1e-3);
  127. checker.set_before_exec_callback(
  128. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  129. "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"));
  130. for (auto&& arg : args) {
  131. checker.set_param(arg.param).exec(
  132. {arg.src, arg.filter, arg.bias, {}, {}});
  133. }
  134. }
  135. TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_QuantizedS8x8x8) {
  136. using namespace conv_bias;
  137. std::vector<TestArg> args;
  138. auto run = [&](size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  139. NonlineMode nonline_mode) {
  140. if (w + 2 * p < kernel || h + 2 * p < kernel)
  141. return;
  142. param::ConvBias param;
  143. param.stride_h = 1;
  144. param.stride_w = 1;
  145. param.pad_h = p;
  146. param.pad_w = p;
  147. param.nonlineMode = nonline_mode;
  148. param.sparse = param::ConvBias::Sparse::GROUP;
  149. //! no bias
  150. args.emplace_back(param, TensorShape{2, ic, h, w},
  151. TensorShape{ic, 1, 1, kernel, kernel}, TensorShape{});
  152. //! bias channel
  153. args.emplace_back(param, TensorShape{2, ic, h, w},
  154. TensorShape{ic, 1, 1, kernel, kernel},
  155. TensorShape{1, ic, 1, 1});
  156. };
  157. for (size_t kernel : {2, 3, 5, 7})
  158. for (size_t pad : {0, 1})
  159. for (size_t ic : {1, 3, 5, 7, 17})
  160. for (size_t h : {10, 15, 17, 30})
  161. for (size_t w : {19, 28, 58, 168})
  162. for (NonlineMode nonline_mode :
  163. {NonlineMode::IDENTITY, NonlineMode::H_SWISH,
  164. NonlineMode::RELU})
  165. run(ic, w, h, kernel, pad, nonline_mode);
  166. Checker<ConvBias> checker(handle());
  167. UniformIntRNG rng{-50, 50};
  168. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  169. .set_dtype(1, dtype::QuantizedS8(2.5f))
  170. .set_dtype(2, dtype::QuantizedS32(6.25f))
  171. .set_dtype(4, dtype::QuantizedS8(60.25f))
  172. .set_rng(0, &rng)
  173. .set_rng(1, &rng)
  174. .set_rng(2, &rng)
  175. .set_epsilon(1e-3);
  176. checker.set_before_exec_callback(
  177. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  178. "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"));
  179. for (auto&& arg : args) {
  180. checker.set_param(arg.param).exec(
  181. {arg.src, arg.filter, arg.bias, {}, {}});
  182. }
  183. }
  184. TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE1_INT8x8x32) {
  185. using namespace conv_bias;
  186. std::vector<TestArg> args;
  187. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  188. size_t p, NonlineMode nonline_mode) {
  189. if (w + 2 * p < kernel || h + 2 * p < kernel)
  190. return;
  191. param::ConvBias param;
  192. param.stride_h = 1;
  193. param.stride_w = 1;
  194. param.pad_h = p;
  195. param.pad_w = p;
  196. param.nonlineMode = nonline_mode;
  197. param.sparse = param::ConvBias::Sparse::DENSE;
  198. //! no bias
  199. args.emplace_back(param, TensorShape{2, ic, h, w},
  200. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  201. param.sparse = param::ConvBias::Sparse::GROUP;
  202. //! no bias
  203. args.emplace_back(param, TensorShape{2, 2 * ic, h, w},
  204. TensorShape{2, oc / 2, ic, kernel, kernel},
  205. TensorShape{});
  206. };
  207. for (size_t kernel : {2, 3, 5, 7})
  208. for (size_t pad : {0, 1})
  209. for (size_t oc : {4, 8, 13, 16, 24})
  210. for (size_t ic : {2, 3, 7, 10})
  211. for (size_t h : {10, 11})
  212. for (size_t w : {8, 10})
  213. for (NonlineMode nonline_mode :
  214. {NonlineMode::IDENTITY})
  215. run(oc, ic, w, h, kernel, pad, nonline_mode);
  216. Checker<ConvBias> checker(handle());
  217. UniformIntRNG rng{-50, 50};
  218. checker.set_dtype(0, dtype::Int8())
  219. .set_dtype(1, dtype::Int8())
  220. .set_dtype(2, dtype::Int32())
  221. .set_dtype(4, dtype::Int32())
  222. .set_rng(0, &rng)
  223. .set_rng(1, &rng)
  224. .set_rng(2, &rng)
  225. .set_epsilon(1e-3);
  226. checker.set_before_exec_callback(
  227. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  228. "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE1"));
  229. for (auto&& arg : args) {
  230. checker.set_param(arg.param).exec(
  231. {arg.src, arg.filter, arg.bias, {}, {}});
  232. }
  233. }
  234. TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE1_QuantizedS32) {
  235. using namespace conv_bias;
  236. std::vector<TestArg> args;
  237. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  238. size_t p, NonlineMode nonline_mode) {
  239. if (w + 2 * p < kernel || h + 2 * p < kernel)
  240. return;
  241. param::ConvBias param;
  242. param.stride_h = 1;
  243. param.stride_w = 1;
  244. param.pad_h = p;
  245. param.pad_w = p;
  246. param.nonlineMode = nonline_mode;
  247. param.sparse = param::ConvBias::Sparse::DENSE;
  248. //! no bias
  249. args.emplace_back(param, TensorShape{2, ic, h, w},
  250. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  251. param.sparse = param::ConvBias::Sparse::GROUP;
  252. //! no bias
  253. args.emplace_back(param, TensorShape{2, 2 * ic, h, w},
  254. TensorShape{2, oc / 2, ic, kernel, kernel},
  255. TensorShape{});
  256. };
  257. for (size_t kernel : {2, 3, 5, 7})
  258. for (size_t pad : {0, 1})
  259. for (size_t oc : {4, 8, 13, 16, 24})
  260. for (size_t ic : {2, 3, 7, 10})
  261. for (size_t h : {10, 11})
  262. for (size_t w : {8, 10})
  263. for (NonlineMode nonline_mode :
  264. {NonlineMode::IDENTITY})
  265. run(oc, ic, w, h, kernel, pad, nonline_mode);
  266. Checker<ConvBias> checker(handle());
  267. UniformIntRNG rng{-50, 50};
  268. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  269. .set_dtype(1, dtype::QuantizedS8(2.5f))
  270. .set_dtype(2, dtype::QuantizedS32(6.25f))
  271. .set_dtype(4, {})
  272. .set_rng(0, &rng)
  273. .set_rng(1, &rng)
  274. .set_rng(2, &rng)
  275. .set_epsilon(1e-3);
  276. checker.set_before_exec_callback(
  277. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  278. "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE1"));
  279. for (auto&& arg : args) {
  280. checker.set_param(arg.param).exec(
  281. {arg.src, arg.filter, arg.bias, {}, {}});
  282. }
  283. }
  284. TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE1_S8S8S8) {
  285. using namespace conv_bias;
  286. std::vector<TestArg> args;
  287. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  288. size_t p, NonlineMode nonline_mode) {
  289. if (w + 2 * p < kernel || h + 2 * p < kernel)
  290. return;
  291. param::ConvBias param;
  292. param.stride_h = 1;
  293. param.stride_w = 1;
  294. param.pad_h = p;
  295. param.pad_w = p;
  296. param.nonlineMode = nonline_mode;
  297. param.sparse = param::ConvBias::Sparse::DENSE;
  298. //! no bias
  299. args.emplace_back(param, TensorShape{1, ic, h, w},
  300. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  301. //! bias channel
  302. args.emplace_back(param, TensorShape{1, ic, h, w},
  303. TensorShape{oc, ic, kernel, kernel},
  304. TensorShape{1, oc, 1, 1});
  305. param.sparse = param::ConvBias::Sparse::GROUP;
  306. //! no bias
  307. args.emplace_back(param, TensorShape{2, 2 * ic, h, w},
  308. TensorShape{2, oc / 2, ic, kernel, kernel},
  309. TensorShape{});
  310. //! bias channel
  311. args.emplace_back(param, TensorShape{2, 2 * ic, h, w},
  312. TensorShape{2, oc / 2, ic, kernel, kernel},
  313. TensorShape{1, oc, 1, 1});
  314. };
  315. for (size_t kernel : {2, 3, 5, 7})
  316. for (size_t pad : {0, 1})
  317. for (size_t oc : {4, 8, 14, 16, 24})
  318. for (size_t ic : {2, 3, 7, 10})
  319. for (size_t h : {10, 11})
  320. for (size_t w : {8, 10})
  321. for (NonlineMode nonline_mode :
  322. {NonlineMode::IDENTITY, NonlineMode::RELU,
  323. NonlineMode::H_SWISH})
  324. run(oc, ic, w, h, kernel, pad, nonline_mode);
  325. Checker<ConvBias> checker(handle());
  326. UniformIntRNG rng{-50, 50};
  327. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  328. .set_dtype(1, dtype::QuantizedS8(2.5f))
  329. .set_dtype(2, dtype::QuantizedS32(6.25f))
  330. .set_dtype(4, dtype::QuantizedS8(60.25f))
  331. .set_rng(0, &rng)
  332. .set_rng(1, &rng)
  333. .set_rng(2, &rng)
  334. .set_epsilon(1e-3);
  335. checker.set_before_exec_callback(
  336. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  337. "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE1"));
  338. for (auto&& arg : args) {
  339. checker.set_param(arg.param).exec(
  340. {arg.src, arg.filter, arg.bias, {}, {}});
  341. }
  342. }
  343. TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE2_INT8x8x32) {
  344. using namespace conv_bias;
  345. std::vector<TestArg> args;
  346. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  347. size_t p, NonlineMode nonline_mode) {
  348. if (w + 2 * p < kernel || h + 2 * p < kernel)
  349. return;
  350. param::ConvBias param;
  351. param.stride_h = 2;
  352. param.stride_w = 2;
  353. param.pad_h = p;
  354. param.pad_w = p;
  355. param.nonlineMode = nonline_mode;
  356. param.sparse = param::ConvBias::Sparse::DENSE;
  357. //! no bias
  358. args.emplace_back(param, TensorShape{2, ic, h, w},
  359. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  360. param.sparse = param::ConvBias::Sparse::GROUP;
  361. //! no bias
  362. args.emplace_back(param, TensorShape{2, 2 * ic, h, w},
  363. TensorShape{2, oc / 2, ic, kernel, kernel},
  364. TensorShape{});
  365. };
  366. for (size_t kernel : {2, 3, 5, 7})
  367. for (size_t pad : {0, 1, 2, 5})
  368. for (size_t oc : {4, 8, 13, 16, 24})
  369. for (size_t ic : {2, 3, 7, 10})
  370. for (size_t h : {10, 11})
  371. for (size_t w : {8, 10, 20})
  372. for (NonlineMode nonline_mode :
  373. {NonlineMode::IDENTITY})
  374. run(oc, ic, w, h, kernel, pad, nonline_mode);
  375. Checker<ConvBias> checker(handle());
  376. UniformIntRNG rng{-50, 50};
  377. checker.set_dtype(0, dtype::Int8())
  378. .set_dtype(1, dtype::Int8())
  379. .set_dtype(2, dtype::Int32())
  380. .set_dtype(4, dtype::Int32())
  381. .set_rng(0, &rng)
  382. .set_rng(1, &rng)
  383. .set_rng(2, &rng)
  384. .set_epsilon(1e-3);
  385. checker.set_before_exec_callback(
  386. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  387. "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE2"));
  388. for (auto&& arg : args) {
  389. checker.set_param(arg.param).exec(
  390. {arg.src, arg.filter, arg.bias, {}, {}});
  391. }
  392. }
  393. TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE2_QuantizedS32) {
  394. using namespace conv_bias;
  395. std::vector<TestArg> args;
  396. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  397. size_t p, NonlineMode nonline_mode) {
  398. if (w + 2 * p < kernel || h + 2 * p < kernel)
  399. return;
  400. param::ConvBias param;
  401. param.stride_h = 2;
  402. param.stride_w = 2;
  403. param.pad_h = p;
  404. param.pad_w = p;
  405. param.nonlineMode = nonline_mode;
  406. param.sparse = param::ConvBias::Sparse::DENSE;
  407. //! no bias
  408. args.emplace_back(param, TensorShape{2, ic, h, w},
  409. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  410. param.sparse = param::ConvBias::Sparse::GROUP;
  411. //! no bias
  412. args.emplace_back(param, TensorShape{2, 2 * ic, h, w},
  413. TensorShape{2, oc / 2, ic, kernel, kernel},
  414. TensorShape{});
  415. };
  416. for (size_t kernel : {2, 3, 5, 7})
  417. for (size_t pad : {0, 1, 3, 5})
  418. for (size_t oc : {4, 8, 13, 16, 24})
  419. for (size_t ic : {2, 3, 7, 10})
  420. for (size_t h : {10, 11})
  421. for (size_t w : {8, 10, 19})
  422. for (NonlineMode nonline_mode :
  423. {NonlineMode::IDENTITY})
  424. run(oc, ic, w, h, kernel, pad, nonline_mode);
  425. Checker<ConvBias> checker(handle());
  426. UniformIntRNG rng{-50, 50};
  427. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  428. .set_dtype(1, dtype::QuantizedS8(2.5f))
  429. .set_dtype(2, dtype::QuantizedS32(6.25f))
  430. .set_dtype(4, {})
  431. .set_rng(0, &rng)
  432. .set_rng(1, &rng)
  433. .set_rng(2, &rng)
  434. .set_epsilon(1e-3);
  435. checker.set_before_exec_callback(
  436. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  437. "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE2"));
  438. for (auto&& arg : args) {
  439. checker.set_param(arg.param).exec(
  440. {arg.src, arg.filter, arg.bias, {}, {}});
  441. }
  442. }
  443. TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE2_S8S8S8) {
  444. using namespace conv_bias;
  445. std::vector<TestArg> args;
  446. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  447. size_t p, NonlineMode nonline_mode) {
  448. if (w + 2 * p < kernel || h + 2 * p < kernel)
  449. return;
  450. param::ConvBias param;
  451. param.stride_h = 2;
  452. param.stride_w = 2;
  453. param.pad_h = p;
  454. param.pad_w = p;
  455. param.nonlineMode = nonline_mode;
  456. param.sparse = param::ConvBias::Sparse::DENSE;
  457. //! no bias
  458. args.emplace_back(param, TensorShape{1, ic, h, w},
  459. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  460. //! bias channel
  461. args.emplace_back(param, TensorShape{1, ic, h, w},
  462. TensorShape{oc, ic, kernel, kernel},
  463. TensorShape{1, oc, 1, 1});
  464. param.sparse = param::ConvBias::Sparse::GROUP;
  465. //! no bias
  466. args.emplace_back(param, TensorShape{2, 2 * ic, h, w},
  467. TensorShape{2, oc / 2, ic, kernel, kernel},
  468. TensorShape{});
  469. //! bias channel
  470. args.emplace_back(param, TensorShape{2, 2 * ic, h, w},
  471. TensorShape{2, oc / 2, ic, kernel, kernel},
  472. TensorShape{1, oc, 1, 1});
  473. };
  474. for (size_t kernel : {2, 3, 5, 7})
  475. for (size_t pad : {0, 1, 3, 5})
  476. for (size_t oc : {4, 8, 14, 16, 24})
  477. for (size_t ic : {2, 3, 7, 10})
  478. for (size_t h : {10, 11})
  479. for (size_t w : {8, 10, 18})
  480. for (NonlineMode nonline_mode :
  481. {NonlineMode::IDENTITY, NonlineMode::RELU,
  482. NonlineMode::H_SWISH})
  483. run(oc, ic, w, h, kernel, pad, nonline_mode);
  484. Checker<ConvBias> checker(handle());
  485. UniformIntRNG rng{-50, 50};
  486. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  487. .set_dtype(1, dtype::QuantizedS8(2.5f))
  488. .set_dtype(2, dtype::QuantizedS32(6.25f))
  489. .set_dtype(4, dtype::QuantizedS8(60.25f))
  490. .set_rng(0, &rng)
  491. .set_rng(1, &rng)
  492. .set_rng(2, &rng)
  493. .set_epsilon(1e-3);
  494. checker.set_before_exec_callback(
  495. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  496. "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE2"));
  497. for (auto&& arg : args) {
  498. checker.set_param(arg.param).exec(
  499. {arg.src, arg.filter, arg.bias, {}, {}});
  500. }
  501. }
  502. TEST_F(X86_MULTI_THREADS, CONV_BIAS_DIRECT_STRIDE1_SMALL_GROUP) {
  503. using namespace conv_bias;
  504. std::vector<TestArg> args;
  505. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  506. size_t p, NonlineMode nonline_mode) {
  507. if (w + 2 * p < kernel || h + 2 * p < kernel)
  508. return;
  509. param::ConvBias param;
  510. param.stride_h = 1;
  511. param.stride_w = 1;
  512. param.pad_h = p;
  513. param.pad_w = p;
  514. param.nonlineMode = nonline_mode;
  515. //! no bias
  516. args.emplace_back(param, TensorShape{1, ic, h, w},
  517. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  518. //! bias channel
  519. args.emplace_back(param, TensorShape{2, ic, h, w},
  520. TensorShape{oc, ic, kernel, kernel},
  521. TensorShape{1, oc, 1, 1});
  522. //! bias
  523. args.emplace_back(param, TensorShape{2, ic, h, w},
  524. TensorShape{oc, ic, kernel, kernel},
  525. TensorShape{2, oc, (h + param.pad_h * 2 - kernel) + 1,
  526. (w + param.pad_w * 2 - kernel) + 1});
  527. };
  528. for (size_t kernel : {1, 2, 3, 4, 5, 6, 7})
  529. for (size_t ic : {1, 4, 8, 16})
  530. for (size_t oc : {1, 4, 8})
  531. for (size_t p : {0, 2})
  532. for (size_t size : {20, 21, 24})
  533. for (NonlineMode nonline_mode :
  534. {NonlineMode::RELU, NonlineMode::SIGMOID,
  535. NonlineMode::H_SWISH, NonlineMode::IDENTITY}) {
  536. run(oc, ic, size, size, kernel, p, nonline_mode);
  537. }
  538. Checker<ConvBias> checker(handle());
  539. UniformIntRNG rng{-50, 50};
  540. checker.set_dtype(0, dtype::Float32())
  541. .set_dtype(1, dtype::Float32())
  542. .set_dtype(2, dtype::Float32())
  543. .set_rng(0, &rng)
  544. .set_rng(1, &rng)
  545. .set_rng(2, &rng);
  546. checker.set_before_exec_callback(
  547. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  548. "X86_CONV_BIAS_DIRECT_STRIDE1_SMALL_GROUP"));
  549. for (auto&& arg : args) {
  550. checker.set_param(arg.param).exec(
  551. {arg.src, arg.filter, arg.bias, {}, {}});
  552. }
  553. }
  554. TEST_F(X86_MULTI_THREADS, CONV_BIAS_DIRECT_STRIDE1_LARGE_GROUP) {
  555. using namespace conv_bias;
  556. std::vector<TestArg> args;
  557. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  558. size_t p, NonlineMode nonline_mode) {
  559. if (w + 2 * p < kernel || h + 2 * p < kernel)
  560. return;
  561. param::ConvBias param;
  562. param.stride_h = 1;
  563. param.stride_w = 1;
  564. param.pad_h = p;
  565. param.pad_w = p;
  566. param.nonlineMode = nonline_mode;
  567. //! no bias
  568. args.emplace_back(param, TensorShape{1, ic, h, w},
  569. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  570. //! bias channel
  571. args.emplace_back(param, TensorShape{2, ic, h, w},
  572. TensorShape{oc, ic, kernel, kernel},
  573. TensorShape{1, oc, 1, 1});
  574. //! bias
  575. args.emplace_back(param, TensorShape{2, ic, h, w},
  576. TensorShape{oc, ic, kernel, kernel},
  577. TensorShape{2, oc, (h + param.pad_h * 2 - kernel) + 1,
  578. (w + param.pad_w * 2 - kernel) + 1});
  579. };
  580. for (size_t kernel : {1, 2, 3, 4, 5, 6, 7})
  581. for (size_t ic : {1, 4, 8, 16})
  582. for (size_t oc : {1, 4, 8})
  583. for (size_t p : {0, 2})
  584. for (size_t size : {20, 21, 24})
  585. for (NonlineMode nonline_mode :
  586. {NonlineMode::RELU, NonlineMode::SIGMOID,
  587. NonlineMode::H_SWISH, NonlineMode::IDENTITY}) {
  588. run(oc, ic, size, size, kernel, p, nonline_mode);
  589. }
  590. Checker<ConvBias> checker(handle());
  591. UniformIntRNG rng{-50, 50};
  592. checker.set_dtype(0, dtype::Float32())
  593. .set_dtype(1, dtype::Float32())
  594. .set_dtype(2, dtype::Float32())
  595. .set_rng(0, &rng)
  596. .set_rng(1, &rng)
  597. .set_rng(2, &rng);
  598. checker.set_before_exec_callback(
  599. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  600. "X86_CONV_BIAS_DIRECT_STRIDE1_LARGE_GROUP"));
  601. for (auto&& arg : args) {
  602. checker.set_param(arg.param).exec(
  603. {arg.src, arg.filter, arg.bias, {}, {}});
  604. }
  605. }
  606. TEST_F(X86_MULTI_THREADS, CONV_BIAS_DIRECT_STRIDE2) {
  607. using namespace conv_bias;
  608. std::vector<TestArg> args;
  609. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  610. size_t p, NonlineMode nonline_mode) {
  611. if (w + 2 * p < kernel || h + 2 * p < kernel)
  612. return;
  613. param::ConvBias param;
  614. param.stride_h = 2;
  615. param.stride_w = 2;
  616. param.pad_h = p;
  617. param.pad_w = p;
  618. param.nonlineMode = nonline_mode;
  619. //! no bias
  620. args.emplace_back(param, TensorShape{1, ic, h, w},
  621. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  622. };
  623. for (size_t kernel : {2, 3, 5, 7})
  624. for (size_t ic : {1, 4, 8, 16})
  625. for (size_t oc : {1, 4, 8})
  626. for (size_t p : {0, 2})
  627. for (size_t size : {20, 21, 24})
  628. for (NonlineMode nonline_mode :
  629. {NonlineMode::RELU, NonlineMode::SIGMOID,
  630. NonlineMode::H_SWISH, NonlineMode::IDENTITY}) {
  631. run(oc, ic, size, size, kernel, p, nonline_mode);
  632. }
  633. Checker<ConvBias> checker(handle());
  634. UniformIntRNG rng{-50, 50};
  635. checker.set_dtype(0, dtype::Float32())
  636. .set_dtype(1, dtype::Float32())
  637. .set_dtype(2, dtype::Float32())
  638. .set_rng(0, &rng)
  639. .set_rng(1, &rng)
  640. .set_rng(2, &rng);
  641. checker.set_before_exec_callback(
  642. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  643. "X86_CONV_BIAS_DIRECT_STRIDE2_SMALL_GROUP"));
  644. for (auto&& arg : args) {
  645. checker.set_param(arg.param).exec(
  646. {arg.src, arg.filter, arg.bias, {}, {}});
  647. }
  648. checker.set_before_exec_callback(
  649. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  650. "X86_CONV_BIAS_DIRECT_STRIDE2_LARGE_GROUP"));
  651. for (auto&& arg : args) {
  652. checker.set_param(arg.param).exec(
  653. {arg.src, arg.filter, arg.bias, {}, {}});
  654. }
  655. }
  656. TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32) {
  657. using namespace conv_bias;
  658. std::vector<TestArg> args;
  659. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  660. size_t p, NonlineMode nonline_mode) {
  661. if (w + 2 * p < kernel || h + 2 * p < kernel)
  662. return;
  663. param::ConvBias param;
  664. param.stride_h = 1;
  665. param.stride_w = 1;
  666. param.pad_h = p;
  667. param.pad_w = p;
  668. param.nonlineMode = nonline_mode;
  669. //! no bias
  670. args.emplace_back(param, TensorShape{1, ic, h, w},
  671. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  672. };
  673. for (size_t kernel : {2, 3, 4, 5, 6, 7})
  674. for (size_t ic : {1, 4, 8, 16})
  675. for (size_t oc : {1, 4, 8})
  676. for (size_t p : {0, 2})
  677. for (size_t size : {20, 21, 24})
  678. for (NonlineMode nonline_mode :
  679. {NonlineMode::IDENTITY}) {
  680. run(oc, ic, size, size, kernel, p, nonline_mode);
  681. }
  682. //! test OC block
  683. run(2046, 1, 8, 8, 2, 0, NonlineMode::IDENTITY);
  684. Checker<ConvBias> checker(handle());
  685. UniformIntRNG rng{-50, 50};
  686. #define cb(algo_name) \
  687. checker.set_before_exec_callback( \
  688. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
  689. checker.set_dtype(0, dtype::Int8()); \
  690. checker.set_dtype(1, dtype::Int8()); \
  691. checker.set_dtype(2, dtype::Int32()); \
  692. checker.set_dtype(4, dtype::Int32()); \
  693. for (auto&& arg : args) { \
  694. checker.set_param(arg.param).execs({arg.src, arg.filter, {}, {}, {}}); \
  695. } \
  696. for (auto&& arg : args) { \
  697. checker.set_dtype(0, dtype::QuantizedS8(2.5f)) \
  698. .set_dtype(1, dtype::QuantizedS8(2.5f)) \
  699. .set_dtype(2, dtype::QuantizedS32(6.25f)) \
  700. .set_dtype(4, {}) \
  701. .set_rng(0, &rng) \
  702. .set_rng(1, &rng) \
  703. .set_rng(2, &rng) \
  704. .set_param(arg.param) \
  705. .execs({arg.src, arg.filter, {}, {}, {}}); \
  706. }
  707. #if MEGDNN_X86_WITH_MKL_DNN
  708. if (megdnn::x86::is_supported(x86::SIMDType::VNNI)) {
  709. cb("IM2COLMATMUL:X86_INT8X8X32_MKLDNN");
  710. }
  711. #endif
  712. #if MEGDNN_X86_WITH_VNNI
  713. if (megdnn::x86::is_supported(x86::SIMDType::VNNI)) {
  714. cb("IM2COLMATMUL:X86_INT8X8X32_VNNI");
  715. }
  716. #endif
  717. if (megdnn::x86::is_supported(x86::SIMDType::AVX2)) {
  718. cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16");
  719. cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2");
  720. }
  721. if (::megdnn::x86::is_supported(::megdnn::x86::SIMDType::SSE4_2)) {
  722. cb("IM2COLMATMUL:X86_INT8X8X32_SSE_4X8X2");
  723. }
  724. #undef cb
  725. }
  726. TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32) {
  727. using namespace conv_bias;
  728. std::vector<TestArg> args;
  729. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  730. size_t p, NonlineMode nonline_mode) {
  731. if (w + 2 * p < kernel || h + 2 * p < kernel)
  732. return;
  733. param::ConvBias param;
  734. param.stride_h = 1;
  735. param.stride_w = 1;
  736. param.pad_h = p;
  737. param.pad_w = p;
  738. param.nonlineMode = nonline_mode;
  739. //! no bias
  740. args.emplace_back(param, TensorShape{1, ic, h, w},
  741. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  742. args.emplace_back(param, TensorShape{1, ic, h, w},
  743. TensorShape{oc, ic, kernel, kernel},
  744. TensorShape{1, oc, 1, 1});
  745. args.emplace_back(
  746. param, TensorShape{1, ic, h, w},
  747. TensorShape{oc, ic, kernel, kernel},
  748. TensorShape{1, oc, (h + 2 * p - kernel) / param.stride_h + 1,
  749. (w + 2 * p - kernel) / param.stride_w + 1});
  750. };
  751. for (size_t kernel : {2, 3, 4, 5, 6, 7})
  752. for (size_t ic : {1, 4, 8, 16})
  753. for (size_t oc : {1, 4, 8, 16, 300})
  754. for (size_t p : {0, 2})
  755. for (size_t size : {8, 24})
  756. for (NonlineMode nonline_mode :
  757. {NonlineMode::IDENTITY, NonlineMode::RELU}) {
  758. run(oc, ic, size, size, kernel, p, nonline_mode);
  759. }
  760. run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);
  761. Checker<ConvBias> checker(handle());
  762. #define cb(algo_name) \
  763. checker.set_before_exec_callback( \
  764. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
  765. for (auto&& arg : args) { \
  766. checker.set_param(arg.param).execs( \
  767. {arg.src, arg.filter, arg.bias, {}, {}}); \
  768. }
  769. #if MEGDNN_X86_WITH_MKL || MEGDNN_X86_WITH_OPENBLAS
  770. cb("IM2COLMATMUL:X86_F32_BLAS");
  771. #endif
  772. #undef cb
  773. }
  774. #if MEGDNN_X86_WITH_MKL && SUPPORT_MKL_PACKED_GEMM
  775. TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA) {
  776. using namespace conv_bias;
  777. std::vector<TestArg> args;
  778. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  779. size_t p, NonlineMode nonline_mode) {
  780. if (w + 2 * p < kernel || h + 2 * p < kernel)
  781. return;
  782. param::ConvBias param;
  783. param.stride_h = 1;
  784. param.stride_w = 1;
  785. param.pad_h = p;
  786. param.pad_w = p;
  787. param.nonlineMode = nonline_mode;
  788. //! no bias
  789. args.emplace_back(param, TensorShape{1, ic, h, w},
  790. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  791. args.emplace_back(param, TensorShape{1, ic, h, w},
  792. TensorShape{oc, ic, kernel, kernel},
  793. TensorShape{1, oc, 1, 1});
  794. args.emplace_back(
  795. param, TensorShape{1, ic, h, w},
  796. TensorShape{oc, ic, kernel, kernel},
  797. TensorShape{1, oc, (h + 2 * p - kernel) / param.stride_h + 1,
  798. (w + 2 * p - kernel) / param.stride_w + 1});
  799. param.sparse = param::ConvBias::Sparse::GROUP;
  800. args.emplace_back(param, TensorShape{1, 2 * ic, h, w},
  801. TensorShape{2, oc, ic, kernel, kernel},
  802. TensorShape{});
  803. args.emplace_back(param, TensorShape{1, 2 * ic, h, w},
  804. TensorShape{2, oc, ic, kernel, kernel},
  805. TensorShape{1, oc * 2, 1, 1});
  806. args.emplace_back(
  807. param, TensorShape{1, 2 * ic, h, w},
  808. TensorShape{2, oc, ic, kernel, kernel},
  809. TensorShape{1, 2 * oc, (h + 2 * param.pad_h - kernel) / 1 + 1,
  810. (w + 2 * param.pad_w - kernel) / 1 + 1});
  811. };
  812. for (size_t kernel : {2, 3, 4, 5, 6, 7})
  813. for (size_t ic : {1, 4, 8, 16})
  814. for (size_t oc : {1, 4, 8, 16})
  815. for (size_t p : {0, 1})
  816. for (size_t size : {8, 24})
  817. for (NonlineMode nonline_mode :
  818. {NonlineMode::IDENTITY, NonlineMode::RELU}) {
  819. run(oc, ic, size, size, kernel, p, nonline_mode);
  820. }
  821. run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);
  822. Checker<ConvBias> checker(handle());
  823. #define cb(algo_name) \
  824. checker.set_before_exec_callback( \
  825. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
  826. for (auto&& arg : args) { \
  827. checker.set_param(arg.param).execs( \
  828. {arg.src, arg.filter, arg.bias, {}, {}}); \
  829. }
  830. cb("IM2COLMATMUL:X86_F32_MKL_PACKA:192");
  831. #undef cb
  832. }
  833. /**************************** Conv1x1 PackA *************************/
  834. namespace {
  835. void checker_conv_bias(std::vector<conv_bias::TestArg> args, Handle* handle,
  836. RNG* rng, float epsilon, DType type0, DType type1,
  837. DType type2, DType type3, const char* algo_name) {
  838. using namespace conv_bias;
  839. Checker<ConvBias> checker(handle);
  840. checker.set_before_exec_callback(
  841. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));
  842. checker.set_dtype(0, type0);
  843. checker.set_dtype(1, type1);
  844. checker.set_dtype(2, type2);
  845. checker.set_dtype(4, type3);
  846. checker.set_epsilon(epsilon);
  847. if (NULL != rng) {
  848. checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng).set_rng(3, rng);
  849. }
  850. for (auto&& arg : args) {
  851. checker.set_param(arg.param).execs(
  852. {arg.src, arg.filter, arg.bias, {}, {}});
  853. }
  854. }
  855. } // namespace
  856. #if MEGDNN_X86_WITH_MKL
  857. TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_FP32_PACKA) {
  858. using namespace conv_bias;
  859. std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(false, false);
  860. check_conv_bias(args, handle(), "CONV1x1:X86_F32_MKL_PACKA:24");
  861. }
  862. TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_FP32_BLAS) {
  863. using namespace conv_bias;
  864. std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(false, false);
  865. check_conv_bias(args, handle(), "CONV1x1:X86_F32_BLAS:48");
  866. }
  867. #endif
  868. TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_INT8X8X32) {
  869. using namespace conv_bias;
  870. UniformIntRNG rng{-50, 50};
  871. float epsilon = 0.001;
  872. std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(true, true);
  873. #if MEGDNN_X86_WITH_MKL_DNN
  874. if (x86::is_supported(x86::SIMDType::VNNI)) {
  875. checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{},
  876. dtype::Int8{}, dtype::Int32{}, dtype::Int32{},
  877. "CONV1x1:X86_INT8X8X32_MKLDNN:24");
  878. }
  879. #endif
  880. #if MEGDNN_X86_WITH_VNNI
  881. if (x86::is_supported(x86::SIMDType::VNNI)) {
  882. checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{},
  883. dtype::Int8{}, dtype::Int32{}, dtype::Int32{},
  884. "CONV1x1:X86_INT8X8X32_VNNI:24");
  885. }
  886. #endif
  887. if (x86::is_supported(x86::SIMDType::AVX2)) {
  888. checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{},
  889. dtype::Int8{}, dtype::Int32{}, dtype::Int32{},
  890. "CONV1x1:X86_INT8X8X32_AVX2_4X16X2:24");
  891. checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{},
  892. dtype::Int8{}, dtype::Int32{}, dtype::Int32{},
  893. "CONV1x1:X86_INT8X8X32_AVX2_2X4X16:24");
  894. }
  895. checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{},
  896. dtype::Int8{}, dtype::Int32{}, dtype::Int32{},
  897. "CONV1x1:X86_INT8X8X32_SSE_4X8X2:48");
  898. }
  899. /************************* End Conv1x1 PackA ************************/
  900. #endif
  901. TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8) {
  902. using namespace conv_bias;
  903. std::vector<TestArg> args;
  904. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  905. size_t p, NonlineMode nonline_mode) {
  906. if (w + 2 * p < kernel || h + 2 * p < kernel)
  907. return;
  908. param::ConvBias param;
  909. param.stride_h = 1;
  910. param.stride_w = 1;
  911. param.pad_h = p;
  912. param.pad_w = p;
  913. param.nonlineMode = nonline_mode;
  914. //! no bias
  915. args.emplace_back(param, TensorShape{1, ic, h, w},
  916. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  917. //! bias channel
  918. args.emplace_back(param, TensorShape{2, ic, h, w},
  919. TensorShape{oc, ic, kernel, kernel},
  920. TensorShape{1, oc, 1, 1});
  921. };
  922. for (size_t kernel : {2, 3, 4, 5, 6, 7})
  923. for (size_t ic : {1, 4, 8, 16})
  924. for (size_t oc : {1, 4, 8})
  925. for (size_t p : {0, 2})
  926. for (size_t size : {20, 21, 24})
  927. for (NonlineMode nonline_mode :
  928. {NonlineMode::IDENTITY, NonlineMode::RELU,
  929. NonlineMode::H_SWISH}) {
  930. run(oc, ic, size, size, kernel, p, nonline_mode);
  931. }
  932. run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);
  933. Checker<ConvBias> checker(handle());
  934. #define cb(algo_name) \
  935. checker.set_before_exec_callback( \
  936. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
  937. UniformIntRNG rng{-50, 50}; \
  938. for (auto&& arg : args) { \
  939. checker.set_dtype(0, dtype::QuantizedS8(2.5f)) \
  940. .set_dtype(1, dtype::QuantizedS8(2.5f)) \
  941. .set_dtype(2, dtype::QuantizedS32(6.25f)) \
  942. .set_dtype(4, dtype::QuantizedS8(60.25)) \
  943. .set_rng(0, &rng) \
  944. .set_rng(1, &rng) \
  945. .set_rng(2, &rng) \
  946. .set_param(arg.param) \
  947. .execs({arg.src, arg.filter, {}, {}, {}}); \
  948. }
  949. #if MEGDNN_X86_WITH_MKL_DNN
  950. if (x86::is_supported(x86::SIMDType::VNNI)) {
  951. cb("IM2COLMATMUL:X86_INT8X8X32_MKLDNN");
  952. }
  953. #endif
  954. #if MEGDNN_X86_WITH_VNNI
  955. if (x86::is_supported(x86::SIMDType::VNNI)) {
  956. cb("IM2COLMATMUL:X86_INT8X8X32_VNNI");
  957. }
  958. #endif
  959. if (x86::is_supported(x86::SIMDType::AVX2)) {
  960. cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16");
  961. }
  962. #undef cb
  963. }
  964. TEST_F(X86, CONV_BIAS_MATMUL) {
  965. using namespace conv_bias;
  966. std::vector<TestArg> args;
  967. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  968. size_t p, NonlineMode nonline_mode) {
  969. if (w + 2 * p < kernel || h + 2 * p < kernel)
  970. return;
  971. param::ConvBias param;
  972. param.stride_h = 1;
  973. param.stride_w = 1;
  974. param.pad_h = p;
  975. param.pad_w = p;
  976. param.nonlineMode = nonline_mode;
  977. //! no bias
  978. param.sparse = param::ConvBias::Sparse::DENSE;
  979. args.emplace_back(param, TensorShape{1, ic, h, w},
  980. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  981. //! bias channel
  982. args.emplace_back(param, TensorShape{2, ic, h, w},
  983. TensorShape{oc, ic, kernel, kernel},
  984. TensorShape{1, oc, 1, 1});
  985. //! bias
  986. args.emplace_back(param, TensorShape{2, ic, h, w},
  987. TensorShape{oc, ic, kernel, kernel},
  988. TensorShape{2, oc, (h + param.pad_h * 2 - kernel) + 1,
  989. (w + param.pad_w * 2 - kernel) + 1});
  990. //! gruop
  991. param.sparse = param::ConvBias::Sparse::GROUP;
  992. args.emplace_back(
  993. param, TensorShape{2, 2 * ic, h, w},
  994. TensorShape{2, oc, ic, kernel, kernel},
  995. TensorShape{2, 2 * oc, (h + param.pad_h * 2 - kernel) + 1,
  996. (w + param.pad_w * 2 - kernel) + 1});
  997. };
  998. for (size_t kernel : {2, 3, 5, 7})
  999. for (size_t ic : {1, 2, 3, 4})
  1000. for (size_t oc : {1, 2, 3, 4})
  1001. for (size_t p : {0, 2})
  1002. for (size_t size : {20, 21, 22, 23, 24})
  1003. for (NonlineMode nonline_mode :
  1004. {NonlineMode::RELU, NonlineMode::SIGMOID,
  1005. NonlineMode::H_SWISH, NonlineMode::IDENTITY}) {
  1006. run(oc, ic, size, size, kernel, p, nonline_mode);
  1007. }
  1008. Checker<ConvBias> checker(handle());
  1009. checker.set_before_exec_callback(
  1010. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  1011. "X86_CONV_BIAS_MATMUL"));
  1012. checker.set_epsilon(1);
  1013. UniformIntRNG rng{-50, 50};
  1014. checker.set_dtype(0, dtype::Float32())
  1015. .set_dtype(1, dtype::Float32())
  1016. .set_dtype(2, dtype::Float32())
  1017. .set_rng(0, &rng)
  1018. .set_rng(1, &rng)
  1019. .set_rng(2, &rng);
  1020. for (auto&& arg : args) {
  1021. checker.set_param(arg.param).exec(
  1022. {arg.src, arg.filter, arg.bias, {}, {}});
  1023. }
  1024. }
  1025. #if MEGDNN_WITH_BENCHMARK
  1026. #if MEGDNN_X86_WITH_MKL_DNN
  1027. static void x86_benchmark_fp32_mkldnn(Handle* handle) {
  1028. constexpr size_t RUNS = 30;
  1029. param::ConvBias param;
  1030. Benchmarker<ConvBias> benchmarker_mkldnn(handle);
  1031. benchmarker_mkldnn.set_display(false).set_times(RUNS);
  1032. benchmarker_mkldnn.set_before_exec_callback(
  1033. AlgoChecker<ConvBias>("MKLDNN_CONV_FP32"));
  1034. Benchmarker<ConvBias> benchmarker_im2col(handle);
  1035. benchmarker_im2col.set_display(false).set_times(RUNS);
  1036. benchmarker_im2col.set_before_exec_callback(
  1037. AlgoChecker<ConvBias>("IM2COLMATMUL.+"));
  1038. auto run = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  1039. size_t FS, size_t SZ, size_t GROUP = 1) {
  1040. TensorShape src({N, IC, H, W}), filter({OC, IC, FS, FS}),
  1041. bias({1, OC, 1, 1}), z({}), dst({N, OC, H / SZ, W / SZ});
  1042. param.pad_h = FS / 2;
  1043. param.pad_w = FS / 2;
  1044. param.stride_h = SZ;
  1045. param.stride_w = SZ;
  1046. param.format = param::ConvBias::Format::NCHW;
  1047. param.sparse = param::ConvBias::Sparse::DENSE;
  1048. if (GROUP > 1) {
  1049. param.sparse = param::ConvBias::Sparse::GROUP;
  1050. filter = {GROUP, OC / GROUP, IC / GROUP, FS, FS};
  1051. }
  1052. auto im2col_used = benchmarker_im2col.set_param(param).exec(
  1053. {src, filter, bias, z, dst}) /
  1054. RUNS;
  1055. src = IC < 8 ? TensorShape{N, IC, H, W}
  1056. : TensorShape{N, IC / 8, H, W, 8};
  1057. filter = IC < 8 ? TensorShape{OC / 8, FS, FS, IC, 8}
  1058. : TensorShape{OC / 8, IC / 8, FS, FS, 8, 8};
  1059. if (GROUP > 1 && OC == GROUP && IC == GROUP) {
  1060. filter = {GROUP / 8, 1, 1, FS, FS, 8};
  1061. } else if (GROUP > 1 && OC / GROUP % 8 == 0 && IC / GROUP % 8 == 0) {
  1062. filter = {GROUP, OC / GROUP / 8, IC / GROUP / 8, FS, FS, 8, 8};
  1063. }
  1064. bias = {1, OC / 8, 1, 1, 8};
  1065. z = {};
  1066. dst = {N, OC / 8, H / SZ, W / SZ, 8};
  1067. param.format = param::ConvBias::Format::NCHW88;
  1068. auto mkldnn_used = benchmarker_mkldnn.set_param(param).exec(
  1069. {src, filter, bias, z, dst}) /
  1070. RUNS;
  1071. float computations =
  1072. (IC / GROUP * FS * FS + 1) * dst.total_nr_elems() * 2 * 1e-6;
  1073. std::cout << "run " << src.to_string() << " " << filter.to_string()
  1074. << " " << bias.to_string() << " " << dst.to_string()
  1075. << std::endl;
  1076. std::cout << "im2col: " << im2col_used << " ms, "
  1077. << (computations / im2col_used) << " Gops, ";
  1078. std::cout << "mkldnn: " << mkldnn_used << " ms, "
  1079. << (computations / mkldnn_used) << " Gops, "
  1080. << "spped up: " << (im2col_used / mkldnn_used) << ", ";
  1081. std::cout << std::endl;
  1082. };
  1083. run(1, 64, 64, 56, 56, 3, 1);
  1084. run(1, 3, 64, 224, 224, 3, 1);
  1085. run(1, 3, 64, 224, 224, 7, 2);
  1086. run(1, 64, 64, 56, 56, 3, 1);
  1087. run(1, 128, 128, 28, 28, 3, 1);
  1088. run(1, 256, 256, 14, 14, 3, 1);
  1089. run(1, 512, 512, 7, 7, 3, 1);
  1090. run(1, 256, 64, 56, 56, 1, 1);
  1091. run(1, 512, 128, 28, 28, 1, 1);
  1092. run(1, 1024, 256, 14, 14, 1, 1);
  1093. run(1, 2048, 512, 7, 7, 1, 1);
  1094. run(1, 32, 32, 112, 112, 3, 1, 32);
  1095. run(1, 144, 144, 56, 56, 3, 1, 144);
  1096. run(1, 192, 192, 28, 28, 3, 1, 192);
  1097. run(1, 384, 384, 28, 28, 3, 1, 384);
  1098. run(1, 576, 576, 14, 14, 3, 1, 576);
  1099. run(1, 960, 960, 7, 7, 3, 1, 960);
  1100. run(1, 256, 128, 56, 56, 1, 2, 1);
  1101. run(1, 512, 256, 28, 28, 1, 2, 1);
  1102. run(1, 1024, 512, 14, 14, 1, 2, 1);
  1103. run(1, 96, 96, 112, 112, 3, 2, 96);
  1104. run(1, 144, 144, 56, 56, 3, 2, 144);
  1105. run(1, 384, 384, 28, 28, 3, 2, 384);
  1106. run(1, 576, 576, 14, 14, 3, 2, 576);
  1107. }
  1108. TEST_F(X86, BENCHMARK_CONVBIAS_FP32_MKLDNN) {
  1109. x86_benchmark_fp32_mkldnn(handle());
  1110. }
  1111. TEST_F(X86_MULTI_THREADS, BENCHMARK_CONVBIAS_FP32_MKLDNN) {
  1112. x86_benchmark_fp32_mkldnn(handle());
  1113. }
  1114. #endif
  1115. #endif
  1116. /************************* Winograd ****************************/
  1117. namespace{
  1118. std::vector<conv_bias::TestArg> get_winograd_mk_nchw88_args() {
  1119. std::vector<conv_bias::TestArg> args;
  1120. param::ConvBias cur_param;
  1121. cur_param.format = param::ConvBias::Format::NCHW88;
  1122. using NLMode = param::ConvBias::NonlineMode;
  1123. // clang-format off
  1124. for (auto nlmode :
  1125. {NLMode::IDENTITY, NLMode::RELU, NLMode::SIGMOID, NLMode::H_SWISH}) {
  1126. for (size_t ic : {1, 2}) {
  1127. for (size_t oc : {1, 2}) {
  1128. for (size_t i : {9, 63}) {
  1129. cur_param.mode = param::ConvBias::Mode::CROSS_CORRELATION;
  1130. cur_param.nonlineMode = nlmode;
  1131. cur_param.sparse = param::ConvBias::Sparse::DENSE;
  1132. cur_param.pad_h = cur_param.pad_w = 1;
  1133. args.emplace_back(cur_param, TensorShape{1, ic, i, i, 8},
  1134. TensorShape{oc, ic, 3, 3, 8, 8},
  1135. TensorShape{1, oc, 1, 1, 8});
  1136. args.emplace_back(cur_param, TensorShape{1, ic, i, i, 8},
  1137. TensorShape{oc, ic, 3, 3, 8, 8},TensorShape{});
  1138. //! bias
  1139. args.emplace_back(cur_param, TensorShape{2, ic, i, i, 8},
  1140. TensorShape{oc, ic, 3, 3, 8, 8}, TensorShape{2, oc, i, i, 8});
  1141. /*cur_param.sparse = param::ConvBias::Sparse::GROUP;
  1142. args.emplace_back(cur_param, TensorShape{2, 2 * ic, i, i, 8},
  1143. TensorShape{2, oc, ic, 3, 3, 8, 8},
  1144. TensorShape{1, 2 * oc, 1, 1, 8});*/
  1145. }}}
  1146. // clang-format on
  1147. //! test for multi-thread OC parallel
  1148. cur_param.sparse = param::ConvBias::Sparse::DENSE;
  1149. cur_param.pad_h = cur_param.pad_w = 1;
  1150. args.emplace_back(cur_param, TensorShape{2, 1, 9, 9, 8},
  1151. TensorShape{128, 1, 3, 3, 8, 8},
  1152. TensorShape{1, 128, 1, 1, 8});
  1153. /*cur_param.sparse = param::ConvBias::Sparse::GROUP;
  1154. args.emplace_back(cur_param, TensorShape{2, 2, 9, 9, 8},
  1155. TensorShape{2, 128, 1, 3, 3, 8, 8},
  1156. TensorShape{1, 2 * 128, 1, 1, 8});*/
  1157. }
  1158. return args;
  1159. }
  1160. } // namespace
  1161. TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW88_F63) {
  1162. using namespace conv_bias;
  1163. std::vector<TestArg> args = get_winograd_mk_nchw88_args();
  1164. Checker<ConvBiasForward> checker(handle());
  1165. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1166. ssprintf("WINOGRAD:X86_F32MK8_8X8:8:6").c_str()));
  1167. for (auto&& arg : args) {
  1168. checker.set_param(arg.param).execs(
  1169. {arg.src, arg.filter, arg.bias, {}, {}});
  1170. }
  1171. }
  1172. TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW88_F23) {
  1173. using namespace conv_bias;
  1174. std::vector<TestArg> args = get_winograd_mk_nchw88_args();
  1175. Checker<ConvBiasForward> checker(handle());
  1176. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1177. ssprintf("WINOGRAD:X86_F32MK8_8X8:8:2").c_str()));
  1178. for (auto&& arg : args) {
  1179. checker.set_param(arg.param).execs(
  1180. {arg.src, arg.filter, arg.bias, {}, {}});
  1181. }
  1182. }
  1183. TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_WEIGHT_PREPROCESS) {
  1184. using namespace conv_bias;
  1185. std::vector<TestArg> args = get_winograd_mk_nchw88_args();
  1186. Checker<ConvBiasForward> checker(handle());
  1187. auto extra_impl = [](const TensorNDArray& tensors, uint32_t m,
  1188. param::ConvBias param, Handle* handle) {
  1189. megdnn_assert(param.format == param::ConvBias::Format::NCHW88);
  1190. auto winograd_preprocess_opr =
  1191. handle->create_operator<WinogradFilterPreprocess>();
  1192. winograd_preprocess_opr->param().output_block_size = m;
  1193. winograd_preprocess_opr->param().format = param::MatrixMul::Format::MK8;
  1194. TensorLayout filter_transform_layout;
  1195. winograd_preprocess_opr->deduce_layout(tensors[1].layout,
  1196. filter_transform_layout);
  1197. size_t winograd_preprocess_workspace_in_bytes =
  1198. winograd_preprocess_opr->get_workspace_in_bytes(
  1199. tensors[1].layout, filter_transform_layout);
  1200. auto conv_bias_opr = handle->create_operator<ConvBias>();
  1201. conv_bias_opr->param() = param;
  1202. conv_bias_opr->param().format = param::ConvBias::Format::NCHW88_WINOGRAD;
  1203. conv_bias_opr->param().output_block_size = m;
  1204. size_t conv_bias_workspace_in_bytes =
  1205. conv_bias_opr->get_workspace_in_bytes(
  1206. tensors[0].layout, filter_transform_layout,
  1207. tensors[2].layout, tensors[3].layout, tensors[4].layout,
  1208. nullptr);
  1209. WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(),
  1210. conv_bias_workspace_in_bytes,
  1211. winograd_preprocess_workspace_in_bytes});
  1212. wb.set(malloc(wb.total_size_in_bytes()));
  1213. TensorND filter_transform_tensor(wb.get(0),
  1214. std::move(filter_transform_layout));
  1215. winograd_preprocess_opr->exec(tensors[1], filter_transform_tensor,
  1216. wb.get_workspace(2));
  1217. conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2],
  1218. tensors[3], tensors[4], nullptr,
  1219. wb.get_workspace(1));
  1220. free(wb.ptr());
  1221. };
  1222. auto run = [&checker, &extra_impl](
  1223. Handle* handle, const std::vector<TestArg>& args,
  1224. const std::vector<size_t>& out_size, DType A_dtype,
  1225. DType B_dtype, DType C_dtype, DType D_dtype,
  1226. const float eps) {
  1227. for (auto&& arg : args) {
  1228. for (uint32_t m : out_size) {
  1229. checker.set_extra_opr_impl(std::bind(extra_impl,
  1230. std::placeholders::_1, m,
  1231. arg.param, handle));
  1232. checker.set_dtype(0, A_dtype)
  1233. .set_dtype(1, B_dtype)
  1234. .set_dtype(2, C_dtype)
  1235. .set_dtype(4, D_dtype)
  1236. .set_epsilon(eps)
  1237. .set_param(arg.param)
  1238. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  1239. }
  1240. }
  1241. };
  1242. run(handle(), args, {2, 6}, dtype::Float32(), dtype::Float32(),
  1243. dtype::Float32(), dtype::Float32(), 1e-3f);
  1244. }
  1245. /*********************************** End winograd ************************/
  1246. #if MEGDNN_X86_WITH_MKL_DNN
  1247. static void x86_correctness_fp32_mkldnn_run(
  1248. Checker<ConvBias>& checker, UniformIntRNG& rng, Handle* handle,
  1249. ConvBiasForward::BiasMode bias_mode,
  1250. param::ConvBias::NonlineMode noline_mode, size_t n, size_t stride,
  1251. size_t kernel, size_t oc, size_t ic, size_t h, size_t w, size_t group) {
  1252. auto oc_per_group = oc / group;
  1253. auto ic_per_group = ic / group;
  1254. bool ok_group = oc_per_group % 8 == 0 && oc_per_group > 0 &&
  1255. (ic_per_group % 8 == 0 || ic_per_group == 3) &&
  1256. ic_per_group > 0;
  1257. bool ok_depthwise = oc == ic && oc == group;
  1258. if (!(ok_group || ok_depthwise)) {
  1259. return;
  1260. }
  1261. size_t pad = kernel / 2;
  1262. size_t kernel_h = kernel;
  1263. size_t kernel_w = kernel;
  1264. param::ConvBias param;
  1265. param.format = param::ConvBias::Format::NCHW88;
  1266. param.stride_h = stride;
  1267. param.stride_w = stride;
  1268. param.pad_h = pad;
  1269. param.pad_w = pad;
  1270. param.nonlineMode = noline_mode;
  1271. auto src_tensor_shape = TensorShape{n, ic / 8, h, w, 8};
  1272. if (ic == 3) {
  1273. src_tensor_shape = TensorShape{n, ic, h, w};
  1274. }
  1275. auto weight_tensor_shape =
  1276. TensorShape{oc / 8, ic / 8, kernel_h, kernel_w, 8, 8};
  1277. if (ic == 3) {
  1278. weight_tensor_shape = TensorShape{oc / 8, kernel_h, kernel_w, ic, 8};
  1279. }
  1280. auto bias_tensor_shape = TensorShape{};
  1281. if (bias_mode == megdnn::BiasMode::BROADCAST_CHANNEL_BIAS) {
  1282. bias_tensor_shape = {1, oc / 8, 1, 1, 8};
  1283. } else if (bias_mode == megdnn::BiasMode::BIAS) {
  1284. TensorLayout dst_layout;
  1285. auto ConvBiasOp = handle->create_operator<ConvBias>();
  1286. ConvBiasOp->param() = param;
  1287. ConvBiasOp->deduce_layout({src_tensor_shape, dtype::Float32()},
  1288. {weight_tensor_shape, dtype::Float32()}, {},
  1289. {}, dst_layout);
  1290. bias_tensor_shape = dst_layout;
  1291. }
  1292. if (group == 1) {
  1293. param.sparse = param::ConvBias::Sparse::DENSE;
  1294. } else if (group > 1 && ic / group == 1 && oc / group == 1) {
  1295. param.sparse = param::ConvBias::Sparse::GROUP;
  1296. weight_tensor_shape =
  1297. TensorShape{group / 8, 1, 1, kernel_h, kernel_w, 8};
  1298. } else if (group > 1 && oc / group % 8 == 0 && oc / group > 0 &&
  1299. ic / group % 8 == 0 && ic / group > 0) {
  1300. param.sparse = param::ConvBias::Sparse::GROUP;
  1301. weight_tensor_shape = TensorShape{
  1302. group, oc / group / 8, ic / group / 8, kernel_h, kernel_w, 8,
  1303. 8};
  1304. }
  1305. checker.set_dtype(0, dtype::Float32())
  1306. .set_dtype(1, dtype::Float32())
  1307. .set_dtype(2, dtype::Float32())
  1308. .set_dtype(4, dtype::Float32())
  1309. .set_rng(0, &rng)
  1310. .set_rng(1, &rng)
  1311. .set_rng(2, &rng)
  1312. .set_epsilon(1e-3)
  1313. .set_param(param)
  1314. .execs({src_tensor_shape,
  1315. weight_tensor_shape,
  1316. bias_tensor_shape,
  1317. {},
  1318. {}});
  1319. }
  1320. static void x86_correctness_fp32_mkldnn(Handle* handle) {
  1321. Checker<ConvBias> checker(handle);
  1322. UniformIntRNG rng{-127, 127};
  1323. checker.set_before_exec_callback(
  1324. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  1325. "MKLDNN_CONV_FP32"));
  1326. for (auto bias_mode :
  1327. {megdnn::BiasMode::NO_BIAS, megdnn::BiasMode::BROADCAST_CHANNEL_BIAS,
  1328. megdnn::BiasMode::BIAS})
  1329. for (auto noline_mode : {param::ConvBias::NonlineMode::IDENTITY,
  1330. param::ConvBias::NonlineMode::SIGMOID,
  1331. param::ConvBias::NonlineMode::H_SWISH})
  1332. for (size_t n : {1, 2})
  1333. for (size_t stride : {1, 2})
  1334. for (size_t kernel : {3, 5, 7})
  1335. for (size_t oc : {8, 16})
  1336. for (size_t ic : {3, 8, 16})
  1337. for (size_t h : {22, 33})
  1338. for (size_t w : {22, 33}) {
  1339. for (size_t group = 1;
  1340. group <= std::min(oc, ic);
  1341. ++group) {
  1342. x86_correctness_fp32_mkldnn_run(
  1343. checker, rng, handle,
  1344. bias_mode, noline_mode, n,
  1345. stride, kernel, oc, ic, h,
  1346. w, group);
  1347. }
  1348. }
  1349. }
  1350. TEST_F(X86, CONV_BIAS_DIRECT_MKLDNN_C8) {
  1351. x86_correctness_fp32_mkldnn(handle());
  1352. }
  1353. TEST_F(X86_MULTI_THREADS, CONV_BIAS_DIRECT_MKLDNN_C8) {
  1354. x86_correctness_fp32_mkldnn(handle());
  1355. }
  1356. TEST_F(X86, CONV_BIAS_MKL_DNN_MATMUL_INT8) {
  1357. using namespace conv_bias;
  1358. std::vector<TestArg> args;
  1359. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1360. size_t p, NonlineMode nonline_mode) {
  1361. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1362. return;
  1363. param::ConvBias param;
  1364. param.stride_h = 1;
  1365. param.stride_w = 1;
  1366. param.pad_h = p;
  1367. param.pad_w = p;
  1368. param.nonlineMode = nonline_mode;
  1369. //! no bias
  1370. args.emplace_back(param, TensorShape{1, ic, h, w},
  1371. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  1372. };
  1373. for (size_t kernel : {2, 3, 5, 7})
  1374. for (size_t ic : {1, 2, 3, 4})
  1375. for (size_t oc : {1, 2, 4})
  1376. for (size_t p : {0, 2})
  1377. for (size_t size : {20, 21, 22, 23, 24})
  1378. for (NonlineMode nonline_mode :
  1379. {NonlineMode::IDENTITY}) {
  1380. run(oc, ic, size, size, kernel, p, nonline_mode);
  1381. }
  1382. Checker<ConvBias> checker(handle());
  1383. checker.set_before_exec_callback(
  1384. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  1385. "MKLDNN_MATMUL_INT8"));
  1386. checker.set_epsilon(1);
  1387. UniformIntRNG rng{-50, 50};
  1388. checker.set_dtype(0, dtype::Int8())
  1389. .set_dtype(1, dtype::Int8())
  1390. .set_dtype(2, dtype::Int32())
  1391. .set_dtype(4, dtype::Int32())
  1392. .set_rng(0, &rng)
  1393. .set_rng(1, &rng)
  1394. .set_rng(2, &rng);
  1395. for (auto&& arg : args) {
  1396. checker.set_param(arg.param).exec(
  1397. {arg.src, arg.filter, arg.bias, {}, {}});
  1398. }
  1399. }
  1400. TEST_F(X86, CONV_BIAS_MKL_DNN_INT8) {
  1401. using namespace conv_bias;
  1402. std::vector<TestArg> args;
  1403. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1404. size_t p, NonlineMode nonline_mode) {
  1405. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1406. return;
  1407. param::ConvBias param;
  1408. param.stride_h = 1;
  1409. param.stride_w = 1;
  1410. param.pad_h = p;
  1411. param.pad_w = p;
  1412. param.nonlineMode = nonline_mode;
  1413. //! no bias
  1414. args.emplace_back(param, TensorShape{1, ic, h, w},
  1415. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  1416. };
  1417. for (size_t kernel : {2, 3, 5, 7})
  1418. for (size_t ic : {1, 2, 3, 4})
  1419. for (size_t oc : {1, 2, 4})
  1420. for (size_t p : {0, 2})
  1421. for (size_t size : {20, 22, 24})
  1422. for (NonlineMode nonline_mode :
  1423. {NonlineMode::IDENTITY}) {
  1424. run(oc, ic, size, size, kernel, p, nonline_mode);
  1425. }
  1426. Checker<ConvBias> checker(handle());
  1427. checker.set_before_exec_callback(
  1428. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("MKLDNN_INT8"));
  1429. checker.set_epsilon(1);
  1430. UniformIntRNG rng{-50, 50};
  1431. checker.set_dtype(0, dtype::Int8())
  1432. .set_dtype(1, dtype::Int8())
  1433. .set_dtype(2, dtype::Int32())
  1434. .set_dtype(4, dtype::Int32())
  1435. .set_rng(0, &rng)
  1436. .set_rng(1, &rng)
  1437. .set_rng(2, &rng);
  1438. for (auto&& arg : args) {
  1439. checker.set_param(arg.param).exec(
  1440. {arg.src, arg.filter, arg.bias, {}, {}});
  1441. }
  1442. }
  1443. TEST_F(X86_MULTI_THREADS, CONV_BIAS_MKL_DNN_INT8) {
  1444. using namespace conv_bias;
  1445. std::vector<TestArg> args;
  1446. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1447. size_t p, NonlineMode nonline_mode) {
  1448. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1449. return;
  1450. param::ConvBias param;
  1451. param.stride_h = 1;
  1452. param.stride_w = 1;
  1453. param.pad_h = p;
  1454. param.pad_w = p;
  1455. param.nonlineMode = nonline_mode;
  1456. //! no bias
  1457. args.emplace_back(param, TensorShape{1, ic, h, w},
  1458. TensorShape{oc, ic, kernel, kernel}, TensorShape{});
  1459. };
  1460. for (size_t kernel : {2, 3, 5, 7})
  1461. for (size_t ic : {1, 2, 3, 4})
  1462. for (size_t oc : {1, 2, 4})
  1463. for (size_t p : {0, 2})
  1464. for (size_t size : {20, 22, 24})
  1465. for (NonlineMode nonline_mode :
  1466. {NonlineMode::IDENTITY}) {
  1467. run(oc, ic, size, size, kernel, p, nonline_mode);
  1468. }
  1469. Checker<ConvBias> checker(handle());
  1470. checker.set_before_exec_callback(
  1471. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("MKLDNN_INT8"));
  1472. checker.set_epsilon(1);
  1473. UniformIntRNG rng{-50, 50};
  1474. checker.set_dtype(0, dtype::Int8())
  1475. .set_dtype(1, dtype::Int8())
  1476. .set_dtype(2, dtype::Int32())
  1477. .set_dtype(4, dtype::Int32())
  1478. .set_rng(0, &rng)
  1479. .set_rng(1, &rng)
  1480. .set_rng(2, &rng);
  1481. for (auto&& arg : args) {
  1482. checker.set_param(arg.param).exec(
  1483. {arg.src, arg.filter, arg.bias, {}, {}});
  1484. }
  1485. }
  1486. #endif
  1487. #if MEGDNN_WITH_BENCHMARK
  1488. namespace {
  1489. void benchmark_impl(const param::ConvBias param,
  1490. std::vector<std::pair<SmallVector<TensorShape>, float>>&
  1491. shapes_and_computation,
  1492. const std::string algo_name, size_t RUNS,
  1493. TaskExecutorConfig&& multi_thread_config,
  1494. TaskExecutorConfig&& single_thread_config,
  1495. std::vector<DType> dtype_v) {
  1496. std::vector<DType> data_type = {dtype::Float32(), dtype::Float32(),
  1497. dtype::Float32(), dtype::Float32()};
  1498. std::vector<float> multi_thread_times, single_thread_times;
  1499. {
  1500. auto multi_thread_hanle =
  1501. create_cpu_handle(0, true, &multi_thread_config);
  1502. auto benchmarker = Benchmarker<ConvBias>(multi_thread_hanle.get());
  1503. benchmarker.set_times(RUNS)
  1504. .set_display(false)
  1505. .set_dtype(0, dtype_v[0])
  1506. .set_dtype(1, dtype_v[1])
  1507. .set_dtype(2, dtype_v[2])
  1508. .set_dtype(4, dtype_v[3])
  1509. .set_param(param)
  1510. .set_before_exec_callback(
  1511. conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1512. algo_name.c_str()));
  1513. for (auto shape : shapes_and_computation) {
  1514. multi_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  1515. }
  1516. }
  1517. {
  1518. auto single_thread_handle =
  1519. create_cpu_handle(0, true, &single_thread_config);
  1520. auto benchmarker = Benchmarker<ConvBias>(single_thread_handle.get());
  1521. benchmarker.set_times(RUNS)
  1522. .set_display(false)
  1523. .set_dtype(0, dtype_v[0])
  1524. .set_dtype(1, dtype_v[1])
  1525. .set_dtype(2, dtype_v[2])
  1526. .set_dtype(4, dtype_v[3])
  1527. .set_param(param)
  1528. .set_before_exec_callback(
  1529. conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1530. algo_name.c_str()));
  1531. for (auto shape : shapes_and_computation) {
  1532. single_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  1533. }
  1534. }
  1535. printf("Benchmark : Multi threads %zu, ", multi_thread_config.nr_thread);
  1536. printf("core_ids:");
  1537. for (size_t i = 0; i < multi_thread_config.affinity_core_set.size(); i++) {
  1538. printf("%zu ", multi_thread_config.affinity_core_set[i]);
  1539. }
  1540. printf(", Single thread core_id %zu\n",
  1541. single_thread_config.affinity_core_set[0]);
  1542. for (size_t i = 0; i < shapes_and_computation.size(); i++) {
  1543. auto shapes = shapes_and_computation[i];
  1544. printf("Bench case: ");
  1545. for (auto&& shape : shapes.first) {
  1546. printf("%s ", shape.to_string().c_str());
  1547. }
  1548. float computations = shapes.second;
  1549. printf("%zu threads gflops: %f,\n single thread gflops: "
  1550. "%f. spead up = %f, speedup/cores=%f\n",
  1551. multi_thread_config.nr_thread,
  1552. computations / multi_thread_times[i],
  1553. computations / single_thread_times[i],
  1554. single_thread_times[i] / multi_thread_times[i],
  1555. single_thread_times[i] / multi_thread_times[i] /
  1556. multi_thread_config.nr_thread);
  1557. }
  1558. }
  1559. void benchmark_impl_comp(const param::ConvBias param,
  1560. std::vector<std::pair<SmallVector<TensorShape>, float>>&
  1561. shapes_and_computation,
  1562. const std::string algo_name, const std::string algo_name1,size_t RUNS,
  1563. TaskExecutorConfig&& multi_thread_config,
  1564. TaskExecutorConfig&& single_thread_config,std::vector<DType> dtype_v) {
  1565. std::vector<DType> data_type = {dtype::Float32(), dtype::Float32(),
  1566. dtype::Float32(), dtype::Float32()};
  1567. std::vector<float> multi_thread_times, single_thread_times;
  1568. {
  1569. auto multi_thread_hanle =
  1570. create_cpu_handle(0, true, &multi_thread_config);
  1571. auto benchmarker = Benchmarker<ConvBias>(multi_thread_hanle.get());
  1572. benchmarker.set_times(RUNS)
  1573. .set_display(false)
  1574. .set_dtype(0,dtype_v[0])
  1575. .set_dtype(1,dtype_v[1])
  1576. .set_dtype(2,dtype_v[2])
  1577. .set_dtype(4,dtype_v[3])
  1578. .set_param(param)
  1579. .set_before_exec_callback(
  1580. conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1581. algo_name.c_str()));
  1582. for (auto shape : shapes_and_computation) {
  1583. multi_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  1584. }
  1585. }
  1586. {
  1587. auto single_thread_handle =
  1588. create_cpu_handle(0, true, &single_thread_config);
  1589. auto benchmarker = Benchmarker<ConvBias>(single_thread_handle.get());
  1590. benchmarker.set_times(RUNS)
  1591. .set_display(false)
  1592. .set_dtype(0,dtype_v[0])
  1593. .set_dtype(1,dtype_v[1])
  1594. .set_dtype(2,dtype_v[2])
  1595. .set_dtype(4,dtype_v[3])
  1596. .set_param(param)
  1597. .set_before_exec_callback(
  1598. conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1599. algo_name1.c_str()));
  1600. for (auto shape : shapes_and_computation) {
  1601. single_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  1602. }
  1603. }
  1604. printf("Benchmark : Multi threads %zu, ", multi_thread_config.nr_thread);
  1605. printf("core_ids:");
  1606. for (size_t i = 0; i < multi_thread_config.affinity_core_set.size(); i++) {
  1607. printf("%zu ", multi_thread_config.affinity_core_set[i]);
  1608. }
  1609. for (size_t i = 0; i < shapes_and_computation.size(); i++) {
  1610. auto shapes = shapes_and_computation[i];
  1611. printf("Bench case: ");
  1612. for (auto&& shape : shapes.first) {
  1613. printf("%s ", shape.to_string().c_str());
  1614. }
  1615. float computations = shapes.second;
  1616. printf("algo:%s gflops: %f,\n algo:%s gflops: "
  1617. "%f. spead up = %f\n",
  1618. algo_name.c_str(), computations / multi_thread_times[i],
  1619. algo_name1.c_str(), computations / single_thread_times[i],
  1620. single_thread_times[i] / multi_thread_times[i]);
  1621. }
  1622. }
  1623. } // namespace
  1624. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_CHANWISE_AVX2_INT8) {
  1625. constexpr size_t RUNS = 50;
  1626. param::ConvBias param;
  1627. param.stride_h = 1;
  1628. param.stride_w = 1;
  1629. param.sparse = param::ConvBias::Sparse::GROUP;
  1630. std::vector<DType> data_type = {dtype::Int8(), dtype::Int8(),
  1631. dtype::Int32(), dtype::Int32()};
  1632. std::vector<std::pair<SmallVector<TensorShape>, float>>
  1633. shapes_and_computation;
  1634. auto bench_case = [&](size_t N, size_t IC, size_t H, size_t W, size_t FS) {
  1635. param.pad_h = FS / 2;
  1636. param.pad_w = FS / 2;
  1637. SmallVector<TensorShape> shapes{
  1638. {N, IC, H, W}, {IC, 1, 1, FS, FS}, {}, {}, {}};
  1639. TensorShape dst{N, IC, (H + 2 * param.pad_h - FS) + 1,
  1640. (W + 2 * param.pad_w - FS) + 1};
  1641. float computations = (FS * FS * dst.total_nr_elems() * 2) * 1e-6;
  1642. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1643. };
  1644. bench_case(1, 32, 112, 112, 7);
  1645. bench_case(1, 144, 56, 56, 7);
  1646. bench_case(1, 192, 28, 28, 7);
  1647. bench_case(1, 384, 28, 28, 7);
  1648. bench_case(1, 576, 14, 14, 7);
  1649. bench_case(1, 960, 7, 7, 7);
  1650. bench_case(1, 32, 112, 112, 5);
  1651. bench_case(1, 144, 56, 56, 5);
  1652. bench_case(1, 192, 28, 28, 5);
  1653. bench_case(1, 384, 28, 28, 5);
  1654. bench_case(1, 576, 14, 14, 5);
  1655. bench_case(1, 960, 7, 7, 5);
  1656. bench_case(1, 32, 112, 112, 3);
  1657. bench_case(1, 144, 56, 56, 3);
  1658. bench_case(1, 192, 28, 28, 3);
  1659. bench_case(1, 384, 28, 28, 3);
  1660. bench_case(1, 576, 14, 14, 3);
  1661. bench_case(1, 960, 7, 7, 3);
  1662. bench_case(1, 32, 112, 112, 2);
  1663. bench_case(1, 144, 56, 56, 2);
  1664. bench_case(1, 192, 28, 28, 2);
  1665. bench_case(1, 384, 28, 28, 2);
  1666. bench_case(1, 576, 14, 14, 2);
  1667. bench_case(1, 960, 7, 7, 2);
  1668. std::string algo_name = "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1";
  1669. printf("Benchmark X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1\n");
  1670. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1671. {4, {4, 5, 6, 7}}, {1, {4}}, data_type);
  1672. benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
  1673. {1, {4}}, data_type);
  1674. shapes_and_computation.clear();
  1675. }
  1676. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECT_AVX2_INT8) {
  1677. constexpr size_t RUNS = 50;
  1678. param::ConvBias param;
  1679. param.stride_h = 1;
  1680. param.stride_w = 1;
  1681. param.sparse = param::ConvBias::Sparse::DENSE;
  1682. std::vector<DType> data_type = {dtype::Int8(), dtype::Int8(),
  1683. dtype::Int32(), dtype::Int32()};
  1684. std::vector<std::pair<SmallVector<TensorShape>, float>>
  1685. shapes_and_computation;
  1686. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  1687. size_t FS) {
  1688. param.pad_h = FS / 2;
  1689. param.pad_w = FS / 2;
  1690. SmallVector<TensorShape> shapes{
  1691. {N, IC, H, W}, {OC, IC, FS, FS}, {}, {}, {}};
  1692. TensorShape dst{N, OC, (H + 2 * param.pad_h - FS) + 1,
  1693. (W + 2 * param.pad_w - FS) + 1};
  1694. float computations = (IC * FS * FS * dst.total_nr_elems() * 2) * 1e-6;
  1695. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1696. };
  1697. bench_case(1, 32, 32, 200, 200, 7);
  1698. bench_case(1, 32, 64, 200, 200, 7);
  1699. bench_case(1, 32, 32, 128, 128, 7);
  1700. bench_case(1, 32, 64, 128, 128, 7);
  1701. bench_case(1, 32, 32, 100, 100, 7);
  1702. bench_case(1, 32, 64, 100, 100, 7);
  1703. bench_case(1, 32, 32, 80, 80, 7);
  1704. bench_case(1, 32, 64, 80, 80, 7);
  1705. bench_case(1, 32, 32, 200, 200, 5);
  1706. bench_case(1, 32, 64, 200, 200, 5);
  1707. bench_case(1, 32, 32, 128, 128, 5);
  1708. bench_case(1, 32, 64, 128, 128, 5);
  1709. bench_case(1, 32, 32, 100, 100, 5);
  1710. bench_case(1, 32, 64, 100, 100, 5);
  1711. bench_case(1, 32, 32, 80, 80, 5);
  1712. bench_case(1, 32, 64, 80, 80, 5);
  1713. bench_case(1, 32, 32, 200, 200, 3);
  1714. bench_case(1, 32, 64, 200, 200, 3);
  1715. bench_case(1, 32, 32, 128, 128, 3);
  1716. bench_case(1, 32, 64, 128, 128, 3);
  1717. bench_case(1, 32, 32, 100, 100, 3);
  1718. bench_case(1, 32, 64, 100, 100, 3);
  1719. bench_case(1, 32, 32, 80, 80, 3);
  1720. bench_case(1, 32, 64, 80, 80, 3);
  1721. bench_case(1, 32, 32, 200, 200, 2);
  1722. bench_case(1, 32, 64, 200, 200, 2);
  1723. bench_case(1, 32, 32, 128, 128, 2);
  1724. bench_case(1, 32, 64, 128, 128, 2);
  1725. bench_case(1, 32, 32, 100, 100, 2);
  1726. bench_case(1, 32, 64, 100, 100, 2);
  1727. bench_case(1, 32, 32, 80, 80, 2);
  1728. bench_case(1, 32, 64, 80, 80, 2);
  1729. std::string algo_name = "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE1";
  1730. printf("Benchmark X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE1 algo\n");
  1731. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1732. {4, {4, 5, 6, 7}}, {1, {4}}, data_type);
  1733. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1734. {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  1735. benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
  1736. {1, {4}}, data_type);
  1737. shapes_and_computation.clear();
  1738. }
  1739. TEST_F(X86_BENCHMARK_MULTI_THREADS,
  1740. BENCHMARK_CONVBIAS_DIRECT_AVX2_INT8_STRIDE2) {
  1741. constexpr size_t RUNS = 50;
  1742. param::ConvBias param;
  1743. param.stride_h = 2;
  1744. param.stride_w = 2;
  1745. param.sparse = param::ConvBias::Sparse::DENSE;
  1746. std::vector<DType> data_type = {dtype::Int8(), dtype::Int8(),
  1747. dtype::Int32(), dtype::Int32()};
  1748. std::vector<std::pair<SmallVector<TensorShape>, float>>
  1749. shapes_and_computation;
  1750. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  1751. size_t FS) {
  1752. param.pad_h = FS / 2;
  1753. param.pad_w = FS / 2;
  1754. SmallVector<TensorShape> shapes{
  1755. {N, IC, H, W}, {OC, IC, FS, FS}, {}, {}, {}};
  1756. TensorShape dst{N, OC, (H + 2 * param.pad_h - FS) / param.stride_h + 1,
  1757. (W + 2 * param.pad_w - FS) / param.pad_w + 1};
  1758. float computations = (IC * FS * FS * dst.total_nr_elems() * 2) * 1e-6;
  1759. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1760. };
  1761. bench_case(1, 32, 32, 200, 200, 7);
  1762. bench_case(1, 32, 64, 200, 200, 7);
  1763. bench_case(1, 32, 32, 128, 128, 7);
  1764. bench_case(1, 32, 64, 128, 128, 7);
  1765. bench_case(1, 32, 32, 100, 100, 7);
  1766. bench_case(1, 32, 64, 100, 100, 7);
  1767. bench_case(1, 32, 32, 80, 80, 7);
  1768. bench_case(1, 32, 64, 80, 80, 7);
  1769. bench_case(1, 32, 32, 200, 200, 5);
  1770. bench_case(1, 32, 64, 200, 200, 5);
  1771. bench_case(1, 32, 32, 128, 128, 5);
  1772. bench_case(1, 32, 64, 128, 128, 5);
  1773. bench_case(1, 32, 32, 100, 100, 5);
  1774. bench_case(1, 32, 64, 100, 100, 5);
  1775. bench_case(1, 32, 32, 80, 80, 5);
  1776. bench_case(1, 32, 64, 80, 80, 5);
  1777. bench_case(1, 32, 32, 200, 200, 3);
  1778. bench_case(1, 32, 64, 200, 200, 3);
  1779. bench_case(1, 32, 32, 128, 128, 3);
  1780. bench_case(1, 32, 64, 128, 128, 3);
  1781. bench_case(1, 32, 32, 100, 100, 3);
  1782. bench_case(1, 32, 64, 100, 100, 3);
  1783. bench_case(1, 32, 32, 80, 80, 3);
  1784. bench_case(1, 32, 64, 80, 80, 3);
  1785. bench_case(1, 32, 32, 200, 200, 2);
  1786. bench_case(1, 32, 64, 200, 200, 2);
  1787. bench_case(1, 32, 32, 128, 128, 2);
  1788. bench_case(1, 32, 64, 128, 128, 2);
  1789. bench_case(1, 32, 32, 100, 100, 2);
  1790. bench_case(1, 32, 64, 100, 100, 2);
  1791. bench_case(1, 32, 32, 80, 80, 2);
  1792. bench_case(1, 32, 64, 80, 80, 2);
  1793. std::string algo_name = "X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE2";
  1794. printf("Benchmark X86_CONV_BIAS_DIRECT_AVX2_INT8_STRIDE2 algo\n");
  1795. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1796. {4, {4, 5, 6, 7}}, {1, {4}}, data_type);
  1797. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1798. {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  1799. benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
  1800. {1, {4}}, data_type);
  1801. shapes_and_computation.clear();
  1802. }
  1803. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECTF32) {
  1804. constexpr size_t RUNS = 50;
  1805. param::ConvBias param;
  1806. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1807. param.pad_h = 1;
  1808. param.pad_w = 1;
  1809. param.stride_h = 1;
  1810. param.stride_w = 1;
  1811. param.sparse = param::ConvBias::Sparse::GROUP;
  1812. std::vector<DType> data_type = {dtype::Float32(), dtype::Float32(),
  1813. dtype::Float32(), dtype::Float32()};
  1814. std::vector<std::pair<SmallVector<TensorShape>, float>>
  1815. shapes_and_computation;
  1816. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  1817. size_t FS, size_t group) {
  1818. SmallVector<TensorShape> shapes{{N, IC, H, W},
  1819. {group, OC / group, IC / group, FS, FS},
  1820. {1, OC, 1, 1},
  1821. {},
  1822. {N, OC, H, W}};
  1823. TensorShape dst{N, OC, H, W};
  1824. float computations =
  1825. ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1826. dst.total_nr_elems()) *
  1827. 1e-6;
  1828. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1829. };
  1830. bench_case(1, 32, 32, 200, 200, 3, 4);
  1831. bench_case(1, 32, 32, 200, 200, 3, 32);
  1832. bench_case(1, 32, 32, 128, 128, 3, 4);
  1833. bench_case(1, 32, 32, 128, 128, 3, 32);
  1834. bench_case(1, 32, 32, 100, 100, 3, 4);
  1835. bench_case(1, 32, 32, 100, 100, 3, 32);
  1836. bench_case(1, 32, 32, 80, 80, 3, 4);
  1837. bench_case(1, 32, 32, 80, 80, 3, 32);
  1838. std::string algo_name = "X86_CONV_BIAS_DIRECT_STRIDE1_LARGE_GROUP";
  1839. printf("Benchmark X86_CONV_BIAS_DIRECT_STRIDE1_LARGE_GROUP algo\n");
  1840. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1841. {4, {4, 5, 6, 7}}, {1, {4}}, data_type);
  1842. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1843. {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  1844. benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
  1845. {1, {4}}, data_type);
  1846. shapes_and_computation.clear();
  1847. algo_name = "X86_CONV_BIAS_DIRECT_STRIDE1_SMALL_GROUP";
  1848. printf("Benchmark X86_CONV_BIAS_DIRECT_STRIDE1_SMALL_GROUP algo\n");
  1849. bench_case(1, 32, 32, 200, 200, 3, 1);
  1850. bench_case(1, 32, 32, 128, 128, 3, 1);
  1851. bench_case(1, 32, 32, 100, 100, 3, 1);
  1852. bench_case(1, 32, 32, 80, 80, 3, 1);
  1853. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1854. {4, {4, 5, 6, 7}}, {1, {4}}, data_type);
  1855. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1856. {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  1857. benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
  1858. {1, {4}}, data_type);
  1859. }
  1860. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_F32) {
  1861. constexpr size_t RUNS = 50;
  1862. param::ConvBias param;
  1863. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1864. param.pad_h = 1;
  1865. param.pad_w = 1;
  1866. param.stride_h = 1;
  1867. param.stride_w = 1;
  1868. std::vector<DType> data_type = {dtype::Float32(), dtype::Float32(),
  1869. dtype::Float32(), dtype::Float32()};
  1870. std::vector<std::pair<SmallVector<TensorShape>, float>>
  1871. shapes_and_computation;
  1872. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  1873. size_t FS, size_t group) {
  1874. SmallVector<TensorShape> shapes{{N, IC, H, W},
  1875. {OC / group, IC / group, FS, FS},
  1876. {1, OC, 1, 1},
  1877. {},
  1878. {N, OC, H, W}};
  1879. TensorShape dst{N, OC, H, W};
  1880. float computations =
  1881. ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1882. dst.total_nr_elems()) *
  1883. 1e-6;
  1884. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1885. };
  1886. bench_case(1, 32, 32, 200, 200, 3, 1);
  1887. bench_case(1, 32, 32, 200, 200, 3, 1);
  1888. bench_case(1, 32, 32, 128, 128, 3, 1);
  1889. bench_case(1, 32, 32, 128, 128, 3, 1);
  1890. bench_case(1, 32, 32, 100, 100, 3, 1);
  1891. bench_case(1, 32, 32, 100, 100, 3, 1);
  1892. bench_case(1, 32, 32, 80, 80, 3, 1);
  1893. bench_case(1, 32, 32, 80, 80, 3, 1);
  1894. bench_case(1, 64, 32, 7, 7, 3, 1);
  1895. bench_case(1, 64, 64, 7, 7, 3, 1);
  1896. bench_case(1, 64, 128, 7, 7, 3, 1);
  1897. bench_case(1, 64, 256, 7, 7, 3, 1);
  1898. bench_case(1, 64, 512, 7, 7, 3, 1);
  1899. bench_case(1, 64, 1024, 7, 7, 3, 1);
  1900. bench_case(1, 64, 32, 14, 14, 3, 1);
  1901. bench_case(1, 64, 64, 14, 14, 3, 1);
  1902. bench_case(1, 64, 128, 14, 14, 3, 1);
  1903. bench_case(1, 64, 256, 14, 14, 3, 1);
  1904. bench_case(1, 64, 512, 14, 14, 3, 1);
  1905. bench_case(1, 64, 1024, 14, 14, 3, 1);
  1906. bench_case(1, 128, 128, 14, 14, 3, 1);
  1907. bench_case(1, 128, 256, 14, 14, 3, 1);
  1908. bench_case(1, 512, 512, 14, 14, 3, 1);
  1909. bench_case(1, 256, 512, 14, 14, 3, 1);
  1910. bench_case(1, 512, 1024, 14, 14, 3, 1);
  1911. bench_case(1, 1024, 1024, 14, 14, 3, 1);
  1912. std::string algo_name = "IM2COLMATMUL:X86_F32_BLAS:192";
  1913. printf("Benchmark IM2COLMATMUL:X86_F32_BLAS algo\n");
  1914. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1915. {4, {4, 5, 6, 7}}, {1, {4}}, data_type);
  1916. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  1917. {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  1918. benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
  1919. {1, {4}}, data_type);
  1920. shapes_and_computation.clear();
  1921. }
  1922. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_F32_single_thread) {
  1923. constexpr size_t RUNS = 50;
  1924. param::ConvBias param;
  1925. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1926. param.pad_h = 1;
  1927. param.pad_w = 1;
  1928. param.stride_h = 1;
  1929. param.stride_w = 1;
  1930. std::vector<DType> data_type = {dtype::Float32(), dtype::Float32(),
  1931. dtype::Float32(), dtype::Float32()};
  1932. std::vector<std::pair<SmallVector<TensorShape>, float>>
  1933. shapes_and_computation;
  1934. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H,
  1935. size_t W, size_t FS,
  1936. size_t group) {
  1937. SmallVector<TensorShape> shapes{{N, IC, H, W},
  1938. {OC / group, IC / group, FS, FS},
  1939. {1, OC, 1, 1},
  1940. {},
  1941. {N, OC, H, W}};
  1942. TensorShape dst{N, OC, H, W};
  1943. float computations =
  1944. ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1945. dst.total_nr_elems()) *
  1946. 1e-6;
  1947. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1948. };
  1949. bench_case(1, 32, 32, 200, 200, 3, 1);
  1950. bench_case(1, 32, 32, 200, 200, 3, 1);
  1951. bench_case(1, 32, 32, 128, 128, 3, 1);
  1952. bench_case(1, 32, 32, 128, 128, 3, 1);
  1953. bench_case(1, 32, 32, 100, 100, 3, 1);
  1954. bench_case(1, 32, 32, 100, 100, 3, 1);
  1955. bench_case(1, 32, 32, 80, 80, 3, 1);
  1956. bench_case(1, 32, 32, 80, 80, 3, 1);
  1957. bench_case(1, 64, 32, 7, 7, 3, 1);
  1958. bench_case(1, 64, 64, 7, 7, 3, 1);
  1959. bench_case(1, 64, 128, 7, 7, 3, 1);
  1960. bench_case(1, 64, 256, 7, 7, 3, 1);
  1961. bench_case(1, 64, 512, 7, 7, 3, 1);
  1962. bench_case(1, 64, 1024, 7, 7, 3, 1);
  1963. bench_case(1, 64, 32, 14, 14, 3, 1);
  1964. bench_case(1, 64, 64, 14, 14, 3, 1);
  1965. bench_case(1, 64, 128, 14, 14, 3, 1);
  1966. bench_case(1, 64, 256, 14, 14, 3, 1);
  1967. bench_case(1, 64, 512, 14, 14, 3, 1);
  1968. bench_case(1, 64, 1024, 14, 14, 3, 1);
  1969. bench_case(1, 128, 128, 14, 14, 3, 1);
  1970. bench_case(1, 128, 256, 14, 14, 3, 1);
  1971. bench_case(1, 512, 512, 14, 14, 3, 1);
  1972. bench_case(1, 256, 512, 14, 14, 3, 1);
  1973. bench_case(1, 512, 1024, 14, 14, 3, 1);
  1974. bench_case(1, 1024, 1024, 14, 14, 3, 1);
  1975. std::string algo_name = "IM2COLMATMUL:X86_F32_MKL_PACKA:192";
  1976. std::string algo_name1 = "IM2COLMATMUL:X86_F32_BLAS:192";
  1977. printf("Benchmark IM2COLMATMUL:X86_F32_BLAS algo\n");
  1978. benchmark_impl_comp(param, shapes_and_computation, algo_name,algo_name1, RUNS,
  1979. {1, {4}}, {1, {4}},data_type);
  1980. benchmark_impl_comp(param, shapes_and_computation, algo_name,algo_name1, RUNS,
  1981. {1, {7}}, {1, {7}},data_type);
  1982. shapes_and_computation.clear();
  1983. }
  1984. TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_INT8X8X32) {
  1985. constexpr size_t RUNS = 50;
  1986. param::ConvBias param;
  1987. param.pad_h = 1;
  1988. param.pad_w = 1;
  1989. param.stride_h = 1;
  1990. param.stride_w = 1;
  1991. std::vector<std::pair<SmallVector<TensorShape>, float>>
  1992. shapes_and_computation;
  1993. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  1994. size_t FS, size_t group) {
  1995. SmallVector<TensorShape> shapes{{N, IC, H, W},
  1996. {OC / group, IC / group, FS, FS},
  1997. {1, OC, 1, 1},
  1998. {},
  1999. {N, OC, H, W}};
  2000. TensorShape dst{N, OC, H, W};
  2001. float computations =
  2002. ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  2003. dst.total_nr_elems()) *
  2004. 1e-6;
  2005. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  2006. };
  2007. bench_case(1, 32, 32, 200, 200, 3, 1);
  2008. bench_case(1, 32, 32, 200, 200, 3, 1);
  2009. bench_case(1, 32, 32, 128, 128, 3, 1);
  2010. bench_case(1, 32, 32, 128, 128, 3, 1);
  2011. bench_case(1, 32, 32, 100, 100, 3, 1);
  2012. bench_case(1, 32, 32, 100, 100, 3, 1);
  2013. bench_case(1, 32, 32, 80, 80, 3, 1);
  2014. bench_case(1, 32, 32, 80, 80, 3, 1);
  2015. bench_case(1, 64, 32, 7, 7, 3, 1);
  2016. bench_case(1, 64, 64, 7, 7, 3, 1);
  2017. bench_case(1, 64, 128, 7, 7, 3, 1);
  2018. bench_case(1, 64, 256, 7, 7, 3, 1);
  2019. bench_case(1, 64, 512, 7, 7, 3, 1);
  2020. bench_case(1, 64, 1024, 7, 7, 3, 1);
  2021. bench_case(1, 64, 32, 14, 14, 3, 1);
  2022. bench_case(1, 64, 64, 14, 14, 3, 1);
  2023. bench_case(1, 64, 128, 14, 14, 3, 1);
  2024. bench_case(1, 64, 256, 14, 14, 3, 1);
  2025. bench_case(1, 64, 512, 14, 14, 3, 1);
  2026. bench_case(1, 64, 1024, 14, 14, 3, 1);
  2027. bench_case(1, 128, 128, 14, 14, 3, 1);
  2028. bench_case(1, 128, 256, 14, 14, 3, 1);
  2029. bench_case(1, 512, 512, 14, 14, 3, 1);
  2030. bench_case(1, 256, 512, 14, 14, 3, 1);
  2031. bench_case(1, 512, 1024, 14, 14, 3, 1);
  2032. bench_case(1, 1024, 1024, 14, 14, 3, 1);
  2033. std::vector<DType> data_type = {dtype::Int8(), dtype::Int8(),
  2034. dtype::Int32(), dtype::Int32()};
  2035. std::string algo_name = "IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2:192";
  2036. // std::string algo_name = "IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16";
  2037. // printf("Benchmark IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2 algo\n");
  2038. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  2039. {4, {4, 5, 6, 7}}, {1, {4}}, data_type);
  2040. benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
  2041. {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  2042. benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
  2043. {1, {4}}, data_type);
  2044. shapes_and_computation.clear();
  2045. }
  2046. namespace{
  2047. std::vector<conv_bias::TestArg> get_winograd_benchmark_args(size_t kernel,
  2048. size_t pack_size) {
  2049. std::vector<conv_bias::TestArg> args;
  2050. auto pack = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  2051. size_t p) {
  2052. if (ic % pack_size != 0 || oc % pack_size != 0)
  2053. return;
  2054. if (w + 2 * p < kernel || h + 2 * p < kernel)
  2055. return;
  2056. param::ConvBias param;
  2057. param.mode = param::ConvBias::Mode::CROSS_CORRELATION;
  2058. param.format = param::ConvBias::Format::NCHW88;
  2059. param.sparse = param::ConvBias::Sparse::DENSE;
  2060. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  2061. param.stride_h = 1;
  2062. param.stride_w = 1;
  2063. param.pad_h = p;
  2064. param.pad_w = p;
  2065. args.push_back(conv_bias::TestArg{param,
  2066. TensorShape{1, ic/8, h, w, 8},
  2067. TensorShape{oc/8, ic/8, kernel, kernel, 8, 8},
  2068. {1, oc/8, 1, 1, 8}});
  2069. };
  2070. for (size_t ic : {64, 128, 256}) {
  2071. for (size_t oc : {64,128,256}) {
  2072. pack(oc, ic, 56, 56, kernel, kernel / 2);
  2073. pack(oc, ic, 14, 14, kernel, kernel / 2);
  2074. pack(oc, ic, 28, 28, kernel, kernel / 2);
  2075. }
  2076. }
  2077. //! conv in vgg16
  2078. pack(512, 512, 15, 15, kernel, kernel / 2);
  2079. pack(512, 256, 15, 15, kernel, kernel / 2);
  2080. pack(256, 256, 29, 29, kernel, kernel / 2);
  2081. pack(256, 128, 29, 29, kernel, kernel / 2);
  2082. pack(128, 128, 57, 57, kernel, kernel / 2);
  2083. pack(128, 64, 57, 57, kernel, kernel / 2);
  2084. pack(64, 64, 56, 56, kernel, kernel / 2);
  2085. pack(128, 128, 28, 28, kernel, kernel / 2);
  2086. pack(512, 512, 14, 14, kernel, kernel / 2);
  2087. return args;
  2088. }
  2089. void benchmark_winograd(const char* algo_name, Handle* handle,
  2090. size_t kernel, size_t pack_size) {
  2091. auto&& args = get_winograd_benchmark_args(kernel, pack_size);
  2092. using namespace conv_bias;
  2093. constexpr size_t RUN = 10;
  2094. Benchmarker<ConvBias> benchmark(handle);
  2095. benchmark.set_display(false);
  2096. benchmark.set_times(RUN);
  2097. Benchmarker<ConvBias> benchmark_winograd(handle);
  2098. benchmark_winograd.set_display(false);
  2099. benchmark_winograd.set_times(RUN);
  2100. for (auto&& arg : args) {
  2101. TensorLayout dst_layout;
  2102. auto opr = handle->create_operator<ConvBias>();
  2103. opr->param() = arg.param;
  2104. opr->deduce_layout({arg.src, dtype::Float32()},
  2105. {arg.filter, dtype::Float32()},
  2106. {arg.bias, dtype::Float32()}, {}, dst_layout);
  2107. //! dst.nr_elems * IC * FH * FW * 2
  2108. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  2109. arg.filter[2] * arg.filter[3] * 2.0 * 8.0 /
  2110. (1024 * 1024 * 1024) * 1e3;
  2111. auto used = benchmark.set_param(arg.param).exec(
  2112. {arg.src, arg.filter, {}, {}, {}}) /
  2113. RUN;
  2114. benchmark_winograd.set_param(arg.param);
  2115. auto used_winograd =
  2116. algo_benchmark<ConvBias>(benchmark_winograd,
  2117. {arg.src, arg.filter, {}, {}, {}},
  2118. algo_name) /
  2119. RUN;
  2120. printf("%s %s: normal: %f ms %f Gflops winograd: %f ms %f GFlops "
  2121. "speedup: "
  2122. "%f\n",
  2123. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  2124. used, computations / used, used_winograd,
  2125. computations / used_winograd, used / used_winograd);
  2126. }
  2127. }
  2128. }
  2129. TEST_F(X86, BENCHMARK_CONVBIAS_WINOGRAD_F63_8x8) {
  2130. benchmark_winograd("WINOGRAD:X86_F32MK8_8X8:8:6:8", handle(), 3, 8);
  2131. }
  2132. TEST_F(X86, BENCHMARK_CONVBIAS_WINOGRAD_F23_8x8) {
  2133. benchmark_winograd("WINOGRAD:X86_F32MK8_8X8:8:2:8", handle(), 3, 8);
  2134. }
  2135. #endif
  2136. } // namespace test
  2137. } // namespace megdnn
  2138. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台