You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias.cpp 104 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558
  1. /**
  2. * \file dnn/test/arm_common/conv_bias.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megdnn/dtype.h"
  13. #include "test/arm_common/fixture.h"
  14. #include "megdnn/opr_param_defs.h"
  15. #include "megdnn/oprs.h"
  16. #include "src/fallback/conv_bias/common.h"
  17. #include "test/common/benchmarker.h"
  18. #include "test/common/checker.h"
  19. #include "test/common/conv_bias.h"
  20. #include "test/common/rng.h"
  21. #include "test/common/tensor.h"
  22. #include "test/common/workspace_wrapper.h"
  23. using namespace megdnn;
  24. using namespace test;
  25. using namespace conv_bias;
  26. //! TODO this algo current does not support multithread
  27. TEST_F(ARM_COMMON, CONVBIAS_INT8_INT8_INT16_STRIDE2F2) {
  28. checker_conv_bias_int8x8x16(get_conv_bias_args({2}, 2, true, true, true),
  29. handle(), "I8816STRD2F2");
  30. }
  31. TEST_F(ARM_COMMON, CONV_BIAS_MATMUL) {
  32. using namespace conv_bias;
  33. std::vector<TestArg> args = get_quantized_args();
  34. Checker<ConvBiasForward> checker(handle());
  35. checker.set_before_exec_callback(
  36. conv_bias::ConvBiasAlgoChecker<ConvBias>("S8MATMUL"));
  37. #if MEGDNN_ARMV7
  38. checker.set_epsilon(1);
  39. #endif
  40. UniformIntRNG rng{-50, 50};
  41. for (auto&& arg : args) {
  42. if (arg.bias.ndim == 4 && arg.bias[2] != 1 && arg.bias[3] != 1)
  43. continue;
  44. checker.set_dtype(0, dtype::QuantizedS8(0.41113496f))
  45. .set_dtype(1, dtype::QuantizedS8(0.01887994f))
  46. .set_dtype(2, dtype::QuantizedS32(0.41113496f * 0.01887994f))
  47. .set_dtype(4, dtype::QuantizedS8(0.49550694f))
  48. .set_rng(0, &rng)
  49. .set_rng(1, &rng)
  50. .set_rng(2, &rng)
  51. .set_param(arg.param)
  52. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  53. }
  54. }
  55. TEST_F(ARM_COMMON, CONV_BIAS_WINOGRAD_F63_4) {
  56. using namespace conv_bias;
  57. std::vector<TestArg> args = get_winograd_mk_packed_args();
  58. Checker<ConvBiasForward> checker(handle());
  59. check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4);
  60. }
  61. TEST_F(ARM_COMMON, CONV_BIAS_WINOGRAD_F63_4_WEIGHT_PREPROCESS) {
  62. using namespace conv_bias;
  63. std::vector<TestArg> args = get_winograd_mk_packed_args();
  64. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  65. handle());
  66. check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4);
  67. }
  68. #define CONV_BIAS_MATMUL_QU8_MODE(MODE) \
  69. using namespace conv_bias; \
  70. std::vector<TestArg> args = get_quantized_args_with_nlmode(MODE); \
  71. Checker<ConvBiasForward> checker(handle()); \
  72. checker.set_before_exec_callback( \
  73. conv_bias::ConvBiasAlgoChecker<ConvBias>("QU8MATMUL")); \
  74. UniformIntRNG rng{0, 127}; \
  75. for (auto&& arg : args) { \
  76. if (arg.bias.ndim == 4 && arg.bias[2] != 1 && arg.bias[3] != 1) \
  77. continue; \
  78. checker.set_dtype(0, dtype::Quantized8Asymm( \
  79. 2.5f, static_cast<uint8_t>(127))) \
  80. .set_dtype(1, dtype::Quantized8Asymm( \
  81. 2.7f, static_cast<uint8_t>(126))) \
  82. .set_dtype(2, dtype::QuantizedS32(6.75f)) \
  83. .set_dtype(4, dtype::Quantized8Asymm( \
  84. 60.25f, static_cast<uint8_t>(125))) \
  85. .set_rng(0, &rng) \
  86. .set_rng(1, &rng) \
  87. .set_rng(2, &rng) \
  88. .set_param(arg.param) \
  89. .execs({arg.src, arg.filter, arg.bias, {}, {}}); \
  90. }
  91. #define MODE_STR(mode) param::ConvBias::NonlineMode::mode
  92. #define CB_TEST(MODE) \
  93. TEST_F(ARM_COMMON, CONV_BIAS_MATMUL_QU8_##MODE) { \
  94. CONV_BIAS_MATMUL_QU8_MODE(MODE_STR(MODE)); \
  95. }
  96. CB_TEST(IDENTITY);
  97. CB_TEST(RELU);
  98. CB_TEST(H_SWISH);
  99. #undef MODE_STR
  100. #undef CB_TEST
  101. #undef CONV_BIAS_MATMUL_QU8_MODE
  102. #if MEGDNN_WITH_BENCHMARK
  103. static void benchmark_convbias(Handle* handle, std::string int_name,
  104. std::string float_name, bool is_fp32 = false,
  105. bool is_8x8x16 = false) {
  106. constexpr size_t RUNS = 30;
  107. Benchmarker<ConvBias> benchmarker_int(handle);
  108. benchmarker_int.set_times(RUNS)
  109. .set_dtype(0, dtype::QuantizedS8(2.5))
  110. .set_dtype(1, dtype::QuantizedS8(2.5))
  111. .set_dtype(2, dtype::QuantizedS32(6.25))
  112. .set_dtype(4, dtype::QuantizedS8(60.25))
  113. .set_display(false);
  114. benchmarker_int.set_before_exec_callback(
  115. conv_bias::ConvBiasAlgoChecker<ConvBias>(int_name.c_str()));
  116. Benchmarker<ConvBias> benchmarker_float(handle);
  117. benchmarker_float.set_display(false).set_times(RUNS);
  118. benchmarker_float.set_before_exec_callback(
  119. conv_bias::ConvBiasAlgoChecker<ConvBias>(float_name.c_str()));
  120. Benchmarker<ConvBias> benchmarker_nchw44(handle);
  121. if (is_fp32) {
  122. benchmarker_nchw44.set_times(RUNS)
  123. .set_dtype(0, dtype::Float32())
  124. .set_dtype(1, dtype::Float32())
  125. .set_dtype(2, dtype::Float32())
  126. .set_dtype(4, dtype::Float32())
  127. .set_display(false);
  128. } else if (is_8x8x16) {
  129. benchmarker_nchw44.set_times(RUNS)
  130. .set_dtype(0, dtype::Int8())
  131. .set_dtype(1, dtype::Int8())
  132. .set_dtype(2, dtype::Int16())
  133. .set_dtype(4, dtype::Int16())
  134. .set_display(false);
  135. benchmarker_int.set_times(RUNS)
  136. .set_dtype(0, dtype::Int8())
  137. .set_dtype(1, dtype::Int8())
  138. .set_dtype(2, dtype::Int16())
  139. .set_dtype(4, dtype::Int16())
  140. .set_display(false);
  141. } else {
  142. benchmarker_nchw44.set_times(RUNS)
  143. .set_dtype(0, dtype::QuantizedS8(2.5))
  144. .set_dtype(1, dtype::QuantizedS8(2.5))
  145. .set_dtype(2, dtype::QuantizedS32(6.25))
  146. .set_dtype(4, dtype::QuantizedS8(60.25))
  147. .set_display(false);
  148. }
  149. auto nchw44_algo_regx = ".*(DIRECT|NCHW_NCHW44).*";
  150. #if __ARM_FEATURE_DOTPROD
  151. if (!is_fp32) {
  152. nchw44_algo_regx = ".*DOT.*";
  153. }
  154. #endif
  155. benchmarker_nchw44.set_before_exec_callback(
  156. conv_bias::ConvBiasAlgoChecker<ConvBias>(nchw44_algo_regx));
  157. auto run = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  158. size_t FS, size_t stride, bool input_nchw = false) {
  159. param::ConvBias param;
  160. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  161. if (is_8x8x16) {
  162. param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
  163. }
  164. param.stride_h = stride;
  165. param.stride_w = stride;
  166. param.pad_h = FS / 2;
  167. param.pad_w = FS / 2;
  168. auto OH = (H + 2 * param.pad_h - FS) /
  169. static_cast<size_t>(param.stride_h) +
  170. 1;
  171. auto OW = (W + 2 * param.pad_w - FS) /
  172. static_cast<size_t>(param.stride_w) +
  173. 1;
  174. TensorShape src({N, IC, H, W}), filter({OC, IC, FS, FS}),
  175. bias({1, OC, 1, 1}), dst({N, OC, OH, OW});
  176. if (is_8x8x16) {
  177. bias = {};
  178. }
  179. param.format = param::ConvBias::Format::NCHW;
  180. auto int_used = benchmarker_int.set_param(param).exec(
  181. {src, filter, bias, {}, dst}) /
  182. RUNS;
  183. auto float_used = benchmarker_float.set_param(param).exec(
  184. {src, filter, bias, {}, dst}) /
  185. RUNS;
  186. param.format = param::ConvBias::Format::NCHW44;
  187. src = {N, IC / 4, H, W, 4};
  188. filter = {OC / 4, IC / 4, FS, FS, 4, 4};
  189. if (input_nchw) {
  190. src = {N, IC, H, W};
  191. filter = {OC / 4, FS, FS, IC, 4};
  192. }
  193. bias = {1, OC / 4, 1, 1, 4};
  194. if (is_8x8x16) {
  195. bias = {};
  196. }
  197. dst = {N, OC / 4, OH, OW, 4};
  198. auto int_nchw44_used = benchmarker_nchw44.set_param(param).exec(
  199. {src, filter, bias, {}, dst}) /
  200. RUNS;
  201. float computations = IC * (FS * FS) * dst.total_nr_elems() * 2 * 1e-6;
  202. printf("run: %s %s %s->%s \n", src.to_string().c_str(),
  203. filter.to_string().c_str(), bias.to_string().c_str(),
  204. dst.to_string().c_str());
  205. printf("float: %f ms %f Gflops, ", float_used,
  206. computations / float_used);
  207. printf("int_nchw: %f ms %f Gflops, ", int_used,
  208. computations / int_used);
  209. auto speed_up = int_used / int_nchw44_used;
  210. if (is_fp32) {
  211. speed_up = float_used / int_nchw44_used;
  212. printf("fp32_nchw44: %f ms %f Gflops %f speedup, ", int_nchw44_used,
  213. computations / int_nchw44_used, speed_up);
  214. } else {
  215. printf("int_nchw44: %f ms %f Gflops %f speedup, ", int_nchw44_used,
  216. computations / int_nchw44_used, speed_up);
  217. }
  218. printf("\n");
  219. };
  220. if (is_fp32) {
  221. run(1, 1, 4, 112, 112, 2, 2, true);
  222. run(1, 3, 32, 224, 224, 3, 2, true);
  223. run(1, 3, 64, 224, 224, 7, 2, true);
  224. run(1, 1, 4, 112, 112, 2, 1, true);
  225. run(1, 3, 32, 224, 224, 3, 1, true);
  226. run(1, 3, 64, 224, 224, 3, 1, true);
  227. run(1, 3, 64, 224, 224, 7, 1, true);
  228. run(1, 64, 128, 56, 56, 3, 2, false);
  229. run(1, 128, 256, 28, 28, 3, 2, false);
  230. run(1, 256, 512, 14, 14, 3, 2, false);
  231. run(1, 128, 128, 28, 28, 3, 1, false);
  232. run(1, 256, 256, 14, 14, 3, 1, false);
  233. run(1, 512, 512, 7, 7, 3, 1, false);
  234. } else {
  235. run(1, 1, 4, 112, 112, 2, 2, true);
  236. run(1, 3, 8, 224, 224, 3, 2, true);
  237. run(1, 3, 32, 224, 224, 3, 2, true);
  238. run(1, 3, 32, 224, 224, 5, 2, true);
  239. run(1, 3, 64, 224, 224, 7, 2, true);
  240. run(1, 1, 4, 112, 112, 2, 1, true);
  241. run(1, 3, 32, 224, 224, 3, 1, true);
  242. run(1, 3, 32, 224, 224, 5, 1, true);
  243. run(1, 3, 64, 224, 224, 7, 1, true);
  244. run(1, 64, 128, 56, 56, 3, 2, false);
  245. run(1, 128, 256, 28, 28, 3, 2, false);
  246. run(1, 256, 512, 14, 14, 3, 2, false);
  247. run(1, 128, 128, 28, 28, 3, 1, false);
  248. run(1, 256, 256, 14, 14, 3, 1, false);
  249. run(1, 512, 512, 7, 7, 3, 1, false);
  250. for (size_t stride : {1}) {
  251. printf("stride %zu\n", stride);
  252. for (size_t filter_size : {2, 3, 5, 7}) {
  253. for (size_t img_size : {32}) {
  254. for (size_t channel : {8, 16, 32, 64, 128, 256}) {
  255. run(1, channel, channel, img_size, img_size,
  256. filter_size, stride, false);
  257. }
  258. }
  259. }
  260. }
  261. }
  262. }
  263. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_NCHW44) {
  264. #if MEGDNN_AARCH64
  265. benchmark_convbias(handle(), "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16:384",
  266. "IM2COLMATMUL:AARCH64_F32K8X12X1:192", true);
  267. benchmark_convbias(handle(), "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16:384",
  268. "IM2COLMATMUL:AARCH64_F32K8X12X1:192", false);
  269. benchmark_convbias(handle(), "IM2COLMATMUL:AARCH64_INT8X8X16_K4X4X16:192",
  270. "IM2COLMATMUL:AARCH64_F32K8X12X1:192", false, true);
  271. #else
  272. benchmark_convbias(handle(), "IM2COLMATMUL:ARMV7_INT8X8X32_K4X8X8:384",
  273. "IM2COLMATMUL:ARMV7_F32:192", true);
  274. benchmark_convbias(handle(), "IM2COLMATMUL:ARMV7_INT8X8X32_K4X8X8:384",
  275. "IM2COLMATMUL:ARMV7_F32:192", false);
  276. benchmark_convbias(handle(), "IM2COLMATMUL:ARMV7_INT8X8X16_K4X8X8:384",
  277. "IM2COLMATMUL:ARMV7_F32:192", false, true);
  278. #endif
  279. }
  280. TEST_F(ARM_COMMON_MULTI_THREADS, BENCHMARK_CONVBIAS_NCHW44) {
  281. #if MEGDNN_AARCH64
  282. benchmark_convbias(handle(), "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16:384",
  283. "IM2COLMATMUL:AARCH64_F32K8X12X1:192", true);
  284. benchmark_convbias(handle(), "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16:384",
  285. "IM2COLMATMUL:AARCH64_F32K8X12X1:192", false);
  286. #else
  287. benchmark_convbias(handle(), "IM2COLMATMUL:ARMV7_INT8X8X32_K4X8X8:384",
  288. "IM2COLMATMUL:ARMV7_F32:192", true);
  289. benchmark_convbias(handle(), "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16:384",
  290. "IM2COLMATMUL:ARMV7_F32:192", false);
  291. #endif
  292. }
  293. #endif
  294. TEST_F(ARM_COMMON, CONV_BIAS_MATMUL_QS8) {
  295. using namespace conv_bias;
  296. std::vector<TestArg> args = get_quantized_args();
  297. Checker<ConvBiasForward> checker(handle());
  298. checker.set_before_exec_callback(
  299. conv_bias::ConvBiasAlgoChecker<ConvBias>("S8MATMUL"));
  300. #if MEGDNN_ARMV7
  301. checker.set_epsilon(1);
  302. #endif
  303. UniformIntRNG rng{0, 255};
  304. for (auto&& arg : args) {
  305. if (arg.bias.ndim == 4 && arg.bias[2] != 1 && arg.bias[3] != 1)
  306. continue;
  307. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  308. .set_dtype(1, dtype::QuantizedS8(2.7f))
  309. .set_dtype(2, dtype::QuantizedS32(6.75f))
  310. .set_dtype(4, dtype::QuantizedS8(60.25f))
  311. .set_rng(0, &rng)
  312. .set_rng(1, &rng)
  313. .set_rng(2, &rng)
  314. .set_param(arg.param)
  315. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  316. }
  317. }
  318. #if MEGDNN_ARMV7
  319. TEST_F(ARM_COMMON, CONV_BIAS_RESCALE_OP) {
  320. using namespace conv_bias;
  321. Checker<ConvBias> checker(handle());
  322. checker.set_before_exec_callback(
  323. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("S8MATMUL"));
  324. checker.set_epsilon(1).set_max_avg_error(1e-2).set_max_avg_biased_error(
  325. 1e-3);
  326. UniformIntRNG rng{-128, 127};
  327. checker.set_dtype(0, dtype::QuantizedS8(0.41113496f))
  328. .set_dtype(1, dtype::QuantizedS8(0.01887994f))
  329. .set_dtype(2, dtype::QuantizedS32(0.41113496f * 0.01887994f))
  330. .set_dtype(4, dtype::QuantizedS8(0.49550694f))
  331. .set_rng(0, &rng)
  332. .set_rng(1, &rng)
  333. .set_rng(2, &rng);
  334. param::ConvBias param;
  335. param.stride_h = 1;
  336. param.stride_w = 1;
  337. param.pad_h = 0;
  338. param.pad_w = 0;
  339. param.nonlineMode = NonlineMode::IDENTITY;
  340. //! Unary op
  341. checker.set_param(param).exec({TensorShape{2, 1, 128, 128},
  342. TensorShape{16, 1, 2, 2},
  343. TensorShape{},
  344. TensorShape{},
  345. {}});
  346. //! Binary op
  347. checker.set_param(param).exec({TensorShape{2, 1, 128, 128},
  348. TensorShape{16, 1, 2, 2},
  349. TensorShape{1, 16, 1, 1},
  350. TensorShape{},
  351. {}});
  352. }
  353. #endif
  354. #if MEGDNN_WITH_BENCHMARK
  355. void benchmark_im2col(const char* algo_name, const char* im2col_name,
  356. Handle* handle, size_t kernel, size_t pack_size = 1) {
  357. auto&& args = get_winograd_benchmark_args(kernel, pack_size);
  358. using namespace conv_bias;
  359. constexpr size_t RUN = 10;
  360. Benchmarker<ConvBias> benchmark(handle);
  361. benchmark.set_display(false);
  362. benchmark.set_times(RUN);
  363. Benchmarker<ConvBias> benchmark_im2col(handle);
  364. benchmark_im2col.set_display(false);
  365. benchmark_im2col.set_times(RUN);
  366. for (auto&& arg : args) {
  367. TensorLayout dst_layout;
  368. auto opr = handle->create_operator<ConvBias>();
  369. opr->param() = arg.param;
  370. opr->deduce_layout({arg.src, dtype::Float32()},
  371. {arg.filter, dtype::Float32()},
  372. {arg.bias, dtype::Float32()}, {}, dst_layout);
  373. //! dst.nr_elems * IC * FH * FW * 2
  374. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  375. arg.filter[2] * arg.filter[3] * 2.0 /
  376. (1024 * 1024 * 1024) * 1e3;
  377. benchmark.set_param(arg.param);
  378. auto used = algo_benchmark<ConvBias>(benchmark,
  379. {arg.src, arg.filter, {}, {}, {}},
  380. algo_name) /
  381. RUN;
  382. benchmark_im2col.set_param(arg.param);
  383. auto used_im2col =
  384. algo_benchmark<ConvBias>(benchmark_im2col,
  385. {arg.src, arg.filter, {}, {}, {}},
  386. im2col_name) /
  387. RUN;
  388. printf("%s %s: normal: %f ms %f Gflops im2col: %f ms %f GFlops "
  389. "speedup: "
  390. "%f\n",
  391. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  392. used, computations / used, used_im2col,
  393. computations / used_im2col, used / used_im2col);
  394. }
  395. }
  396. void benchmark_im2col_single_algo(const char* im2col_name, Handle* handle,
  397. size_t kernel, size_t pack_size = 1) {
  398. std::vector<conv_bias::TestArg> args;
  399. auto pack = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  400. size_t p) {
  401. if (ic % pack_size != 0 || oc % pack_size != 0)
  402. return;
  403. if (w + 2 * p < kernel || h + 2 * p < kernel)
  404. return;
  405. param::ConvBias param;
  406. param.stride_h = 1;
  407. param.stride_w = 1;
  408. param.pad_h = p;
  409. param.pad_w = p;
  410. args.push_back(conv_bias::TestArg{param,
  411. TensorShape{1, ic, h, w},
  412. TensorShape{oc, ic, kernel, kernel},
  413. {1, oc, 1, 1}});
  414. };
  415. pack(1, 64, 100, 100, kernel, 1);
  416. pack(8, 64, 100, 100, kernel, 1);
  417. pack(16, 64, 100, 100, kernel, 1);
  418. pack(32, 64, 100, 100, kernel, 1);
  419. pack(64, 64, 100, 100, kernel, 1);
  420. pack(128, 64, 100, 100, kernel, 1);
  421. pack(256, 64, 100, 100, kernel, 1);
  422. pack(512, 64, 100, 100, kernel, 1);
  423. pack(1024, 64, 100, 100, kernel, 1);
  424. pack(1, 64, 10, 10, kernel, 1);
  425. pack(8, 64, 10, 10, kernel, 1);
  426. pack(16, 64, 10, 10, kernel, 1);
  427. pack(32, 64, 10, 10, kernel, 1);
  428. pack(64, 64, 10, 10, kernel, 1);
  429. pack(128, 64, 10, 10, kernel, 1);
  430. pack(256, 64, 10, 10, kernel, 1);
  431. pack(512, 64, 10, 10, kernel, 1);
  432. pack(1024, 64, 10, 10, kernel, 1);
  433. pack(1, 16, 10, 10, kernel, 1);
  434. pack(8, 16, 10, 10, kernel, 1);
  435. pack(16, 16, 10, 10, kernel, 1);
  436. pack(32, 16, 10, 10, kernel, 1);
  437. pack(64, 16, 10, 10, kernel, 1);
  438. pack(128, 16, 10, 10, kernel, 1);
  439. pack(256, 16, 10, 10, kernel, 1);
  440. pack(512, 16, 10, 10, kernel, 1);
  441. pack(1024, 16, 10, 10, kernel, 1);
  442. using namespace conv_bias;
  443. constexpr size_t RUN = 20;
  444. Benchmarker<ConvBias> benchmark_im2col(handle);
  445. benchmark_im2col.set_display(false);
  446. benchmark_im2col.set_times(RUN);
  447. for (auto&& arg : args) {
  448. TensorLayout dst_layout;
  449. auto opr = handle->create_operator<ConvBias>();
  450. opr->param() = arg.param;
  451. opr->deduce_layout({arg.src, dtype::Float32()},
  452. {arg.filter, dtype::Float32()},
  453. {arg.bias, dtype::Float32()}, {}, dst_layout);
  454. //! dst.nr_elems * IC * FH * FW * 2
  455. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  456. arg.filter[2] * arg.filter[3] * 2.0 /
  457. (1024 * 1024 * 1024) * 1e3;
  458. benchmark_im2col.set_param(arg.param);
  459. auto used_im2col =
  460. algo_benchmark<ConvBias>(benchmark_im2col,
  461. {arg.src, arg.filter, {}, {}, {}},
  462. im2col_name) /
  463. RUN;
  464. printf("%s %s: im2col: %f ms %f GFlops \n", arg.src.to_string().c_str(),
  465. arg.filter.to_string().c_str(), used_im2col,
  466. computations / used_im2col);
  467. }
  468. }
  469. void BENCHMARK_IM2COL_NCHW44_VS_NCHW(const char* algo_name,
  470. const char* im2col_name, Handle* handle,
  471. size_t kernel, DType src_type,
  472. DType dst_type) {
  473. auto&& args = get_winograd_benchmark_args(kernel, 4);
  474. using namespace conv_bias;
  475. constexpr size_t RUN = 10;
  476. Benchmarker<ConvBias> benchmark(handle);
  477. benchmark.set_display(false);
  478. benchmark.set_times(RUN);
  479. benchmark.set_dtype(0, src_type);
  480. benchmark.set_dtype(1, src_type);
  481. benchmark.set_dtype(2, dst_type);
  482. benchmark.set_dtype(4, dst_type);
  483. Benchmarker<ConvBias> benchmark_im2col(handle);
  484. benchmark_im2col.set_display(false);
  485. benchmark_im2col.set_times(RUN);
  486. benchmark_im2col.set_dtype(0, src_type);
  487. benchmark_im2col.set_dtype(1, src_type);
  488. benchmark_im2col.set_dtype(2, dst_type);
  489. benchmark_im2col.set_dtype(4, dst_type);
  490. for (auto&& arg : args) {
  491. TensorLayout dst_layout;
  492. auto opr = handle->create_operator<ConvBias>();
  493. opr->param() = arg.param;
  494. opr->deduce_layout({arg.src, dtype::Float32()},
  495. {arg.filter, dtype::Float32()},
  496. {arg.bias, dtype::Float32()}, {}, dst_layout);
  497. //! dst.nr_elems * IC * FH * FW * 2
  498. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  499. arg.filter[2] * arg.filter[3] * 2.0 /
  500. (1024 * 1024 * 1024) * 1e3;
  501. std::vector<conv_bias::TestArg> nchw44param;
  502. benchmark.set_param(arg.param);
  503. auto used = algo_benchmark<ConvBias>(benchmark,
  504. {arg.src, arg.filter, {}, {}, {}},
  505. algo_name) /
  506. RUN;
  507. arg.param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
  508. arg.param.format = param::ConvBias::Format::NCHW44;
  509. benchmark_im2col.set_param(arg.param);
  510. nchw44param.push_back(conv_bias::TestArg{
  511. arg.param,
  512. TensorShape{arg.src.shape[0], arg.src.shape[1] / 4, arg.src[2],
  513. arg.src.shape[3], 4},
  514. TensorShape{arg.filter.shape[0] / 4, arg.filter.shape[1] / 4,
  515. kernel, kernel, 4, 4},
  516. TensorShape{}});
  517. auto used_im2col =
  518. algo_benchmark<ConvBias>(
  519. benchmark_im2col,
  520. {nchw44param[0].src, nchw44param[0].filter, {}, {}, {}},
  521. im2col_name) /
  522. RUN;
  523. printf("nchw44 shape src %s filter %s\n",
  524. nchw44param[0].src.to_string().c_str(),
  525. nchw44param[0].filter.to_string().c_str());
  526. printf("%s %s: normal: %f ms %f Gflops im2col: %f ms %f GFlops "
  527. "speedup: "
  528. "%f\n",
  529. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  530. used, computations / used, used_im2col,
  531. computations / used_im2col, used / used_im2col);
  532. }
  533. }
  534. std::vector<conv_bias::TestArg> get_nchw44_channel_wise_benchmark_args(
  535. std::vector<size_t> kernel, size_t stride, bool no_bias,
  536. bool no_nonlinemode, bool no_full_bias) {
  537. using namespace conv_bias;
  538. using Param = param::ConvBias;
  539. using NLMode = param::ConvBias::NonlineMode;
  540. std::vector<TestArg> args;
  541. auto pack = [&](size_t n, size_t group, size_t w, size_t h, size_t kernel,
  542. size_t stride, NLMode nlmode, bool pad) {
  543. Param param;
  544. param.stride_h = stride;
  545. param.stride_w = stride;
  546. if (pad) {
  547. param.pad_h = kernel / 2;
  548. param.pad_w = kernel / 2;
  549. } else {
  550. param.pad_h = 0;
  551. param.pad_w = 0;
  552. }
  553. param.nonlineMode = nlmode;
  554. param.format = param::ConvBias::Format::NCHW44;
  555. param.sparse = param::ConvBias::Sparse::GROUP;
  556. args.emplace_back(param, TensorShape{n, group, h, w, 4},
  557. TensorShape{group, 1, 1, kernel, kernel, 4},
  558. TensorShape{});
  559. if (!no_bias) {
  560. args.emplace_back(param, TensorShape{n, group, h, w, 4},
  561. TensorShape{group, 1, 1, kernel, kernel, 4},
  562. TensorShape{1, group, 1, 1, 4});
  563. }
  564. if (!no_full_bias) {
  565. args.emplace_back(
  566. param, TensorShape{n, group, h, w, 4},
  567. TensorShape{group, 1, 1, kernel, kernel, 4},
  568. TensorShape{n, group,
  569. (h + 2 * param.pad_w - kernel) / stride + 1,
  570. (w + 2 * param.pad_w - kernel) / stride + 1,
  571. 4});
  572. }
  573. };
  574. std::vector<NLMode> nonlinemode = {NLMode::IDENTITY};
  575. if (!no_nonlinemode) {
  576. nonlinemode.emplace_back(NLMode::RELU);
  577. nonlinemode.emplace_back(NLMode::H_SWISH);
  578. }
  579. for (size_t n : {1}) {
  580. for (auto nlmode : nonlinemode) {
  581. for (bool pad : {true}) {
  582. for (size_t group : {1, 2, 4, 128}) {
  583. for (size_t size : {40,89,100,200}) {
  584. for (size_t kern : kernel) {
  585. pack(n, group, size, size, kern, stride, nlmode,
  586. pad);
  587. }
  588. }
  589. }
  590. }
  591. for (bool pad : {false}) {
  592. for (size_t group : {1, 2, 4, 8, 16, 32, 64, 128}) {
  593. for (size_t size : {40, 89, 100}) {
  594. for (size_t kern : kernel) {
  595. pack(n, group, size, size, kern, stride, nlmode,
  596. pad);
  597. }
  598. }
  599. }
  600. }
  601. }
  602. }
  603. return args;
  604. }
  605. void BENCHMARK_GROUPCONV_NCHW44_int8x8x16VS_int8x8x32(const char* algo_name0,
  606. const char* algo_name1, Handle* handle,
  607. size_t kernel,size_t stride = 1, size_t pack_size = 1) {
  608. auto args = get_nchw44_channel_wise_benchmark_args({2, 3, 5}, stride, false, true, true);
  609. using namespace conv_bias;
  610. constexpr size_t RUN = 10;
  611. Benchmarker<ConvBias> benchmark(handle);
  612. benchmark.set_display(false);
  613. benchmark.set_times(RUN);
  614. benchmark.set_dtype(0, dtype::Int8());
  615. benchmark.set_dtype(1, dtype::Int8());
  616. benchmark.set_dtype(2, dtype::Int32());
  617. benchmark.set_dtype(4, dtype::Int32());
  618. Benchmarker<ConvBias> benchmark_algo1(handle);
  619. benchmark_algo1.set_display(false);
  620. benchmark_algo1.set_times(RUN);
  621. benchmark_algo1.set_dtype(0, dtype::Int8());
  622. benchmark_algo1.set_dtype(1, dtype::Int8());
  623. benchmark_algo1.set_dtype(2, dtype::Int16());
  624. benchmark_algo1.set_dtype(4, dtype::Int16());
  625. for (auto&& arg : args) {
  626. TensorLayout dst_layout;
  627. auto opr = handle->create_operator<ConvBias>();
  628. opr->param() = arg.param;
  629. opr->deduce_layout({arg.src, dtype::Float32()},
  630. {arg.filter, dtype::Float32()},
  631. {arg.bias, dtype::Float32()}, {}, dst_layout);
  632. //! dst.nr_elems * IC * FH * FW * 2
  633. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  634. arg.filter[2] * arg.filter[3] * 2.0 * pack_size/
  635. (1024 * 1024 * 1024) * 1e3;
  636. benchmark.set_param(arg.param);
  637. auto used = algo_benchmark<ConvBias>(benchmark,
  638. {arg.src, arg.filter, {}, {}, {}},
  639. algo_name0) /
  640. RUN;
  641. arg.param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
  642. arg.param.format = param::ConvBias::Format::NCHW44;
  643. benchmark_algo1.set_param(arg.param);
  644. auto used_algo1 =
  645. algo_benchmark<ConvBias>(
  646. benchmark_algo1,
  647. {arg.src, arg.filter, {}, {}, {}},
  648. algo_name1) /
  649. RUN;
  650. printf("%s %s: normal: %f ms %f Gflops 8x8x16: %f ms %f GFlops "
  651. "speedup: "
  652. "%f\n",
  653. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  654. used, computations / used, used_algo1,
  655. computations / used_algo1, used / used_algo1);
  656. }
  657. }
  658. #if MEGDNN_AARCH64
  659. TEST_F(ARM_COMMON, BENCHMARK_NCHW_VS_NCHW44_INT8x8x32) {
  660. printf("=========================compare "
  661. "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16, "
  662. "IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16 \n");
  663. BENCHMARK_IM2COL_NCHW44_VS_NCHW("IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16",
  664. "IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16",
  665. handle(), 3, dtype::Int8(), dtype::Int32());
  666. }
  667. #endif
  668. #if MEGDNN_ARMV7
  669. TEST_F(ARM_COMMON, BENCHMARK_NCHW_VS_NCHW44_INT8x8x16) {
  670. const char* default_algo = "IM2COLMATMUL:ARMV7_INT8X8X16_K4X8X8";
  671. const char* mk4_algo = "IM2COLMATMUL:ARMV7_INT8X8X16_MK4_K8X8X4";
  672. printf("compare %s vs %s \n", default_algo, mk4_algo);
  673. BENCHMARK_IM2COL_NCHW44_VS_NCHW(default_algo, mk4_algo, handle(), 3,
  674. dtype::Int8(), dtype::Int16());
  675. }
  676. #endif
  677. TEST_F(ARM_COMMON, BENCHMARK_GROUP_CONV_NCHW44_INT8x8x32_VS_INT8x8x16_STRIDE1) {
  678. BENCHMARK_GROUPCONV_NCHW44_int8x8x16VS_int8x8x32("S8_CHAN_WISE_STRD1_NCHW44",
  679. "S8x8x16_CHAN_WISE_STRD1_STRD2_NCHW44",
  680. handle(), 3,1,4);
  681. }
  682. TEST_F(ARM_COMMON, BENCHMARK_GROUP_CONV_NCHW44_INT8x8x32_VS_INT8x8x16_STRIDE2) {
  683. BENCHMARK_GROUPCONV_NCHW44_int8x8x16VS_int8x8x32("S8_CHAN_WISE_STRD2_NCHW44",
  684. "S8x8x16_CHAN_WISE_STRD1_STRD2_NCHW44",
  685. handle(), 3,2, 4);
  686. }
  687. TEST_F(ARM_COMMON, BENCHMARK_GROUP_CONVBIAS_QUANTIZED) {
  688. constexpr size_t RUNS = 50;
  689. param::ConvBias param;
  690. param.sparse = param::ConvBias::Sparse::GROUP;
  691. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  692. Benchmarker<ConvBias> benchmarker_int(handle());
  693. benchmarker_int.set_times(RUNS)
  694. .set_dtype(0, dtype::QuantizedS8(2.5f))
  695. .set_dtype(1, dtype::QuantizedS8(2.5f))
  696. .set_dtype(2, dtype::QuantizedS32(6.25f))
  697. .set_dtype(4, dtype::QuantizedS8(40.25f))
  698. .set_display(false);
  699. Benchmarker<ConvBias> benchmarker_float(handle());
  700. benchmarker_float.set_display(false).set_times(RUNS);
  701. auto run = [&](size_t N, size_t GROUP, size_t IC, size_t OC, size_t H,
  702. size_t W, size_t FS, size_t STRD) {
  703. megdnn_assert(IC % GROUP == 0 && OC % GROUP == 0);
  704. TensorShape src({N, IC, H, W}),
  705. filter({GROUP, OC / GROUP, IC / GROUP, FS, FS}),
  706. bias({1, OC, 1, 1}), dst({N, OC, H / STRD, W / STRD});
  707. param.pad_h = FS / 2;
  708. param.pad_w = FS / 2;
  709. param.stride_h = STRD;
  710. param.stride_w = STRD;
  711. auto int_used = benchmarker_int.set_param(param).exec(
  712. {src, filter, bias, {}, dst}) /
  713. RUNS;
  714. auto float_used = benchmarker_float.set_param(param).exec(
  715. {src, filter, bias, {}, dst}) /
  716. RUNS;
  717. float computations = (IC / GROUP * FS * FS * dst.total_nr_elems() * 2 +
  718. dst.total_nr_elems()) *
  719. 1e-6;
  720. printf("run: %s %s %s->%s \nfloat: %f ms %f Gflops int: %f ms "
  721. "%f Gflops speedup: %f\n",
  722. src.to_string().c_str(), filter.to_string().c_str(),
  723. bias.to_string().c_str(), dst.to_string().c_str(), float_used,
  724. computations / float_used, int_used, computations / int_used,
  725. float_used / int_used);
  726. };
  727. run(1, 1, 28, 28, 28, 28, 3, 1);
  728. run(1, 68, 68, 68, 14, 14, 3, 2);
  729. run(1, 96, 96, 96, 14, 14, 3, 2);
  730. run(1, 100, 100, 100, 7, 7, 3, 1);
  731. }
  732. #endif
  733. #if MEGDNN_WITH_BENCHMARK
  734. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_MATMUL) {
  735. constexpr size_t RUNS = 10;
  736. param::ConvBias param;
  737. param.stride_h = 1;
  738. param.stride_w = 1;
  739. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  740. Benchmarker<ConvBias> benchmarker(handle()), benchmarker_fused(handle());
  741. benchmarker.set_times(RUNS)
  742. .set_dtype(0, dtype::QuantizedS8(2.5f))
  743. .set_dtype(1, dtype::QuantizedS8(2.5f))
  744. .set_dtype(2, dtype::QuantizedS32(6.25f))
  745. .set_dtype(4, dtype::QuantizedS8(40.25f))
  746. .set_display(false);
  747. benchmarker_fused.set_times(RUNS)
  748. .set_dtype(0, dtype::QuantizedS8(2.5f))
  749. .set_dtype(1, dtype::QuantizedS8(2.5f))
  750. .set_dtype(2, dtype::QuantizedS32(6.25f))
  751. .set_dtype(4, dtype::QuantizedS8(40.25f))
  752. .set_display(false);
  753. benchmarker_fused.set_before_exec_callback(
  754. conv_bias::ConvBiasAlgoChecker<ConvBias>("S8MATMUL"));
  755. auto run = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  756. size_t FS) {
  757. TensorShape src({N, IC, H, W}), filter({OC, IC, FS, FS}),
  758. bias({1, OC, 1, 1}), dst({N, OC, H, W});
  759. param.pad_h = FS / 2;
  760. param.pad_w = FS / 2;
  761. auto default_used = benchmarker.set_param(param).exec(
  762. {src, filter, bias, {}, dst}) /
  763. RUNS;
  764. auto fused_used = benchmarker_fused.set_param(param).exec(
  765. {src, filter, bias, {}, dst}) /
  766. RUNS;
  767. float computations =
  768. IC * (FS * FS + 1) * dst.total_nr_elems() * 2 * 1e-6;
  769. printf("run: %s %s %s->%s \ndefault: %f ms %f Gflops fused: %f ms "
  770. "%f Gflops speedup: %f\n",
  771. src.to_string().c_str(), filter.to_string().c_str(),
  772. bias.to_string().c_str(), dst.to_string().c_str(), default_used,
  773. computations / default_used, fused_used,
  774. computations / fused_used, default_used / fused_used);
  775. };
  776. run(1, 128, 128, 32, 32, 3);
  777. for (size_t IC : {36, 48}) {
  778. for (size_t OC : {36, 48, 64}) {
  779. for (size_t size : {56, 128, 256}) {
  780. for (size_t FS : {1, 3, 5}) {
  781. run(1, IC, OC, size, size, FS);
  782. }
  783. }
  784. }
  785. }
  786. }
  787. #endif
  788. #if MEGDNN_WITH_BENCHMARK
  789. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F23) {
  790. #if MEGDNN_AARCH64
  791. benchmark_winograd("WINOGRAD:AARCH64_F32:1:2", handle(), 3);
  792. #else
  793. benchmark_winograd("WINOGRAD:ARMV7_F32_:1:2", handle(), 3);
  794. #endif
  795. }
  796. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F23_4x4) {
  797. #if MEGDNN_AARCH64
  798. benchmark_winograd("WINOGRAD:AARCH64_F32_MK4_4x16:4:2", handle(), 3, 4);
  799. #else
  800. benchmark_winograd("WINOGRAD:ARMV7_F32_MK4_4x8:4:2", handle(), 3, 4);
  801. #endif
  802. }
  803. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F63) {
  804. #if MEGDNN_AARCH64
  805. benchmark_winograd("WINOGRAD:AARCH64_F32K8X12X1:1:6", handle(), 3);
  806. #else
  807. benchmark_winograd("WINOGRAD:ARMV7_F32:1:6", handle(), 3);
  808. #endif
  809. }
  810. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F63_4x4) {
  811. #if MEGDNN_AARCH64
  812. benchmark_winograd("WINOGRAD:AARCH64_F32_MK4_4x16:4:6", handle(), 3, 4);
  813. #else
  814. benchmark_winograd("WINOGRAD:ARMV7_F32_MK4_4x8:4:6", handle(), 3, 4);
  815. #endif
  816. }
  817. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F54) {
  818. #if MEGDNN_AARCH64
  819. benchmark_winograd("WINOGRAD:AARCH64_F32K8X12X1:1:5", handle(), 4);
  820. #else
  821. benchmark_winograd("WINOGRAD:ARMV7_F32:1:5", handle(), 4);
  822. #endif
  823. }
  824. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F45) {
  825. #if MEGDNN_AARCH64
  826. benchmark_winograd("WINOGRAD:AARCH64_F32K8X12X1:1:4", handle(), 5);
  827. #else
  828. benchmark_winograd("WINOGRAD:ARMV7_F32:1:4", handle(), 5);
  829. #endif
  830. }
  831. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  832. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F16_F23) {
  833. #if MEGDNN_AARCH64
  834. benchmark_winograd_fp16("WINOGRAD:AARCH64_F32_MK4_4x16:4:2",
  835. "WINOGRAD:AARCH64_F16_K8X24X1:1:6", handle(), 3, 4);
  836. #else
  837. benchmark_winograd_fp16("WINOGRAD:ARMV7_F32:1:2",
  838. "WINOGRAD:AARCH32_F16_K4X16X1:1:2", handle(), 3);
  839. #endif
  840. }
  841. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F16_F45) {
  842. #if MEGDNN_AARCH64
  843. benchmark_winograd_fp16("WINOGRAD:AARCH64_F32K8X12X1:1:4",
  844. "WINOGRAD:AARCH64_F16_K8X24X1:1:4", handle(), 5);
  845. #else
  846. benchmark_winograd_fp16("WINOGRAD:ARMV7_F32:1:4",
  847. "WINOGRAD:AARCH32_F16_K4X16X1:1:4", handle(), 5);
  848. #endif
  849. }
  850. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F16_F63) {
  851. #if MEGDNN_AARCH64
  852. benchmark_winograd_fp16("WINOGRAD:AARCH64_F32K8X12X1:1:6",
  853. "WINOGRAD:AARCH64_F16_K8X24X1:1:6", handle(), 3);
  854. #else
  855. benchmark_winograd_fp16("WINOGRAD:ARMV7_F32:1:6",
  856. "WINOGRAD:AARCH32_F16_K4X16X1:1:6", handle(), 3);
  857. #endif
  858. }
  859. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F16_F23_8x8) {
  860. #if MEGDNN_AARCH64
  861. benchmark_winograd_fp16("WINOGRAD:AARCH64_F32_MK4_4x16:4:2",
  862. "WINOGRAD:AARCH64_F16_MK8_8X8:8:2", handle(), 3, 8);
  863. #else
  864. benchmark_winograd_fp16("WINOGRAD:ARMV7_F32_MK4_4x8:4:2",
  865. "WINOGRAD:AARCH32_F16_MK8_4X8:8:2", handle(), 3, 8);
  866. #endif
  867. }
  868. #endif
  869. void benchmark_winograd_nchw_vs_nchw44(const char* algo_name0,
  870. const char* algo_name1, Handle* handle) {
  871. using namespace conv_bias;
  872. using NLMode = param::ConvBias::NonlineMode;
  873. std::vector<conv_bias::TestArg> args_nchw44;
  874. std::vector<conv_bias::TestArg> args_nchw;
  875. auto pack = [&](size_t n, size_t oc, size_t ic, size_t h, size_t w,
  876. size_t group, NLMode nlmode) {
  877. param::ConvBias param;
  878. param.format = param::ConvBias::Format::NCHW44;
  879. param.stride_h = 1;
  880. param.stride_w = 1;
  881. param.pad_h = 1;
  882. param.pad_w = 1;
  883. param.nonlineMode = nlmode;
  884. if (group == 1) {
  885. param.sparse = param::ConvBias::Sparse::DENSE;
  886. args_nchw44.emplace_back(param, TensorShape{n, ic / 4, h, w, 4},
  887. TensorShape{oc / 4, ic / 4, 3, 3, 4, 4},
  888. TensorShape{});
  889. param.format = param::ConvBias::Format::NCHW;
  890. args_nchw.emplace_back(param, TensorShape{n, ic, h, w},
  891. TensorShape{oc, ic, 3, 3}, TensorShape{});
  892. } else {
  893. auto oc_per_group = oc / group;
  894. auto ic_per_group = ic / group;
  895. param.sparse = param::ConvBias::Sparse::GROUP;
  896. args_nchw44.emplace_back(param,
  897. TensorShape{n, ic_per_group / 4, h, w, 4},
  898. TensorShape{group, oc_per_group / 4,
  899. ic_per_group / 4, 3, 3, 4, 4},
  900. TensorShape{});
  901. param.format = param::ConvBias::Format::NCHW;
  902. args_nchw.emplace_back(
  903. param, TensorShape{n, ic, h, w},
  904. TensorShape{group, oc_per_group, ic_per_group, 3, 3},
  905. TensorShape{});
  906. }
  907. };
  908. std::vector<NLMode> nonlinemode = {NLMode::IDENTITY};
  909. for (auto nlmode : nonlinemode)
  910. for (size_t n : {1})
  911. for (size_t group = 1; group <= 1; ++group) {
  912. pack(n, 512, 512, 15, 15, group, nlmode);
  913. pack(n, 512, 256, 15, 15, group, nlmode);
  914. pack(n, 256, 256, 29, 29, group, nlmode);
  915. pack(n, 256, 128, 29, 29, group, nlmode);
  916. pack(n, 128, 128, 57, 57, group, nlmode);
  917. pack(n, 128, 64, 57, 57, group, nlmode);
  918. pack(n, 24, 24, 224, 224, group, nlmode);
  919. pack(n, 64, 24, 123, 123, group, nlmode);
  920. pack(n, 64, 64, 56, 56, group, nlmode);
  921. pack(n, 128, 128, 28, 28, group, nlmode);
  922. pack(n, 256, 256, 14, 14, group, nlmode);
  923. pack(n, 512, 512, 7, 7, group, nlmode);
  924. }
  925. using namespace conv_bias;
  926. constexpr size_t RUN = 10;
  927. Benchmarker<ConvBias> benchmark_winograd_nchw(handle);
  928. benchmark_winograd_nchw.set_display(false);
  929. benchmark_winograd_nchw.set_times(RUN);
  930. Benchmarker<ConvBias> benchmark_winograd_nchw44(handle);
  931. benchmark_winograd_nchw44.set_display(false);
  932. benchmark_winograd_nchw44.set_times(RUN);
  933. std::string winograd_nchw_algo_name = ssprintf("WINOGRAD:%s", algo_name0);
  934. std::string winograd_nchw44_algo_name =
  935. ssprintf("WINOGRAD_NCHW44:%s", algo_name1);
  936. for (size_t i = 0; i < args_nchw.size(); ++i) {
  937. auto arg_nchw = args_nchw[i];
  938. auto arg_nchw44 = args_nchw44[i];
  939. TensorLayout dst_layout;
  940. auto opr = handle->create_operator<ConvBias>();
  941. opr->param() = arg_nchw.param;
  942. opr->deduce_layout({arg_nchw.src, dtype::Float32()},
  943. {arg_nchw.filter, dtype::Float32()},
  944. {arg_nchw.bias, dtype::Float32()}, {}, dst_layout);
  945. //! dst.nr_elems * IC * FH * FW * 2
  946. float computations = dst_layout.total_nr_elems() * arg_nchw.filter[1] *
  947. arg_nchw.filter[2] * arg_nchw.filter[3] * 2.0 /
  948. (1024 * 1024 * 1024) * 1e3;
  949. benchmark_winograd_nchw.set_param(arg_nchw.param);
  950. auto nchw_used = algo_benchmark<ConvBias>(
  951. benchmark_winograd_nchw,
  952. {arg_nchw.src, arg_nchw.filter, {}, {}, {}},
  953. winograd_nchw_algo_name.c_str()) /
  954. RUN;
  955. benchmark_winograd_nchw44.set_param(arg_nchw44.param);
  956. auto nchw44_used =
  957. algo_benchmark<ConvBias>(
  958. benchmark_winograd_nchw44,
  959. {arg_nchw44.src, arg_nchw44.filter, {}, {}, {}},
  960. winograd_nchw44_algo_name.c_str()) /
  961. RUN;
  962. printf("%s %s: nchw: %f ms %f Gflops nchw44: %f ms %f GFlops "
  963. "speedup: "
  964. "%f\n",
  965. arg_nchw.src.to_string().c_str(),
  966. arg_nchw.filter.to_string().c_str(), nchw_used,
  967. computations / nchw_used, nchw44_used,
  968. computations / nchw44_used, nchw_used / nchw44_used);
  969. }
  970. }
  971. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F23_MK4_NCHW_VS_NCHW44) {
  972. #if MEGDNN_AARCH64
  973. benchmark_winograd_nchw_vs_nchw44("AARCH64_F32_MK4_4x16:4:2",
  974. "AARCH64_F32_MK4_4x16:4:2", handle());
  975. #else
  976. benchmark_winograd_nchw_vs_nchw44("ARMV7_F32_MK4_4x8:4:2",
  977. "ARMV7_F32_MK4_4x8:4:2", handle());
  978. #endif
  979. }
  980. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F63_MK4_NCHW_VS_NCHW44) {
  981. #if MEGDNN_AARCH64
  982. benchmark_winograd_nchw_vs_nchw44("AARCH64_F32_MK4_4x16:4:6",
  983. "AARCH64_F32_MK4_4x16:4:6", handle());
  984. #else
  985. benchmark_winograd_nchw_vs_nchw44("ARMV7_F32_MK4_4x8:4:6",
  986. "ARMV7_F32_MK4_4x8:4:6", handle());
  987. #endif
  988. }
  989. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F73_MK4_NCHW_VS_NCHW44) {
  990. #if MEGDNN_AARCH64
  991. benchmark_winograd_nchw_vs_nchw44("AARCH64_F32_MK4_4x16:4:6",
  992. "ARM_COMMON_F32_GEMV_MK4:4:7", handle());
  993. #else
  994. benchmark_winograd_nchw_vs_nchw44("ARMV7_F32_MK4_4x8:4:6",
  995. "ARMV7_F32_MK4_4x8:4:7", handle());
  996. #endif
  997. }
  998. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F23_8x8) {
  999. auto benchmark_winograd_quantized = [](const char* algo_name_fp32,
  1000. const char* algo_name_quantized,
  1001. Handle* handle, size_t kernel) {
  1002. auto&& args = get_winograd_benchmark_args(kernel);
  1003. using namespace conv_bias;
  1004. constexpr size_t RUN = 10;
  1005. Benchmarker<ConvBias> benchmark(handle);
  1006. benchmark.set_display(false);
  1007. benchmark.set_times(RUN);
  1008. Benchmarker<ConvBias> benchmark_winograd(handle);
  1009. benchmark_winograd.set_display(false).set_times(RUN);
  1010. benchmark_winograd.set_dtype(0, dtype::QuantizedS8(2.5f))
  1011. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1012. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1013. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1014. for (auto&& arg : args) {
  1015. TensorLayout dst_layout;
  1016. auto opr = handle->create_operator<ConvBias>();
  1017. opr->param() = arg.param;
  1018. opr->deduce_layout({arg.src, dtype::Float32()},
  1019. {arg.filter, dtype::Float32()},
  1020. {arg.bias, dtype::Float32()}, {}, dst_layout);
  1021. //! dst.nr_elems * IC * FH * FW * 2
  1022. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1023. arg.filter[2] * arg.filter[3] * 2.0 /
  1024. (1024 * 1024 * 1024) * 1e3;
  1025. benchmark.set_param(arg.param);
  1026. auto used = algo_benchmark<ConvBias>(
  1027. benchmark, {arg.src, arg.filter, {}, {}, {}},
  1028. algo_name_fp32) /
  1029. RUN;
  1030. benchmark_winograd.set_param(arg.param);
  1031. auto used_winograd =
  1032. algo_benchmark<ConvBias>(benchmark_winograd,
  1033. {arg.src, arg.filter, {}, {}, {}},
  1034. algo_name_quantized) /
  1035. RUN;
  1036. printf("%s %s: normal: %f ms %f Gflops winograd: %f ms %f GFlops "
  1037. "speedup: "
  1038. "%f\n",
  1039. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  1040. used, computations / used, used_winograd,
  1041. computations / used_winograd, used / used_winograd);
  1042. }
  1043. };
  1044. #if MEGDNN_AARCH64
  1045. benchmark_winograd_quantized("WINOGRAD:AARCH64_F32_MK4_4x16:4:2",
  1046. "WINOGRAD:AARCH64_INT16X16X32_MK8_8X8:8:2",
  1047. handle(), 3);
  1048. #else
  1049. benchmark_winograd_quantized("WINOGRAD:ARMV7_F32_MK4_4x8:4:2",
  1050. "WINOGRAD:ARMV7_INT16X16X32_MK8_4X8:8:2",
  1051. handle(), 3);
  1052. #endif
  1053. }
  1054. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_INT8_STRIDE1) {
  1055. // have to remove preferred restrict in usable func before run the benchmark
  1056. using namespace conv_bias;
  1057. std::vector<TestArg> args;
  1058. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1059. size_t p, NonlineMode nonline_mode) {
  1060. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1061. return;
  1062. param::ConvBias param;
  1063. param.stride_h = 1;
  1064. param.stride_w = 1;
  1065. param.pad_h = p;
  1066. param.pad_w = p;
  1067. param.nonlineMode = nonline_mode;
  1068. //! channel bias
  1069. args.emplace_back(param, TensorShape{2, ic, h, w},
  1070. TensorShape{oc, ic, kernel, kernel},
  1071. TensorShape{1, oc, 1, 1});
  1072. };
  1073. for (size_t kernel : {2, 3, 5, 7})
  1074. for (size_t ic : {1, 8, 16, 32})
  1075. for (size_t oc : {1, 8, 16, 32})
  1076. for (size_t p : {1})
  1077. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  1078. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  1079. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  1080. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  1081. }
  1082. constexpr size_t RUN = 50;
  1083. Benchmarker<ConvBias> benchmark0(handle());
  1084. benchmark0.set_dtype(0, dtype::QuantizedS8(2.5f))
  1085. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1086. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1087. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1088. benchmark0.set_display(false);
  1089. benchmark0.set_times(RUN);
  1090. benchmark0.set_before_exec_callback(
  1091. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("S8STRD1"));
  1092. Benchmarker<ConvBias> benchmark1(handle());
  1093. benchmark1.set_dtype(0, dtype::QuantizedS8(2.5f))
  1094. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1095. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1096. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1097. benchmark1.set_display(false);
  1098. benchmark1.set_times(RUN);
  1099. for (auto&& arg : args) {
  1100. TensorLayout dst_layout;
  1101. auto opr = handle()->create_operator<ConvBias>();
  1102. opr->param() = arg.param;
  1103. opr->deduce_layout({arg.src, dtype::Int8()},
  1104. {arg.filter, dtype::Int8()},
  1105. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1106. //! dst.nr_elems * IC * FH * FW * 2
  1107. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1108. arg.filter[2] * arg.filter[3] * 2.0 /
  1109. (1024 * 1024 * 1024) * 1e3;
  1110. auto used0 = benchmark0.set_param(arg.param).exec(
  1111. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1112. RUN;
  1113. auto used1 = benchmark1.set_param(arg.param).exec(
  1114. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1115. RUN;
  1116. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  1117. "speedup: %f\n",
  1118. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  1119. used0, computations / used0, used1, computations / used1,
  1120. used1 / used0);
  1121. }
  1122. }
  1123. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_INT8_STRIDE2) {
  1124. // have to remove preferred restrict in usable func before run the benchmark
  1125. using namespace conv_bias;
  1126. std::vector<TestArg> args;
  1127. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1128. size_t p, NonlineMode nonline_mode) {
  1129. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1130. return;
  1131. param::ConvBias param;
  1132. param.stride_h = 2;
  1133. param.stride_w = 2;
  1134. param.pad_h = p;
  1135. param.pad_w = p;
  1136. param.nonlineMode = nonline_mode;
  1137. //! channel bias
  1138. args.emplace_back(param, TensorShape{2, ic, h, w},
  1139. TensorShape{oc, ic, kernel, kernel},
  1140. TensorShape{1, oc, 1, 1});
  1141. };
  1142. for (size_t kernel : {2, 3, 5, 7})
  1143. for (size_t ic : {1, 8, 16, 32})
  1144. for (size_t oc : {1, 8, 16, 32})
  1145. for (size_t p : {1})
  1146. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  1147. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  1148. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  1149. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  1150. }
  1151. constexpr size_t RUN = 50;
  1152. Benchmarker<ConvBias> benchmark0(handle());
  1153. benchmark0.set_dtype(0, dtype::QuantizedS8(2.5f))
  1154. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1155. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1156. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1157. benchmark0.set_display(false);
  1158. benchmark0.set_times(RUN);
  1159. benchmark0.set_before_exec_callback(
  1160. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("S8STRD2"));
  1161. Benchmarker<ConvBias> benchmark1(handle());
  1162. benchmark1.set_dtype(0, dtype::QuantizedS8(2.5f))
  1163. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1164. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1165. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1166. benchmark1.set_display(false);
  1167. benchmark1.set_times(RUN);
  1168. for (auto&& arg : args) {
  1169. TensorLayout dst_layout;
  1170. auto opr = handle()->create_operator<ConvBias>();
  1171. opr->param() = arg.param;
  1172. opr->deduce_layout({arg.src, dtype::Int8()},
  1173. {arg.filter, dtype::Int8()},
  1174. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1175. //! dst.nr_elems * IC * FH * FW * 2
  1176. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1177. arg.filter[2] * arg.filter[3] * 2.0 /
  1178. (1024 * 1024 * 1024) * 1e3;
  1179. auto used0 = benchmark0.set_param(arg.param).exec(
  1180. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1181. RUN;
  1182. auto used1 = benchmark1.set_param(arg.param).exec(
  1183. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1184. RUN;
  1185. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  1186. "speedup: %f\n",
  1187. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  1188. used0, computations / used0, used1, computations / used1,
  1189. used1 / used0);
  1190. }
  1191. }
  1192. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_QUINT8_STRIDE1) {
  1193. // have to remove preferred restrict in usable func before run the benchmark
  1194. using namespace conv_bias;
  1195. std::vector<TestArg> args;
  1196. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1197. size_t p, NonlineMode nonline_mode) {
  1198. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1199. return;
  1200. param::ConvBias param;
  1201. param.stride_h = 1;
  1202. param.stride_w = 1;
  1203. param.pad_h = p;
  1204. param.pad_w = p;
  1205. param.nonlineMode = nonline_mode;
  1206. //! channel bias
  1207. args.emplace_back(param, TensorShape{2, ic, h, w},
  1208. TensorShape{oc, ic, kernel, kernel},
  1209. TensorShape{1, oc, 1, 1});
  1210. };
  1211. for (size_t kernel : {2, 3, 5, 7})
  1212. for (size_t ic : {1, 8, 16, 32})
  1213. for (size_t oc : {1, 8, 16, 32})
  1214. for (size_t p : {1})
  1215. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  1216. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  1217. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  1218. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  1219. }
  1220. constexpr size_t RUN = 50;
  1221. Benchmarker<ConvBias> benchmark0(handle());
  1222. benchmark0
  1223. .set_dtype(0,
  1224. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1225. .set_dtype(1,
  1226. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1227. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1228. .set_dtype(4,
  1229. dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1230. benchmark0.set_display(false);
  1231. benchmark0.set_times(RUN);
  1232. benchmark0.set_before_exec_callback(
  1233. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("QU8STRD1"));
  1234. Benchmarker<ConvBias> benchmark1(handle());
  1235. benchmark1
  1236. .set_dtype(0,
  1237. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1238. .set_dtype(1,
  1239. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1240. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1241. .set_dtype(4,
  1242. dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1243. benchmark1.set_display(false);
  1244. benchmark1.set_times(RUN);
  1245. for (auto&& arg : args) {
  1246. TensorLayout dst_layout;
  1247. auto opr = handle()->create_operator<ConvBias>();
  1248. opr->param() = arg.param;
  1249. opr->deduce_layout({arg.src, dtype::Int8()},
  1250. {arg.filter, dtype::Int8()},
  1251. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1252. //! dst.nr_elems * IC * FH * FW * 2
  1253. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1254. arg.filter[2] * arg.filter[3] * 2.0 /
  1255. (1024 * 1024 * 1024) * 1e3;
  1256. auto used0 = benchmark0.set_param(arg.param).exec(
  1257. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1258. RUN;
  1259. auto used1 = benchmark1.set_param(arg.param).exec(
  1260. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1261. RUN;
  1262. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  1263. "speedup: %f\n",
  1264. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  1265. used0, computations / used0, used1, computations / used1,
  1266. used1 / used0);
  1267. }
  1268. }
  1269. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_QUINT8_STRIDE2) {
  1270. // have to remove preferred restrict in usable func before run the benchmark
  1271. using namespace conv_bias;
  1272. std::vector<TestArg> args;
  1273. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1274. size_t p, NonlineMode nonline_mode) {
  1275. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1276. return;
  1277. param::ConvBias param;
  1278. param.stride_h = 2;
  1279. param.stride_w = 2;
  1280. param.pad_h = p;
  1281. param.pad_w = p;
  1282. param.nonlineMode = nonline_mode;
  1283. //! channel bias
  1284. args.emplace_back(param, TensorShape{2, ic, h, w},
  1285. TensorShape{oc, ic, kernel, kernel},
  1286. TensorShape{1, oc, 1, 1});
  1287. };
  1288. for (size_t kernel : {2, 3, 5, 7})
  1289. for (size_t ic : {1, 8, 16, 32})
  1290. for (size_t oc : {1, 8, 16, 32})
  1291. for (size_t p : {1})
  1292. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  1293. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  1294. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  1295. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  1296. }
  1297. constexpr size_t RUN = 50;
  1298. Benchmarker<ConvBias> benchmark0(handle());
  1299. benchmark0
  1300. .set_dtype(0,
  1301. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1302. .set_dtype(1,
  1303. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1304. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1305. .set_dtype(4,
  1306. dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1307. benchmark0.set_display(false);
  1308. benchmark0.set_times(RUN);
  1309. benchmark0.set_before_exec_callback(
  1310. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("QU8STRD2"));
  1311. Benchmarker<ConvBias> benchmark1(handle());
  1312. benchmark1
  1313. .set_dtype(0,
  1314. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1315. .set_dtype(1,
  1316. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1317. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1318. .set_dtype(4,
  1319. dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1320. benchmark1.set_display(false);
  1321. benchmark1.set_times(RUN);
  1322. for (auto&& arg : args) {
  1323. TensorLayout dst_layout;
  1324. auto opr = handle()->create_operator<ConvBias>();
  1325. opr->param() = arg.param;
  1326. opr->deduce_layout({arg.src, dtype::Int8()},
  1327. {arg.filter, dtype::Int8()},
  1328. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1329. //! dst.nr_elems * IC * FH * FW * 2
  1330. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1331. arg.filter[2] * arg.filter[3] * 2.0 /
  1332. (1024 * 1024 * 1024) * 1e3;
  1333. auto used0 = benchmark0.set_param(arg.param).exec(
  1334. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1335. RUN;
  1336. auto used1 = benchmark1.set_param(arg.param).exec(
  1337. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1338. RUN;
  1339. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  1340. "speedup: %f\n",
  1341. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  1342. used0, computations / used0, used1, computations / used1,
  1343. used1 / used0);
  1344. }
  1345. }
  1346. TEST_F(ARM_COMMON, BENCHMARK_CHANNEL_WISE_F32_STRIDE1_NCHW44) {
  1347. // have to remove preferred restrict in usable func before run the benchmark
  1348. using namespace conv_bias;
  1349. param::ConvBias param;
  1350. param.stride_h = 1;
  1351. param.stride_w = 1;
  1352. param.pad_h = 1;
  1353. param.pad_w = 1;
  1354. param.nonlineMode = NonlineMode::RELU;
  1355. param.sparse = param::ConvBias::Sparse::GROUP;
  1356. constexpr size_t RUN = 50;
  1357. Benchmarker<ConvBias> benchmark0(handle());
  1358. benchmark0.set_display(false);
  1359. benchmark0.set_param(param);
  1360. benchmark0.set_times(RUN);
  1361. benchmark0.set_before_exec_callback(
  1362. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32STRD1"));
  1363. auto opr = handle()->create_operator<ConvBias>();
  1364. opr->param() = param;
  1365. param.format = param::ConvBias::Format::NCHW44;
  1366. Benchmarker<ConvBias> benchmark1(handle());
  1367. benchmark1.set_display(false);
  1368. benchmark1.set_param(param);
  1369. benchmark1.set_times(RUN);
  1370. benchmark1.set_before_exec_callback(
  1371. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  1372. "F32_CHANNEL_WISE_NCHW44"));
  1373. auto run = [&](size_t group, size_t w, size_t h, size_t kernel) {
  1374. TensorLayout dst_layout;
  1375. opr->deduce_layout({{1, group * 4, h, w}, dtype::Int8()},
  1376. {{group * 4, 1, 1, kernel, kernel}, dtype::Int8()},
  1377. {{1, group * 4, 1, 1}, dtype::Int32()}, {},
  1378. dst_layout);
  1379. //! dst.nr_elems * IC * FH * FW * 2
  1380. float computations = dst_layout.total_nr_elems() * kernel * kernel *
  1381. 2.0 / (1024 * 1024 * 1024) * 1e3;
  1382. auto used0 = benchmark0.exec({{1, group * 4, h, w},
  1383. {group * 4, 1, 1, kernel, kernel},
  1384. {1, group * 4, 1, 1},
  1385. {},
  1386. {}}) /
  1387. RUN;
  1388. auto used1 = benchmark1.exec({{1, group, h, w, 4},
  1389. {group, 1, 1, kernel, kernel, 4},
  1390. {1, group, 1, 1, 4},
  1391. {},
  1392. {}}) /
  1393. RUN;
  1394. printf("group/h/w/kernel:%zu,%zu,%zu,%zu: nchw: %f ms %f Gflops "
  1395. "nchw44: "
  1396. "%f ms %f GFlops "
  1397. "speedup: %f\n",
  1398. group, h, w, kernel, used0, computations / used0, used1,
  1399. computations / used1, used0 / used1);
  1400. };
  1401. for (size_t group : {8, 16, 32, 64}) {
  1402. for (size_t kerenl : {2, 3, 5}) {
  1403. run(group, 112, 112, kerenl);
  1404. run(group, 56, 56, kerenl);
  1405. run(group, 48, 48, kerenl);
  1406. run(group, 28, 28, kerenl);
  1407. run(group, 14, 14, kerenl);
  1408. }
  1409. }
  1410. run(8, 112, 112, 3);
  1411. run(32, 56, 56, 3);
  1412. run(64, 28, 28, 3);
  1413. run(128, 14, 14, 3);
  1414. }
  1415. TEST_F(ARM_COMMON, BENCHMARK_CHANNEL_WISE_F32_STRIDE2_NCHW44) {
  1416. // have to remove preferred restrict in usable func before run the benchmark
  1417. using namespace conv_bias;
  1418. param::ConvBias param;
  1419. param.stride_h = 2;
  1420. param.stride_w = 2;
  1421. param.pad_h = 1;
  1422. param.pad_w = 1;
  1423. param.nonlineMode = NonlineMode::RELU;
  1424. param.sparse = param::ConvBias::Sparse::GROUP;
  1425. constexpr size_t RUN = 50;
  1426. Benchmarker<ConvBias> benchmark0(handle());
  1427. benchmark0.set_display(false);
  1428. benchmark0.set_param(param);
  1429. benchmark0.set_times(RUN);
  1430. benchmark0.set_before_exec_callback(
  1431. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32STRD2"));
  1432. auto opr = handle()->create_operator<ConvBias>();
  1433. opr->param() = param;
  1434. param.format = param::ConvBias::Format::NCHW44;
  1435. Benchmarker<ConvBias> benchmark1(handle());
  1436. benchmark1.set_display(false);
  1437. benchmark1.set_param(param);
  1438. benchmark1.set_times(RUN);
  1439. benchmark1.set_before_exec_callback(
  1440. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  1441. "F32_CHANNEL_WISE_NCHW44"));
  1442. auto run = [&](size_t group, size_t w, size_t h, size_t kernel) {
  1443. TensorLayout dst_layout;
  1444. opr->deduce_layout({{1, group * 4, h, w}, dtype::Int8()},
  1445. {{group * 4, 1, 1, kernel, kernel}, dtype::Int8()},
  1446. {{1, group * 4, 1, 1}, dtype::Int32()}, {},
  1447. dst_layout);
  1448. //! dst.nr_elems * IC * FH * FW * 2
  1449. float computations = dst_layout.total_nr_elems() * kernel * kernel *
  1450. 2.0 / (1024 * 1024 * 1024) * 1e3;
  1451. auto used0 = benchmark0.exec({{1, group * 4, h, w},
  1452. {group * 4, 1, 1, kernel, kernel},
  1453. {1, group * 4, 1, 1},
  1454. {},
  1455. {}}) /
  1456. RUN;
  1457. auto used1 = benchmark1.exec({{1, group, h, w, 4},
  1458. {group, 1, 1, kernel, kernel, 4},
  1459. {1, group, 1, 1, 4},
  1460. {},
  1461. {}}) /
  1462. RUN;
  1463. printf("group/h/w/kernel:%zu,%zu,%zu,%zu: nchw: %f ms %f Gflops "
  1464. "nchw44: "
  1465. "%f ms %f GFlops "
  1466. "speedup: %f\n",
  1467. group, h, w, kernel, used0, computations / used0, used1,
  1468. computations / used1, used0 / used1);
  1469. };
  1470. for (size_t group : {8, 16, 32, 64}) {
  1471. for (size_t kerenl : {2, 3, 5}) {
  1472. run(group, 112, 112, kerenl);
  1473. run(group, 56, 56, kerenl);
  1474. run(group, 48, 48, kerenl);
  1475. run(group, 28, 28, kerenl);
  1476. run(group, 14, 14, kerenl);
  1477. }
  1478. }
  1479. run(8, 112, 112, 3);
  1480. run(32, 56, 56, 3);
  1481. run(64, 28, 28, 3);
  1482. run(128, 14, 14, 3);
  1483. }
  1484. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_QINT8_STRIDE1_NCHW44) {
  1485. // have to remove preferred restrict in usable func before run the benchmark
  1486. using namespace conv_bias;
  1487. param::ConvBias param;
  1488. param.stride_h = 1;
  1489. param.stride_w = 1;
  1490. param.pad_h = 1;
  1491. param.pad_w = 1;
  1492. param.nonlineMode = NonlineMode::RELU;
  1493. param.sparse = param::ConvBias::Sparse::GROUP;
  1494. constexpr size_t RUN = 50;
  1495. Benchmarker<ConvBias> benchmark0(handle());
  1496. benchmark0.set_dtype(0, dtype::QuantizedS8(0.2f))
  1497. .set_dtype(1, dtype::QuantizedS8(0.2f))
  1498. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1499. .set_dtype(4, dtype::QuantizedS8(1.4f));
  1500. benchmark0.set_display(false);
  1501. benchmark0.set_param(param);
  1502. benchmark0.set_times(RUN);
  1503. benchmark0.set_before_exec_callback(
  1504. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("S8STRD1"));
  1505. auto opr = handle()->create_operator<ConvBias>();
  1506. opr->param() = param;
  1507. param.format = param::ConvBias::Format::NCHW44;
  1508. Benchmarker<ConvBias> benchmark1(handle());
  1509. benchmark1.set_dtype(0, dtype::QuantizedS8(0.2f))
  1510. .set_dtype(1, dtype::QuantizedS8(0.2f))
  1511. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1512. .set_dtype(4, dtype::QuantizedS8(1.4f));
  1513. benchmark1.set_display(false);
  1514. benchmark1.set_param(param);
  1515. benchmark1.set_times(RUN);
  1516. benchmark1.set_before_exec_callback(
  1517. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  1518. "S8_CHAN_WISE_STRD1_NCHW44"));
  1519. auto run = [&](size_t group, size_t w, size_t h, size_t kernel) {
  1520. TensorLayout dst_layout;
  1521. opr->deduce_layout({{1, group * 4, h, w}, dtype::Int8()},
  1522. {{group * 4, 1, 1, kernel, kernel}, dtype::Int8()},
  1523. {{1, group * 4, 1, 1}, dtype::Int32()}, {},
  1524. dst_layout);
  1525. //! dst.nr_elems * IC * FH * FW * 2
  1526. float computations = dst_layout.total_nr_elems() * kernel * kernel *
  1527. 2.0 / (1024 * 1024 * 1024) * 1e3;
  1528. auto used0 = benchmark0.exec({{1, group * 4, h, w},
  1529. {group * 4, 1, 1, kernel, kernel},
  1530. {1, group * 4, 1, 1},
  1531. {},
  1532. {}}) /
  1533. RUN;
  1534. auto used1 = benchmark1.exec({{1, group, h, w, 4},
  1535. {group, 1, 1, kernel, kernel, 4},
  1536. {1, group, 1, 1, 4},
  1537. {},
  1538. {}}) /
  1539. RUN;
  1540. printf("group/h/w/kernel:%zu,%zu,%zu,%zu: nchw: %f ms %f Gflops "
  1541. "nchw44: "
  1542. "%f ms %f GFlops "
  1543. "speedup: %f\n",
  1544. group, h, w, kernel, used0, computations / used0, used1,
  1545. computations / used1, used0 / used1);
  1546. };
  1547. for (size_t group : {8, 16, 32, 64, 128}) {
  1548. for (size_t kerenl : {2, 3, 5}) {
  1549. run(group, 112, 112, kerenl);
  1550. run(group, 56, 56, kerenl);
  1551. run(group, 48, 48, kerenl);
  1552. run(group, 28, 28, kerenl);
  1553. run(group, 14, 14, kerenl);
  1554. }
  1555. }
  1556. }
  1557. #endif
  1558. #if __ARM_FEATURE_DOTPROD
  1559. #if MEGDNN_WITH_BENCHMARK
  1560. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_INT8_STRIDE1_WITHDOTPROD) {
  1561. // have to remove preferred restrict in usable func before run the benchmark
  1562. using namespace conv_bias;
  1563. std::vector<TestArg> args;
  1564. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1565. size_t p, NonlineMode nonline_mode) {
  1566. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1567. return;
  1568. param::ConvBias param;
  1569. param.stride_h = 1;
  1570. param.stride_w = 1;
  1571. param.pad_h = p;
  1572. param.pad_w = p;
  1573. param.nonlineMode = nonline_mode;
  1574. //! channel bias
  1575. args.emplace_back(param, TensorShape{2, ic, h, w},
  1576. TensorShape{oc, ic, kernel, kernel},
  1577. TensorShape{1, oc, 1, 1});
  1578. };
  1579. for (size_t kernel : {2, 3, 5, 7})
  1580. for (size_t ic : {1, 8, 16, 32})
  1581. for (size_t oc : {1, 8, 16, 32})
  1582. for (size_t p : {1})
  1583. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  1584. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  1585. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  1586. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  1587. }
  1588. constexpr size_t RUN = 50;
  1589. Benchmarker<ConvBias> benchmark0(handle());
  1590. benchmark0.set_dtype(0, dtype::QuantizedS8(2.5f))
  1591. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1592. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1593. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1594. benchmark0.set_display(false);
  1595. benchmark0.set_times(RUN);
  1596. benchmark0.set_before_exec_callback(
  1597. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("ARMDOTS8STRD1"));
  1598. Benchmarker<ConvBias> benchmark1(handle());
  1599. benchmark1.set_dtype(0, dtype::QuantizedS8(2.5f))
  1600. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1601. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1602. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1603. benchmark1.set_display(false);
  1604. benchmark1.set_times(RUN);
  1605. for (auto&& arg : args) {
  1606. TensorLayout dst_layout;
  1607. auto opr = handle()->create_operator<ConvBias>();
  1608. opr->param() = arg.param;
  1609. opr->deduce_layout({arg.src, dtype::Int8()},
  1610. {arg.filter, dtype::Int8()},
  1611. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1612. //! dst.nr_elems * IC * FH * FW * 2
  1613. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1614. arg.filter[2] * arg.filter[3] * 2.0 /
  1615. (1024 * 1024 * 1024) * 1e3;
  1616. auto used0 = benchmark0.set_param(arg.param).exec(
  1617. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1618. RUN;
  1619. auto used1 = benchmark1.set_param(arg.param).exec(
  1620. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1621. RUN;
  1622. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  1623. "speedup: %f\n",
  1624. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  1625. used0, computations / used0, used1, computations / used1,
  1626. used1 / used0);
  1627. }
  1628. }
  1629. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_INT8_STRIDE2_WITHDOTPROD) {
  1630. // have to remove preferred restrict in usable func before run the benchmark
  1631. using namespace conv_bias;
  1632. std::vector<TestArg> args;
  1633. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1634. size_t p, NonlineMode nonline_mode) {
  1635. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1636. return;
  1637. param::ConvBias param;
  1638. param.stride_h = 2;
  1639. param.stride_w = 2;
  1640. param.pad_h = p;
  1641. param.pad_w = p;
  1642. param.nonlineMode = nonline_mode;
  1643. //! channel bias
  1644. args.emplace_back(param, TensorShape{2, ic, h, w},
  1645. TensorShape{oc, ic, kernel, kernel},
  1646. TensorShape{1, oc, 1, 1});
  1647. };
  1648. for (size_t kernel : {2, 3, 5, 7})
  1649. for (size_t ic : {1, 8, 16, 32})
  1650. for (size_t oc : {1, 8, 16, 32})
  1651. for (size_t p : {1})
  1652. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  1653. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  1654. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  1655. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  1656. }
  1657. constexpr size_t RUN = 50;
  1658. Benchmarker<ConvBias> benchmark0(handle());
  1659. benchmark0.set_dtype(0, dtype::QuantizedS8(2.5f))
  1660. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1661. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1662. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1663. benchmark0.set_display(false);
  1664. benchmark0.set_times(RUN);
  1665. benchmark0.set_before_exec_callback(
  1666. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("ARMDOTS8STRD2"));
  1667. Benchmarker<ConvBias> benchmark1(handle());
  1668. benchmark1.set_dtype(0, dtype::QuantizedS8(2.5f))
  1669. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1670. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1671. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1672. benchmark1.set_display(false);
  1673. benchmark1.set_times(RUN);
  1674. for (auto&& arg : args) {
  1675. TensorLayout dst_layout;
  1676. auto opr = handle()->create_operator<ConvBias>();
  1677. opr->param() = arg.param;
  1678. opr->deduce_layout({arg.src, dtype::Int8()},
  1679. {arg.filter, dtype::Int8()},
  1680. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1681. //! dst.nr_elems * IC * FH * FW * 2
  1682. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1683. arg.filter[2] * arg.filter[3] * 2.0 /
  1684. (1024 * 1024 * 1024) * 1e3;
  1685. auto used0 = benchmark0.set_param(arg.param).exec(
  1686. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1687. RUN;
  1688. auto used1 = benchmark1.set_param(arg.param).exec(
  1689. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1690. RUN;
  1691. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  1692. "speedup: %f\n",
  1693. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  1694. used0, computations / used0, used1, computations / used1,
  1695. used1 / used0);
  1696. }
  1697. }
  1698. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_QUINT8_STRIDE1_WITHDOTPROD) {
  1699. // have to remove preferred restrict in usable func before run the benchmark
  1700. using namespace conv_bias;
  1701. std::vector<TestArg> args;
  1702. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1703. size_t p, NonlineMode nonline_mode) {
  1704. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1705. return;
  1706. param::ConvBias param;
  1707. param.stride_h = 1;
  1708. param.stride_w = 1;
  1709. param.pad_h = p;
  1710. param.pad_w = p;
  1711. param.nonlineMode = nonline_mode;
  1712. //! channel bias
  1713. args.emplace_back(param, TensorShape{2, ic, h, w},
  1714. TensorShape{oc, ic, kernel, kernel},
  1715. TensorShape{1, oc, 1, 1});
  1716. };
  1717. // clang-format off
  1718. for (size_t kernel : {2, 3, 5, 7})
  1719. for (size_t ic : {1, 8, 16, 32})
  1720. for (size_t oc : {1, 8, 16, 32})
  1721. for (size_t p : {1})
  1722. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  1723. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  1724. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  1725. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  1726. }
  1727. // clang-format on
  1728. constexpr size_t RUN = 50;
  1729. Benchmarker<ConvBias> benchmark0(handle());
  1730. benchmark0
  1731. .set_dtype(0,
  1732. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1733. .set_dtype(1,
  1734. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1735. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1736. .set_dtype(4,
  1737. dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1738. benchmark0.set_display(false);
  1739. benchmark0.set_times(RUN);
  1740. benchmark0.set_before_exec_callback(
  1741. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("ARMDOTU8STRD1"));
  1742. Benchmarker<ConvBias> benchmark1(handle());
  1743. benchmark1
  1744. .set_dtype(0,
  1745. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1746. .set_dtype(1,
  1747. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1748. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1749. .set_dtype(4,
  1750. dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1751. benchmark1.set_display(false);
  1752. benchmark1.set_times(RUN);
  1753. for (auto&& arg : args) {
  1754. TensorLayout dst_layout;
  1755. auto opr = handle()->create_operator<ConvBias>();
  1756. opr->param() = arg.param;
  1757. opr->deduce_layout({arg.src, dtype::Int8()},
  1758. {arg.filter, dtype::Int8()},
  1759. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1760. //! dst.nr_elems * IC * FH * FW * 2
  1761. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1762. arg.filter[2] * arg.filter[3] * 2.0 /
  1763. (1024 * 1024 * 1024) * 1e3;
  1764. auto used0 = benchmark0.set_param(arg.param).exec(
  1765. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1766. RUN;
  1767. auto used1 = benchmark1.set_param(arg.param).exec(
  1768. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1769. RUN;
  1770. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  1771. "speedup: %f\n",
  1772. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  1773. used0, computations / used0, used1, computations / used1,
  1774. used1 / used0);
  1775. }
  1776. }
  1777. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_QUINT8_STRIDE2_WITHDOTPROD) {
  1778. // have to remove preferred restrict in usable func before run the benchmark
  1779. using namespace conv_bias;
  1780. std::vector<TestArg> args;
  1781. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1782. size_t p, NonlineMode nonline_mode) {
  1783. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1784. return;
  1785. param::ConvBias param;
  1786. param.stride_h = 2;
  1787. param.stride_w = 2;
  1788. param.pad_h = p;
  1789. param.pad_w = p;
  1790. param.nonlineMode = nonline_mode;
  1791. //! channel bias
  1792. args.emplace_back(param, TensorShape{2, ic, h, w},
  1793. TensorShape{oc, ic, kernel, kernel},
  1794. TensorShape{1, oc, 1, 1});
  1795. };
  1796. // clang-format off
  1797. for (size_t kernel : {2, 3, 5, 7})
  1798. for (size_t ic : {1, 8, 16, 32})
  1799. for (size_t oc : {1, 8, 16, 32})
  1800. for (size_t p : {1})
  1801. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  1802. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  1803. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  1804. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  1805. }
  1806. // clang-format on
  1807. constexpr size_t RUN = 50;
  1808. Benchmarker<ConvBias> benchmark0(handle());
  1809. benchmark0
  1810. .set_dtype(0,
  1811. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1812. .set_dtype(1,
  1813. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1814. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1815. .set_dtype(4,
  1816. dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1817. benchmark0.set_display(false);
  1818. benchmark0.set_times(RUN);
  1819. benchmark0.set_before_exec_callback(
  1820. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("ARMDOTU8STRD2"));
  1821. Benchmarker<ConvBias> benchmark1(handle());
  1822. benchmark1
  1823. .set_dtype(0,
  1824. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1825. .set_dtype(1,
  1826. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1827. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1828. .set_dtype(4,
  1829. dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1830. benchmark1.set_display(false);
  1831. benchmark1.set_times(RUN);
  1832. for (auto&& arg : args) {
  1833. TensorLayout dst_layout;
  1834. auto opr = handle()->create_operator<ConvBias>();
  1835. opr->param() = arg.param;
  1836. opr->deduce_layout({arg.src, dtype::Int8()},
  1837. {arg.filter, dtype::Int8()},
  1838. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1839. //! dst.nr_elems * IC * FH * FW * 2
  1840. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1841. arg.filter[2] * arg.filter[3] * 2.0 /
  1842. (1024 * 1024 * 1024) * 1e3;
  1843. auto used0 = benchmark0.set_param(arg.param).exec(
  1844. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1845. RUN;
  1846. auto used1 = benchmark1.set_param(arg.param).exec(
  1847. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1848. RUN;
  1849. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  1850. "speedup: %f\n",
  1851. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  1852. used0, computations / used0, used1, computations / used1,
  1853. used1 / used0);
  1854. }
  1855. }
  1856. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_INT8_STRIDE1_WITHDOTPROD_NCHW44_DOT) {
  1857. using namespace conv_bias;
  1858. std::vector<TestArg> args;
  1859. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1860. size_t p, size_t stride, NonlineMode nonline_mode) {
  1861. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1862. return;
  1863. param::ConvBias param;
  1864. param.stride_h = stride;
  1865. param.stride_w = stride;
  1866. param.pad_h = p;
  1867. param.pad_w = p;
  1868. param.nonlineMode = nonline_mode;
  1869. param.format = param::ConvBias::Format::NCHW44_DOT;
  1870. //! channel bias
  1871. args.emplace_back(param, TensorShape{1, ic / 4, h, w, 4},
  1872. TensorShape{oc / 4, ic / 4, kernel, kernel, 4, 4},
  1873. TensorShape{1, oc / 4, 1, 1, 4});
  1874. };
  1875. for (size_t stride : {1, 2})
  1876. for (size_t kernel : {2, 3, 5, 7})
  1877. for (size_t oc : {64})
  1878. for (NonlineMode nonline_mode : {NonlineMode::IDENTITY}) {
  1879. run(oc, oc, 56, 56, kernel, kernel / 2, stride,
  1880. nonline_mode);
  1881. }
  1882. constexpr size_t RUN = 50;
  1883. Benchmarker<ConvBias> benchmark0(handle());
  1884. benchmark0.set_dtype(0, dtype::QuantizedS8(2.5f))
  1885. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1886. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1887. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1888. benchmark0.set_display(false);
  1889. benchmark0.set_times(RUN);
  1890. benchmark0.set_before_exec_callback(
  1891. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  1892. "ARMDOTS8DIRECT_NCHW44"));
  1893. Benchmarker<ConvBias> benchmark1(handle());
  1894. benchmark1.set_dtype(0, dtype::QuantizedS8(2.5f))
  1895. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1896. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1897. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1898. benchmark1.set_display(false);
  1899. benchmark1.set_times(RUN);
  1900. for (auto&& arg : args) {
  1901. TensorLayout dst_layout;
  1902. auto opr = handle()->create_operator<ConvBias>();
  1903. opr->param() = arg.param;
  1904. opr->deduce_layout({arg.src, dtype::Int8()},
  1905. {arg.filter, dtype::Int8()},
  1906. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1907. //! dst.nr_elems * IC * FH * FW * 2
  1908. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1909. arg.filter[2] * arg.filter[3] * 8.0 /
  1910. (1024 * 1024 * 1024) * 1e3;
  1911. auto used0 = benchmark0.set_param(arg.param).exec(
  1912. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1913. RUN;
  1914. auto used1 = benchmark1.set_param(arg.param).exec(
  1915. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1916. RUN;
  1917. printf("%s %s: Direct use: %f ms %f Gflops normal: %f ms %f GFlops "
  1918. "speedup: %f\n",
  1919. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  1920. used0, computations / used0, used1, computations / used1,
  1921. used1 / used0);
  1922. }
  1923. }
  1924. #endif
  1925. #endif
  1926. /*====================== BENCHMARK CONV1X1 ===========================*/
  1927. #if MEGDNN_WITH_BENCHMARK
  1928. namespace {
  1929. std::vector<conv_bias::TestArg> get_conv_bias_1x1_benchmark_args(
  1930. size_t pack_size = 1) {
  1931. using namespace conv_bias;
  1932. std::vector<TestArg> args;
  1933. param::ConvBias param;
  1934. param.stride_h = 1;
  1935. param.stride_w = 1;
  1936. param.pad_h = 0;
  1937. param.pad_w = 0;
  1938. param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
  1939. auto bench_case = [&](size_t OC, size_t IC, size_t H, size_t W) {
  1940. if (pack_size == 1)
  1941. args.emplace_back(param, TensorShape{1, IC, H, W},
  1942. TensorShape{OC, IC, 1, 1}, TensorShape{});
  1943. else {
  1944. if (pack_size == 4)
  1945. param.format = param::ConvBias::Format::NCHW44;
  1946. args.emplace_back(param,
  1947. TensorShape{1, IC / pack_size, H, W, pack_size},
  1948. TensorShape{OC / pack_size, IC / pack_size, 1, 1,
  1949. pack_size, pack_size},
  1950. TensorShape{});
  1951. }
  1952. };
  1953. //! MobileNetV1
  1954. bench_case(64, 32, 112, 112);
  1955. bench_case(128, 64, 56, 56);
  1956. bench_case(128, 128, 56, 56);
  1957. bench_case(256, 128, 28, 28);
  1958. bench_case(256, 256, 28, 28);
  1959. bench_case(512, 256, 14, 14);
  1960. bench_case(512, 512, 14, 14);
  1961. bench_case(1024, 512, 7, 7);
  1962. bench_case(1024, 1024, 7, 7);
  1963. //! MobileNetV2
  1964. bench_case(16, 32, 112, 112);
  1965. bench_case(96, 16, 112, 112);
  1966. bench_case(144, 24, 56, 56);
  1967. bench_case(192, 32, 28, 28);
  1968. bench_case(384, 64, 28, 28);
  1969. bench_case(576, 96, 14, 14);
  1970. bench_case(960, 160, 7, 7);
  1971. bench_case(320, 960, 7, 7);
  1972. bench_case(1280, 320, 7, 7);
  1973. //! MobileNetV3-Large
  1974. bench_case(64, 16, 112, 112);
  1975. bench_case(72, 24, 56, 56);
  1976. bench_case(120, 40, 28, 28);
  1977. bench_case(240, 40, 28, 28);
  1978. bench_case(200, 80, 14, 14);
  1979. bench_case(184, 80, 14, 14);
  1980. bench_case(480, 80, 14, 14);
  1981. bench_case(672, 112, 14, 14);
  1982. //! MobileNetV3-Small
  1983. bench_case(72, 16, 56, 56);
  1984. bench_case(88, 24, 28, 28);
  1985. bench_case(96, 24, 28, 28);
  1986. bench_case(240, 40, 14, 14);
  1987. bench_case(120, 40, 14, 14);
  1988. bench_case(144, 48, 14, 14);
  1989. bench_case(288, 48, 14, 14);
  1990. bench_case(576, 96, 7, 7);
  1991. //! resnet50
  1992. bench_case(256, 64, 56, 56);
  1993. bench_case(512, 128, 28, 28);
  1994. bench_case(1024, 256, 14, 14);
  1995. bench_case(2048, 512, 7, 7);
  1996. return args;
  1997. }
  1998. void benchmark_conv1x1(const char* matmul_algo_name, Handle* handle,
  1999. DType stype, DType matmul_dtype, DType bias_type,
  2000. DType conv_dtype, bool is_mk4 = false) {
  2001. using namespace conv_bias;
  2002. int pack_size = is_mk4 ? 4 : 1;
  2003. std::vector<TestArg> conv_bias_1x1_args =
  2004. get_conv_bias_1x1_benchmark_args(pack_size);
  2005. constexpr size_t RUNS = 50;
  2006. param::MatrixMul param;
  2007. param.transposeA = false;
  2008. param.transposeB = false;
  2009. if (is_mk4) {
  2010. param.format = MatrixMul::Param::Format::MK4;
  2011. }
  2012. Benchmarker<MatrixMul> benchmark_matmul(handle);
  2013. benchmark_matmul.set_before_exec_callback(
  2014. AlgoChecker<MatrixMul>(matmul_algo_name));
  2015. benchmark_matmul.set_times(RUNS)
  2016. .set_dtype(0, stype)
  2017. .set_dtype(1, stype)
  2018. .set_dtype(2, matmul_dtype)
  2019. .set_param(param)
  2020. .set_display(false);
  2021. std::string conv1x1_algo_name = ssprintf("CONV1x1:%s:24", matmul_algo_name);
  2022. Benchmarker<ConvBias> benchmark_conv1x1(handle);
  2023. benchmark_conv1x1.set_before_exec_callback(
  2024. conv_bias::ConvBiasAlgoChecker<ConvBias>(
  2025. conv1x1_algo_name.c_str()));
  2026. benchmark_conv1x1.set_times(RUNS)
  2027. .set_dtype(0, stype)
  2028. .set_dtype(1, stype)
  2029. .set_dtype(2, bias_type)
  2030. .set_dtype(4, conv_dtype)
  2031. .set_display(false);
  2032. for (auto&& arg : conv_bias_1x1_args) {
  2033. size_t IC = arg.src[1];
  2034. size_t OH = arg.src[2];
  2035. size_t OW = arg.src[3];
  2036. size_t OC = arg.filter[0];
  2037. size_t M = OC * pack_size;
  2038. size_t K = IC * pack_size;
  2039. size_t N = OH * OW;
  2040. float computations = M * N * K * 2.f / (1024 * 1024 * 1024) * 1e3;
  2041. TensorShape A, B;
  2042. A = TensorShape{M, K};
  2043. B = TensorShape{K, N};
  2044. if (is_mk4) {
  2045. A = TensorShape{M / 4, K / 4, 4, 4};
  2046. B = TensorShape{K / 4, N, 4};
  2047. }
  2048. auto conv1x1_used = benchmark_conv1x1.set_param(arg.param).exec(
  2049. {arg.src, arg.filter, arg.bias, {}, {}}) /
  2050. RUNS;
  2051. auto matmul_used = benchmark_matmul.exec({A, B, {}}) / RUNS;
  2052. printf("%s %s:\n matmul: %f ms %f Gflops\nconv1x1: %f ms %f GFlops "
  2053. "speedup: "
  2054. "%f\n",
  2055. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  2056. matmul_used, computations / matmul_used, conv1x1_used,
  2057. computations / conv1x1_used, matmul_used / conv1x1_used);
  2058. }
  2059. }
  2060. } // namespace
  2061. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_CONV1X1_S1_F32) {
  2062. #if MEGDNN_AARCH64
  2063. benchmark_conv1x1("AARCH64_F32K8X12X1", handle(), dtype::Float32{},
  2064. dtype::Float32{}, dtype::Float32{}, dtype::Float32{});
  2065. #else
  2066. benchmark_conv1x1("ARMV7_F32", handle(), dtype::Float32{}, dtype::Float32{},
  2067. dtype::Float32{}, dtype::Float32{});
  2068. #endif
  2069. }
  2070. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  2071. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_CONV1X1_S1_F16) {
  2072. #if MEGDNN_AARCH64
  2073. benchmark_conv1x1("AARCH64_F16_K8X24X1", handle(), dtype::Float16{},
  2074. dtype::Float16{}, dtype::Float16{}, dtype::Float16{});
  2075. #else
  2076. benchmark_conv1x1("AARCH32_F16_K4X16X1", handle(), dtype::Float16{},
  2077. dtype::Float16{}, dtype::Float16{}, dtype::Float16{});
  2078. #endif
  2079. }
  2080. #endif
  2081. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_CONV1X1_S1_QUANTIZEDSYM) {
  2082. dtype::QuantizedS8 stype(2.5f);
  2083. dtype::QuantizedS32 dtype(6.25f);
  2084. #if MEGDNN_AARCH64
  2085. #if __ARM_FEATURE_DOTPROD
  2086. benchmark_conv1x1("AARCH64_INT8X8X32_K8X12X4_DOTPROD", handle(), stype,
  2087. dtype, dtype, dtype);
  2088. #else
  2089. benchmark_conv1x1("AARCH64_INT8X8X32_K8X8X8", handle(), stype, dtype, dtype,
  2090. dtype);
  2091. benchmark_conv1x1("AARCH64_INT8X8X32_K4X4X16", handle(), stype, dtype,
  2092. dtype, dtype);
  2093. #endif
  2094. #elif MEGDNN_ARMV7
  2095. benchmark_conv1x1("ARMV7_INT8X8X32_K4X8X8", handle(), stype, dtype, dtype,
  2096. dtype);
  2097. #endif
  2098. }
  2099. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_CONV1X1_S1_QUANTIZEDASYM) {
  2100. dtype::Quantized8Asymm stype(1.2f, (uint8_t)125);
  2101. dtype::QuantizedS32 dtype(1.2 * 1.2);
  2102. #if MEGDNN_AARCH64
  2103. #if __ARM_FEATURE_DOTPROD
  2104. benchmark_conv1x1("AARCH64_QUINT8_K8X8X4_DOTPROD", handle(), stype, dtype,
  2105. dtype, dtype);
  2106. #else
  2107. benchmark_conv1x1("AARCH64_QUINT8_K8X8X8", handle(), stype, dtype, dtype,
  2108. dtype);
  2109. #endif
  2110. #elif MEGDNN_ARMV7
  2111. benchmark_conv1x1("ARMV7_QUINT8_K4X8X8", handle(), stype, dtype, dtype,
  2112. dtype);
  2113. #endif
  2114. }
  2115. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_CONV1X1_S1_INT8x8x16) {
  2116. #if MEGDNN_AARCH64
  2117. benchmark_conv1x1("AARCH64_INT8X8X16_K8X8X8", handle(), dtype::Int8{},
  2118. dtype::Int16{}, dtype::Int16{}, dtype::Int16{});
  2119. benchmark_conv1x1("AARCH64_INT8X8X16_K4X4X16", handle(), dtype::Int8{},
  2120. dtype::Int16{}, dtype::Int16{}, dtype::Int16{});
  2121. #elif MEGDNN_ARMV7
  2122. benchmark_conv1x1("ARMV7_INT8X8X16_K4X8X8", handle(), dtype::Int8{},
  2123. dtype::Int16{}, dtype::Int16{}, dtype::Int16{});
  2124. benchmark_conv1x1("ARMV7_INT8X8X16_K4X2X16", handle(), dtype::Int8{},
  2125. dtype::Int16{}, dtype::Int16{}, dtype::Int16{});
  2126. benchmark_conv1x1("ARMV7_INT8X8X16_MK4_K8X8X4", handle(), dtype::Int8{},
  2127. dtype::Int16{}, dtype::Int16{}, dtype::Int16{}, true);
  2128. #endif
  2129. }
  2130. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_CONV1X1_GEMV_FP32) {
  2131. using namespace conv_bias;
  2132. std::vector<conv_bias::TestArg> args;
  2133. param::ConvBias conv_param;
  2134. conv_param.stride_h = 1;
  2135. conv_param.stride_w = 1;
  2136. conv_param.pad_h = 0;
  2137. conv_param.pad_w = 0;
  2138. conv_param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
  2139. auto run = [&](size_t M, size_t K) {
  2140. args.emplace_back(conv_param, TensorShape{1, K, 1, 1},
  2141. TensorShape{M, K, 1, 1}, TensorShape{});
  2142. };
  2143. for (size_t M : {4, 64, 1024, 4096})
  2144. for (size_t K : {128, 256, 1024, 4096})
  2145. run(M, K);
  2146. constexpr size_t RUNS = 50;
  2147. param::MatrixMul param;
  2148. param.transposeA = false;
  2149. param.transposeB = false;
  2150. Benchmarker<MatrixMul> benchmark_matmul(handle());
  2151. benchmark_matmul.set_before_exec_callback(
  2152. AlgoChecker<MatrixMul>("ARM_COMMON_F32_GEMV"));
  2153. benchmark_matmul.set_times(RUNS)
  2154. .set_dtype(0, dtype::Float32{})
  2155. .set_dtype(1, dtype::Float32{})
  2156. .set_dtype(2, dtype::Float32{})
  2157. .set_param(param)
  2158. .set_display(false);
  2159. Benchmarker<ConvBias> benchmark_conv1x1(handle());
  2160. benchmark_conv1x1.set_before_exec_callback(
  2161. conv_bias::ConvBiasAlgoChecker<ConvBias>("CONV1x1_GEMV"));
  2162. benchmark_conv1x1.set_times(RUNS)
  2163. .set_dtype(0, dtype::Float32{})
  2164. .set_dtype(1, dtype::Float32{})
  2165. .set_dtype(2, dtype::Float32{})
  2166. .set_dtype(4, dtype::Float32{})
  2167. .set_display(false);
  2168. std::cout << "warm up:\n";
  2169. for (int i = 0; i < 50; i++) {
  2170. benchmark_matmul.exec({{1, 1024}, {1024, 512}, {}});
  2171. benchmark_matmul.set_display(true);
  2172. }
  2173. for (auto&& arg : args) {
  2174. size_t IC = arg.src[1];
  2175. size_t OH = arg.src[2];
  2176. size_t OW = arg.src[3];
  2177. size_t OC = arg.filter[0];
  2178. size_t M = OC;
  2179. size_t K = IC;
  2180. size_t N = OH * OW;
  2181. float computations = M * N * K * 2.f / (1024 * 1024 * 1024) * 1e3;
  2182. TensorShape A, B;
  2183. A = TensorShape{M, K};
  2184. B = TensorShape{K, N};
  2185. auto conv1x1_used = benchmark_conv1x1.set_param(arg.param).exec(
  2186. {arg.src, arg.filter, arg.bias, {}, {}}) /
  2187. RUNS;
  2188. auto matmul_used = benchmark_matmul.exec({A, B, {}}) / RUNS;
  2189. printf("%s %s:\n gemv: %f ms %f Gflops\nconv1x1: %f ms %f GFlops "
  2190. "speedup: "
  2191. "%f\n",
  2192. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  2193. matmul_used, computations / matmul_used, conv1x1_used,
  2194. computations / conv1x1_used, matmul_used / conv1x1_used);
  2195. }
  2196. }
  2197. #ifndef __ARM_FEATURE_DOTPROD
  2198. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_1X1_S1_NCHW_VS_NCHW44_INT8x8x32) {
  2199. std::vector<TestArg> conv_bias_1x1_args_nchw44 =
  2200. get_conv_bias_1x1_benchmark_args(4);
  2201. std::vector<TestArg> conv_bias_1x1_args_nchw =
  2202. get_conv_bias_1x1_benchmark_args(1);
  2203. constexpr size_t RUNS = 50;
  2204. Benchmarker<ConvBias> benchmark_conv1x1_nchw44(handle());
  2205. benchmark_conv1x1_nchw44.set_before_exec_callback(
  2206. conv_bias::ConvBiasAlgoChecker<ConvBias>(
  2207. "CONV1x1:AARCH64_INT8X8X32_MK4_4X4X16:24"));
  2208. benchmark_conv1x1_nchw44.set_times(RUNS)
  2209. .set_dtype(0, dtype::Int8())
  2210. .set_dtype(1, dtype::Int8())
  2211. .set_dtype(2, dtype::Int32())
  2212. .set_dtype(4, dtype::Int32())
  2213. .set_display(false);
  2214. Benchmarker<ConvBias> benchmark_conv1x1_nchw(handle());
  2215. benchmark_conv1x1_nchw.set_before_exec_callback(
  2216. conv_bias::ConvBiasAlgoChecker<ConvBias>(
  2217. "CONV1x1:AARCH64_INT8X8X32_K4X4X16:24"));
  2218. benchmark_conv1x1_nchw.set_times(RUNS)
  2219. .set_dtype(0, dtype::Int8())
  2220. .set_dtype(1, dtype::Int8())
  2221. .set_dtype(2, dtype::Int32())
  2222. .set_dtype(4, dtype::Int32())
  2223. .set_display(false);
  2224. for (size_t i = 0; i < conv_bias_1x1_args_nchw44.size(); ++i) {
  2225. auto&& arg_nchw = conv_bias_1x1_args_nchw[i];
  2226. auto&& arg_nchw44 = conv_bias_1x1_args_nchw44[i];
  2227. size_t IC = arg_nchw.src[1];
  2228. size_t OH = arg_nchw.src[2];
  2229. size_t OW = arg_nchw.src[3];
  2230. size_t OC = arg_nchw.filter[0];
  2231. size_t M = OC;
  2232. size_t K = IC;
  2233. size_t N = OH * OW;
  2234. float computations = M * N * K * 2.f / (1024 * 1024 * 1024) * 1e3;
  2235. auto conv1x1_nchw = benchmark_conv1x1_nchw.set_param(arg_nchw.param)
  2236. .exec({arg_nchw.src,
  2237. arg_nchw.filter,
  2238. arg_nchw.bias,
  2239. {},
  2240. {}}) /
  2241. RUNS;
  2242. auto conv1x1_nchw44 =
  2243. benchmark_conv1x1_nchw44.set_param(arg_nchw44.param)
  2244. .exec({arg_nchw44.src,
  2245. arg_nchw44.filter,
  2246. arg_nchw44.bias,
  2247. {},
  2248. {}}) /
  2249. RUNS;
  2250. printf("%s %s:\n conv_1x1_nchw: %f ms %f Gflops\nconv1x1_nchw44: %f ms "
  2251. "%f GFlops "
  2252. "speedup: "
  2253. "%f\n",
  2254. arg_nchw.src.to_string().c_str(),
  2255. arg_nchw.filter.to_string().c_str(), conv1x1_nchw,
  2256. computations / conv1x1_nchw, conv1x1_nchw44,
  2257. computations / conv1x1_nchw44, conv1x1_nchw / conv1x1_nchw44);
  2258. }
  2259. }
  2260. #endif
  2261. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_WINOGRAD_VS_IM2COL_INT8) {
  2262. auto&& args = get_winograd_benchmark_args(3, 8);
  2263. using namespace conv_bias;
  2264. constexpr size_t RUN = 10;
  2265. Benchmarker<ConvBias> benchmark_im2col(handle());
  2266. benchmark_im2col.set_display(false);
  2267. benchmark_im2col.set_times(RUN);
  2268. benchmark_im2col.set_dtype(0, dtype::QuantizedS8(2.5f))
  2269. .set_dtype(1, dtype::QuantizedS8(2.5f))
  2270. .set_dtype(2, dtype::QuantizedS32(6.25f))
  2271. .set_dtype(4, dtype::QuantizedS8(60.25f));
  2272. Benchmarker<ConvBias> benchmark_winograd(handle());
  2273. benchmark_winograd.set_display(false);
  2274. benchmark_winograd.set_times(RUN);
  2275. benchmark_winograd.set_dtype(0, dtype::QuantizedS8(2.5f))
  2276. .set_dtype(1, dtype::QuantizedS8(2.5f))
  2277. .set_dtype(2, dtype::QuantizedS32(6.25f))
  2278. .set_dtype(4, dtype::QuantizedS8(60.25f));
  2279. for (auto&& arg : args) {
  2280. TensorLayout dst_layout;
  2281. auto opr = handle()->create_operator<ConvBias>();
  2282. opr->param() = arg.param;
  2283. opr->deduce_layout({arg.src, dtype::Float32()},
  2284. {arg.filter, dtype::Float32()},
  2285. {arg.bias, dtype::Float32()}, {}, dst_layout);
  2286. //! dst.nr_elems * IC * FH * FW * 2
  2287. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  2288. arg.filter[2] * arg.filter[3] * 2.0 /
  2289. (1024 * 1024 * 1024) * 1e3;
  2290. benchmark_im2col.set_param(arg.param);
  2291. auto im2col_used =
  2292. algo_benchmark<ConvBias>(
  2293. benchmark_im2col, {arg.src, arg.filter, {}, {}, {}},
  2294. "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16") /
  2295. RUN;
  2296. benchmark_winograd.set_param(arg.param);
  2297. auto winograd_used =
  2298. algo_benchmark<ConvBias>(
  2299. benchmark_winograd, {arg.src, arg.filter, {}, {}, {}},
  2300. "WINOGRAD:AARCH64_INT16X16X32_MK8_8X8:8:2") /
  2301. RUN;
  2302. printf("%s %s: im2col: %f ms %f Gflops winograd: %f ms %f GFlops "
  2303. "speedup: "
  2304. "%f\n",
  2305. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  2306. im2col_used, computations / im2col_used, winograd_used,
  2307. computations / winograd_used, im2col_used / winograd_used);
  2308. }
  2309. }
  2310. #endif
  2311. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台