You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias_multi_thread_benchmark.cpp 89 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168
  1. /**
  2. * \file dnn/test/arm_common/conv_bias_multi_thread_benchmark.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "test/arm_common/fixture.h"
  13. #include "test/common/benchmarker.h"
  14. #include "test/common/conv_bias.h"
  15. using namespace megdnn;
  16. using namespace test;
  17. using namespace conv_bias;
  18. #if MEGDNN_WITH_BENCHMARK
  19. namespace {
  20. void benchmark_impl(
  21. const param::ConvBias param,
  22. std::vector<std::pair<SmallVector<TensorShape>, float>>& shapes_and_computation,
  23. const std::string algo_name, size_t RUNS,
  24. TaskExecutorConfig&& multi_thread_config,
  25. TaskExecutorConfig&& single_thread_config, std::vector<DType>& data_type) {
  26. std::vector<float> multi_thread_times, single_thread_times;
  27. {
  28. auto multi_thread_hanle = create_cpu_handle(0, true, &multi_thread_config);
  29. auto benchmarker = Benchmarker<ConvBias>(multi_thread_hanle.get());
  30. benchmarker.set_times(RUNS)
  31. .set_display(false)
  32. .set_param(param)
  33. .set_dtype(0, data_type[0])
  34. .set_dtype(1, data_type[1])
  35. .set_dtype(2, data_type[2])
  36. .set_dtype(4, data_type[3])
  37. .set_before_exec_callback(
  38. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name.c_str()));
  39. for (auto shape : shapes_and_computation) {
  40. multi_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  41. }
  42. }
  43. {
  44. auto single_thread_handle = create_cpu_handle(0, true, &single_thread_config);
  45. auto benchmarker = Benchmarker<ConvBias>(single_thread_handle.get());
  46. benchmarker.set_times(RUNS)
  47. .set_display(false)
  48. .set_param(param)
  49. .set_dtype(0, data_type[0])
  50. .set_dtype(1, data_type[1])
  51. .set_dtype(2, data_type[2])
  52. .set_dtype(4, data_type[3])
  53. .set_before_exec_callback(
  54. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name.c_str()));
  55. for (auto shape : shapes_and_computation) {
  56. single_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  57. }
  58. }
  59. printf("Benchmark : Multi threads %zu, ", multi_thread_config.nr_thread);
  60. printf("core_ids:");
  61. for (size_t i = 0; i < multi_thread_config.affinity_core_set.size(); i++) {
  62. printf("%zu ", multi_thread_config.affinity_core_set[i]);
  63. }
  64. printf(", Single thread core_id %zu\n", single_thread_config.affinity_core_set[0]);
  65. for (size_t i = 0; i < shapes_and_computation.size(); i++) {
  66. auto shapes = shapes_and_computation[i];
  67. printf("Bench case: ");
  68. for (auto&& shape : shapes.first) {
  69. printf("%s ", shape.to_string().c_str());
  70. }
  71. float computations = shapes.second;
  72. printf("%zu threads gflops: %f,\n single thread gflops: "
  73. "%f. spead up = %f, speedup/cores=%f\n",
  74. multi_thread_config.nr_thread, computations / multi_thread_times[i],
  75. computations / single_thread_times[i],
  76. single_thread_times[i] / multi_thread_times[i],
  77. single_thread_times[i] / multi_thread_times[i] /
  78. multi_thread_config.nr_thread);
  79. }
  80. }
  81. } // namespace
  82. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECTF32) {
  83. constexpr size_t RUNS = 50;
  84. param::ConvBias param;
  85. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  86. param.pad_h = 1;
  87. param.pad_w = 1;
  88. param.stride_h = 1;
  89. param.stride_w = 1;
  90. param.sparse = param::ConvBias::Sparse::GROUP;
  91. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  92. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  93. size_t group) {
  94. SmallVector<TensorShape> shapes{
  95. {N, IC, H, W},
  96. {group, OC / group, IC / group, FS, FS},
  97. {1, OC, 1, 1},
  98. {},
  99. {N, OC, H, W}};
  100. TensorShape dst{N, OC, H, W};
  101. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  102. dst.total_nr_elems()) *
  103. 1e-6;
  104. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  105. };
  106. bench_case(1, 32, 32, 200, 200, 3, 4);
  107. bench_case(1, 32, 32, 200, 200, 3, 32);
  108. bench_case(1, 32, 32, 128, 128, 3, 4);
  109. bench_case(1, 32, 32, 128, 128, 3, 32);
  110. bench_case(1, 32, 32, 100, 100, 3, 4);
  111. bench_case(1, 32, 32, 100, 100, 3, 32);
  112. bench_case(1, 32, 32, 80, 80, 3, 4);
  113. bench_case(1, 32, 32, 80, 80, 3, 32);
  114. std::string algo_name = "F32DIRECT";
  115. printf("Benchmark F32DIRECT_LARGE_GROUP algo\n");
  116. std::vector<DType> data_type = {
  117. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  118. benchmark_impl(
  119. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  120. data_type);
  121. benchmark_impl(
  122. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  123. data_type);
  124. benchmark_impl(
  125. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  126. data_type);
  127. shapes_and_computation.clear();
  128. algo_name = "F32DIRECT";
  129. printf("Benchmark F32DIRECT_SMALL_GROUP algo\n");
  130. bench_case(1, 32, 32, 200, 200, 3, 1);
  131. bench_case(1, 32, 32, 128, 128, 3, 1);
  132. bench_case(1, 32, 32, 100, 100, 3, 1);
  133. bench_case(1, 32, 32, 80, 80, 3, 1);
  134. benchmark_impl(
  135. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  136. data_type);
  137. benchmark_impl(
  138. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  139. data_type);
  140. benchmark_impl(
  141. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  142. data_type);
  143. }
  144. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECTF32_STR1) {
  145. constexpr size_t RUNS = 50;
  146. param::ConvBias param;
  147. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  148. param.pad_h = 1;
  149. param.pad_w = 1;
  150. param.stride_h = 1;
  151. param.stride_w = 1;
  152. param.sparse = param::ConvBias::Sparse::GROUP;
  153. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  154. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  155. size_t group) {
  156. SmallVector<TensorShape> shapes{
  157. {N, IC, H, W},
  158. {group, OC / group, IC / group, FS, FS},
  159. {1, OC, 1, 1},
  160. {},
  161. {N, OC, H, W}};
  162. TensorShape dst{N, OC, H, W};
  163. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  164. dst.total_nr_elems()) *
  165. 1e-6;
  166. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  167. };
  168. bench_case(1, 32, 32, 200, 200, 3, 4);
  169. bench_case(1, 32, 32, 200, 200, 3, 32);
  170. bench_case(1, 32, 32, 128, 128, 3, 4);
  171. bench_case(1, 32, 32, 128, 128, 3, 32);
  172. bench_case(1, 32, 32, 100, 100, 3, 4);
  173. bench_case(1, 32, 32, 100, 100, 3, 32);
  174. bench_case(1, 32, 32, 80, 80, 3, 4);
  175. bench_case(1, 32, 32, 80, 80, 3, 32);
  176. std::string algo_name = "F32STRD1";
  177. printf("Benchmark F32STRD1_LARGE_GROUP algo\n");
  178. std::vector<DType> data_type = {
  179. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  180. benchmark_impl(
  181. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  182. data_type);
  183. benchmark_impl(
  184. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  185. data_type);
  186. benchmark_impl(
  187. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  188. data_type);
  189. shapes_and_computation.clear();
  190. algo_name = "F32STRD1";
  191. printf("Benchmark F32STRD1_SMALL_GROUP algo\n");
  192. bench_case(1, 32, 32, 200, 200, 3, 1);
  193. bench_case(1, 32, 32, 128, 128, 3, 1);
  194. bench_case(1, 32, 32, 100, 100, 3, 1);
  195. bench_case(1, 32, 32, 80, 80, 3, 1);
  196. benchmark_impl(
  197. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  198. data_type);
  199. benchmark_impl(
  200. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  201. data_type);
  202. benchmark_impl(
  203. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  204. data_type);
  205. }
  206. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECTF32_STR2) {
  207. constexpr size_t RUNS = 50;
  208. param::ConvBias param;
  209. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  210. param.pad_h = 1;
  211. param.pad_w = 1;
  212. param.stride_h = 2;
  213. param.stride_w = 2;
  214. param.sparse = param::ConvBias::Sparse::GROUP;
  215. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  216. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  217. size_t group, size_t P, size_t S) {
  218. SmallVector<TensorShape> shapes{
  219. {N, IC, H, W},
  220. {group, OC / group, IC / group, FS, FS},
  221. {1, OC, 1, 1},
  222. {},
  223. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  224. TensorShape dst{N, OC, H, W};
  225. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  226. dst.total_nr_elems()) *
  227. 1e-6;
  228. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  229. };
  230. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 2);
  231. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 2);
  232. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 2);
  233. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 2);
  234. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 2);
  235. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 2);
  236. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 2);
  237. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 2);
  238. std::string algo_name = "F32STRD2";
  239. printf("Benchmark F32STRD2_LARGE_GROUP algo\n");
  240. std::vector<DType> data_type = {
  241. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  242. benchmark_impl(
  243. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  244. data_type);
  245. benchmark_impl(
  246. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  247. data_type);
  248. benchmark_impl(
  249. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  250. data_type);
  251. shapes_and_computation.clear();
  252. algo_name = "F32STRD2";
  253. printf("Benchmark F32STRD2_SMALL_GROUP algo\n");
  254. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 2);
  255. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 2);
  256. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 2);
  257. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 2);
  258. benchmark_impl(
  259. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  260. data_type);
  261. benchmark_impl(
  262. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  263. data_type);
  264. benchmark_impl(
  265. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  266. data_type);
  267. }
  268. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  269. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECTF16) {
  270. constexpr size_t RUNS = 50;
  271. param::ConvBias param;
  272. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  273. param.pad_h = 1;
  274. param.pad_w = 1;
  275. param.stride_h = 1;
  276. param.stride_w = 1;
  277. param.sparse = param::ConvBias::Sparse::GROUP;
  278. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  279. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  280. size_t group) {
  281. SmallVector<TensorShape> shapes{
  282. {N, IC, H, W},
  283. {group, OC / group, IC / group, FS, FS},
  284. {1, OC, 1, 1},
  285. {},
  286. {N, OC, H, W}};
  287. TensorShape dst{N, OC, H, W};
  288. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  289. dst.total_nr_elems()) *
  290. 1e-6;
  291. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  292. };
  293. bench_case(1, 32, 32, 200, 200, 3, 4);
  294. bench_case(1, 32, 32, 200, 200, 3, 32);
  295. bench_case(1, 32, 32, 128, 128, 3, 4);
  296. bench_case(1, 32, 32, 128, 128, 3, 32);
  297. bench_case(1, 32, 32, 100, 100, 3, 4);
  298. bench_case(1, 32, 32, 100, 100, 3, 32);
  299. bench_case(1, 32, 32, 80, 80, 3, 4);
  300. bench_case(1, 32, 32, 80, 80, 3, 32);
  301. std::string algo_name = "F16DIRECT";
  302. printf("Benchmark F16DIRECT_LARGE_GROUP algo\n");
  303. std::vector<DType> data_type = {
  304. dtype::Float16(), dtype::Float16(), dtype::Float16(), dtype::Float16()};
  305. benchmark_impl(
  306. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  307. data_type);
  308. benchmark_impl(
  309. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  310. data_type);
  311. benchmark_impl(
  312. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  313. data_type);
  314. shapes_and_computation.clear();
  315. algo_name = "F16DIRECT";
  316. printf("Benchmark F16DIRECT_SMALL_GROUP algo\n");
  317. bench_case(1, 32, 32, 200, 200, 3, 1);
  318. bench_case(1, 32, 32, 128, 128, 3, 1);
  319. bench_case(1, 32, 32, 100, 100, 3, 1);
  320. bench_case(1, 32, 32, 80, 80, 3, 1);
  321. benchmark_impl(
  322. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  323. data_type);
  324. benchmark_impl(
  325. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  326. data_type);
  327. benchmark_impl(
  328. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  329. data_type);
  330. }
  331. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECTF16_STR1) {
  332. constexpr size_t RUNS = 50;
  333. param::ConvBias param;
  334. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  335. param.pad_h = 1;
  336. param.pad_w = 1;
  337. param.stride_h = 1;
  338. param.stride_w = 1;
  339. param.sparse = param::ConvBias::Sparse::GROUP;
  340. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  341. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  342. size_t group) {
  343. SmallVector<TensorShape> shapes{
  344. {N, IC, H, W},
  345. {group, OC / group, IC / group, FS, FS},
  346. {1, OC, 1, 1},
  347. {},
  348. {N, OC, H, W}};
  349. TensorShape dst{N, OC, H, W};
  350. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  351. dst.total_nr_elems()) *
  352. 1e-6;
  353. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  354. };
  355. bench_case(1, 32, 32, 200, 200, 3, 4);
  356. bench_case(1, 32, 32, 200, 200, 3, 32);
  357. bench_case(1, 32, 32, 128, 128, 3, 4);
  358. bench_case(1, 32, 32, 128, 128, 3, 32);
  359. bench_case(1, 32, 32, 100, 100, 3, 4);
  360. bench_case(1, 32, 32, 100, 100, 3, 32);
  361. bench_case(1, 32, 32, 80, 80, 3, 4);
  362. bench_case(1, 32, 32, 80, 80, 3, 32);
  363. std::string algo_name = "F16STRD1";
  364. printf("Benchmark F16STRD1_LARGE_GROUP algo\n");
  365. std::vector<DType> data_type = {
  366. dtype::Float16(), dtype::Float16(), dtype::Float16(), dtype::Float16()};
  367. benchmark_impl(
  368. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  369. data_type);
  370. benchmark_impl(
  371. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  372. data_type);
  373. benchmark_impl(
  374. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  375. data_type);
  376. shapes_and_computation.clear();
  377. algo_name = "F16STRD1";
  378. printf("Benchmark F16STRD1_SMALL_GROUP algo\n");
  379. bench_case(1, 32, 32, 200, 200, 3, 1);
  380. bench_case(1, 32, 32, 128, 128, 3, 1);
  381. bench_case(1, 32, 32, 100, 100, 3, 1);
  382. bench_case(1, 32, 32, 80, 80, 3, 1);
  383. benchmark_impl(
  384. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  385. data_type);
  386. benchmark_impl(
  387. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  388. data_type);
  389. benchmark_impl(
  390. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  391. data_type);
  392. }
  393. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CHANNEL_WISE_FP16_NCHW88) {
  394. constexpr size_t RUNS = 50;
  395. std::string algo_name = "F16_CHANNEL_WISE_NCHW88";
  396. printf("Benchmarker F16_CHANNEL_WISE_NCHW88 algo\n");
  397. std::vector<DType> data_type = {
  398. dtype::Float16(), dtype::Float16(), dtype::Float16(), dtype::Float16()};
  399. auto bench_case = [&](size_t N, size_t IC, size_t H, size_t W, size_t FS, size_t P,
  400. size_t S) {
  401. param::ConvBias param;
  402. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  403. param.pad_h = P;
  404. param.pad_w = P;
  405. param.stride_h = S;
  406. param.stride_w = S;
  407. param.sparse = param::ConvBias::Sparse::GROUP;
  408. param.format = param::ConvBias::Format::NCHW88;
  409. size_t group = IC;
  410. size_t OC = IC;
  411. SmallVector<TensorShape> shapes{
  412. {N, IC, H, W, 8},
  413. {group, 1, 1, FS, FS, 8},
  414. {1, OC, 1, 1, 8},
  415. {},
  416. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1, 8}};
  417. TensorShape dst{N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1, 8};
  418. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  419. dst.total_nr_elems()) *
  420. 1e-6;
  421. std::vector<std::pair<SmallVector<TensorShape>, float>> shape_arg = {
  422. std::make_pair(shapes, computations)};
  423. benchmark_impl(
  424. param, shape_arg, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  425. data_type);
  426. };
  427. bench_case(1, 64, 100, 100, 5, 2, 1);
  428. bench_case(1, 64, 56, 56, 5, 2, 1);
  429. bench_case(1, 64, 28, 28, 5, 2, 1);
  430. bench_case(1, 64, 100, 100, 5, 2, 2);
  431. bench_case(1, 64, 56, 56, 5, 2, 2);
  432. bench_case(1, 64, 28, 28, 5, 2, 2);
  433. bench_case(1, 64, 100, 100, 3, 1, 1);
  434. bench_case(1, 64, 56, 56, 3, 1, 1);
  435. bench_case(1, 64, 28, 28, 3, 1, 1);
  436. bench_case(1, 64, 100, 100, 3, 1, 2);
  437. bench_case(1, 64, 56, 56, 3, 1, 2);
  438. bench_case(1, 64, 28, 28, 3, 1, 2);
  439. bench_case(1, 64, 100, 100, 2, 0, 1);
  440. bench_case(1, 64, 56, 56, 2, 0, 1);
  441. bench_case(1, 64, 28, 28, 2, 0, 1);
  442. bench_case(1, 64, 100, 100, 2, 0, 2);
  443. bench_case(1, 64, 56, 56, 2, 0, 2);
  444. bench_case(1, 64, 28, 28, 2, 0, 2);
  445. }
  446. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_FP16_NCHW88) {
  447. constexpr size_t RUNS = 40;
  448. std::vector<DType> data_type = {
  449. dtype::Float16(), dtype::Float16(), dtype::Float16(), dtype::Float16()};
  450. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  451. size_t group, size_t P, size_t S) {
  452. param::ConvBias param;
  453. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  454. param.pad_h = P;
  455. param.pad_w = P;
  456. param.stride_h = S;
  457. param.stride_w = S;
  458. param.sparse = param::ConvBias::Sparse::DENSE;
  459. param.format = param::ConvBias::Format::NCHW88;
  460. auto OH = (H + 2 * P - FS) / static_cast<size_t>(S) + 1;
  461. auto OW = (W + 2 * P - FS) / static_cast<size_t>(S) + 1;
  462. TensorShape src = {N, IC / 8, H, W, 8};
  463. TensorShape filter = {OC / 8, IC / 8, FS, FS, 8, 8};
  464. if (group > 1) {
  465. filter = {group, OC / group / 8, IC / group / 8, FS, FS, 8, 8};
  466. param.sparse = param::ConvBias::Sparse::GROUP;
  467. }
  468. TensorShape bias = {1, OC / 8, 1, 1, 8};
  469. TensorShape dst = {N, OC / 8, OH, OW, 8};
  470. SmallVector<TensorShape> shapes{src, filter, bias, {}, dst};
  471. float computations = (((IC / group) * FS * FS + 1) * dst.total_nr_elems() * 2 +
  472. dst.total_nr_elems()) *
  473. 1e-6;
  474. std::vector<std::pair<SmallVector<TensorShape>, float>> shape_arg = {
  475. std::make_pair(shapes, computations)};
  476. benchmark_impl(
  477. param, shape_arg, ".+", RUNS, {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  478. };
  479. bench_case(1, 64, 64, 28, 28, 3, 1, 1, 1);
  480. bench_case(1, 64, 64, 28, 28, 5, 1, 2, 1);
  481. bench_case(1, 64, 64, 28, 28, 7, 1, 3, 1);
  482. bench_case(1, 64, 64, 28, 28, 3, 1, 1, 2);
  483. bench_case(1, 64, 64, 28, 28, 5, 1, 2, 2);
  484. bench_case(1, 64, 64, 28, 28, 7, 1, 3, 2);
  485. bench_case(1, 64, 64, 28, 28, 3, 2, 1, 1);
  486. bench_case(1, 64, 64, 28, 28, 3, 4, 1, 1);
  487. bench_case(1, 64, 64, 28, 28, 3, 8, 1, 1);
  488. bench_case(1, 16, 16, 28, 28, 3, 1, 1, 1);
  489. bench_case(1, 32, 32, 28, 28, 3, 1, 1, 1);
  490. bench_case(1, 128, 128, 28, 28, 3, 1, 1, 1);
  491. bench_case(1, 256, 256, 28, 28, 3, 1, 1, 1);
  492. bench_case(1, 64, 64, 7, 7, 3, 1, 1, 1);
  493. bench_case(1, 64, 64, 14, 14, 3, 1, 1, 1);
  494. bench_case(1, 64, 64, 56, 56, 3, 1, 1, 1);
  495. bench_case(1, 64, 64, 112, 112, 3, 1, 1, 1);
  496. }
  497. #endif
  498. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECT_INT8x8x16) {
  499. constexpr size_t RUNS = 50;
  500. param::ConvBias param;
  501. param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
  502. param.pad_h = 1;
  503. param.pad_w = 1;
  504. param.stride_h = 1;
  505. param.stride_w = 1;
  506. param.sparse = param::ConvBias::Sparse::GROUP;
  507. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  508. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  509. size_t group) {
  510. SmallVector<TensorShape> shapes{
  511. {N, IC, H, W},
  512. {group, OC / group, IC / group, FS, FS},
  513. {},
  514. {},
  515. {N, OC, H, W}};
  516. TensorShape dst{N, OC, H, W};
  517. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  518. dst.total_nr_elems()) *
  519. 1e-6;
  520. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  521. };
  522. bench_case(1, 32, 32, 200, 200, 3, 4);
  523. bench_case(1, 32, 32, 200, 200, 3, 32);
  524. bench_case(1, 32, 32, 128, 128, 3, 4);
  525. bench_case(1, 32, 32, 128, 128, 3, 32);
  526. bench_case(1, 32, 32, 100, 100, 3, 4);
  527. bench_case(1, 32, 32, 100, 100, 3, 32);
  528. bench_case(1, 32, 32, 80, 80, 3, 4);
  529. bench_case(1, 32, 32, 80, 80, 3, 32);
  530. std::string algo_name = "I8816DIRECT";
  531. printf("Benchmark I8816DIRECT_LARGE_GROUP algo\n");
  532. std::vector<DType> data_type = {
  533. dtype::Int8(), dtype::Int8(), dtype::Int16(), dtype::Int16()};
  534. benchmark_impl(
  535. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  536. data_type);
  537. benchmark_impl(
  538. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  539. data_type);
  540. benchmark_impl(
  541. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  542. data_type);
  543. shapes_and_computation.clear();
  544. algo_name = "I8816DIRECT";
  545. printf("Benchmark I8816DIRECT_SMALL_GROUP algo\n");
  546. bench_case(1, 32, 32, 200, 200, 3, 1);
  547. bench_case(1, 32, 32, 128, 128, 3, 1);
  548. bench_case(1, 32, 32, 100, 100, 3, 1);
  549. bench_case(1, 32, 32, 80, 80, 3, 1);
  550. benchmark_impl(
  551. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  552. data_type);
  553. benchmark_impl(
  554. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  555. data_type);
  556. benchmark_impl(
  557. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  558. data_type);
  559. }
  560. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECT_INT8x8x16_STR2) {
  561. constexpr size_t RUNS = 50;
  562. param::ConvBias param;
  563. param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
  564. param.pad_h = 1;
  565. param.pad_w = 1;
  566. param.stride_h = 2;
  567. param.stride_w = 2;
  568. param.sparse = param::ConvBias::Sparse::GROUP;
  569. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  570. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  571. size_t group, size_t P, size_t S) {
  572. SmallVector<TensorShape> shapes{
  573. {N, IC, H, W},
  574. {group, OC / group, IC / group, FS, FS},
  575. {},
  576. {},
  577. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  578. TensorShape dst{N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1};
  579. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  580. dst.total_nr_elems()) *
  581. 1e-6;
  582. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  583. };
  584. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 2);
  585. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 2);
  586. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 2);
  587. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 2);
  588. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 2);
  589. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 2);
  590. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 2);
  591. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 2);
  592. std::string algo_name = "I8816STRD2";
  593. printf("Benchmark I8816STRD2_LARGE_GROUP algo\n");
  594. std::vector<DType> data_type = {
  595. dtype::Int8(), dtype::Int8(), dtype::Int16(), dtype::Int16()};
  596. benchmark_impl(
  597. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  598. data_type);
  599. benchmark_impl(
  600. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  601. data_type);
  602. benchmark_impl(
  603. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  604. data_type);
  605. shapes_and_computation.clear();
  606. algo_name = "I8816STRD2";
  607. printf("Benchmark I8816STRD2_SMALL_GROUP algo\n");
  608. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 2);
  609. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 2);
  610. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 2);
  611. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 2);
  612. benchmark_impl(
  613. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  614. data_type);
  615. benchmark_impl(
  616. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  617. data_type);
  618. benchmark_impl(
  619. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  620. data_type);
  621. }
  622. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_INT8_INT8_INT8_STRIDE1) {
  623. constexpr size_t RUNS = 50;
  624. param::ConvBias param;
  625. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  626. param.pad_h = 1;
  627. param.pad_w = 1;
  628. param.stride_h = 1;
  629. param.stride_w = 1;
  630. param.sparse = param::ConvBias::Sparse::GROUP;
  631. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  632. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  633. size_t group, size_t P, size_t S) {
  634. SmallVector<TensorShape> shapes{
  635. {N, IC, H, W},
  636. {group, OC / group, IC / group, FS, FS},
  637. {1, OC, 1, 1},
  638. {},
  639. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  640. TensorShape dst{N, OC, H, W};
  641. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  642. dst.total_nr_elems()) *
  643. 1e-6;
  644. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  645. };
  646. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 1);
  647. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 1);
  648. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 1);
  649. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 1);
  650. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 1);
  651. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 1);
  652. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 1);
  653. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 1);
  654. std::string algo_name = "S8STRD1";
  655. printf("Benchmark S8STRD1_LARGE_GROUP algo\n");
  656. std::vector<DType> data_type = {
  657. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  658. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  659. benchmark_impl(
  660. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  661. data_type);
  662. benchmark_impl(
  663. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  664. data_type);
  665. benchmark_impl(
  666. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  667. data_type);
  668. shapes_and_computation.clear();
  669. algo_name = "S8STRD1";
  670. printf("Benchmark S8STRD1_SMALL_GROUP algo\n");
  671. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 1);
  672. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 1);
  673. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 1);
  674. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 1);
  675. benchmark_impl(
  676. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  677. data_type);
  678. benchmark_impl(
  679. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  680. data_type);
  681. benchmark_impl(
  682. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  683. data_type);
  684. }
  685. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_INT8_NCHW44) {
  686. constexpr size_t RUNS = 40;
  687. std::vector<DType> data_type = {
  688. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  689. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  690. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  691. size_t group, size_t P, size_t S, bool is_nchw = false) {
  692. param::ConvBias param;
  693. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  694. param.pad_h = P;
  695. param.pad_w = P;
  696. param.stride_h = S;
  697. param.stride_w = S;
  698. param.sparse = param::ConvBias::Sparse::DENSE;
  699. param.format = param::ConvBias::Format::NCHW44;
  700. auto OH = (H + 2 * P - FS) / static_cast<size_t>(S) + 1;
  701. auto OW = (W + 2 * P - FS) / static_cast<size_t>(S) + 1;
  702. TensorShape src = {N, IC / 4, H, W, 4};
  703. TensorShape filter = {OC / 4, IC / 4, FS, FS, 4, 4};
  704. if (group > 1) {
  705. filter = {group, OC / group / 4, IC / group / 4, FS, FS, 4, 4};
  706. param.sparse = param::ConvBias::Sparse::GROUP;
  707. }
  708. if (is_nchw) {
  709. src = {N, IC, H, W};
  710. filter = {OC / 4, FS, FS, IC, 4};
  711. }
  712. TensorShape bias = {1, OC / 4, 1, 1, 4};
  713. TensorShape dst = {N, OC / 4, OH, OW, 4};
  714. SmallVector<TensorShape> shapes{src, filter, bias, {}, dst};
  715. float computations = (((IC / group) * FS * FS + 1) * dst.total_nr_elems() * 2 +
  716. dst.total_nr_elems()) *
  717. 1e-6;
  718. std::vector<std::pair<SmallVector<TensorShape>, float>> shape_arg = {
  719. std::make_pair(shapes, computations)};
  720. benchmark_impl(
  721. param, shape_arg, ".+", RUNS, {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  722. };
  723. bench_case(1, 2, 64, 160, 160, 1, 1, 0, 1, true);
  724. bench_case(1, 3, 64, 224, 224, 7, 1, 3, 2, true);
  725. bench_case(1, 64, 64, 56, 56, 3, 1, 1, 1);
  726. bench_case(1, 128, 128, 28, 28, 3, 1, 1, 1);
  727. bench_case(1, 256, 256, 14, 14, 3, 1, 1, 1);
  728. bench_case(1, 512, 512, 7, 7, 3, 1, 1, 1);
  729. bench_case(1, 64, 64, 56, 56, 3, 4, 1, 1);
  730. bench_case(1, 128, 128, 28, 28, 3, 4, 1, 1);
  731. bench_case(1, 256, 256, 14, 14, 3, 4, 1, 1);
  732. bench_case(1, 512, 512, 7, 7, 3, 4, 1, 1);
  733. bench_case(1, 4, 64, 224, 224, 7, 1, 1, 2);
  734. bench_case(1, 256, 128, 56, 56, 3, 1, 1, 2);
  735. bench_case(1, 512, 256, 28, 28, 3, 1, 1, 2);
  736. bench_case(1, 4, 32, 224, 224, 3, 1, 1, 2);
  737. bench_case(1, 256, 128, 56, 56, 3, 4, 1, 2);
  738. bench_case(1, 512, 256, 28, 28, 3, 4, 1, 2);
  739. }
  740. #if MGB_ENABLE_DOT
  741. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_INT8_NCHW44_DOT) {
  742. constexpr size_t RUNS = 40;
  743. std::vector<DType> data_type = {
  744. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  745. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  746. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  747. size_t group, size_t P, size_t S, bool is_nchw = false) {
  748. param::ConvBias param;
  749. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  750. param.pad_h = P;
  751. param.pad_w = P;
  752. param.stride_h = S;
  753. param.stride_w = S;
  754. param.sparse = param::ConvBias::Sparse::DENSE;
  755. param.format = param::ConvBias::Format::NCHW44_DOT;
  756. auto OH = (H + 2 * P - FS) / static_cast<size_t>(S) + 1;
  757. auto OW = (W + 2 * P - FS) / static_cast<size_t>(S) + 1;
  758. TensorShape src = {N, IC / 4, H, W, 4};
  759. TensorShape filter = {OC / 4, IC / 4, FS, FS, 4, 4};
  760. if (group > 1) {
  761. filter = {group, OC / group / 4, IC / group / 4, FS, FS, 4, 4};
  762. param.sparse = param::ConvBias::Sparse::GROUP;
  763. }
  764. if (is_nchw) {
  765. src = {N, IC, H, W};
  766. filter = {OC / 4, FS, FS, IC, 4};
  767. }
  768. TensorShape bias = {1, OC / 4, 1, 1, 4};
  769. TensorShape dst = {N, OC / 4, OH, OW, 4};
  770. SmallVector<TensorShape> shapes{src, filter, bias, {}, dst};
  771. float computations = (((IC / group) * FS * FS + 1) * dst.total_nr_elems() * 2 +
  772. dst.total_nr_elems()) *
  773. 1e-6;
  774. std::vector<std::pair<SmallVector<TensorShape>, float>> shape_arg = {
  775. std::make_pair(shapes, computations)};
  776. benchmark_impl(
  777. param, shape_arg, ".+", RUNS, {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  778. };
  779. bench_case(1, 64, 64, 56, 56, 3, 1, 1, 1);
  780. bench_case(1, 128, 128, 28, 28, 3, 1, 1, 1);
  781. bench_case(1, 256, 256, 14, 14, 3, 1, 1, 1);
  782. bench_case(1, 512, 512, 7, 7, 3, 1, 1, 1);
  783. bench_case(1, 64, 64, 56, 56, 3, 4, 1, 1);
  784. bench_case(1, 128, 128, 28, 28, 3, 4, 1, 1);
  785. bench_case(1, 256, 256, 14, 14, 3, 4, 1, 1);
  786. bench_case(1, 512, 512, 7, 7, 3, 4, 1, 1);
  787. }
  788. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_INT8_NCHW44_DOT_S2) {
  789. constexpr size_t RUNS = 40;
  790. std::vector<DType> data_type = {
  791. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  792. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  793. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  794. size_t group, size_t P, size_t S, bool is_nchw = false) {
  795. param::ConvBias param;
  796. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  797. param.pad_h = P;
  798. param.pad_w = P;
  799. param.stride_h = S;
  800. param.stride_w = S;
  801. param.sparse = param::ConvBias::Sparse::DENSE;
  802. param.format = param::ConvBias::Format::NCHW44_DOT;
  803. auto OH = (H + 2 * P - FS) / static_cast<size_t>(S) + 1;
  804. auto OW = (W + 2 * P - FS) / static_cast<size_t>(S) + 1;
  805. TensorShape src = {N, IC / 4, H, W, 4};
  806. TensorShape filter = {OC / 4, IC / 4, FS, FS, 4, 4};
  807. if (group > 1) {
  808. filter = {group, OC / group / 4, IC / group / 4, FS, FS, 4, 4};
  809. param.sparse = param::ConvBias::Sparse::GROUP;
  810. }
  811. if (is_nchw) {
  812. src = {N, IC, H, W};
  813. filter = {OC / 4, FS, FS, IC, 4};
  814. }
  815. TensorShape bias = {1, OC / 4, 1, 1, 4};
  816. TensorShape dst = {N, OC / 4, OH, OW, 4};
  817. SmallVector<TensorShape> shapes{src, filter, bias, {}, dst};
  818. float computations = (((IC / group) * FS * FS + 1) * dst.total_nr_elems() * 2 +
  819. dst.total_nr_elems()) *
  820. 1e-6;
  821. std::vector<std::pair<SmallVector<TensorShape>, float>> shape_arg = {
  822. std::make_pair(shapes, computations)};
  823. benchmark_impl(
  824. param, shape_arg, ".+", RUNS, {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  825. };
  826. bench_case(1, 64, 64, 56, 56, 3, 1, 1, 2);
  827. bench_case(1, 64, 64, 128, 128, 3, 1, 1, 2);
  828. bench_case(1, 64, 64, 256, 256, 3, 1, 1, 2);
  829. bench_case(1, 64, 64, 156, 156, 3, 1, 1, 2);
  830. bench_case(1, 128, 128, 28, 28, 3, 1, 1, 2);
  831. bench_case(1, 256, 256, 14, 14, 3, 1, 1, 2);
  832. bench_case(1, 512, 512, 7, 7, 3, 1, 1, 2);
  833. bench_case(1, 64, 64, 56, 56, 3, 4, 1, 2);
  834. bench_case(1, 128, 128, 28, 28, 3, 4, 1, 2);
  835. bench_case(1, 256, 256, 14, 14, 3, 4, 1, 2);
  836. bench_case(1, 512, 512, 7, 7, 3, 4, 1, 2);
  837. }
  838. #endif
  839. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_FLOAT_NCHW44) {
  840. constexpr size_t RUNS = 40;
  841. std::vector<DType> data_type = {
  842. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  843. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  844. size_t group, size_t P, size_t S, bool is_nchw = false) {
  845. param::ConvBias param;
  846. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  847. param.pad_h = P;
  848. param.pad_w = P;
  849. param.stride_h = S;
  850. param.stride_w = S;
  851. param.sparse = param::ConvBias::Sparse::DENSE;
  852. param.format = param::ConvBias::Format::NCHW44;
  853. auto OH = (H + 2 * P - FS) / static_cast<size_t>(S) + 1;
  854. auto OW = (W + 2 * P - FS) / static_cast<size_t>(S) + 1;
  855. TensorShape src = {N, IC / 4, H, W, 4};
  856. TensorShape filter = {OC / 4, IC / 4, FS, FS, 4, 4};
  857. if (group > 1) {
  858. filter = {group, OC / group / 4, IC / group / 4, FS, FS, 4, 4};
  859. param.sparse = param::ConvBias::Sparse::GROUP;
  860. }
  861. if (is_nchw) {
  862. src = {N, IC, H, W};
  863. filter = {OC / 4, FS, FS, IC, 4};
  864. }
  865. TensorShape bias = {1, OC / 4, 1, 1, 4};
  866. TensorShape dst = {N, OC / 4, OH, OW, 4};
  867. SmallVector<TensorShape> shapes{src, filter, bias, {}, dst};
  868. float computations = (((IC / group) * FS * FS + 1) * dst.total_nr_elems() * 2 +
  869. dst.total_nr_elems()) *
  870. 1e-6;
  871. std::vector<std::pair<SmallVector<TensorShape>, float>> shape_arg = {
  872. std::make_pair(shapes, computations)};
  873. benchmark_impl(
  874. param, shape_arg, ".+", RUNS, {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  875. };
  876. bench_case(1, 64, 64, 56, 56, 3, 1, 1, 2);
  877. bench_case(1, 128, 128, 28, 28, 3, 1, 1, 2);
  878. bench_case(1, 256, 256, 14, 14, 3, 1, 1, 2);
  879. bench_case(1, 512, 512, 7, 7, 3, 1, 1, 2);
  880. bench_case(1, 64, 64, 56, 56, 3, 4, 1, 2);
  881. bench_case(1, 128, 128, 28, 28, 3, 4, 1, 2);
  882. bench_case(1, 256, 256, 14, 14, 3, 4, 1, 2);
  883. bench_case(1, 512, 512, 7, 7, 3, 4, 1, 2);
  884. bench_case(1, 64, 64, 56 * 2, 56 * 2, 3, 4, 1, 2);
  885. bench_case(1, 128, 128, 28 * 2, 28 * 2, 3, 4, 1, 2);
  886. bench_case(1, 256, 256, 14 * 2, 14 * 2, 3, 4, 1, 2);
  887. bench_case(1, 512, 512, 7 * 2, 7 * 2, 3, 4, 1, 2);
  888. }
  889. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_INT8_INT8_INT8_STRIDE2) {
  890. constexpr size_t RUNS = 50;
  891. param::ConvBias param;
  892. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  893. param.pad_h = 1;
  894. param.pad_w = 1;
  895. param.stride_h = 2;
  896. param.stride_w = 2;
  897. param.sparse = param::ConvBias::Sparse::GROUP;
  898. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  899. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  900. size_t group, size_t P, size_t S) {
  901. SmallVector<TensorShape> shapes{
  902. {N, IC, H, W},
  903. {group, OC / group, IC / group, FS, FS},
  904. {1, OC, 1, 1},
  905. {},
  906. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  907. TensorShape dst{N, OC, H, W};
  908. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  909. dst.total_nr_elems()) *
  910. 1e-6;
  911. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  912. };
  913. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 2);
  914. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 2);
  915. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 2);
  916. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 2);
  917. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 2);
  918. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 2);
  919. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 2);
  920. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 2);
  921. std::string algo_name = "S8STRD2";
  922. printf("Benchmark S8STRD2_LARGE_GROUP algo\n");
  923. std::vector<DType> data_type = {
  924. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  925. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  926. benchmark_impl(
  927. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  928. data_type);
  929. benchmark_impl(
  930. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  931. data_type);
  932. benchmark_impl(
  933. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  934. data_type);
  935. shapes_and_computation.clear();
  936. algo_name = "S8STRD2";
  937. printf("Benchmark S8STRD2_SMALL_GROUP algo\n");
  938. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 2);
  939. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 2);
  940. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 2);
  941. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 2);
  942. benchmark_impl(
  943. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  944. data_type);
  945. benchmark_impl(
  946. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  947. data_type);
  948. benchmark_impl(
  949. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  950. data_type);
  951. }
  952. #if MGB_ENABLE_DOT
  953. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  954. BENCHMARK_CONVBIAS_INT8_INT8_INT8_STRIDE1_WITHDOTPROD) {
  955. constexpr size_t RUNS = 50;
  956. param::ConvBias param;
  957. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  958. param.pad_h = 1;
  959. param.pad_w = 1;
  960. param.stride_h = 1;
  961. param.stride_w = 1;
  962. param.sparse = param::ConvBias::Sparse::GROUP;
  963. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  964. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  965. size_t group, size_t P, size_t S) {
  966. SmallVector<TensorShape> shapes{
  967. {N, IC, H, W},
  968. {group, OC / group, IC / group, FS, FS},
  969. {1, OC, 1, 1},
  970. {},
  971. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  972. TensorShape dst{N, OC, H, W};
  973. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  974. dst.total_nr_elems()) *
  975. 1e-6;
  976. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  977. };
  978. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 1);
  979. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 1);
  980. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 1);
  981. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 1);
  982. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 1);
  983. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 1);
  984. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 1);
  985. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 1);
  986. std::string algo_name = "ARMDOTS8STRD1";
  987. printf("Benchmark ARMDOTS8STRD1_LARGE_GROUP algo\n");
  988. std::vector<DType> data_type = {
  989. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  990. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  991. benchmark_impl(
  992. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  993. data_type);
  994. benchmark_impl(
  995. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  996. data_type);
  997. benchmark_impl(
  998. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  999. data_type);
  1000. shapes_and_computation.clear();
  1001. algo_name = "ARMDOTS8STRD1";
  1002. printf("Benchmark ARMDOTS8STRD1_SMALL_GROUP algo\n");
  1003. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 1);
  1004. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 1);
  1005. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 1);
  1006. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 1);
  1007. benchmark_impl(
  1008. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1009. data_type);
  1010. benchmark_impl(
  1011. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1012. data_type);
  1013. benchmark_impl(
  1014. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1015. data_type);
  1016. }
  1017. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  1018. BENCHMARK_CONVBIAS_INT8_INT8_INT8_STRIDE2_WITHDOTPROD) {
  1019. constexpr size_t RUNS = 50;
  1020. param::ConvBias param;
  1021. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1022. param.pad_h = 1;
  1023. param.pad_w = 1;
  1024. param.stride_h = 2;
  1025. param.stride_w = 2;
  1026. param.sparse = param::ConvBias::Sparse::GROUP;
  1027. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1028. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1029. size_t group, size_t P, size_t S) {
  1030. SmallVector<TensorShape> shapes{
  1031. {N, IC, H, W},
  1032. {group, OC / group, IC / group, FS, FS},
  1033. {1, OC, 1, 1},
  1034. {},
  1035. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  1036. TensorShape dst{N, OC, H, W};
  1037. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1038. dst.total_nr_elems()) *
  1039. 1e-6;
  1040. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1041. };
  1042. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 2);
  1043. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 2);
  1044. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 2);
  1045. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 2);
  1046. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 2);
  1047. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 2);
  1048. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 2);
  1049. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 2);
  1050. std::string algo_name = "ARMDOTS8STRD2";
  1051. printf("Benchmark ARMDOTS8STRD2_LARGE_GROUP algo\n");
  1052. std::vector<DType> data_type = {
  1053. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  1054. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  1055. benchmark_impl(
  1056. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1057. data_type);
  1058. benchmark_impl(
  1059. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1060. data_type);
  1061. benchmark_impl(
  1062. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1063. data_type);
  1064. shapes_and_computation.clear();
  1065. algo_name = "ARMDOTS8STRD2";
  1066. printf("Benchmark ARMDOTS8STRD2_SMALL_GROUP algo\n");
  1067. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 2);
  1068. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 2);
  1069. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 2);
  1070. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 2);
  1071. benchmark_impl(
  1072. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1073. data_type);
  1074. benchmark_impl(
  1075. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1076. data_type);
  1077. benchmark_impl(
  1078. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1079. data_type);
  1080. }
  1081. #endif
  1082. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  1083. BENCHMARK_CONVBIAS_QUINT8_QUINT8_QUINT8_STRIDE1) {
  1084. constexpr size_t RUNS = 50;
  1085. param::ConvBias param;
  1086. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1087. param.pad_h = 1;
  1088. param.pad_w = 1;
  1089. param.stride_h = 1;
  1090. param.stride_w = 1;
  1091. param.sparse = param::ConvBias::Sparse::GROUP;
  1092. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1093. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1094. size_t group, size_t P, size_t S) {
  1095. SmallVector<TensorShape> shapes{
  1096. {N, IC, H, W},
  1097. {group, OC / group, IC / group, FS, FS},
  1098. {1, OC, 1, 1},
  1099. {},
  1100. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  1101. TensorShape dst{N, OC, H, W};
  1102. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1103. dst.total_nr_elems()) *
  1104. 1e-6;
  1105. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1106. };
  1107. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 1);
  1108. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 1);
  1109. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 1);
  1110. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 1);
  1111. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 1);
  1112. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 1);
  1113. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 1);
  1114. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 1);
  1115. std::string algo_name = "QU8STRD1";
  1116. printf("Benchmark QU8STRD1_LARGE_GROUP algo\n");
  1117. std::vector<DType> data_type = {
  1118. dtype::Quantized8Asymm(0.2f, 100), dtype::Quantized8Asymm(0.2f, 120),
  1119. dtype::QuantizedS32(0.04f), dtype::Quantized8Asymm(1.4f, 110)};
  1120. benchmark_impl(
  1121. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1122. data_type);
  1123. benchmark_impl(
  1124. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1125. data_type);
  1126. benchmark_impl(
  1127. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1128. data_type);
  1129. shapes_and_computation.clear();
  1130. algo_name = "QU8STRD1";
  1131. printf("Benchmark QU8STRD1_SMALL_GROUP algo\n");
  1132. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 1);
  1133. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 1);
  1134. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 1);
  1135. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 1);
  1136. benchmark_impl(
  1137. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1138. data_type);
  1139. benchmark_impl(
  1140. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1141. data_type);
  1142. benchmark_impl(
  1143. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1144. data_type);
  1145. }
  1146. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  1147. BENCHMARK_CONVBIAS_QUINT8_QUINT8_QUINT8_STRIDE2) {
  1148. constexpr size_t RUNS = 50;
  1149. param::ConvBias param;
  1150. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1151. param.pad_h = 1;
  1152. param.pad_w = 1;
  1153. param.stride_h = 2;
  1154. param.stride_w = 2;
  1155. param.sparse = param::ConvBias::Sparse::GROUP;
  1156. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1157. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1158. size_t group, size_t P, size_t S) {
  1159. SmallVector<TensorShape> shapes{
  1160. {N, IC, H, W},
  1161. {group, OC / group, IC / group, FS, FS},
  1162. {1, OC, 1, 1},
  1163. {},
  1164. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  1165. TensorShape dst{N, OC, H, W};
  1166. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1167. dst.total_nr_elems()) *
  1168. 1e-6;
  1169. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1170. };
  1171. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 2);
  1172. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 2);
  1173. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 2);
  1174. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 2);
  1175. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 2);
  1176. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 2);
  1177. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 2);
  1178. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 2);
  1179. std::string algo_name = "QU8STRD2";
  1180. printf("Benchmark QU8STRD2_LARGE_GROUP algo\n");
  1181. std::vector<DType> data_type = {
  1182. dtype::Quantized8Asymm(0.2f, 100), dtype::Quantized8Asymm(0.2f, 120),
  1183. dtype::QuantizedS32(0.04f), dtype::Quantized8Asymm(1.4f, 110)};
  1184. benchmark_impl(
  1185. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1186. data_type);
  1187. benchmark_impl(
  1188. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1189. data_type);
  1190. benchmark_impl(
  1191. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1192. data_type);
  1193. shapes_and_computation.clear();
  1194. algo_name = "QU8STRD2";
  1195. printf("Benchmark QU8STRD2_SMALL_GROUP algo\n");
  1196. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 2);
  1197. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 2);
  1198. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 2);
  1199. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 2);
  1200. benchmark_impl(
  1201. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1202. data_type);
  1203. benchmark_impl(
  1204. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1205. data_type);
  1206. benchmark_impl(
  1207. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1208. data_type);
  1209. }
  1210. #if MGB_ENABLE_DOT
  1211. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  1212. BENCHMARK_CONVBIAS_QUINT8_QUINT8_QUINT8_STRIDE1_WITHDOTPROD) {
  1213. constexpr size_t RUNS = 50;
  1214. param::ConvBias param;
  1215. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1216. param.pad_h = 1;
  1217. param.pad_w = 1;
  1218. param.stride_h = 1;
  1219. param.stride_w = 1;
  1220. param.sparse = param::ConvBias::Sparse::GROUP;
  1221. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1222. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1223. size_t group, size_t P, size_t S) {
  1224. SmallVector<TensorShape> shapes{
  1225. {N, IC, H, W},
  1226. {group, OC / group, IC / group, FS, FS},
  1227. {1, OC, 1, 1},
  1228. {},
  1229. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  1230. TensorShape dst{N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1};
  1231. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1232. dst.total_nr_elems()) *
  1233. 1e-6;
  1234. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1235. };
  1236. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 1);
  1237. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 1);
  1238. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 1);
  1239. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 1);
  1240. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 1);
  1241. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 1);
  1242. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 1);
  1243. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 1);
  1244. std::string algo_name = "ARMDOTU8STRD1";
  1245. printf("Benchmark ARMDOTU8STRD1_LARGE_GROUP algo\n");
  1246. std::vector<DType> data_type = {
  1247. dtype::Quantized8Asymm(0.2f, 100), dtype::Quantized8Asymm(0.2f, 120),
  1248. dtype::QuantizedS32(0.04f), dtype::Quantized8Asymm(1.4f, 110)};
  1249. benchmark_impl(
  1250. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1251. data_type);
  1252. benchmark_impl(
  1253. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1254. data_type);
  1255. benchmark_impl(
  1256. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1257. data_type);
  1258. shapes_and_computation.clear();
  1259. algo_name = "ARMDOTU8STRD1";
  1260. printf("Benchmark ARMDOTS8STRD1_SMALL_GROUP algo\n");
  1261. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 1);
  1262. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 1);
  1263. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 1);
  1264. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 1);
  1265. benchmark_impl(
  1266. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1267. data_type);
  1268. benchmark_impl(
  1269. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1270. data_type);
  1271. benchmark_impl(
  1272. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1273. data_type);
  1274. }
  1275. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  1276. BENCHMARK_CONVBIAS_QUINT8_QUINT8_QUINT8_STRIDE2_WITHDOTPROD) {
  1277. constexpr size_t RUNS = 50;
  1278. param::ConvBias param;
  1279. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1280. param.pad_h = 1;
  1281. param.pad_w = 1;
  1282. param.stride_h = 2;
  1283. param.stride_w = 2;
  1284. param.sparse = param::ConvBias::Sparse::GROUP;
  1285. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1286. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1287. size_t group, size_t P, size_t S) {
  1288. SmallVector<TensorShape> shapes{
  1289. {N, IC, H, W},
  1290. {group, OC / group, IC / group, FS, FS},
  1291. {1, OC, 1, 1},
  1292. {},
  1293. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  1294. TensorShape dst{N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1};
  1295. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1296. dst.total_nr_elems()) *
  1297. 1e-6;
  1298. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1299. };
  1300. bench_case(1, 32, 32, 200, 200, 5, 4, 1, 2);
  1301. bench_case(1, 32, 32, 200, 200, 5, 32, 1, 2);
  1302. bench_case(1, 32, 32, 128, 128, 5, 4, 1, 2);
  1303. bench_case(1, 32, 32, 128, 128, 5, 32, 1, 2);
  1304. bench_case(1, 32, 32, 100, 100, 5, 4, 1, 2);
  1305. bench_case(1, 32, 32, 100, 100, 5, 32, 1, 2);
  1306. bench_case(1, 32, 32, 80, 80, 5, 4, 1, 2);
  1307. bench_case(1, 32, 32, 80, 80, 5, 32, 1, 2);
  1308. std::string algo_name = "ARMDOTU8STRD2";
  1309. printf("Benchmark ARMDOTU8STRD2_LARGE_GROUP algo\n");
  1310. std::vector<DType> data_type = {
  1311. dtype::Quantized8Asymm(0.2f, 100), dtype::Quantized8Asymm(0.2f, 120),
  1312. dtype::QuantizedS32(0.04f), dtype::Quantized8Asymm(1.4f, 110)};
  1313. benchmark_impl(
  1314. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1315. data_type);
  1316. benchmark_impl(
  1317. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1318. data_type);
  1319. benchmark_impl(
  1320. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1321. data_type);
  1322. shapes_and_computation.clear();
  1323. algo_name = "ARMDOTU8STRD2";
  1324. printf("Benchmark ARMDOTU8STRD2_SMALL_GROUP algo\n");
  1325. bench_case(1, 32, 32, 200, 200, 5, 1, 1, 2);
  1326. bench_case(1, 32, 32, 128, 128, 5, 1, 1, 2);
  1327. bench_case(1, 32, 32, 100, 100, 5, 1, 1, 2);
  1328. bench_case(1, 32, 32, 80, 80, 5, 1, 1, 2);
  1329. benchmark_impl(
  1330. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1331. data_type);
  1332. benchmark_impl(
  1333. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1334. data_type);
  1335. benchmark_impl(
  1336. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1337. data_type);
  1338. }
  1339. #endif
  1340. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_WINOGRAD_F32) {
  1341. constexpr size_t RUNS = 50;
  1342. param::ConvBias param;
  1343. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1344. param.pad_h = 1;
  1345. param.pad_w = 1;
  1346. param.stride_h = 1;
  1347. param.stride_w = 1;
  1348. param.sparse = param::ConvBias::Sparse::GROUP;
  1349. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1350. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1351. size_t group) {
  1352. SmallVector<TensorShape> shapes{
  1353. {N, IC, H, W},
  1354. {group, OC / group, IC / group, FS, FS},
  1355. {1, OC, 1, 1},
  1356. {},
  1357. {N, OC, H, W}};
  1358. TensorShape dst{N, OC, H, W};
  1359. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1360. dst.total_nr_elems()) *
  1361. 1e-6;
  1362. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1363. };
  1364. bench_case(1, 32, 32, 200, 200, 3, 4);
  1365. bench_case(1, 32, 32, 200, 200, 3, 1);
  1366. bench_case(1, 32, 32, 128, 128, 3, 4);
  1367. bench_case(1, 32, 32, 128, 128, 3, 1);
  1368. bench_case(1, 32, 32, 100, 100, 3, 4);
  1369. bench_case(1, 32, 32, 100, 100, 3, 1);
  1370. bench_case(1, 32, 32, 80, 80, 3, 4);
  1371. bench_case(1, 512, 512, 14, 14, 3, 1);
  1372. bench_case(1, 512, 256, 14, 14, 3, 1);
  1373. bench_case(1, 512, 128, 14, 14, 3, 1);
  1374. bench_case(1, 512, 64, 14, 14, 3, 1);
  1375. bench_case(1, 512, 512, 7, 7, 3, 1);
  1376. bench_case(1, 512, 256, 7, 7, 3, 1);
  1377. bench_case(1, 512, 128, 7, 7, 3, 1);
  1378. bench_case(1, 512, 64, 7, 7, 3, 1);
  1379. std::string algo_name;
  1380. #if MEGDNN_AARCH64
  1381. algo_name = "WINOGRAD:AARCH64_F32_MK4_4x16:4:2";
  1382. #else
  1383. algo_name = "WINOGRAD:ARMV7_F32_MK4_4x8:4:2";
  1384. #endif
  1385. std::vector<DType> data_type = {
  1386. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  1387. printf("Benchmark WINOGRAD_F32_MK4 algo\n");
  1388. benchmark_impl(
  1389. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1390. data_type);
  1391. benchmark_impl(
  1392. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1393. data_type);
  1394. benchmark_impl(
  1395. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1396. data_type);
  1397. }
  1398. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_WINOGRAD_INT8) {
  1399. constexpr size_t RUNS = 50;
  1400. param::ConvBias param;
  1401. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1402. param.pad_h = 1;
  1403. param.pad_w = 1;
  1404. param.stride_h = 1;
  1405. param.stride_w = 1;
  1406. param.sparse = param::ConvBias::Sparse::GROUP;
  1407. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1408. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1409. size_t group) {
  1410. SmallVector<TensorShape> shapes{
  1411. {N, IC, H, W},
  1412. {group, OC / group, IC / group, FS, FS},
  1413. {1, OC, 1, 1},
  1414. {},
  1415. {N, OC, H, W}};
  1416. TensorShape dst{N, OC, H, W};
  1417. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1418. dst.total_nr_elems()) *
  1419. 1e-6;
  1420. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1421. };
  1422. bench_case(1, 32, 32, 200, 200, 3, 4);
  1423. bench_case(1, 32, 32, 200, 200, 3, 1);
  1424. bench_case(1, 32, 32, 128, 128, 3, 4);
  1425. bench_case(1, 32, 32, 128, 128, 3, 1);
  1426. bench_case(1, 32, 32, 100, 100, 3, 4);
  1427. bench_case(1, 32, 32, 100, 100, 3, 1);
  1428. bench_case(1, 32, 32, 80, 80, 3, 4);
  1429. bench_case(1, 512, 512, 14, 14, 3, 1);
  1430. bench_case(1, 512, 256, 14, 14, 3, 1);
  1431. bench_case(1, 512, 128, 14, 14, 3, 1);
  1432. bench_case(1, 512, 64, 14, 14, 3, 1);
  1433. bench_case(1, 512, 512, 7, 7, 3, 1);
  1434. bench_case(1, 512, 256, 7, 7, 3, 1);
  1435. bench_case(1, 512, 128, 7, 7, 3, 1);
  1436. bench_case(1, 512, 64, 7, 7, 3, 1);
  1437. std::string algo_name;
  1438. #if MEGDNN_AARCH64
  1439. algo_name = "WINOGRAD:AARCH64_INT16X16X32_MK8_8X8:8:2:32";
  1440. #else
  1441. algo_name = "WINOGRAD:ARMV7_INT16X16X32_MK8_4X8:8:2:32";
  1442. #endif
  1443. std::vector<DType> data_type = {
  1444. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  1445. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  1446. printf("Benchmark WINOGRAD_IN8_MK8 algo\n");
  1447. benchmark_impl(
  1448. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1449. data_type);
  1450. benchmark_impl(
  1451. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1452. data_type);
  1453. benchmark_impl(
  1454. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1455. data_type);
  1456. }
  1457. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  1458. BENCHMARK_CONVBIAS_WINOGRAD_NCHW44_INT8_MK8) {
  1459. constexpr size_t RUNS = 50;
  1460. param::ConvBias param;
  1461. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1462. param.pad_h = 1;
  1463. param.pad_w = 1;
  1464. param.stride_h = 1;
  1465. param.stride_w = 1;
  1466. param.sparse = param::ConvBias::Sparse::DENSE;
  1467. param.format = param::ConvBias::Format::NCHW44;
  1468. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1469. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1470. size_t group) {
  1471. SmallVector<TensorShape> shapes{
  1472. {N, IC / 4, H, W, 4},
  1473. {OC / 4, IC / 4, FS, FS, 4, 4},
  1474. {1, OC / 4, 1, 1, 4},
  1475. {},
  1476. {N, OC / 4, H, W, 4}};
  1477. TensorShape dst{N, OC, H, W};
  1478. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1479. dst.total_nr_elems()) *
  1480. 1e-6;
  1481. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1482. };
  1483. bench_case(1, 32, 32, 200, 200, 3, 1);
  1484. bench_case(1, 32, 32, 128, 128, 3, 1);
  1485. bench_case(1, 32, 32, 100, 100, 3, 1);
  1486. bench_case(1, 512, 512, 14, 14, 3, 1);
  1487. bench_case(1, 512, 256, 14, 14, 3, 1);
  1488. bench_case(1, 512, 128, 14, 14, 3, 1);
  1489. bench_case(1, 512, 64, 14, 14, 3, 1);
  1490. bench_case(1, 512, 512, 7, 7, 3, 1);
  1491. bench_case(1, 512, 256, 7, 7, 3, 1);
  1492. bench_case(1, 512, 128, 7, 7, 3, 1);
  1493. bench_case(1, 512, 64, 7, 7, 3, 1);
  1494. std::string algo_name;
  1495. #if MEGDNN_AARCH64
  1496. algo_name = "WINOGRAD_NCHW44:AARCH64_INT16X16X32_MK8_8X8:8:2:32";
  1497. #else
  1498. algo_name = "WINOGRAD_NCHW44:ARMV7_INT16X16X32_MK8_4X8:8:2:32";
  1499. #endif
  1500. std::vector<DType> data_type = {
  1501. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  1502. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  1503. printf("Benchmark WINOGRAD_INT8_MK8 algo\n");
  1504. benchmark_impl(
  1505. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1506. data_type);
  1507. benchmark_impl(
  1508. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1509. data_type);
  1510. benchmark_impl(
  1511. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1512. data_type);
  1513. }
  1514. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  1515. BENCHMARK_CONVBIAS_WINOGRAD_NCHW44_INT8_COMP_F32) {
  1516. constexpr size_t RUNS = 50;
  1517. param::ConvBias param;
  1518. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1519. param.pad_h = 1;
  1520. param.pad_w = 1;
  1521. param.stride_h = 1;
  1522. param.stride_w = 1;
  1523. param.sparse = param::ConvBias::Sparse::DENSE; // GROUP;
  1524. param.format = param::ConvBias::Format::NCHW44;
  1525. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1526. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1527. size_t group) {
  1528. SmallVector<TensorShape> shapes{
  1529. {N, IC / 4, H, W, 4},
  1530. {OC / 4, IC / 4, FS, FS, 4, 4},
  1531. {1, OC / 4, 1, 1, 4},
  1532. {},
  1533. {N, OC / 4, H, W, 4}};
  1534. TensorShape dst{N, OC, H, W};
  1535. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1536. dst.total_nr_elems()) *
  1537. 1e-6;
  1538. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1539. };
  1540. bench_case(1, 32, 32, 200, 200, 3, 1);
  1541. bench_case(1, 32, 32, 128, 128, 3, 1);
  1542. bench_case(1, 32, 32, 100, 100, 3, 1);
  1543. bench_case(1, 512, 512, 14, 14, 3, 1);
  1544. bench_case(1, 512, 256, 14, 14, 3, 1);
  1545. bench_case(1, 512, 128, 14, 14, 3, 1);
  1546. bench_case(1, 512, 64, 14, 14, 3, 1);
  1547. bench_case(1, 512, 512, 7, 7, 3, 1);
  1548. bench_case(1, 512, 256, 7, 7, 3, 1);
  1549. bench_case(1, 512, 128, 7, 7, 3, 1);
  1550. bench_case(1, 512, 64, 7, 7, 3, 1);
  1551. std::string algo_name;
  1552. #if MEGDNN_AARCH64
  1553. algo_name = "WINOGRAD_NCHW44:AARCH64_F32_MK4_4x16:4:2:32";
  1554. #else
  1555. algo_name = "WINOGRAD_NCHW44:ARMV7_F32_MK4_4x8:4:2:32";
  1556. #endif
  1557. std::vector<DType> data_type = {
  1558. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  1559. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  1560. printf("Benchmark WINOGRAD_INT8_NCHW44_MK4_COMP_F32 algo\n");
  1561. benchmark_impl(
  1562. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1563. data_type);
  1564. benchmark_impl(
  1565. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1566. data_type);
  1567. benchmark_impl(
  1568. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1569. data_type);
  1570. }
  1571. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_FP32) {
  1572. constexpr size_t RUNS = 50;
  1573. param::ConvBias param;
  1574. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1575. param.pad_h = 1;
  1576. param.pad_w = 1;
  1577. param.stride_h = 1;
  1578. param.stride_w = 1;
  1579. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1580. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1581. size_t group) {
  1582. SmallVector<TensorShape> shapes{
  1583. {N, IC, H, W},
  1584. {OC, IC / group, FS, FS},
  1585. {1, OC, 1, 1},
  1586. {},
  1587. {N, OC, H, W}};
  1588. TensorShape dst{N, OC, H, W};
  1589. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1590. dst.total_nr_elems()) *
  1591. 1e-6;
  1592. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1593. };
  1594. std::vector<DType> data_type = {
  1595. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  1596. bench_case(1, 32, 32, 300, 300, 3, 1);
  1597. bench_case(1, 32, 32, 400, 400, 3, 1);
  1598. bench_case(1, 32, 32, 100, 100, 3, 1);
  1599. bench_case(1, 32, 32, 80, 80, 3, 1);
  1600. bench_case(1, 32, 64, 200, 200, 3, 1);
  1601. bench_case(1, 32, 64, 128, 128, 3, 1);
  1602. bench_case(1, 32, 64, 100, 100, 3, 1);
  1603. bench_case(1, 32, 64, 80, 80, 3, 1);
  1604. bench_case(1, 32, 128, 200, 200, 3, 1);
  1605. bench_case(1, 32, 128, 128, 128, 3, 1);
  1606. bench_case(1, 32, 128, 100, 100, 3, 1);
  1607. bench_case(1, 32, 128, 80, 80, 3, 1);
  1608. bench_case(1, 64, 32, 7, 7, 3, 1);
  1609. bench_case(1, 64, 64, 7, 7, 3, 1);
  1610. bench_case(1, 64, 128, 7, 7, 3, 1);
  1611. bench_case(1, 64, 256, 7, 7, 3, 1);
  1612. bench_case(1, 64, 512, 7, 7, 3, 1);
  1613. bench_case(1, 64, 1024, 7, 7, 3, 1);
  1614. bench_case(1, 64, 32, 14, 14, 3, 1);
  1615. bench_case(1, 64, 64, 14, 14, 3, 1);
  1616. bench_case(1, 64, 128, 14, 14, 3, 1);
  1617. bench_case(1, 64, 256, 14, 14, 3, 1);
  1618. bench_case(1, 64, 512, 14, 14, 3, 1);
  1619. bench_case(1, 64, 1024, 14, 14, 3, 1);
  1620. bench_case(1, 128, 128, 14, 14, 3, 1);
  1621. bench_case(1, 128, 256, 14, 14, 3, 1);
  1622. bench_case(1, 512, 512, 14, 14, 3, 1);
  1623. bench_case(1, 256, 512, 14, 14, 3, 1);
  1624. bench_case(1, 512, 1024, 14, 14, 3, 1);
  1625. bench_case(1, 1024, 1024, 14, 14, 3, 1);
  1626. std::string algo_name = "IM2COLMATMUL:AARCH64_F32K8X12X1:96";
  1627. printf("Benchmark IM2COLMATMUL:AARCH64_F32K8X12X1algo:96\n");
  1628. benchmark_impl(
  1629. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1630. data_type);
  1631. benchmark_impl(
  1632. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1633. data_type);
  1634. benchmark_impl(
  1635. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1636. data_type);
  1637. algo_name = "IM2COLMATMUL:AARCH64_F32K8X12X1:192";
  1638. printf("Benchmark IM2COLMATMUL:AARCH64_F32K8X12X1algo:192\n");
  1639. benchmark_impl(
  1640. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1641. data_type);
  1642. benchmark_impl(
  1643. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1644. data_type);
  1645. benchmark_impl(
  1646. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1647. data_type);
  1648. algo_name = "IM2COLMATMUL:AARCH64_F32K8X12X1:384";
  1649. printf("Benchmark IM2COLMATMUL:AARCH64_F32K8X12X1algo:384\n");
  1650. benchmark_impl(
  1651. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1652. data_type);
  1653. benchmark_impl(
  1654. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1655. data_type);
  1656. benchmark_impl(
  1657. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1658. data_type);
  1659. shapes_and_computation.clear();
  1660. }
  1661. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  1662. BENCHMARK_CHANNEL_WISE_INT8_INT8_INT8_STRIDE1) {
  1663. constexpr size_t RUNS = 50;
  1664. param::ConvBias param;
  1665. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1666. param.pad_h = 1;
  1667. param.pad_w = 1;
  1668. param.stride_h = 1;
  1669. param.stride_w = 1;
  1670. param.sparse = param::ConvBias::Sparse::GROUP;
  1671. param.format = param::ConvBias::Format::NCHW44;
  1672. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1673. auto bench_case = [&](size_t N, size_t IC, size_t H, size_t W, size_t FS,
  1674. size_t P) {
  1675. size_t group = IC;
  1676. size_t OC = IC;
  1677. size_t S = 1;
  1678. SmallVector<TensorShape> shapes{
  1679. {N, IC, H, W, 4},
  1680. {group, 1, 1, FS, FS, 4},
  1681. {1, OC, 1, 1, 4},
  1682. {},
  1683. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1, 4}};
  1684. TensorShape dst{N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1, 4};
  1685. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1686. dst.total_nr_elems()) *
  1687. 1e-6;
  1688. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1689. };
  1690. bench_case(1, 128, 200, 200, 3, 1);
  1691. bench_case(1, 128, 128, 128, 3, 1);
  1692. bench_case(1, 128, 100, 100, 3, 1);
  1693. bench_case(1, 128, 80, 80, 3, 1);
  1694. bench_case(1, 128, 56, 56, 3, 1);
  1695. bench_case(1, 128, 28, 28, 3, 1);
  1696. bench_case(1, 128, 14, 14, 3, 1);
  1697. bench_case(1, 64, 200, 200, 3, 1);
  1698. bench_case(1, 64, 128, 128, 3, 1);
  1699. bench_case(1, 64, 100, 100, 3, 1);
  1700. bench_case(1, 64, 80, 80, 3, 1);
  1701. bench_case(1, 64, 56, 56, 3, 1);
  1702. bench_case(1, 64, 28, 28, 3, 1);
  1703. bench_case(1, 64, 14, 14, 3, 1);
  1704. bench_case(1, 32, 200, 200, 3, 1);
  1705. bench_case(1, 32, 128, 128, 3, 1);
  1706. bench_case(1, 32, 100, 100, 3, 1);
  1707. bench_case(1, 32, 80, 80, 3, 1);
  1708. bench_case(1, 32, 56, 56, 3, 1);
  1709. bench_case(1, 32, 28, 28, 3, 1);
  1710. bench_case(1, 32, 14, 14, 3, 1);
  1711. std::string algo_name = "S8_CHAN_WISE_STRD1_NCHW44";
  1712. printf("Benchmarker S8_CHAN_WISE_STRD1_NCHW44 algo\n");
  1713. std::vector<DType> data_type = {
  1714. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  1715. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  1716. benchmark_impl(
  1717. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1718. data_type);
  1719. benchmark_impl(
  1720. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1721. data_type);
  1722. benchmark_impl(
  1723. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1724. data_type);
  1725. }
  1726. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  1727. BENCHMARK_CHANNEL_WISE_INT8_INT8_INT16_STRIDE1) {
  1728. constexpr size_t RUNS = 50;
  1729. param::ConvBias param;
  1730. param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
  1731. param.pad_h = 1;
  1732. param.pad_w = 1;
  1733. param.stride_h = 1;
  1734. param.stride_w = 1;
  1735. param.sparse = param::ConvBias::Sparse::GROUP;
  1736. param.format = param::ConvBias::Format::NCHW44;
  1737. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1738. auto bench_case = [&](size_t N, size_t IC, size_t H, size_t W, size_t FS,
  1739. size_t P) {
  1740. size_t group = IC;
  1741. size_t OC = IC;
  1742. size_t S = 1;
  1743. SmallVector<TensorShape> shapes{
  1744. {N, IC, H, W, 4},
  1745. {group, 1, 1, FS, FS, 4},
  1746. {1, OC, 1, 1, 4},
  1747. {},
  1748. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1, 4}};
  1749. TensorShape dst{N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1, 4};
  1750. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1751. dst.total_nr_elems()) *
  1752. 1e-6;
  1753. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1754. };
  1755. bench_case(1, 128, 200, 200, 3, 1);
  1756. bench_case(1, 128, 128, 128, 3, 1);
  1757. bench_case(1, 128, 100, 100, 3, 1);
  1758. bench_case(1, 128, 80, 80, 3, 1);
  1759. bench_case(1, 128, 56, 56, 3, 1);
  1760. bench_case(1, 128, 28, 28, 3, 1);
  1761. bench_case(1, 128, 14, 14, 3, 1);
  1762. bench_case(1, 64, 200, 200, 3, 1);
  1763. bench_case(1, 64, 128, 128, 3, 1);
  1764. bench_case(1, 64, 100, 100, 3, 1);
  1765. bench_case(1, 64, 80, 80, 3, 1);
  1766. bench_case(1, 64, 56, 56, 3, 1);
  1767. bench_case(1, 64, 28, 28, 3, 1);
  1768. bench_case(1, 64, 14, 14, 3, 1);
  1769. bench_case(1, 32, 200, 200, 3, 1);
  1770. bench_case(1, 32, 128, 128, 3, 1);
  1771. bench_case(1, 32, 100, 100, 3, 1);
  1772. bench_case(1, 32, 80, 80, 3, 1);
  1773. bench_case(1, 32, 56, 56, 3, 1);
  1774. bench_case(1, 32, 28, 28, 3, 1);
  1775. bench_case(1, 32, 14, 14, 3, 1);
  1776. std::string algo_name = "S8x8x16_CHAN_WISE_STRD1_STRD2_NCHW44";
  1777. printf("Benchmarker S8x8x16_CHAN_WISE_STRD1_STRD2_NCHW44 algo\n");
  1778. std::vector<DType> data_type = {
  1779. dtype::Int8(), dtype::Int8(), dtype::Int16(), dtype::Int16()};
  1780. benchmark_impl(
  1781. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1782. data_type);
  1783. benchmark_impl(
  1784. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1785. data_type);
  1786. benchmark_impl(
  1787. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1788. data_type);
  1789. }
  1790. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_IM2COL_NCHW44_INT8x8x32_STRIDE1) {
  1791. constexpr size_t RUNS = 50;
  1792. param::ConvBias param;
  1793. param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
  1794. param.pad_h = 1;
  1795. param.pad_w = 1;
  1796. param.stride_h = 1;
  1797. param.stride_w = 1;
  1798. param.sparse = param::ConvBias::Sparse::DENSE;
  1799. param.format = param::ConvBias::Format::NCHW44;
  1800. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1801. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1802. size_t group = 1) {
  1803. SmallVector<TensorShape> shapes{
  1804. {N, IC, H, W, 4},
  1805. {OC, IC / group, FS, FS, 4, 4},
  1806. {/*1, OC, 1, 1*/},
  1807. {},
  1808. {N, OC, H, W, 4}};
  1809. TensorShape dst{N, OC, H, W, 4};
  1810. float computations = ((4 * IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1811. dst.total_nr_elems()) *
  1812. 1e-6;
  1813. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1814. };
  1815. bench_case(1, 32, 32, 300, 300, 3, 1);
  1816. bench_case(1, 32, 32, 400, 400, 3, 1);
  1817. bench_case(1, 32, 32, 100, 100, 3, 1);
  1818. bench_case(1, 32, 32, 80, 80, 3, 1);
  1819. bench_case(1, 32, 64, 200, 200, 3, 1);
  1820. bench_case(1, 32, 64, 128, 128, 3, 1);
  1821. bench_case(1, 32, 64, 100, 100, 3, 1);
  1822. bench_case(1, 32, 64, 80, 80, 3, 1);
  1823. bench_case(1, 32, 128, 200, 200, 3, 1);
  1824. bench_case(1, 32, 128, 128, 128, 3, 1);
  1825. bench_case(1, 32, 128, 100, 100, 3, 1);
  1826. bench_case(1, 32, 128, 80, 80, 3, 1);
  1827. #if 1
  1828. bench_case(1, 64, 32, 7, 7, 3, 1);
  1829. bench_case(1, 64, 64, 7, 7, 3, 1);
  1830. bench_case(1, 64, 128, 7, 7, 3, 1);
  1831. bench_case(1, 64, 256, 7, 7, 3, 1);
  1832. bench_case(1, 64, 512, 7, 7, 3, 1);
  1833. bench_case(1, 64, 1024, 7, 7, 3, 1);
  1834. bench_case(1, 64, 32, 14, 14, 3, 1);
  1835. bench_case(1, 64, 64, 14, 14, 3, 1);
  1836. bench_case(1, 64, 128, 14, 14, 3, 1);
  1837. bench_case(1, 64, 256, 14, 14, 3, 1);
  1838. bench_case(1, 64, 512, 14, 14, 3, 1);
  1839. bench_case(1, 64, 1024, 14, 14, 3, 1);
  1840. bench_case(1, 128, 128, 14, 14, 3, 1);
  1841. bench_case(1, 128, 256, 14, 14, 3, 1);
  1842. bench_case(1, 512, 512, 14, 14, 3, 1);
  1843. bench_case(1, 256, 512, 14, 14, 3, 1);
  1844. bench_case(1, 512, 1024, 14, 14, 3, 1);
  1845. bench_case(1, 1024, 1024, 14, 14, 3, 1);
  1846. #endif
  1847. std::string algo_name = "IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96";
  1848. printf("Benchmarker IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96 algo\n");
  1849. std::vector<DType> data_type = {
  1850. dtype::QuantizedS8(2.5f),
  1851. dtype::QuantizedS8(2.5f),
  1852. dtype::QuantizedS32(6.25f),
  1853. {}};
  1854. benchmark_impl(
  1855. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1856. data_type);
  1857. benchmark_impl(
  1858. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1859. data_type);
  1860. benchmark_impl(
  1861. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1862. data_type);
  1863. algo_name = "IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:192";
  1864. printf("Benchmarker IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:192 "
  1865. "algo\n");
  1866. benchmark_impl(
  1867. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1868. data_type);
  1869. benchmark_impl(
  1870. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1871. data_type);
  1872. benchmark_impl(
  1873. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1874. data_type);
  1875. algo_name = "IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:384";
  1876. printf("Benchmarker IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:384 "
  1877. "algo\n");
  1878. benchmark_impl(
  1879. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1880. data_type);
  1881. benchmark_impl(
  1882. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1883. data_type);
  1884. benchmark_impl(
  1885. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1886. data_type);
  1887. }
  1888. #endif
  1889. /*================== BENCHMARK MULTITHREAD CONV1X1 =====================*/
  1890. #if MEGDNN_WITH_BENCHMARK
  1891. namespace {
  1892. std::vector<std::pair<SmallVector<TensorShape>, float>>
  1893. get_conv1x1_multithread_benchmark_args() {
  1894. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1895. auto bench_case = [&](size_t IC, size_t OC, size_t H, size_t W) {
  1896. SmallVector<TensorShape> shapes{
  1897. {1, IC, H, W}, {OC, IC, 1, 1}, {1, OC, 1, 1}, {}, {1, OC, H, W}};
  1898. TensorShape dst{1, OC, H, W};
  1899. float computations =
  1900. (IC * dst.total_nr_elems() * 2 + dst.total_nr_elems()) * 1e-6;
  1901. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1902. };
  1903. bench_case(32, 32, 300, 300);
  1904. bench_case(32, 32, 400, 400);
  1905. bench_case(32, 32, 100, 100);
  1906. bench_case(32, 32, 80, 80);
  1907. bench_case(32, 64, 200, 200);
  1908. bench_case(32, 64, 128, 128);
  1909. bench_case(32, 64, 100, 100);
  1910. bench_case(32, 64, 80, 80);
  1911. bench_case(32, 128, 200, 200);
  1912. bench_case(32, 128, 128, 128);
  1913. bench_case(32, 128, 100, 100);
  1914. bench_case(32, 128, 80, 80);
  1915. bench_case(64, 32, 7, 7);
  1916. bench_case(64, 64, 7, 7);
  1917. bench_case(64, 128, 7, 7);
  1918. bench_case(64, 256, 7, 7);
  1919. bench_case(64, 512, 7, 7);
  1920. bench_case(64, 1024, 7, 7);
  1921. bench_case(64, 32, 14, 14);
  1922. bench_case(64, 64, 14, 14);
  1923. bench_case(64, 128, 14, 14);
  1924. bench_case(64, 256, 14, 14);
  1925. bench_case(64, 512, 14, 14);
  1926. bench_case(64, 1024, 14, 14);
  1927. bench_case(128, 128, 14, 14);
  1928. bench_case(128, 256, 14, 14);
  1929. bench_case(512, 512, 14, 14);
  1930. bench_case(256, 512, 14, 14);
  1931. bench_case(512, 1024, 14, 14);
  1932. bench_case(1024, 1024, 14, 14);
  1933. return shapes_and_computation;
  1934. }
  1935. void conv1x1_multithread_benchmark(
  1936. const char* algo_name, DType stype, DType ftype, DType btype, DType dtype) {
  1937. constexpr size_t RUNS = 50;
  1938. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation =
  1939. get_conv1x1_multithread_benchmark_args();
  1940. std::vector<DType> data_type = {stype, ftype, btype, dtype};
  1941. param::ConvBias param;
  1942. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1943. param.pad_h = 0;
  1944. param.pad_w = 0;
  1945. param.stride_h = 1;
  1946. param.stride_w = 1;
  1947. benchmark_impl(
  1948. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1949. data_type);
  1950. benchmark_impl(
  1951. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1952. data_type);
  1953. benchmark_impl(
  1954. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1955. data_type);
  1956. shapes_and_computation.clear();
  1957. }
  1958. } // namespace
  1959. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_CONV1X1_S1_FP32) {
  1960. #if MEGDNN_AARCH64
  1961. conv1x1_multithread_benchmark(
  1962. "CONV1x1:AARCH64_F32K8X12X1:8", dtype::Float32(), dtype::Float32(),
  1963. dtype::Float32(), dtype::Float32());
  1964. #else
  1965. conv1x1_multithread_benchmark(
  1966. "CONV1x1:ARMV7_F32:8", dtype::Float32(), dtype::Float32(), dtype::Float32(),
  1967. dtype::Float32());
  1968. #endif
  1969. }
  1970. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  1971. BENCHMARK_CONVBIAS_CONV1X1_S1_QUANTIZEDASYM) {
  1972. dtype::Quantized8Asymm stype(0.2f, 100);
  1973. dtype::Quantized8Asymm ftype(0.2f, 120);
  1974. dtype::QuantizedS32 btype(0.04f);
  1975. dtype::Quantized8Asymm dtype(1.4f, 110);
  1976. #if MEGDNN_AARCH64
  1977. #if MGB_ENABLE_DOT
  1978. conv1x1_multithread_benchmark(
  1979. "CONV1x1:AARCH64_QUINT8_K8X8X4_DOTPROD:8", stype, ftype, btype, dtype);
  1980. #else
  1981. conv1x1_multithread_benchmark(
  1982. "CONV1x1:AARCH64_QUINT8_K8X8X8:8", stype, ftype, btype, dtype);
  1983. #endif
  1984. #else
  1985. conv1x1_multithread_benchmark(
  1986. "CONV1x1:ARMV7_QUINT8_K4X8X8:8", stype, ftype, btype, dtype);
  1987. #endif
  1988. }
  1989. #endif
  1990. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台