You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias_multi_thread_benchmark.cpp 80 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956
  1. #include "test/arm_common/fixture.h"
  2. #include "test/common/benchmarker.h"
  3. #include "test/common/conv_bias.h"
  4. using namespace megdnn;
  5. using namespace test;
  6. using namespace conv_bias;
  7. #if MEGDNN_WITH_BENCHMARK
  8. namespace {
  9. void benchmark_impl(
  10. const param::ConvBias param,
  11. std::vector<std::pair<SmallVector<TensorShape>, float>>& shapes_and_computation,
  12. const std::string algo_name, size_t RUNS,
  13. TaskExecutorConfig&& multi_thread_config,
  14. TaskExecutorConfig&& single_thread_config, std::vector<DType>& data_type) {
  15. std::vector<float> multi_thread_times, single_thread_times;
  16. {
  17. auto multi_thread_hanle = create_cpu_handle(0, true, &multi_thread_config);
  18. auto benchmarker = Benchmarker<ConvBias>(multi_thread_hanle.get());
  19. benchmarker.set_times(RUNS)
  20. .set_display(false)
  21. .set_param(param)
  22. .set_dtype(0, data_type[0])
  23. .set_dtype(1, data_type[1])
  24. .set_dtype(2, data_type[2])
  25. .set_dtype(4, data_type[3])
  26. .set_before_exec_callback(
  27. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name.c_str()));
  28. for (auto shape : shapes_and_computation) {
  29. multi_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  30. }
  31. }
  32. {
  33. auto single_thread_handle = create_cpu_handle(0, true, &single_thread_config);
  34. auto benchmarker = Benchmarker<ConvBias>(single_thread_handle.get());
  35. benchmarker.set_times(RUNS)
  36. .set_display(false)
  37. .set_param(param)
  38. .set_dtype(0, data_type[0])
  39. .set_dtype(1, data_type[1])
  40. .set_dtype(2, data_type[2])
  41. .set_dtype(4, data_type[3])
  42. .set_before_exec_callback(
  43. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name.c_str()));
  44. for (auto shape : shapes_and_computation) {
  45. single_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  46. }
  47. }
  48. printf("Benchmark : Multi threads %zu, ", multi_thread_config.nr_thread);
  49. printf("core_ids:");
  50. for (size_t i = 0; i < multi_thread_config.affinity_core_set.size(); i++) {
  51. printf("%zu ", multi_thread_config.affinity_core_set[i]);
  52. }
  53. printf(", Single thread core_id %zu\n", single_thread_config.affinity_core_set[0]);
  54. for (size_t i = 0; i < shapes_and_computation.size(); i++) {
  55. auto shapes = shapes_and_computation[i];
  56. printf("Bench case: ");
  57. for (auto&& shape : shapes.first) {
  58. printf("%s ", shape.to_string().c_str());
  59. }
  60. float computations = shapes.second;
  61. printf("%zu threads gflops: %f,\n single thread gflops: "
  62. "%f. spead up = %f, speedup/cores=%f\n",
  63. multi_thread_config.nr_thread, computations / multi_thread_times[i],
  64. computations / single_thread_times[i],
  65. single_thread_times[i] / multi_thread_times[i],
  66. single_thread_times[i] / multi_thread_times[i] /
  67. multi_thread_config.nr_thread);
  68. }
  69. }
  70. } // namespace
  71. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  72. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECTF16) {
  73. constexpr size_t RUNS = 50;
  74. param::ConvBias param;
  75. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  76. param.pad_h = 1;
  77. param.pad_w = 1;
  78. param.stride_h = 1;
  79. param.stride_w = 1;
  80. param.sparse = param::ConvBias::Sparse::GROUP;
  81. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  82. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  83. size_t group) {
  84. SmallVector<TensorShape> shapes{
  85. {N, IC, H, W},
  86. {group, OC / group, IC / group, FS, FS},
  87. {1, OC, 1, 1},
  88. {},
  89. {N, OC, H, W}};
  90. TensorShape dst{N, OC, H, W};
  91. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  92. dst.total_nr_elems()) *
  93. 1e-6;
  94. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  95. };
  96. bench_case(1, 32, 32, 200, 200, 3, 4);
  97. bench_case(1, 32, 32, 200, 200, 3, 32);
  98. bench_case(1, 32, 32, 128, 128, 3, 4);
  99. bench_case(1, 32, 32, 128, 128, 3, 32);
  100. bench_case(1, 32, 32, 100, 100, 3, 4);
  101. bench_case(1, 32, 32, 100, 100, 3, 32);
  102. bench_case(1, 32, 32, 80, 80, 3, 4);
  103. bench_case(1, 32, 32, 80, 80, 3, 32);
  104. std::string algo_name = "F16DIRECT";
  105. printf("Benchmark F16DIRECT_LARGE_GROUP algo\n");
  106. std::vector<DType> data_type = {
  107. dtype::Float16(), dtype::Float16(), dtype::Float16(), dtype::Float16()};
  108. benchmark_impl(
  109. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  110. data_type);
  111. benchmark_impl(
  112. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  113. data_type);
  114. benchmark_impl(
  115. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  116. data_type);
  117. shapes_and_computation.clear();
  118. algo_name = "F16DIRECT";
  119. printf("Benchmark F16DIRECT_SMALL_GROUP algo\n");
  120. bench_case(1, 32, 32, 200, 200, 3, 1);
  121. bench_case(1, 32, 32, 128, 128, 3, 1);
  122. bench_case(1, 32, 32, 100, 100, 3, 1);
  123. bench_case(1, 32, 32, 80, 80, 3, 1);
  124. benchmark_impl(
  125. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  126. data_type);
  127. benchmark_impl(
  128. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  129. data_type);
  130. benchmark_impl(
  131. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  132. data_type);
  133. }
  134. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECTF16_STR1) {
  135. constexpr size_t RUNS = 50;
  136. param::ConvBias param;
  137. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  138. param.pad_h = 1;
  139. param.pad_w = 1;
  140. param.stride_h = 1;
  141. param.stride_w = 1;
  142. param.sparse = param::ConvBias::Sparse::GROUP;
  143. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  144. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  145. size_t group) {
  146. SmallVector<TensorShape> shapes{
  147. {N, IC, H, W},
  148. {group, OC / group, IC / group, FS, FS},
  149. {1, OC, 1, 1},
  150. {},
  151. {N, OC, H, W}};
  152. TensorShape dst{N, OC, H, W};
  153. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  154. dst.total_nr_elems()) *
  155. 1e-6;
  156. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  157. };
  158. bench_case(1, 32, 32, 200, 200, 3, 4);
  159. bench_case(1, 32, 32, 200, 200, 3, 32);
  160. bench_case(1, 32, 32, 128, 128, 3, 4);
  161. bench_case(1, 32, 32, 128, 128, 3, 32);
  162. bench_case(1, 32, 32, 100, 100, 3, 4);
  163. bench_case(1, 32, 32, 100, 100, 3, 32);
  164. bench_case(1, 32, 32, 80, 80, 3, 4);
  165. bench_case(1, 32, 32, 80, 80, 3, 32);
  166. std::string algo_name = "F16STRD1";
  167. printf("Benchmark F16STRD1_LARGE_GROUP algo\n");
  168. std::vector<DType> data_type = {
  169. dtype::Float16(), dtype::Float16(), dtype::Float16(), dtype::Float16()};
  170. benchmark_impl(
  171. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  172. data_type);
  173. benchmark_impl(
  174. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  175. data_type);
  176. benchmark_impl(
  177. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  178. data_type);
  179. shapes_and_computation.clear();
  180. algo_name = "F16STRD1";
  181. printf("Benchmark F16STRD1_SMALL_GROUP algo\n");
  182. bench_case(1, 32, 32, 200, 200, 3, 1);
  183. bench_case(1, 32, 32, 128, 128, 3, 1);
  184. bench_case(1, 32, 32, 100, 100, 3, 1);
  185. bench_case(1, 32, 32, 80, 80, 3, 1);
  186. benchmark_impl(
  187. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  188. data_type);
  189. benchmark_impl(
  190. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  191. data_type);
  192. benchmark_impl(
  193. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  194. data_type);
  195. }
  196. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CHANNEL_WISE_FP16_NCHW88) {
  197. constexpr size_t RUNS = 50;
  198. std::string algo_name = "F16_CHANNEL_WISE_NCHW88";
  199. printf("Benchmarker F16_CHANNEL_WISE_NCHW88 algo\n");
  200. std::vector<DType> data_type = {
  201. dtype::Float16(), dtype::Float16(), dtype::Float16(), dtype::Float16()};
  202. auto bench_case = [&](size_t N, size_t IC, size_t H, size_t W, size_t FS, size_t P,
  203. size_t S) {
  204. param::ConvBias param;
  205. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  206. param.pad_h = P;
  207. param.pad_w = P;
  208. param.stride_h = S;
  209. param.stride_w = S;
  210. param.sparse = param::ConvBias::Sparse::GROUP;
  211. param.format = param::ConvBias::Format::NCHW88;
  212. size_t group = IC;
  213. size_t OC = IC;
  214. SmallVector<TensorShape> shapes{
  215. {N, IC, H, W, 8},
  216. {group, 1, 1, FS, FS, 8},
  217. {1, OC, 1, 1, 8},
  218. {},
  219. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1, 8}};
  220. TensorShape dst{N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1, 8};
  221. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  222. dst.total_nr_elems()) *
  223. 1e-6;
  224. std::vector<std::pair<SmallVector<TensorShape>, float>> shape_arg = {
  225. std::make_pair(shapes, computations)};
  226. benchmark_impl(
  227. param, shape_arg, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  228. data_type);
  229. };
  230. bench_case(1, 64, 100, 100, 5, 2, 1);
  231. bench_case(1, 64, 56, 56, 5, 2, 1);
  232. bench_case(1, 64, 28, 28, 5, 2, 1);
  233. bench_case(1, 64, 100, 100, 5, 2, 2);
  234. bench_case(1, 64, 56, 56, 5, 2, 2);
  235. bench_case(1, 64, 28, 28, 5, 2, 2);
  236. bench_case(1, 64, 100, 100, 3, 1, 1);
  237. bench_case(1, 64, 56, 56, 3, 1, 1);
  238. bench_case(1, 64, 28, 28, 3, 1, 1);
  239. bench_case(1, 64, 100, 100, 3, 1, 2);
  240. bench_case(1, 64, 56, 56, 3, 1, 2);
  241. bench_case(1, 64, 28, 28, 3, 1, 2);
  242. bench_case(1, 64, 100, 100, 2, 0, 1);
  243. bench_case(1, 64, 56, 56, 2, 0, 1);
  244. bench_case(1, 64, 28, 28, 2, 0, 1);
  245. bench_case(1, 64, 100, 100, 2, 0, 2);
  246. bench_case(1, 64, 56, 56, 2, 0, 2);
  247. bench_case(1, 64, 28, 28, 2, 0, 2);
  248. }
  249. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_FP16_NCHW88) {
  250. constexpr size_t RUNS = 40;
  251. std::vector<DType> data_type = {
  252. dtype::Float16(), dtype::Float16(), dtype::Float16(), dtype::Float16()};
  253. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  254. size_t group, size_t P, size_t S) {
  255. param::ConvBias param;
  256. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  257. param.pad_h = P;
  258. param.pad_w = P;
  259. param.stride_h = S;
  260. param.stride_w = S;
  261. param.sparse = param::ConvBias::Sparse::DENSE;
  262. param.format = param::ConvBias::Format::NCHW88;
  263. auto OH = (H + 2 * P - FS) / static_cast<size_t>(S) + 1;
  264. auto OW = (W + 2 * P - FS) / static_cast<size_t>(S) + 1;
  265. TensorShape src = {N, IC / 8, H, W, 8};
  266. TensorShape filter = {OC / 8, IC / 8, FS, FS, 8, 8};
  267. if (group > 1) {
  268. filter = {group, OC / group / 8, IC / group / 8, FS, FS, 8, 8};
  269. param.sparse = param::ConvBias::Sparse::GROUP;
  270. }
  271. TensorShape bias = {1, OC / 8, 1, 1, 8};
  272. TensorShape dst = {N, OC / 8, OH, OW, 8};
  273. SmallVector<TensorShape> shapes{src, filter, bias, {}, dst};
  274. float computations = (((IC / group) * FS * FS + 1) * dst.total_nr_elems() * 2 +
  275. dst.total_nr_elems()) *
  276. 1e-6;
  277. std::vector<std::pair<SmallVector<TensorShape>, float>> shape_arg = {
  278. std::make_pair(shapes, computations)};
  279. benchmark_impl(
  280. param, shape_arg, ".+", RUNS, {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  281. };
  282. bench_case(1, 64, 64, 28, 28, 3, 1, 1, 1);
  283. bench_case(1, 64, 64, 28, 28, 5, 1, 2, 1);
  284. bench_case(1, 64, 64, 28, 28, 7, 1, 3, 1);
  285. bench_case(1, 64, 64, 28, 28, 3, 1, 1, 2);
  286. bench_case(1, 64, 64, 28, 28, 5, 1, 2, 2);
  287. bench_case(1, 64, 64, 28, 28, 7, 1, 3, 2);
  288. bench_case(1, 64, 64, 28, 28, 3, 2, 1, 1);
  289. bench_case(1, 64, 64, 28, 28, 3, 4, 1, 1);
  290. bench_case(1, 64, 64, 28, 28, 3, 8, 1, 1);
  291. bench_case(1, 16, 16, 28, 28, 3, 1, 1, 1);
  292. bench_case(1, 32, 32, 28, 28, 3, 1, 1, 1);
  293. bench_case(1, 128, 128, 28, 28, 3, 1, 1, 1);
  294. bench_case(1, 256, 256, 28, 28, 3, 1, 1, 1);
  295. bench_case(1, 64, 64, 7, 7, 3, 1, 1, 1);
  296. bench_case(1, 64, 64, 14, 14, 3, 1, 1, 1);
  297. bench_case(1, 64, 64, 56, 56, 3, 1, 1, 1);
  298. bench_case(1, 64, 64, 112, 112, 3, 1, 1, 1);
  299. }
  300. #endif
  301. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECT_INT8x8x16) {
  302. constexpr size_t RUNS = 50;
  303. param::ConvBias param;
  304. param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
  305. param.pad_h = 1;
  306. param.pad_w = 1;
  307. param.stride_h = 1;
  308. param.stride_w = 1;
  309. param.sparse = param::ConvBias::Sparse::GROUP;
  310. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  311. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  312. size_t group) {
  313. SmallVector<TensorShape> shapes{
  314. {N, IC, H, W},
  315. {group, OC / group, IC / group, FS, FS},
  316. {},
  317. {},
  318. {N, OC, H, W}};
  319. TensorShape dst{N, OC, H, W};
  320. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  321. dst.total_nr_elems()) *
  322. 1e-6;
  323. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  324. };
  325. bench_case(1, 32, 32, 200, 200, 3, 4);
  326. bench_case(1, 32, 32, 200, 200, 3, 32);
  327. bench_case(1, 32, 32, 128, 128, 3, 4);
  328. bench_case(1, 32, 32, 128, 128, 3, 32);
  329. bench_case(1, 32, 32, 100, 100, 3, 4);
  330. bench_case(1, 32, 32, 100, 100, 3, 32);
  331. bench_case(1, 32, 32, 80, 80, 3, 4);
  332. bench_case(1, 32, 32, 80, 80, 3, 32);
  333. std::string algo_name = "I8816DIRECT";
  334. printf("Benchmark I8816DIRECT_LARGE_GROUP algo\n");
  335. std::vector<DType> data_type = {
  336. dtype::Int8(), dtype::Int8(), dtype::Int16(), dtype::Int16()};
  337. benchmark_impl(
  338. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  339. data_type);
  340. benchmark_impl(
  341. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  342. data_type);
  343. benchmark_impl(
  344. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  345. data_type);
  346. shapes_and_computation.clear();
  347. algo_name = "I8816DIRECT";
  348. printf("Benchmark I8816DIRECT_SMALL_GROUP algo\n");
  349. bench_case(1, 32, 32, 200, 200, 3, 1);
  350. bench_case(1, 32, 32, 128, 128, 3, 1);
  351. bench_case(1, 32, 32, 100, 100, 3, 1);
  352. bench_case(1, 32, 32, 80, 80, 3, 1);
  353. benchmark_impl(
  354. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  355. data_type);
  356. benchmark_impl(
  357. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  358. data_type);
  359. benchmark_impl(
  360. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  361. data_type);
  362. }
  363. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECT_INT8x8x16_STR2) {
  364. constexpr size_t RUNS = 50;
  365. param::ConvBias param;
  366. param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
  367. param.pad_h = 1;
  368. param.pad_w = 1;
  369. param.stride_h = 2;
  370. param.stride_w = 2;
  371. param.sparse = param::ConvBias::Sparse::GROUP;
  372. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  373. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  374. size_t group, size_t P, size_t S) {
  375. SmallVector<TensorShape> shapes{
  376. {N, IC, H, W},
  377. {group, OC / group, IC / group, FS, FS},
  378. {},
  379. {},
  380. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  381. TensorShape dst{N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1};
  382. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  383. dst.total_nr_elems()) *
  384. 1e-6;
  385. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  386. };
  387. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 2);
  388. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 2);
  389. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 2);
  390. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 2);
  391. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 2);
  392. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 2);
  393. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 2);
  394. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 2);
  395. std::string algo_name = "I8816STRD2";
  396. printf("Benchmark I8816STRD2_LARGE_GROUP algo\n");
  397. std::vector<DType> data_type = {
  398. dtype::Int8(), dtype::Int8(), dtype::Int16(), dtype::Int16()};
  399. benchmark_impl(
  400. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  401. data_type);
  402. benchmark_impl(
  403. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  404. data_type);
  405. benchmark_impl(
  406. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  407. data_type);
  408. shapes_and_computation.clear();
  409. algo_name = "I8816STRD2";
  410. printf("Benchmark I8816STRD2_SMALL_GROUP algo\n");
  411. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 2);
  412. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 2);
  413. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 2);
  414. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 2);
  415. benchmark_impl(
  416. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  417. data_type);
  418. benchmark_impl(
  419. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  420. data_type);
  421. benchmark_impl(
  422. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  423. data_type);
  424. }
  425. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_INT8_INT8_INT8_STRIDE1) {
  426. constexpr size_t RUNS = 50;
  427. param::ConvBias param;
  428. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  429. param.pad_h = 1;
  430. param.pad_w = 1;
  431. param.stride_h = 1;
  432. param.stride_w = 1;
  433. param.sparse = param::ConvBias::Sparse::GROUP;
  434. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  435. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  436. size_t group, size_t P, size_t S) {
  437. SmallVector<TensorShape> shapes{
  438. {N, IC, H, W},
  439. {group, OC / group, IC / group, FS, FS},
  440. {1, OC, 1, 1},
  441. {},
  442. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  443. TensorShape dst{N, OC, H, W};
  444. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  445. dst.total_nr_elems()) *
  446. 1e-6;
  447. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  448. };
  449. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 1);
  450. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 1);
  451. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 1);
  452. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 1);
  453. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 1);
  454. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 1);
  455. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 1);
  456. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 1);
  457. std::string algo_name = "S8STRD1";
  458. printf("Benchmark S8STRD1_LARGE_GROUP algo\n");
  459. std::vector<DType> data_type = {
  460. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  461. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  462. benchmark_impl(
  463. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  464. data_type);
  465. benchmark_impl(
  466. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  467. data_type);
  468. benchmark_impl(
  469. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  470. data_type);
  471. shapes_and_computation.clear();
  472. algo_name = "S8STRD1";
  473. printf("Benchmark S8STRD1_SMALL_GROUP algo\n");
  474. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 1);
  475. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 1);
  476. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 1);
  477. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 1);
  478. benchmark_impl(
  479. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  480. data_type);
  481. benchmark_impl(
  482. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  483. data_type);
  484. benchmark_impl(
  485. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  486. data_type);
  487. }
  488. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_INT8_NCHW44) {
  489. constexpr size_t RUNS = 40;
  490. std::vector<DType> data_type = {
  491. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  492. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  493. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  494. size_t group, size_t P, size_t S, bool is_nchw = false) {
  495. param::ConvBias param;
  496. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  497. param.pad_h = P;
  498. param.pad_w = P;
  499. param.stride_h = S;
  500. param.stride_w = S;
  501. param.sparse = param::ConvBias::Sparse::DENSE;
  502. param.format = param::ConvBias::Format::NCHW44;
  503. auto OH = (H + 2 * P - FS) / static_cast<size_t>(S) + 1;
  504. auto OW = (W + 2 * P - FS) / static_cast<size_t>(S) + 1;
  505. TensorShape src = {N, IC / 4, H, W, 4};
  506. TensorShape filter = {OC / 4, IC / 4, FS, FS, 4, 4};
  507. if (group > 1) {
  508. filter = {group, OC / group / 4, IC / group / 4, FS, FS, 4, 4};
  509. param.sparse = param::ConvBias::Sparse::GROUP;
  510. }
  511. if (is_nchw) {
  512. src = {N, IC, H, W};
  513. filter = {OC / 4, FS, FS, IC, 4};
  514. }
  515. TensorShape bias = {1, OC / 4, 1, 1, 4};
  516. TensorShape dst = {N, OC / 4, OH, OW, 4};
  517. SmallVector<TensorShape> shapes{src, filter, bias, {}, dst};
  518. float computations = (((IC / group) * FS * FS + 1) * dst.total_nr_elems() * 2 +
  519. dst.total_nr_elems()) *
  520. 1e-6;
  521. std::vector<std::pair<SmallVector<TensorShape>, float>> shape_arg = {
  522. std::make_pair(shapes, computations)};
  523. benchmark_impl(
  524. param, shape_arg, ".+", RUNS, {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  525. };
  526. bench_case(1, 2, 64, 160, 160, 1, 1, 0, 1, true);
  527. bench_case(1, 3, 64, 224, 224, 7, 1, 3, 2, true);
  528. bench_case(1, 64, 64, 56, 56, 3, 1, 1, 1);
  529. bench_case(1, 128, 128, 28, 28, 3, 1, 1, 1);
  530. bench_case(1, 256, 256, 14, 14, 3, 1, 1, 1);
  531. bench_case(1, 512, 512, 7, 7, 3, 1, 1, 1);
  532. bench_case(1, 64, 64, 56, 56, 3, 4, 1, 1);
  533. bench_case(1, 128, 128, 28, 28, 3, 4, 1, 1);
  534. bench_case(1, 256, 256, 14, 14, 3, 4, 1, 1);
  535. bench_case(1, 512, 512, 7, 7, 3, 4, 1, 1);
  536. bench_case(1, 4, 64, 224, 224, 7, 1, 1, 2);
  537. bench_case(1, 256, 128, 56, 56, 3, 1, 1, 2);
  538. bench_case(1, 512, 256, 28, 28, 3, 1, 1, 2);
  539. bench_case(1, 4, 32, 224, 224, 3, 1, 1, 2);
  540. bench_case(1, 256, 128, 56, 56, 3, 4, 1, 2);
  541. bench_case(1, 512, 256, 28, 28, 3, 4, 1, 2);
  542. }
  543. #if MGB_ENABLE_DOT
  544. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_INT8_NCHW44_DOT) {
  545. constexpr size_t RUNS = 40;
  546. std::vector<DType> data_type = {
  547. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  548. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  549. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  550. size_t group, size_t P, size_t S, bool is_nchw = false) {
  551. param::ConvBias param;
  552. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  553. param.pad_h = P;
  554. param.pad_w = P;
  555. param.stride_h = S;
  556. param.stride_w = S;
  557. param.sparse = param::ConvBias::Sparse::DENSE;
  558. param.format = param::ConvBias::Format::NCHW44_DOT;
  559. auto OH = (H + 2 * P - FS) / static_cast<size_t>(S) + 1;
  560. auto OW = (W + 2 * P - FS) / static_cast<size_t>(S) + 1;
  561. TensorShape src = {N, IC / 4, H, W, 4};
  562. TensorShape filter = {OC / 4, IC / 4, FS, FS, 4, 4};
  563. if (group > 1) {
  564. filter = {group, OC / group / 4, IC / group / 4, FS, FS, 4, 4};
  565. param.sparse = param::ConvBias::Sparse::GROUP;
  566. }
  567. if (is_nchw) {
  568. src = {N, IC, H, W};
  569. filter = {OC / 4, FS, FS, IC, 4};
  570. }
  571. TensorShape bias = {1, OC / 4, 1, 1, 4};
  572. TensorShape dst = {N, OC / 4, OH, OW, 4};
  573. SmallVector<TensorShape> shapes{src, filter, bias, {}, dst};
  574. float computations = (((IC / group) * FS * FS + 1) * dst.total_nr_elems() * 2 +
  575. dst.total_nr_elems()) *
  576. 1e-6;
  577. std::vector<std::pair<SmallVector<TensorShape>, float>> shape_arg = {
  578. std::make_pair(shapes, computations)};
  579. benchmark_impl(
  580. param, shape_arg, ".+", RUNS, {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  581. };
  582. bench_case(1, 64, 64, 56, 56, 3, 1, 1, 1);
  583. bench_case(1, 128, 128, 28, 28, 3, 1, 1, 1);
  584. bench_case(1, 256, 256, 14, 14, 3, 1, 1, 1);
  585. bench_case(1, 512, 512, 7, 7, 3, 1, 1, 1);
  586. bench_case(1, 64, 64, 56, 56, 3, 4, 1, 1);
  587. bench_case(1, 128, 128, 28, 28, 3, 4, 1, 1);
  588. bench_case(1, 256, 256, 14, 14, 3, 4, 1, 1);
  589. bench_case(1, 512, 512, 7, 7, 3, 4, 1, 1);
  590. }
  591. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_INT8_NCHW44_DOT_S2) {
  592. constexpr size_t RUNS = 40;
  593. std::vector<DType> data_type = {
  594. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  595. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  596. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  597. size_t group, size_t P, size_t S, bool is_nchw = false) {
  598. param::ConvBias param;
  599. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  600. param.pad_h = P;
  601. param.pad_w = P;
  602. param.stride_h = S;
  603. param.stride_w = S;
  604. param.sparse = param::ConvBias::Sparse::DENSE;
  605. param.format = param::ConvBias::Format::NCHW44_DOT;
  606. auto OH = (H + 2 * P - FS) / static_cast<size_t>(S) + 1;
  607. auto OW = (W + 2 * P - FS) / static_cast<size_t>(S) + 1;
  608. TensorShape src = {N, IC / 4, H, W, 4};
  609. TensorShape filter = {OC / 4, IC / 4, FS, FS, 4, 4};
  610. if (group > 1) {
  611. filter = {group, OC / group / 4, IC / group / 4, FS, FS, 4, 4};
  612. param.sparse = param::ConvBias::Sparse::GROUP;
  613. }
  614. if (is_nchw) {
  615. src = {N, IC, H, W};
  616. filter = {OC / 4, FS, FS, IC, 4};
  617. }
  618. TensorShape bias = {1, OC / 4, 1, 1, 4};
  619. TensorShape dst = {N, OC / 4, OH, OW, 4};
  620. SmallVector<TensorShape> shapes{src, filter, bias, {}, dst};
  621. float computations = (((IC / group) * FS * FS + 1) * dst.total_nr_elems() * 2 +
  622. dst.total_nr_elems()) *
  623. 1e-6;
  624. std::vector<std::pair<SmallVector<TensorShape>, float>> shape_arg = {
  625. std::make_pair(shapes, computations)};
  626. benchmark_impl(
  627. param, shape_arg, ".+", RUNS, {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  628. };
  629. bench_case(1, 64, 64, 56, 56, 3, 1, 1, 2);
  630. bench_case(1, 64, 64, 128, 128, 3, 1, 1, 2);
  631. bench_case(1, 64, 64, 256, 256, 3, 1, 1, 2);
  632. bench_case(1, 64, 64, 156, 156, 3, 1, 1, 2);
  633. bench_case(1, 128, 128, 28, 28, 3, 1, 1, 2);
  634. bench_case(1, 256, 256, 14, 14, 3, 1, 1, 2);
  635. bench_case(1, 512, 512, 7, 7, 3, 1, 1, 2);
  636. bench_case(1, 64, 64, 56, 56, 3, 4, 1, 2);
  637. bench_case(1, 128, 128, 28, 28, 3, 4, 1, 2);
  638. bench_case(1, 256, 256, 14, 14, 3, 4, 1, 2);
  639. bench_case(1, 512, 512, 7, 7, 3, 4, 1, 2);
  640. }
  641. #endif
  642. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_FLOAT_NCHW44) {
  643. constexpr size_t RUNS = 40;
  644. std::vector<DType> data_type = {
  645. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  646. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  647. size_t group, size_t P, size_t S, bool is_nchw = false) {
  648. param::ConvBias param;
  649. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  650. param.pad_h = P;
  651. param.pad_w = P;
  652. param.stride_h = S;
  653. param.stride_w = S;
  654. param.sparse = param::ConvBias::Sparse::DENSE;
  655. param.format = param::ConvBias::Format::NCHW44;
  656. auto OH = (H + 2 * P - FS) / static_cast<size_t>(S) + 1;
  657. auto OW = (W + 2 * P - FS) / static_cast<size_t>(S) + 1;
  658. TensorShape src = {N, IC / 4, H, W, 4};
  659. TensorShape filter = {OC / 4, IC / 4, FS, FS, 4, 4};
  660. if (group > 1) {
  661. filter = {group, OC / group / 4, IC / group / 4, FS, FS, 4, 4};
  662. param.sparse = param::ConvBias::Sparse::GROUP;
  663. }
  664. if (is_nchw) {
  665. src = {N, IC, H, W};
  666. filter = {OC / 4, FS, FS, IC, 4};
  667. }
  668. TensorShape bias = {1, OC / 4, 1, 1, 4};
  669. TensorShape dst = {N, OC / 4, OH, OW, 4};
  670. SmallVector<TensorShape> shapes{src, filter, bias, {}, dst};
  671. float computations = (((IC / group) * FS * FS + 1) * dst.total_nr_elems() * 2 +
  672. dst.total_nr_elems()) *
  673. 1e-6;
  674. std::vector<std::pair<SmallVector<TensorShape>, float>> shape_arg = {
  675. std::make_pair(shapes, computations)};
  676. benchmark_impl(
  677. param, shape_arg, ".+", RUNS, {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  678. };
  679. bench_case(1, 64, 64, 56, 56, 3, 1, 1, 2);
  680. bench_case(1, 128, 128, 28, 28, 3, 1, 1, 2);
  681. bench_case(1, 256, 256, 14, 14, 3, 1, 1, 2);
  682. bench_case(1, 512, 512, 7, 7, 3, 1, 1, 2);
  683. bench_case(1, 64, 64, 56, 56, 3, 4, 1, 2);
  684. bench_case(1, 128, 128, 28, 28, 3, 4, 1, 2);
  685. bench_case(1, 256, 256, 14, 14, 3, 4, 1, 2);
  686. bench_case(1, 512, 512, 7, 7, 3, 4, 1, 2);
  687. bench_case(1, 64, 64, 56 * 2, 56 * 2, 3, 4, 1, 2);
  688. bench_case(1, 128, 128, 28 * 2, 28 * 2, 3, 4, 1, 2);
  689. bench_case(1, 256, 256, 14 * 2, 14 * 2, 3, 4, 1, 2);
  690. bench_case(1, 512, 512, 7 * 2, 7 * 2, 3, 4, 1, 2);
  691. }
  692. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_INT8_INT8_INT8_STRIDE2) {
  693. constexpr size_t RUNS = 50;
  694. param::ConvBias param;
  695. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  696. param.pad_h = 1;
  697. param.pad_w = 1;
  698. param.stride_h = 2;
  699. param.stride_w = 2;
  700. param.sparse = param::ConvBias::Sparse::GROUP;
  701. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  702. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  703. size_t group, size_t P, size_t S) {
  704. SmallVector<TensorShape> shapes{
  705. {N, IC, H, W},
  706. {group, OC / group, IC / group, FS, FS},
  707. {1, OC, 1, 1},
  708. {},
  709. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  710. TensorShape dst{N, OC, H, W};
  711. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  712. dst.total_nr_elems()) *
  713. 1e-6;
  714. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  715. };
  716. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 2);
  717. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 2);
  718. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 2);
  719. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 2);
  720. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 2);
  721. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 2);
  722. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 2);
  723. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 2);
  724. std::string algo_name = "S8STRD2";
  725. printf("Benchmark S8STRD2_LARGE_GROUP algo\n");
  726. std::vector<DType> data_type = {
  727. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  728. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  729. benchmark_impl(
  730. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  731. data_type);
  732. benchmark_impl(
  733. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  734. data_type);
  735. benchmark_impl(
  736. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  737. data_type);
  738. shapes_and_computation.clear();
  739. algo_name = "S8STRD2";
  740. printf("Benchmark S8STRD2_SMALL_GROUP algo\n");
  741. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 2);
  742. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 2);
  743. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 2);
  744. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 2);
  745. benchmark_impl(
  746. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  747. data_type);
  748. benchmark_impl(
  749. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  750. data_type);
  751. benchmark_impl(
  752. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  753. data_type);
  754. }
  755. #if MGB_ENABLE_DOT
  756. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  757. BENCHMARK_CONVBIAS_INT8_INT8_INT8_STRIDE1_WITHDOTPROD) {
  758. constexpr size_t RUNS = 50;
  759. param::ConvBias param;
  760. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  761. param.pad_h = 1;
  762. param.pad_w = 1;
  763. param.stride_h = 1;
  764. param.stride_w = 1;
  765. param.sparse = param::ConvBias::Sparse::GROUP;
  766. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  767. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  768. size_t group, size_t P, size_t S) {
  769. SmallVector<TensorShape> shapes{
  770. {N, IC, H, W},
  771. {group, OC / group, IC / group, FS, FS},
  772. {1, OC, 1, 1},
  773. {},
  774. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  775. TensorShape dst{N, OC, H, W};
  776. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  777. dst.total_nr_elems()) *
  778. 1e-6;
  779. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  780. };
  781. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 1);
  782. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 1);
  783. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 1);
  784. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 1);
  785. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 1);
  786. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 1);
  787. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 1);
  788. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 1);
  789. std::string algo_name = "ARMDOTS8STRD1";
  790. printf("Benchmark ARMDOTS8STRD1_LARGE_GROUP algo\n");
  791. std::vector<DType> data_type = {
  792. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  793. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  794. benchmark_impl(
  795. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  796. data_type);
  797. benchmark_impl(
  798. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  799. data_type);
  800. benchmark_impl(
  801. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  802. data_type);
  803. shapes_and_computation.clear();
  804. algo_name = "ARMDOTS8STRD1";
  805. printf("Benchmark ARMDOTS8STRD1_SMALL_GROUP algo\n");
  806. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 1);
  807. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 1);
  808. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 1);
  809. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 1);
  810. benchmark_impl(
  811. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  812. data_type);
  813. benchmark_impl(
  814. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  815. data_type);
  816. benchmark_impl(
  817. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  818. data_type);
  819. }
  820. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  821. BENCHMARK_CONVBIAS_INT8_INT8_INT8_STRIDE2_WITHDOTPROD) {
  822. constexpr size_t RUNS = 50;
  823. param::ConvBias param;
  824. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  825. param.pad_h = 1;
  826. param.pad_w = 1;
  827. param.stride_h = 2;
  828. param.stride_w = 2;
  829. param.sparse = param::ConvBias::Sparse::GROUP;
  830. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  831. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  832. size_t group, size_t P, size_t S) {
  833. SmallVector<TensorShape> shapes{
  834. {N, IC, H, W},
  835. {group, OC / group, IC / group, FS, FS},
  836. {1, OC, 1, 1},
  837. {},
  838. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  839. TensorShape dst{N, OC, H, W};
  840. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  841. dst.total_nr_elems()) *
  842. 1e-6;
  843. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  844. };
  845. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 2);
  846. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 2);
  847. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 2);
  848. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 2);
  849. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 2);
  850. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 2);
  851. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 2);
  852. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 2);
  853. std::string algo_name = "ARMDOTS8STRD2";
  854. printf("Benchmark ARMDOTS8STRD2_LARGE_GROUP algo\n");
  855. std::vector<DType> data_type = {
  856. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  857. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  858. benchmark_impl(
  859. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  860. data_type);
  861. benchmark_impl(
  862. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  863. data_type);
  864. benchmark_impl(
  865. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  866. data_type);
  867. shapes_and_computation.clear();
  868. algo_name = "ARMDOTS8STRD2";
  869. printf("Benchmark ARMDOTS8STRD2_SMALL_GROUP algo\n");
  870. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 2);
  871. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 2);
  872. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 2);
  873. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 2);
  874. benchmark_impl(
  875. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  876. data_type);
  877. benchmark_impl(
  878. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  879. data_type);
  880. benchmark_impl(
  881. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  882. data_type);
  883. }
  884. #endif
  885. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  886. BENCHMARK_CONVBIAS_QUINT8_QUINT8_QUINT8_STRIDE1) {
  887. constexpr size_t RUNS = 50;
  888. param::ConvBias param;
  889. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  890. param.pad_h = 1;
  891. param.pad_w = 1;
  892. param.stride_h = 1;
  893. param.stride_w = 1;
  894. param.sparse = param::ConvBias::Sparse::GROUP;
  895. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  896. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  897. size_t group, size_t P, size_t S) {
  898. SmallVector<TensorShape> shapes{
  899. {N, IC, H, W},
  900. {group, OC / group, IC / group, FS, FS},
  901. {1, OC, 1, 1},
  902. {},
  903. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  904. TensorShape dst{N, OC, H, W};
  905. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  906. dst.total_nr_elems()) *
  907. 1e-6;
  908. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  909. };
  910. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 1);
  911. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 1);
  912. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 1);
  913. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 1);
  914. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 1);
  915. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 1);
  916. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 1);
  917. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 1);
  918. std::string algo_name = "QU8STRD1";
  919. printf("Benchmark QU8STRD1_LARGE_GROUP algo\n");
  920. std::vector<DType> data_type = {
  921. dtype::Quantized8Asymm(0.2f, 100), dtype::Quantized8Asymm(0.2f, 120),
  922. dtype::QuantizedS32(0.04f), dtype::Quantized8Asymm(1.4f, 110)};
  923. benchmark_impl(
  924. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  925. data_type);
  926. benchmark_impl(
  927. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  928. data_type);
  929. benchmark_impl(
  930. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  931. data_type);
  932. shapes_and_computation.clear();
  933. algo_name = "QU8STRD1";
  934. printf("Benchmark QU8STRD1_SMALL_GROUP algo\n");
  935. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 1);
  936. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 1);
  937. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 1);
  938. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 1);
  939. benchmark_impl(
  940. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  941. data_type);
  942. benchmark_impl(
  943. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  944. data_type);
  945. benchmark_impl(
  946. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  947. data_type);
  948. }
  949. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  950. BENCHMARK_CONVBIAS_QUINT8_QUINT8_QUINT8_STRIDE2) {
  951. constexpr size_t RUNS = 50;
  952. param::ConvBias param;
  953. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  954. param.pad_h = 1;
  955. param.pad_w = 1;
  956. param.stride_h = 2;
  957. param.stride_w = 2;
  958. param.sparse = param::ConvBias::Sparse::GROUP;
  959. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  960. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  961. size_t group, size_t P, size_t S) {
  962. SmallVector<TensorShape> shapes{
  963. {N, IC, H, W},
  964. {group, OC / group, IC / group, FS, FS},
  965. {1, OC, 1, 1},
  966. {},
  967. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  968. TensorShape dst{N, OC, H, W};
  969. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  970. dst.total_nr_elems()) *
  971. 1e-6;
  972. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  973. };
  974. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 2);
  975. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 2);
  976. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 2);
  977. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 2);
  978. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 2);
  979. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 2);
  980. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 2);
  981. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 2);
  982. std::string algo_name = "QU8STRD2";
  983. printf("Benchmark QU8STRD2_LARGE_GROUP algo\n");
  984. std::vector<DType> data_type = {
  985. dtype::Quantized8Asymm(0.2f, 100), dtype::Quantized8Asymm(0.2f, 120),
  986. dtype::QuantizedS32(0.04f), dtype::Quantized8Asymm(1.4f, 110)};
  987. benchmark_impl(
  988. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  989. data_type);
  990. benchmark_impl(
  991. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  992. data_type);
  993. benchmark_impl(
  994. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  995. data_type);
  996. shapes_and_computation.clear();
  997. algo_name = "QU8STRD2";
  998. printf("Benchmark QU8STRD2_SMALL_GROUP algo\n");
  999. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 2);
  1000. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 2);
  1001. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 2);
  1002. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 2);
  1003. benchmark_impl(
  1004. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1005. data_type);
  1006. benchmark_impl(
  1007. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1008. data_type);
  1009. benchmark_impl(
  1010. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1011. data_type);
  1012. }
  1013. #if MGB_ENABLE_DOT
  1014. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  1015. BENCHMARK_CONVBIAS_QUINT8_QUINT8_QUINT8_STRIDE1_WITHDOTPROD) {
  1016. constexpr size_t RUNS = 50;
  1017. param::ConvBias param;
  1018. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1019. param.pad_h = 1;
  1020. param.pad_w = 1;
  1021. param.stride_h = 1;
  1022. param.stride_w = 1;
  1023. param.sparse = param::ConvBias::Sparse::GROUP;
  1024. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1025. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1026. size_t group, size_t P, size_t S) {
  1027. SmallVector<TensorShape> shapes{
  1028. {N, IC, H, W},
  1029. {group, OC / group, IC / group, FS, FS},
  1030. {1, OC, 1, 1},
  1031. {},
  1032. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  1033. TensorShape dst{N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1};
  1034. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1035. dst.total_nr_elems()) *
  1036. 1e-6;
  1037. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1038. };
  1039. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 1);
  1040. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 1);
  1041. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 1);
  1042. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 1);
  1043. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 1);
  1044. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 1);
  1045. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 1);
  1046. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 1);
  1047. std::string algo_name = "ARMDOTU8STRD1";
  1048. printf("Benchmark ARMDOTU8STRD1_LARGE_GROUP algo\n");
  1049. std::vector<DType> data_type = {
  1050. dtype::Quantized8Asymm(0.2f, 100), dtype::Quantized8Asymm(0.2f, 120),
  1051. dtype::QuantizedS32(0.04f), dtype::Quantized8Asymm(1.4f, 110)};
  1052. benchmark_impl(
  1053. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1054. data_type);
  1055. benchmark_impl(
  1056. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1057. data_type);
  1058. benchmark_impl(
  1059. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1060. data_type);
  1061. shapes_and_computation.clear();
  1062. algo_name = "ARMDOTU8STRD1";
  1063. printf("Benchmark ARMDOTS8STRD1_SMALL_GROUP algo\n");
  1064. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 1);
  1065. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 1);
  1066. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 1);
  1067. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 1);
  1068. benchmark_impl(
  1069. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1070. data_type);
  1071. benchmark_impl(
  1072. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1073. data_type);
  1074. benchmark_impl(
  1075. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1076. data_type);
  1077. }
  1078. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  1079. BENCHMARK_CONVBIAS_QUINT8_QUINT8_QUINT8_STRIDE2_WITHDOTPROD) {
  1080. constexpr size_t RUNS = 50;
  1081. param::ConvBias param;
  1082. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1083. param.pad_h = 1;
  1084. param.pad_w = 1;
  1085. param.stride_h = 2;
  1086. param.stride_w = 2;
  1087. param.sparse = param::ConvBias::Sparse::GROUP;
  1088. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1089. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1090. size_t group, size_t P, size_t S) {
  1091. SmallVector<TensorShape> shapes{
  1092. {N, IC, H, W},
  1093. {group, OC / group, IC / group, FS, FS},
  1094. {1, OC, 1, 1},
  1095. {},
  1096. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  1097. TensorShape dst{N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1};
  1098. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1099. dst.total_nr_elems()) *
  1100. 1e-6;
  1101. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1102. };
  1103. bench_case(1, 32, 32, 200, 200, 5, 4, 1, 2);
  1104. bench_case(1, 32, 32, 200, 200, 5, 32, 1, 2);
  1105. bench_case(1, 32, 32, 128, 128, 5, 4, 1, 2);
  1106. bench_case(1, 32, 32, 128, 128, 5, 32, 1, 2);
  1107. bench_case(1, 32, 32, 100, 100, 5, 4, 1, 2);
  1108. bench_case(1, 32, 32, 100, 100, 5, 32, 1, 2);
  1109. bench_case(1, 32, 32, 80, 80, 5, 4, 1, 2);
  1110. bench_case(1, 32, 32, 80, 80, 5, 32, 1, 2);
  1111. std::string algo_name = "ARMDOTU8STRD2";
  1112. printf("Benchmark ARMDOTU8STRD2_LARGE_GROUP algo\n");
  1113. std::vector<DType> data_type = {
  1114. dtype::Quantized8Asymm(0.2f, 100), dtype::Quantized8Asymm(0.2f, 120),
  1115. dtype::QuantizedS32(0.04f), dtype::Quantized8Asymm(1.4f, 110)};
  1116. benchmark_impl(
  1117. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1118. data_type);
  1119. benchmark_impl(
  1120. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1121. data_type);
  1122. benchmark_impl(
  1123. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1124. data_type);
  1125. shapes_and_computation.clear();
  1126. algo_name = "ARMDOTU8STRD2";
  1127. printf("Benchmark ARMDOTU8STRD2_SMALL_GROUP algo\n");
  1128. bench_case(1, 32, 32, 200, 200, 5, 1, 1, 2);
  1129. bench_case(1, 32, 32, 128, 128, 5, 1, 1, 2);
  1130. bench_case(1, 32, 32, 100, 100, 5, 1, 1, 2);
  1131. bench_case(1, 32, 32, 80, 80, 5, 1, 1, 2);
  1132. benchmark_impl(
  1133. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1134. data_type);
  1135. benchmark_impl(
  1136. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1137. data_type);
  1138. benchmark_impl(
  1139. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1140. data_type);
  1141. }
  1142. #endif
  1143. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_WINOGRAD_F32) {
  1144. constexpr size_t RUNS = 50;
  1145. param::ConvBias param;
  1146. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1147. param.pad_h = 1;
  1148. param.pad_w = 1;
  1149. param.stride_h = 1;
  1150. param.stride_w = 1;
  1151. param.sparse = param::ConvBias::Sparse::GROUP;
  1152. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1153. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1154. size_t group) {
  1155. SmallVector<TensorShape> shapes{
  1156. {N, IC, H, W},
  1157. {group, OC / group, IC / group, FS, FS},
  1158. {1, OC, 1, 1},
  1159. {},
  1160. {N, OC, H, W}};
  1161. TensorShape dst{N, OC, H, W};
  1162. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1163. dst.total_nr_elems()) *
  1164. 1e-6;
  1165. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1166. };
  1167. bench_case(1, 32, 32, 200, 200, 3, 4);
  1168. bench_case(1, 32, 32, 200, 200, 3, 1);
  1169. bench_case(1, 32, 32, 128, 128, 3, 4);
  1170. bench_case(1, 32, 32, 128, 128, 3, 1);
  1171. bench_case(1, 32, 32, 100, 100, 3, 4);
  1172. bench_case(1, 32, 32, 100, 100, 3, 1);
  1173. bench_case(1, 32, 32, 80, 80, 3, 4);
  1174. bench_case(1, 512, 512, 14, 14, 3, 1);
  1175. bench_case(1, 512, 256, 14, 14, 3, 1);
  1176. bench_case(1, 512, 128, 14, 14, 3, 1);
  1177. bench_case(1, 512, 64, 14, 14, 3, 1);
  1178. bench_case(1, 512, 512, 7, 7, 3, 1);
  1179. bench_case(1, 512, 256, 7, 7, 3, 1);
  1180. bench_case(1, 512, 128, 7, 7, 3, 1);
  1181. bench_case(1, 512, 64, 7, 7, 3, 1);
  1182. std::string algo_name;
  1183. #if MEGDNN_AARCH64
  1184. algo_name = "WINOGRAD:AARCH64_F32_MK4_4x16:4:2";
  1185. #else
  1186. algo_name = "WINOGRAD:ARMV7_F32_MK4_4x8:4:2";
  1187. #endif
  1188. std::vector<DType> data_type = {
  1189. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  1190. printf("Benchmark WINOGRAD_F32_MK4 algo\n");
  1191. benchmark_impl(
  1192. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1193. data_type);
  1194. benchmark_impl(
  1195. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1196. data_type);
  1197. benchmark_impl(
  1198. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1199. data_type);
  1200. }
  1201. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_WINOGRAD_INT8) {
  1202. constexpr size_t RUNS = 50;
  1203. param::ConvBias param;
  1204. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1205. param.pad_h = 1;
  1206. param.pad_w = 1;
  1207. param.stride_h = 1;
  1208. param.stride_w = 1;
  1209. param.sparse = param::ConvBias::Sparse::GROUP;
  1210. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1211. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1212. size_t group) {
  1213. SmallVector<TensorShape> shapes{
  1214. {N, IC, H, W},
  1215. {group, OC / group, IC / group, FS, FS},
  1216. {1, OC, 1, 1},
  1217. {},
  1218. {N, OC, H, W}};
  1219. TensorShape dst{N, OC, H, W};
  1220. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1221. dst.total_nr_elems()) *
  1222. 1e-6;
  1223. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1224. };
  1225. bench_case(1, 32, 32, 200, 200, 3, 4);
  1226. bench_case(1, 32, 32, 200, 200, 3, 1);
  1227. bench_case(1, 32, 32, 128, 128, 3, 4);
  1228. bench_case(1, 32, 32, 128, 128, 3, 1);
  1229. bench_case(1, 32, 32, 100, 100, 3, 4);
  1230. bench_case(1, 32, 32, 100, 100, 3, 1);
  1231. bench_case(1, 32, 32, 80, 80, 3, 4);
  1232. bench_case(1, 512, 512, 14, 14, 3, 1);
  1233. bench_case(1, 512, 256, 14, 14, 3, 1);
  1234. bench_case(1, 512, 128, 14, 14, 3, 1);
  1235. bench_case(1, 512, 64, 14, 14, 3, 1);
  1236. bench_case(1, 512, 512, 7, 7, 3, 1);
  1237. bench_case(1, 512, 256, 7, 7, 3, 1);
  1238. bench_case(1, 512, 128, 7, 7, 3, 1);
  1239. bench_case(1, 512, 64, 7, 7, 3, 1);
  1240. std::string algo_name;
  1241. #if MEGDNN_AARCH64
  1242. algo_name = "WINOGRAD:AARCH64_INT16X16X32_MK8_8X8:8:2:32";
  1243. #else
  1244. algo_name = "WINOGRAD:ARMV7_INT16X16X32_MK8_4X8:8:2:32";
  1245. #endif
  1246. std::vector<DType> data_type = {
  1247. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  1248. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  1249. printf("Benchmark WINOGRAD_IN8_MK8 algo\n");
  1250. benchmark_impl(
  1251. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1252. data_type);
  1253. benchmark_impl(
  1254. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1255. data_type);
  1256. benchmark_impl(
  1257. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1258. data_type);
  1259. }
  1260. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  1261. BENCHMARK_CONVBIAS_WINOGRAD_NCHW44_INT8_MK8) {
  1262. constexpr size_t RUNS = 50;
  1263. param::ConvBias param;
  1264. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1265. param.pad_h = 1;
  1266. param.pad_w = 1;
  1267. param.stride_h = 1;
  1268. param.stride_w = 1;
  1269. param.sparse = param::ConvBias::Sparse::DENSE;
  1270. param.format = param::ConvBias::Format::NCHW44;
  1271. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1272. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1273. size_t group) {
  1274. SmallVector<TensorShape> shapes{
  1275. {N, IC / 4, H, W, 4},
  1276. {OC / 4, IC / 4, FS, FS, 4, 4},
  1277. {1, OC / 4, 1, 1, 4},
  1278. {},
  1279. {N, OC / 4, H, W, 4}};
  1280. TensorShape dst{N, OC, H, W};
  1281. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1282. dst.total_nr_elems()) *
  1283. 1e-6;
  1284. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1285. };
  1286. bench_case(1, 32, 32, 200, 200, 3, 1);
  1287. bench_case(1, 32, 32, 128, 128, 3, 1);
  1288. bench_case(1, 32, 32, 100, 100, 3, 1);
  1289. bench_case(1, 512, 512, 14, 14, 3, 1);
  1290. bench_case(1, 512, 256, 14, 14, 3, 1);
  1291. bench_case(1, 512, 128, 14, 14, 3, 1);
  1292. bench_case(1, 512, 64, 14, 14, 3, 1);
  1293. bench_case(1, 512, 512, 7, 7, 3, 1);
  1294. bench_case(1, 512, 256, 7, 7, 3, 1);
  1295. bench_case(1, 512, 128, 7, 7, 3, 1);
  1296. bench_case(1, 512, 64, 7, 7, 3, 1);
  1297. std::string algo_name;
  1298. #if MEGDNN_AARCH64
  1299. algo_name = "WINOGRAD_NCHW44:AARCH64_INT16X16X32_MK8_8X8:8:2:32";
  1300. #else
  1301. algo_name = "WINOGRAD_NCHW44:ARMV7_INT16X16X32_MK8_4X8:8:2:32";
  1302. #endif
  1303. std::vector<DType> data_type = {
  1304. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  1305. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  1306. printf("Benchmark WINOGRAD_INT8_MK8 algo\n");
  1307. benchmark_impl(
  1308. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1309. data_type);
  1310. benchmark_impl(
  1311. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1312. data_type);
  1313. benchmark_impl(
  1314. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1315. data_type);
  1316. }
  1317. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  1318. BENCHMARK_CONVBIAS_WINOGRAD_NCHW44_INT8_COMP_F32) {
  1319. constexpr size_t RUNS = 50;
  1320. param::ConvBias param;
  1321. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1322. param.pad_h = 1;
  1323. param.pad_w = 1;
  1324. param.stride_h = 1;
  1325. param.stride_w = 1;
  1326. param.sparse = param::ConvBias::Sparse::DENSE; // GROUP;
  1327. param.format = param::ConvBias::Format::NCHW44;
  1328. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1329. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1330. size_t group) {
  1331. SmallVector<TensorShape> shapes{
  1332. {N, IC / 4, H, W, 4},
  1333. {OC / 4, IC / 4, FS, FS, 4, 4},
  1334. {1, OC / 4, 1, 1, 4},
  1335. {},
  1336. {N, OC / 4, H, W, 4}};
  1337. TensorShape dst{N, OC, H, W};
  1338. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1339. dst.total_nr_elems()) *
  1340. 1e-6;
  1341. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1342. };
  1343. bench_case(1, 32, 32, 200, 200, 3, 1);
  1344. bench_case(1, 32, 32, 128, 128, 3, 1);
  1345. bench_case(1, 32, 32, 100, 100, 3, 1);
  1346. bench_case(1, 512, 512, 14, 14, 3, 1);
  1347. bench_case(1, 512, 256, 14, 14, 3, 1);
  1348. bench_case(1, 512, 128, 14, 14, 3, 1);
  1349. bench_case(1, 512, 64, 14, 14, 3, 1);
  1350. bench_case(1, 512, 512, 7, 7, 3, 1);
  1351. bench_case(1, 512, 256, 7, 7, 3, 1);
  1352. bench_case(1, 512, 128, 7, 7, 3, 1);
  1353. bench_case(1, 512, 64, 7, 7, 3, 1);
  1354. std::string algo_name;
  1355. #if MEGDNN_AARCH64
  1356. algo_name = "WINOGRAD_NCHW44:AARCH64_F32_MK4_4x16:4:2:32";
  1357. #else
  1358. algo_name = "WINOGRAD_NCHW44:ARMV7_F32_MK4_4x8:4:2:32";
  1359. #endif
  1360. std::vector<DType> data_type = {
  1361. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  1362. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  1363. printf("Benchmark WINOGRAD_INT8_NCHW44_MK4_COMP_F32 algo\n");
  1364. benchmark_impl(
  1365. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1366. data_type);
  1367. benchmark_impl(
  1368. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1369. data_type);
  1370. benchmark_impl(
  1371. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1372. data_type);
  1373. }
  1374. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_FP32) {
  1375. constexpr size_t RUNS = 50;
  1376. param::ConvBias param;
  1377. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1378. param.pad_h = 1;
  1379. param.pad_w = 1;
  1380. param.stride_h = 1;
  1381. param.stride_w = 1;
  1382. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1383. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1384. size_t group) {
  1385. SmallVector<TensorShape> shapes{
  1386. {N, IC, H, W},
  1387. {OC, IC / group, FS, FS},
  1388. {1, OC, 1, 1},
  1389. {},
  1390. {N, OC, H, W}};
  1391. TensorShape dst{N, OC, H, W};
  1392. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1393. dst.total_nr_elems()) *
  1394. 1e-6;
  1395. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1396. };
  1397. std::vector<DType> data_type = {
  1398. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  1399. bench_case(1, 32, 32, 300, 300, 3, 1);
  1400. bench_case(1, 32, 32, 400, 400, 3, 1);
  1401. bench_case(1, 32, 32, 100, 100, 3, 1);
  1402. bench_case(1, 32, 32, 80, 80, 3, 1);
  1403. bench_case(1, 32, 64, 200, 200, 3, 1);
  1404. bench_case(1, 32, 64, 128, 128, 3, 1);
  1405. bench_case(1, 32, 64, 100, 100, 3, 1);
  1406. bench_case(1, 32, 64, 80, 80, 3, 1);
  1407. bench_case(1, 32, 128, 200, 200, 3, 1);
  1408. bench_case(1, 32, 128, 128, 128, 3, 1);
  1409. bench_case(1, 32, 128, 100, 100, 3, 1);
  1410. bench_case(1, 32, 128, 80, 80, 3, 1);
  1411. bench_case(1, 64, 32, 7, 7, 3, 1);
  1412. bench_case(1, 64, 64, 7, 7, 3, 1);
  1413. bench_case(1, 64, 128, 7, 7, 3, 1);
  1414. bench_case(1, 64, 256, 7, 7, 3, 1);
  1415. bench_case(1, 64, 512, 7, 7, 3, 1);
  1416. bench_case(1, 64, 1024, 7, 7, 3, 1);
  1417. bench_case(1, 64, 32, 14, 14, 3, 1);
  1418. bench_case(1, 64, 64, 14, 14, 3, 1);
  1419. bench_case(1, 64, 128, 14, 14, 3, 1);
  1420. bench_case(1, 64, 256, 14, 14, 3, 1);
  1421. bench_case(1, 64, 512, 14, 14, 3, 1);
  1422. bench_case(1, 64, 1024, 14, 14, 3, 1);
  1423. bench_case(1, 128, 128, 14, 14, 3, 1);
  1424. bench_case(1, 128, 256, 14, 14, 3, 1);
  1425. bench_case(1, 512, 512, 14, 14, 3, 1);
  1426. bench_case(1, 256, 512, 14, 14, 3, 1);
  1427. bench_case(1, 512, 1024, 14, 14, 3, 1);
  1428. bench_case(1, 1024, 1024, 14, 14, 3, 1);
  1429. std::string algo_name = "IM2COLMATMUL:AARCH64_F32K8X12X1:96";
  1430. printf("Benchmark IM2COLMATMUL:AARCH64_F32K8X12X1algo:96\n");
  1431. benchmark_impl(
  1432. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1433. data_type);
  1434. benchmark_impl(
  1435. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1436. data_type);
  1437. benchmark_impl(
  1438. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1439. data_type);
  1440. algo_name = "IM2COLMATMUL:AARCH64_F32K8X12X1:192";
  1441. printf("Benchmark IM2COLMATMUL:AARCH64_F32K8X12X1algo:192\n");
  1442. benchmark_impl(
  1443. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1444. data_type);
  1445. benchmark_impl(
  1446. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1447. data_type);
  1448. benchmark_impl(
  1449. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1450. data_type);
  1451. algo_name = "IM2COLMATMUL:AARCH64_F32K8X12X1:384";
  1452. printf("Benchmark IM2COLMATMUL:AARCH64_F32K8X12X1algo:384\n");
  1453. benchmark_impl(
  1454. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1455. data_type);
  1456. benchmark_impl(
  1457. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1458. data_type);
  1459. benchmark_impl(
  1460. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1461. data_type);
  1462. shapes_and_computation.clear();
  1463. }
  1464. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  1465. BENCHMARK_CHANNEL_WISE_INT8_INT8_INT8_STRIDE1) {
  1466. constexpr size_t RUNS = 50;
  1467. param::ConvBias param;
  1468. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1469. param.pad_h = 1;
  1470. param.pad_w = 1;
  1471. param.stride_h = 1;
  1472. param.stride_w = 1;
  1473. param.sparse = param::ConvBias::Sparse::GROUP;
  1474. param.format = param::ConvBias::Format::NCHW44;
  1475. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1476. auto bench_case = [&](size_t N, size_t IC, size_t H, size_t W, size_t FS,
  1477. size_t P) {
  1478. size_t group = IC;
  1479. size_t OC = IC;
  1480. size_t S = 1;
  1481. SmallVector<TensorShape> shapes{
  1482. {N, IC, H, W, 4},
  1483. {group, 1, 1, FS, FS, 4},
  1484. {1, OC, 1, 1, 4},
  1485. {},
  1486. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1, 4}};
  1487. TensorShape dst{N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1, 4};
  1488. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1489. dst.total_nr_elems()) *
  1490. 1e-6;
  1491. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1492. };
  1493. bench_case(1, 128, 200, 200, 3, 1);
  1494. bench_case(1, 128, 128, 128, 3, 1);
  1495. bench_case(1, 128, 100, 100, 3, 1);
  1496. bench_case(1, 128, 80, 80, 3, 1);
  1497. bench_case(1, 128, 56, 56, 3, 1);
  1498. bench_case(1, 128, 28, 28, 3, 1);
  1499. bench_case(1, 128, 14, 14, 3, 1);
  1500. bench_case(1, 64, 200, 200, 3, 1);
  1501. bench_case(1, 64, 128, 128, 3, 1);
  1502. bench_case(1, 64, 100, 100, 3, 1);
  1503. bench_case(1, 64, 80, 80, 3, 1);
  1504. bench_case(1, 64, 56, 56, 3, 1);
  1505. bench_case(1, 64, 28, 28, 3, 1);
  1506. bench_case(1, 64, 14, 14, 3, 1);
  1507. bench_case(1, 32, 200, 200, 3, 1);
  1508. bench_case(1, 32, 128, 128, 3, 1);
  1509. bench_case(1, 32, 100, 100, 3, 1);
  1510. bench_case(1, 32, 80, 80, 3, 1);
  1511. bench_case(1, 32, 56, 56, 3, 1);
  1512. bench_case(1, 32, 28, 28, 3, 1);
  1513. bench_case(1, 32, 14, 14, 3, 1);
  1514. std::string algo_name = "S8_CHAN_WISE_STRD1_NCHW44";
  1515. printf("Benchmarker S8_CHAN_WISE_STRD1_NCHW44 algo\n");
  1516. std::vector<DType> data_type = {
  1517. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  1518. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  1519. benchmark_impl(
  1520. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1521. data_type);
  1522. benchmark_impl(
  1523. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1524. data_type);
  1525. benchmark_impl(
  1526. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1527. data_type);
  1528. }
  1529. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  1530. BENCHMARK_CHANNEL_WISE_INT8_INT8_INT16_STRIDE1) {
  1531. constexpr size_t RUNS = 50;
  1532. param::ConvBias param;
  1533. param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
  1534. param.pad_h = 1;
  1535. param.pad_w = 1;
  1536. param.stride_h = 1;
  1537. param.stride_w = 1;
  1538. param.sparse = param::ConvBias::Sparse::GROUP;
  1539. param.format = param::ConvBias::Format::NCHW44;
  1540. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1541. auto bench_case = [&](size_t N, size_t IC, size_t H, size_t W, size_t FS,
  1542. size_t P) {
  1543. size_t group = IC;
  1544. size_t OC = IC;
  1545. size_t S = 1;
  1546. SmallVector<TensorShape> shapes{
  1547. {N, IC, H, W, 4},
  1548. {group, 1, 1, FS, FS, 4},
  1549. {1, OC, 1, 1, 4},
  1550. {},
  1551. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1, 4}};
  1552. TensorShape dst{N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1, 4};
  1553. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1554. dst.total_nr_elems()) *
  1555. 1e-6;
  1556. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1557. };
  1558. bench_case(1, 128, 200, 200, 3, 1);
  1559. bench_case(1, 128, 128, 128, 3, 1);
  1560. bench_case(1, 128, 100, 100, 3, 1);
  1561. bench_case(1, 128, 80, 80, 3, 1);
  1562. bench_case(1, 128, 56, 56, 3, 1);
  1563. bench_case(1, 128, 28, 28, 3, 1);
  1564. bench_case(1, 128, 14, 14, 3, 1);
  1565. bench_case(1, 64, 200, 200, 3, 1);
  1566. bench_case(1, 64, 128, 128, 3, 1);
  1567. bench_case(1, 64, 100, 100, 3, 1);
  1568. bench_case(1, 64, 80, 80, 3, 1);
  1569. bench_case(1, 64, 56, 56, 3, 1);
  1570. bench_case(1, 64, 28, 28, 3, 1);
  1571. bench_case(1, 64, 14, 14, 3, 1);
  1572. bench_case(1, 32, 200, 200, 3, 1);
  1573. bench_case(1, 32, 128, 128, 3, 1);
  1574. bench_case(1, 32, 100, 100, 3, 1);
  1575. bench_case(1, 32, 80, 80, 3, 1);
  1576. bench_case(1, 32, 56, 56, 3, 1);
  1577. bench_case(1, 32, 28, 28, 3, 1);
  1578. bench_case(1, 32, 14, 14, 3, 1);
  1579. std::string algo_name = "S8x8x16_CHAN_WISE_STRD1_STRD2_NCHW44";
  1580. printf("Benchmarker S8x8x16_CHAN_WISE_STRD1_STRD2_NCHW44 algo\n");
  1581. std::vector<DType> data_type = {
  1582. dtype::Int8(), dtype::Int8(), dtype::Int16(), dtype::Int16()};
  1583. benchmark_impl(
  1584. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1585. data_type);
  1586. benchmark_impl(
  1587. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1588. data_type);
  1589. benchmark_impl(
  1590. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1591. data_type);
  1592. }
  1593. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_IM2COL_NCHW44_INT8x8x32_STRIDE1) {
  1594. constexpr size_t RUNS = 50;
  1595. param::ConvBias param;
  1596. param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
  1597. param.pad_h = 1;
  1598. param.pad_w = 1;
  1599. param.stride_h = 1;
  1600. param.stride_w = 1;
  1601. param.sparse = param::ConvBias::Sparse::DENSE;
  1602. param.format = param::ConvBias::Format::NCHW44;
  1603. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1604. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1605. size_t group = 1) {
  1606. SmallVector<TensorShape> shapes{
  1607. {N, IC, H, W, 4},
  1608. {OC, IC / group, FS, FS, 4, 4},
  1609. {/*1, OC, 1, 1*/},
  1610. {},
  1611. {N, OC, H, W, 4}};
  1612. TensorShape dst{N, OC, H, W, 4};
  1613. float computations = ((4 * IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1614. dst.total_nr_elems()) *
  1615. 1e-6;
  1616. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1617. };
  1618. bench_case(1, 32, 32, 300, 300, 3, 1);
  1619. bench_case(1, 32, 32, 400, 400, 3, 1);
  1620. bench_case(1, 32, 32, 100, 100, 3, 1);
  1621. bench_case(1, 32, 32, 80, 80, 3, 1);
  1622. bench_case(1, 32, 64, 200, 200, 3, 1);
  1623. bench_case(1, 32, 64, 128, 128, 3, 1);
  1624. bench_case(1, 32, 64, 100, 100, 3, 1);
  1625. bench_case(1, 32, 64, 80, 80, 3, 1);
  1626. bench_case(1, 32, 128, 200, 200, 3, 1);
  1627. bench_case(1, 32, 128, 128, 128, 3, 1);
  1628. bench_case(1, 32, 128, 100, 100, 3, 1);
  1629. bench_case(1, 32, 128, 80, 80, 3, 1);
  1630. #if 1
  1631. bench_case(1, 64, 32, 7, 7, 3, 1);
  1632. bench_case(1, 64, 64, 7, 7, 3, 1);
  1633. bench_case(1, 64, 128, 7, 7, 3, 1);
  1634. bench_case(1, 64, 256, 7, 7, 3, 1);
  1635. bench_case(1, 64, 512, 7, 7, 3, 1);
  1636. bench_case(1, 64, 1024, 7, 7, 3, 1);
  1637. bench_case(1, 64, 32, 14, 14, 3, 1);
  1638. bench_case(1, 64, 64, 14, 14, 3, 1);
  1639. bench_case(1, 64, 128, 14, 14, 3, 1);
  1640. bench_case(1, 64, 256, 14, 14, 3, 1);
  1641. bench_case(1, 64, 512, 14, 14, 3, 1);
  1642. bench_case(1, 64, 1024, 14, 14, 3, 1);
  1643. bench_case(1, 128, 128, 14, 14, 3, 1);
  1644. bench_case(1, 128, 256, 14, 14, 3, 1);
  1645. bench_case(1, 512, 512, 14, 14, 3, 1);
  1646. bench_case(1, 256, 512, 14, 14, 3, 1);
  1647. bench_case(1, 512, 1024, 14, 14, 3, 1);
  1648. bench_case(1, 1024, 1024, 14, 14, 3, 1);
  1649. #endif
  1650. std::string algo_name = "IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96";
  1651. printf("Benchmarker IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96 algo\n");
  1652. std::vector<DType> data_type = {
  1653. dtype::QuantizedS8(2.5f),
  1654. dtype::QuantizedS8(2.5f),
  1655. dtype::QuantizedS32(6.25f),
  1656. {}};
  1657. benchmark_impl(
  1658. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1659. data_type);
  1660. benchmark_impl(
  1661. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1662. data_type);
  1663. benchmark_impl(
  1664. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1665. data_type);
  1666. algo_name = "IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:192";
  1667. printf("Benchmarker IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:192 "
  1668. "algo\n");
  1669. benchmark_impl(
  1670. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1671. data_type);
  1672. benchmark_impl(
  1673. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1674. data_type);
  1675. benchmark_impl(
  1676. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1677. data_type);
  1678. algo_name = "IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:384";
  1679. printf("Benchmarker IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:384 "
  1680. "algo\n");
  1681. benchmark_impl(
  1682. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1683. data_type);
  1684. benchmark_impl(
  1685. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1686. data_type);
  1687. benchmark_impl(
  1688. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1689. data_type);
  1690. }
  1691. #endif
  1692. /*================== BENCHMARK MULTITHREAD CONV1X1 =====================*/
  1693. #if MEGDNN_WITH_BENCHMARK
  1694. namespace {
  1695. std::vector<std::pair<SmallVector<TensorShape>, float>>
  1696. get_conv1x1_multithread_benchmark_args() {
  1697. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1698. auto bench_case = [&](size_t IC, size_t OC, size_t H, size_t W) {
  1699. SmallVector<TensorShape> shapes{
  1700. {1, IC, H, W}, {OC, IC, 1, 1}, {1, OC, 1, 1}, {}, {1, OC, H, W}};
  1701. TensorShape dst{1, OC, H, W};
  1702. float computations =
  1703. (IC * dst.total_nr_elems() * 2 + dst.total_nr_elems()) * 1e-6;
  1704. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1705. };
  1706. bench_case(32, 32, 300, 300);
  1707. bench_case(32, 32, 400, 400);
  1708. bench_case(32, 32, 100, 100);
  1709. bench_case(32, 32, 80, 80);
  1710. bench_case(32, 64, 200, 200);
  1711. bench_case(32, 64, 128, 128);
  1712. bench_case(32, 64, 100, 100);
  1713. bench_case(32, 64, 80, 80);
  1714. bench_case(32, 128, 200, 200);
  1715. bench_case(32, 128, 128, 128);
  1716. bench_case(32, 128, 100, 100);
  1717. bench_case(32, 128, 80, 80);
  1718. bench_case(64, 32, 7, 7);
  1719. bench_case(64, 64, 7, 7);
  1720. bench_case(64, 128, 7, 7);
  1721. bench_case(64, 256, 7, 7);
  1722. bench_case(64, 512, 7, 7);
  1723. bench_case(64, 1024, 7, 7);
  1724. bench_case(64, 32, 14, 14);
  1725. bench_case(64, 64, 14, 14);
  1726. bench_case(64, 128, 14, 14);
  1727. bench_case(64, 256, 14, 14);
  1728. bench_case(64, 512, 14, 14);
  1729. bench_case(64, 1024, 14, 14);
  1730. bench_case(128, 128, 14, 14);
  1731. bench_case(128, 256, 14, 14);
  1732. bench_case(512, 512, 14, 14);
  1733. bench_case(256, 512, 14, 14);
  1734. bench_case(512, 1024, 14, 14);
  1735. bench_case(1024, 1024, 14, 14);
  1736. return shapes_and_computation;
  1737. }
  1738. void conv1x1_multithread_benchmark(
  1739. const char* algo_name, DType stype, DType ftype, DType btype, DType dtype) {
  1740. constexpr size_t RUNS = 50;
  1741. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation =
  1742. get_conv1x1_multithread_benchmark_args();
  1743. std::vector<DType> data_type = {stype, ftype, btype, dtype};
  1744. param::ConvBias param;
  1745. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1746. param.pad_h = 0;
  1747. param.pad_w = 0;
  1748. param.stride_h = 1;
  1749. param.stride_w = 1;
  1750. benchmark_impl(
  1751. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1752. data_type);
  1753. benchmark_impl(
  1754. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1755. data_type);
  1756. benchmark_impl(
  1757. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1758. data_type);
  1759. shapes_and_computation.clear();
  1760. }
  1761. } // namespace
  1762. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_CONV1X1_S1_FP32) {
  1763. #if MEGDNN_AARCH64
  1764. conv1x1_multithread_benchmark(
  1765. "CONV1x1:AARCH64_F32K8X12X1:8", dtype::Float32(), dtype::Float32(),
  1766. dtype::Float32(), dtype::Float32());
  1767. #else
  1768. conv1x1_multithread_benchmark(
  1769. "CONV1x1:ARMV7_F32:8", dtype::Float32(), dtype::Float32(), dtype::Float32(),
  1770. dtype::Float32());
  1771. #endif
  1772. }
  1773. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  1774. BENCHMARK_CONVBIAS_CONV1X1_S1_QUANTIZEDASYM) {
  1775. dtype::Quantized8Asymm stype(0.2f, 100);
  1776. dtype::Quantized8Asymm ftype(0.2f, 120);
  1777. dtype::QuantizedS32 btype(0.04f);
  1778. dtype::Quantized8Asymm dtype(1.4f, 110);
  1779. #if MEGDNN_AARCH64
  1780. #if MGB_ENABLE_DOT
  1781. conv1x1_multithread_benchmark(
  1782. "CONV1x1:AARCH64_QUINT8_K8X8X4_DOTPROD:8", stype, ftype, btype, dtype);
  1783. #else
  1784. conv1x1_multithread_benchmark(
  1785. "CONV1x1:AARCH64_QUINT8_K8X8X8:8", stype, ftype, btype, dtype);
  1786. #endif
  1787. #else
  1788. conv1x1_multithread_benchmark(
  1789. "CONV1x1:ARMV7_QUINT8_K4X8X8:8", stype, ftype, btype, dtype);
  1790. #endif
  1791. }
  1792. #endif
  1793. // vim: syntax=cpp.doxygen