You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias_multi_thread_benchmark.cpp 88 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122
  1. #include "test/arm_common/fixture.h"
  2. #include "test/common/benchmarker.h"
  3. #include "test/common/conv_bias.h"
  4. using namespace megdnn;
  5. using namespace test;
  6. using namespace conv_bias;
  7. #if MEGDNN_WITH_BENCHMARK
  8. namespace {
  9. void benchmark_impl(
  10. const param::ConvBias param,
  11. std::vector<std::pair<SmallVector<TensorShape>, float>>& shapes_and_computation,
  12. const std::string algo_name, size_t RUNS,
  13. TaskExecutorConfig&& multi_thread_config,
  14. TaskExecutorConfig&& single_thread_config, std::vector<DType>& data_type) {
  15. std::vector<float> multi_thread_times, single_thread_times;
  16. {
  17. auto multi_thread_hanle = create_cpu_handle(0, true, &multi_thread_config);
  18. auto benchmarker = Benchmarker<ConvBias>(multi_thread_hanle.get());
  19. benchmarker.set_times(RUNS)
  20. .set_display(false)
  21. .set_param(param)
  22. .set_dtype(0, data_type[0])
  23. .set_dtype(1, data_type[1])
  24. .set_dtype(2, data_type[2])
  25. .set_dtype(4, data_type[3])
  26. .set_before_exec_callback(
  27. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name.c_str()));
  28. for (auto shape : shapes_and_computation) {
  29. multi_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  30. }
  31. }
  32. {
  33. auto single_thread_handle = create_cpu_handle(0, true, &single_thread_config);
  34. auto benchmarker = Benchmarker<ConvBias>(single_thread_handle.get());
  35. benchmarker.set_times(RUNS)
  36. .set_display(false)
  37. .set_param(param)
  38. .set_dtype(0, data_type[0])
  39. .set_dtype(1, data_type[1])
  40. .set_dtype(2, data_type[2])
  41. .set_dtype(4, data_type[3])
  42. .set_before_exec_callback(
  43. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name.c_str()));
  44. for (auto shape : shapes_and_computation) {
  45. single_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  46. }
  47. }
  48. printf("Benchmark : Multi threads %zu, ", multi_thread_config.nr_thread);
  49. printf("core_ids:");
  50. for (size_t i = 0; i < multi_thread_config.affinity_core_set.size(); i++) {
  51. printf("%zu ", multi_thread_config.affinity_core_set[i]);
  52. }
  53. printf(", Single thread core_id %zu\n", single_thread_config.affinity_core_set[0]);
  54. for (size_t i = 0; i < shapes_and_computation.size(); i++) {
  55. auto shapes = shapes_and_computation[i];
  56. printf("Bench case: ");
  57. for (auto&& shape : shapes.first) {
  58. printf("%s ", shape.to_string().c_str());
  59. }
  60. float computations = shapes.second;
  61. printf("%zu threads gflops: %f,\n single thread gflops: "
  62. "%f. spead up = %f, speedup/cores=%f\n",
  63. multi_thread_config.nr_thread, computations / multi_thread_times[i],
  64. computations / single_thread_times[i],
  65. single_thread_times[i] / multi_thread_times[i],
  66. single_thread_times[i] / multi_thread_times[i] /
  67. multi_thread_config.nr_thread);
  68. }
  69. }
  70. void benchmark_with_contrast(
  71. const std::vector<conv_bias::TestArg>& args, const std::string algo_name,
  72. std::vector<DType>& data_type,
  73. const std::vector<conv_bias::TestArg>& args_contrast,
  74. const std::string algo_name_contrast, std::vector<DType>& data_type_contrast,
  75. size_t RUNS, TaskExecutorConfig&& single_thread_config) {
  76. auto single_thread_handle = create_cpu_handle(0, true, &single_thread_config);
  77. auto benchmarker = Benchmarker<ConvBias>(single_thread_handle.get());
  78. auto benchmarker_contrast = Benchmarker<ConvBias>(single_thread_handle.get());
  79. benchmarker.set_times(RUNS)
  80. .set_display(false)
  81. .set_dtype(0, data_type[0])
  82. .set_dtype(1, data_type[1])
  83. .set_dtype(2, data_type[2])
  84. .set_dtype(4, data_type[3])
  85. .set_before_exec_callback(
  86. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name.c_str()));
  87. benchmarker_contrast.set_times(RUNS)
  88. .set_display(false)
  89. .set_dtype(0, data_type_contrast[0])
  90. .set_dtype(1, data_type_contrast[1])
  91. .set_dtype(2, data_type_contrast[2])
  92. .set_dtype(4, data_type_contrast[3])
  93. .set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>(
  94. algo_name_contrast.c_str()));
  95. size_t arg_size = args.size(), arg_contrast_size = args_contrast.size();
  96. megdnn_assert(arg_size == arg_contrast_size);
  97. rep(i, arg_size) {
  98. TensorLayout dst_layout, dst_layout_contrast;
  99. auto opr = single_thread_handle.get()->create_operator<ConvBias>();
  100. auto&& arg = args[i];
  101. opr->param() = arg.param;
  102. opr->deduce_layout(
  103. {arg.src, data_type[0]}, {arg.filter, data_type[1]},
  104. {arg.bias, data_type[2]}, {}, dst_layout);
  105. float computation = (dst_layout.total_nr_elems() * arg.filter[1] *
  106. arg.filter[2] * arg.filter[3] * arg.filter[4] * 2.0) /
  107. (1024 * 1024 * 1024) * 1e3;
  108. benchmarker.set_param(arg.param);
  109. auto used = benchmarker.exec({arg.src, arg.filter, arg.bias, {}, {}}) / RUNS;
  110. auto&& arg_contrast = args_contrast[i];
  111. opr->param() = arg_contrast.param;
  112. opr->deduce_layout(
  113. {arg_contrast.src, data_type_contrast[0]},
  114. {arg_contrast.filter, data_type_contrast[1]},
  115. {arg_contrast.bias, data_type_contrast[2]}, {}, dst_layout_contrast);
  116. float computation_contrast =
  117. (dst_layout_contrast.total_nr_elems() * arg_contrast.filter[1] *
  118. arg_contrast.filter[2] * arg_contrast.filter[3] *
  119. arg_contrast.filter[4] * 2.0) /
  120. (1024 * 1024 * 1024) * 1e3;
  121. benchmarker_contrast.set_param(arg_contrast.param);
  122. auto used_contrast = benchmarker_contrast.exec(
  123. {arg_contrast.src,
  124. arg_contrast.filter,
  125. arg_contrast.bias,
  126. {},
  127. {}}) /
  128. RUNS;
  129. printf("Bench case: \n");
  130. printf("padding: %u, stride: %u, nonline mode: %u\n", arg.param.pad_h,
  131. arg.param.stride_h, arg.param.nonlineMode);
  132. printf("%s %s %s\n", arg.src.to_string().c_str(),
  133. arg.filter.to_string().c_str(), arg.bias.to_string().c_str());
  134. printf("%s %s %s\n", arg_contrast.src.to_string().c_str(),
  135. arg_contrast.filter.to_string().c_str(),
  136. arg_contrast.bias.to_string().c_str());
  137. printf("%s: %f gflops;\n%s: %f gflops\n"
  138. "spead up = %f\n",
  139. algo_name.c_str(), computation / used, algo_name_contrast.c_str(),
  140. computation_contrast / used_contrast, used_contrast / used);
  141. }
  142. }
  143. } // namespace
  144. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  145. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECTF16) {
  146. constexpr size_t RUNS = 50;
  147. param::ConvBias param;
  148. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  149. param.pad_h = 1;
  150. param.pad_w = 1;
  151. param.stride_h = 1;
  152. param.stride_w = 1;
  153. param.sparse = param::ConvBias::Sparse::GROUP;
  154. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  155. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  156. size_t group) {
  157. SmallVector<TensorShape> shapes{
  158. {N, IC, H, W},
  159. {group, OC / group, IC / group, FS, FS},
  160. {1, OC, 1, 1},
  161. {},
  162. {N, OC, H, W}};
  163. TensorShape dst{N, OC, H, W};
  164. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  165. dst.total_nr_elems()) *
  166. 1e-6;
  167. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  168. };
  169. bench_case(1, 32, 32, 200, 200, 3, 4);
  170. bench_case(1, 32, 32, 200, 200, 3, 32);
  171. bench_case(1, 32, 32, 128, 128, 3, 4);
  172. bench_case(1, 32, 32, 128, 128, 3, 32);
  173. bench_case(1, 32, 32, 100, 100, 3, 4);
  174. bench_case(1, 32, 32, 100, 100, 3, 32);
  175. bench_case(1, 32, 32, 80, 80, 3, 4);
  176. bench_case(1, 32, 32, 80, 80, 3, 32);
  177. std::string algo_name = "F16DIRECT";
  178. printf("Benchmark F16DIRECT_LARGE_GROUP algo\n");
  179. std::vector<DType> data_type = {
  180. dtype::Float16(), dtype::Float16(), dtype::Float16(), dtype::Float16()};
  181. benchmark_impl(
  182. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  183. data_type);
  184. benchmark_impl(
  185. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  186. data_type);
  187. benchmark_impl(
  188. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  189. data_type);
  190. shapes_and_computation.clear();
  191. algo_name = "F16DIRECT";
  192. printf("Benchmark F16DIRECT_SMALL_GROUP algo\n");
  193. bench_case(1, 32, 32, 200, 200, 3, 1);
  194. bench_case(1, 32, 32, 128, 128, 3, 1);
  195. bench_case(1, 32, 32, 100, 100, 3, 1);
  196. bench_case(1, 32, 32, 80, 80, 3, 1);
  197. benchmark_impl(
  198. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  199. data_type);
  200. benchmark_impl(
  201. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  202. data_type);
  203. benchmark_impl(
  204. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  205. data_type);
  206. }
  207. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECTF16_STR1) {
  208. constexpr size_t RUNS = 50;
  209. param::ConvBias param;
  210. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  211. param.pad_h = 1;
  212. param.pad_w = 1;
  213. param.stride_h = 1;
  214. param.stride_w = 1;
  215. param.sparse = param::ConvBias::Sparse::GROUP;
  216. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  217. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  218. size_t group) {
  219. SmallVector<TensorShape> shapes{
  220. {N, IC, H, W},
  221. {group, OC / group, IC / group, FS, FS},
  222. {1, OC, 1, 1},
  223. {},
  224. {N, OC, H, W}};
  225. TensorShape dst{N, OC, H, W};
  226. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  227. dst.total_nr_elems()) *
  228. 1e-6;
  229. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  230. };
  231. bench_case(1, 32, 32, 200, 200, 3, 4);
  232. bench_case(1, 32, 32, 200, 200, 3, 32);
  233. bench_case(1, 32, 32, 128, 128, 3, 4);
  234. bench_case(1, 32, 32, 128, 128, 3, 32);
  235. bench_case(1, 32, 32, 100, 100, 3, 4);
  236. bench_case(1, 32, 32, 100, 100, 3, 32);
  237. bench_case(1, 32, 32, 80, 80, 3, 4);
  238. bench_case(1, 32, 32, 80, 80, 3, 32);
  239. std::string algo_name = "F16STRD1";
  240. printf("Benchmark F16STRD1_LARGE_GROUP algo\n");
  241. std::vector<DType> data_type = {
  242. dtype::Float16(), dtype::Float16(), dtype::Float16(), dtype::Float16()};
  243. benchmark_impl(
  244. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  245. data_type);
  246. benchmark_impl(
  247. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  248. data_type);
  249. benchmark_impl(
  250. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  251. data_type);
  252. shapes_and_computation.clear();
  253. algo_name = "F16STRD1";
  254. printf("Benchmark F16STRD1_SMALL_GROUP algo\n");
  255. bench_case(1, 32, 32, 200, 200, 3, 1);
  256. bench_case(1, 32, 32, 128, 128, 3, 1);
  257. bench_case(1, 32, 32, 100, 100, 3, 1);
  258. bench_case(1, 32, 32, 80, 80, 3, 1);
  259. benchmark_impl(
  260. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  261. data_type);
  262. benchmark_impl(
  263. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  264. data_type);
  265. benchmark_impl(
  266. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  267. data_type);
  268. }
  269. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CHANNEL_WISE_FP16_NCHW88) {
  270. constexpr size_t RUNS = 50;
  271. std::string algo_name = "F16_CHANNEL_WISE_NCHW88";
  272. printf("Benchmarker F16_CHANNEL_WISE_NCHW88 algo\n");
  273. std::vector<DType> data_type = {
  274. dtype::Float16(), dtype::Float16(), dtype::Float16(), dtype::Float16()};
  275. auto bench_case = [&](size_t N, size_t IC, size_t H, size_t W, size_t FS, size_t P,
  276. size_t S) {
  277. param::ConvBias param;
  278. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  279. param.pad_h = P;
  280. param.pad_w = P;
  281. param.stride_h = S;
  282. param.stride_w = S;
  283. param.sparse = param::ConvBias::Sparse::GROUP;
  284. param.format = param::ConvBias::Format::NCHW88;
  285. size_t group = IC;
  286. size_t OC = IC;
  287. SmallVector<TensorShape> shapes{
  288. {N, IC, H, W, 8},
  289. {group, 1, 1, FS, FS, 8},
  290. {1, OC, 1, 1, 8},
  291. {},
  292. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1, 8}};
  293. TensorShape dst{N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1, 8};
  294. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  295. dst.total_nr_elems()) *
  296. 1e-6;
  297. std::vector<std::pair<SmallVector<TensorShape>, float>> shape_arg = {
  298. std::make_pair(shapes, computations)};
  299. benchmark_impl(
  300. param, shape_arg, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  301. data_type);
  302. };
  303. bench_case(1, 64, 100, 100, 5, 2, 1);
  304. bench_case(1, 64, 56, 56, 5, 2, 1);
  305. bench_case(1, 64, 28, 28, 5, 2, 1);
  306. bench_case(1, 64, 100, 100, 5, 2, 2);
  307. bench_case(1, 64, 56, 56, 5, 2, 2);
  308. bench_case(1, 64, 28, 28, 5, 2, 2);
  309. bench_case(1, 64, 100, 100, 3, 1, 1);
  310. bench_case(1, 64, 56, 56, 3, 1, 1);
  311. bench_case(1, 64, 28, 28, 3, 1, 1);
  312. bench_case(1, 64, 100, 100, 3, 1, 2);
  313. bench_case(1, 64, 56, 56, 3, 1, 2);
  314. bench_case(1, 64, 28, 28, 3, 1, 2);
  315. bench_case(1, 64, 100, 100, 2, 0, 1);
  316. bench_case(1, 64, 56, 56, 2, 0, 1);
  317. bench_case(1, 64, 28, 28, 2, 0, 1);
  318. bench_case(1, 64, 100, 100, 2, 0, 2);
  319. bench_case(1, 64, 56, 56, 2, 0, 2);
  320. bench_case(1, 64, 28, 28, 2, 0, 2);
  321. }
  322. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_FP16_NCHW88) {
  323. constexpr size_t RUNS = 40;
  324. std::vector<DType> data_type = {
  325. dtype::Float16(), dtype::Float16(), dtype::Float16(), dtype::Float16()};
  326. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  327. size_t group, size_t P, size_t S) {
  328. param::ConvBias param;
  329. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  330. param.pad_h = P;
  331. param.pad_w = P;
  332. param.stride_h = S;
  333. param.stride_w = S;
  334. param.sparse = param::ConvBias::Sparse::DENSE;
  335. param.format = param::ConvBias::Format::NCHW88;
  336. auto OH = (H + 2 * P - FS) / static_cast<size_t>(S) + 1;
  337. auto OW = (W + 2 * P - FS) / static_cast<size_t>(S) + 1;
  338. TensorShape src = {N, IC / 8, H, W, 8};
  339. TensorShape filter = {OC / 8, IC / 8, FS, FS, 8, 8};
  340. if (group > 1) {
  341. filter = {group, OC / group / 8, IC / group / 8, FS, FS, 8, 8};
  342. param.sparse = param::ConvBias::Sparse::GROUP;
  343. }
  344. TensorShape bias = {1, OC / 8, 1, 1, 8};
  345. TensorShape dst = {N, OC / 8, OH, OW, 8};
  346. SmallVector<TensorShape> shapes{src, filter, bias, {}, dst};
  347. float computations = (((IC / group) * FS * FS + 1) * dst.total_nr_elems() * 2 +
  348. dst.total_nr_elems()) *
  349. 1e-6;
  350. std::vector<std::pair<SmallVector<TensorShape>, float>> shape_arg = {
  351. std::make_pair(shapes, computations)};
  352. benchmark_impl(
  353. param, shape_arg, ".+", RUNS, {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  354. };
  355. bench_case(1, 64, 64, 28, 28, 3, 1, 1, 1);
  356. bench_case(1, 64, 64, 28, 28, 5, 1, 2, 1);
  357. bench_case(1, 64, 64, 28, 28, 7, 1, 3, 1);
  358. bench_case(1, 64, 64, 28, 28, 3, 1, 1, 2);
  359. bench_case(1, 64, 64, 28, 28, 5, 1, 2, 2);
  360. bench_case(1, 64, 64, 28, 28, 7, 1, 3, 2);
  361. bench_case(1, 64, 64, 28, 28, 3, 2, 1, 1);
  362. bench_case(1, 64, 64, 28, 28, 3, 4, 1, 1);
  363. bench_case(1, 64, 64, 28, 28, 3, 8, 1, 1);
  364. bench_case(1, 16, 16, 28, 28, 3, 1, 1, 1);
  365. bench_case(1, 32, 32, 28, 28, 3, 1, 1, 1);
  366. bench_case(1, 128, 128, 28, 28, 3, 1, 1, 1);
  367. bench_case(1, 256, 256, 28, 28, 3, 1, 1, 1);
  368. bench_case(1, 64, 64, 7, 7, 3, 1, 1, 1);
  369. bench_case(1, 64, 64, 14, 14, 3, 1, 1, 1);
  370. bench_case(1, 64, 64, 56, 56, 3, 1, 1, 1);
  371. bench_case(1, 64, 64, 112, 112, 3, 1, 1, 1);
  372. }
  373. #endif
  374. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECT_INT8x8x16) {
  375. constexpr size_t RUNS = 50;
  376. param::ConvBias param;
  377. param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
  378. param.pad_h = 1;
  379. param.pad_w = 1;
  380. param.stride_h = 1;
  381. param.stride_w = 1;
  382. param.sparse = param::ConvBias::Sparse::GROUP;
  383. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  384. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  385. size_t group) {
  386. SmallVector<TensorShape> shapes{
  387. {N, IC, H, W},
  388. {group, OC / group, IC / group, FS, FS},
  389. {},
  390. {},
  391. {N, OC, H, W}};
  392. TensorShape dst{N, OC, H, W};
  393. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  394. dst.total_nr_elems()) *
  395. 1e-6;
  396. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  397. };
  398. bench_case(1, 32, 32, 200, 200, 3, 4);
  399. bench_case(1, 32, 32, 200, 200, 3, 32);
  400. bench_case(1, 32, 32, 128, 128, 3, 4);
  401. bench_case(1, 32, 32, 128, 128, 3, 32);
  402. bench_case(1, 32, 32, 100, 100, 3, 4);
  403. bench_case(1, 32, 32, 100, 100, 3, 32);
  404. bench_case(1, 32, 32, 80, 80, 3, 4);
  405. bench_case(1, 32, 32, 80, 80, 3, 32);
  406. std::string algo_name = "I8816DIRECT";
  407. printf("Benchmark I8816DIRECT_LARGE_GROUP algo\n");
  408. std::vector<DType> data_type = {
  409. dtype::Int8(), dtype::Int8(), dtype::Int16(), dtype::Int16()};
  410. benchmark_impl(
  411. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  412. data_type);
  413. benchmark_impl(
  414. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  415. data_type);
  416. benchmark_impl(
  417. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  418. data_type);
  419. shapes_and_computation.clear();
  420. algo_name = "I8816DIRECT";
  421. printf("Benchmark I8816DIRECT_SMALL_GROUP algo\n");
  422. bench_case(1, 32, 32, 200, 200, 3, 1);
  423. bench_case(1, 32, 32, 128, 128, 3, 1);
  424. bench_case(1, 32, 32, 100, 100, 3, 1);
  425. bench_case(1, 32, 32, 80, 80, 3, 1);
  426. benchmark_impl(
  427. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  428. data_type);
  429. benchmark_impl(
  430. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  431. data_type);
  432. benchmark_impl(
  433. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  434. data_type);
  435. }
  436. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECT_INT8x8x16_STR2) {
  437. constexpr size_t RUNS = 50;
  438. param::ConvBias param;
  439. param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
  440. param.pad_h = 1;
  441. param.pad_w = 1;
  442. param.stride_h = 2;
  443. param.stride_w = 2;
  444. param.sparse = param::ConvBias::Sparse::GROUP;
  445. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  446. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  447. size_t group, size_t P, size_t S) {
  448. SmallVector<TensorShape> shapes{
  449. {N, IC, H, W},
  450. {group, OC / group, IC / group, FS, FS},
  451. {},
  452. {},
  453. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  454. TensorShape dst{N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1};
  455. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  456. dst.total_nr_elems()) *
  457. 1e-6;
  458. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  459. };
  460. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 2);
  461. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 2);
  462. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 2);
  463. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 2);
  464. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 2);
  465. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 2);
  466. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 2);
  467. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 2);
  468. std::string algo_name = "I8816STRD2";
  469. printf("Benchmark I8816STRD2_LARGE_GROUP algo\n");
  470. std::vector<DType> data_type = {
  471. dtype::Int8(), dtype::Int8(), dtype::Int16(), dtype::Int16()};
  472. benchmark_impl(
  473. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  474. data_type);
  475. benchmark_impl(
  476. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  477. data_type);
  478. benchmark_impl(
  479. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  480. data_type);
  481. shapes_and_computation.clear();
  482. algo_name = "I8816STRD2";
  483. printf("Benchmark I8816STRD2_SMALL_GROUP algo\n");
  484. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 2);
  485. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 2);
  486. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 2);
  487. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 2);
  488. benchmark_impl(
  489. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  490. data_type);
  491. benchmark_impl(
  492. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  493. data_type);
  494. benchmark_impl(
  495. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  496. data_type);
  497. }
  498. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_INT8_INT8_INT8_STRIDE1) {
  499. constexpr size_t RUNS = 50;
  500. param::ConvBias param;
  501. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  502. param.pad_h = 1;
  503. param.pad_w = 1;
  504. param.stride_h = 1;
  505. param.stride_w = 1;
  506. param.sparse = param::ConvBias::Sparse::GROUP;
  507. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  508. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  509. size_t group, size_t P, size_t S) {
  510. SmallVector<TensorShape> shapes{
  511. {N, IC, H, W},
  512. {group, OC / group, IC / group, FS, FS},
  513. {1, OC, 1, 1},
  514. {},
  515. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  516. TensorShape dst{N, OC, H, W};
  517. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  518. dst.total_nr_elems()) *
  519. 1e-6;
  520. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  521. };
  522. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 1);
  523. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 1);
  524. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 1);
  525. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 1);
  526. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 1);
  527. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 1);
  528. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 1);
  529. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 1);
  530. std::string algo_name = "S8STRD1";
  531. printf("Benchmark S8STRD1_LARGE_GROUP algo\n");
  532. std::vector<DType> data_type = {
  533. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  534. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  535. benchmark_impl(
  536. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  537. data_type);
  538. benchmark_impl(
  539. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  540. data_type);
  541. benchmark_impl(
  542. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  543. data_type);
  544. shapes_and_computation.clear();
  545. algo_name = "S8STRD1";
  546. printf("Benchmark S8STRD1_SMALL_GROUP algo\n");
  547. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 1);
  548. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 1);
  549. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 1);
  550. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 1);
  551. benchmark_impl(
  552. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  553. data_type);
  554. benchmark_impl(
  555. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  556. data_type);
  557. benchmark_impl(
  558. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  559. data_type);
  560. }
  561. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_INT8_NCHW44) {
  562. constexpr size_t RUNS = 40;
  563. std::vector<DType> data_type = {
  564. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  565. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  566. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  567. size_t group, size_t P, size_t S, bool is_nchw = false) {
  568. param::ConvBias param;
  569. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  570. param.pad_h = P;
  571. param.pad_w = P;
  572. param.stride_h = S;
  573. param.stride_w = S;
  574. param.sparse = param::ConvBias::Sparse::DENSE;
  575. param.format = param::ConvBias::Format::NCHW44;
  576. auto OH = (H + 2 * P - FS) / static_cast<size_t>(S) + 1;
  577. auto OW = (W + 2 * P - FS) / static_cast<size_t>(S) + 1;
  578. TensorShape src = {N, IC / 4, H, W, 4};
  579. TensorShape filter = {OC / 4, IC / 4, FS, FS, 4, 4};
  580. if (group > 1) {
  581. filter = {group, OC / group / 4, IC / group / 4, FS, FS, 4, 4};
  582. param.sparse = param::ConvBias::Sparse::GROUP;
  583. }
  584. if (is_nchw) {
  585. src = {N, IC, H, W};
  586. filter = {OC / 4, FS, FS, IC, 4};
  587. }
  588. TensorShape bias = {1, OC / 4, 1, 1, 4};
  589. TensorShape dst = {N, OC / 4, OH, OW, 4};
  590. SmallVector<TensorShape> shapes{src, filter, bias, {}, dst};
  591. float computations = (((IC / group) * FS * FS + 1) * dst.total_nr_elems() * 2 +
  592. dst.total_nr_elems()) *
  593. 1e-6;
  594. std::vector<std::pair<SmallVector<TensorShape>, float>> shape_arg = {
  595. std::make_pair(shapes, computations)};
  596. benchmark_impl(
  597. param, shape_arg, ".+", RUNS, {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  598. };
  599. bench_case(1, 2, 64, 160, 160, 1, 1, 0, 1, true);
  600. bench_case(1, 3, 64, 224, 224, 7, 1, 3, 2, true);
  601. bench_case(1, 64, 64, 56, 56, 3, 1, 1, 1);
  602. bench_case(1, 128, 128, 28, 28, 3, 1, 1, 1);
  603. bench_case(1, 256, 256, 14, 14, 3, 1, 1, 1);
  604. bench_case(1, 512, 512, 7, 7, 3, 1, 1, 1);
  605. bench_case(1, 64, 64, 56, 56, 3, 4, 1, 1);
  606. bench_case(1, 128, 128, 28, 28, 3, 4, 1, 1);
  607. bench_case(1, 256, 256, 14, 14, 3, 4, 1, 1);
  608. bench_case(1, 512, 512, 7, 7, 3, 4, 1, 1);
  609. bench_case(1, 4, 64, 224, 224, 7, 1, 1, 2);
  610. bench_case(1, 256, 128, 56, 56, 3, 1, 1, 2);
  611. bench_case(1, 512, 256, 28, 28, 3, 1, 1, 2);
  612. bench_case(1, 4, 32, 224, 224, 3, 1, 1, 2);
  613. bench_case(1, 256, 128, 56, 56, 3, 4, 1, 2);
  614. bench_case(1, 512, 256, 28, 28, 3, 4, 1, 2);
  615. }
  616. #if MGB_ENABLE_DOT
  617. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_INT8_NCHW44_DOT) {
  618. constexpr size_t RUNS = 40;
  619. std::vector<DType> data_type = {
  620. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  621. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  622. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  623. size_t group, size_t P, size_t S, bool is_nchw = false) {
  624. param::ConvBias param;
  625. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  626. param.pad_h = P;
  627. param.pad_w = P;
  628. param.stride_h = S;
  629. param.stride_w = S;
  630. param.sparse = param::ConvBias::Sparse::DENSE;
  631. param.format = param::ConvBias::Format::NCHW44_DOT;
  632. auto OH = (H + 2 * P - FS) / static_cast<size_t>(S) + 1;
  633. auto OW = (W + 2 * P - FS) / static_cast<size_t>(S) + 1;
  634. TensorShape src = {N, IC / 4, H, W, 4};
  635. TensorShape filter = {OC / 4, IC / 4, FS, FS, 4, 4};
  636. if (group > 1) {
  637. filter = {group, OC / group / 4, IC / group / 4, FS, FS, 4, 4};
  638. param.sparse = param::ConvBias::Sparse::GROUP;
  639. }
  640. if (is_nchw) {
  641. src = {N, IC, H, W};
  642. filter = {OC / 4, FS, FS, IC, 4};
  643. }
  644. TensorShape bias = {1, OC / 4, 1, 1, 4};
  645. TensorShape dst = {N, OC / 4, OH, OW, 4};
  646. SmallVector<TensorShape> shapes{src, filter, bias, {}, dst};
  647. float computations = (((IC / group) * FS * FS + 1) * dst.total_nr_elems() * 2 +
  648. dst.total_nr_elems()) *
  649. 1e-6;
  650. std::vector<std::pair<SmallVector<TensorShape>, float>> shape_arg = {
  651. std::make_pair(shapes, computations)};
  652. benchmark_impl(
  653. param, shape_arg, ".+", RUNS, {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  654. };
  655. bench_case(1, 64, 64, 56, 56, 3, 1, 1, 1);
  656. bench_case(1, 128, 128, 28, 28, 3, 1, 1, 1);
  657. bench_case(1, 256, 256, 14, 14, 3, 1, 1, 1);
  658. bench_case(1, 512, 512, 7, 7, 3, 1, 1, 1);
  659. bench_case(1, 64, 64, 56, 56, 3, 4, 1, 1);
  660. bench_case(1, 128, 128, 28, 28, 3, 4, 1, 1);
  661. bench_case(1, 256, 256, 14, 14, 3, 4, 1, 1);
  662. bench_case(1, 512, 512, 7, 7, 3, 4, 1, 1);
  663. }
  664. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_INT8_NCHW44_DOT_S2) {
  665. constexpr size_t RUNS = 40;
  666. std::vector<DType> data_type = {
  667. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  668. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  669. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  670. size_t group, size_t P, size_t S, bool is_nchw = false) {
  671. param::ConvBias param;
  672. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  673. param.pad_h = P;
  674. param.pad_w = P;
  675. param.stride_h = S;
  676. param.stride_w = S;
  677. param.sparse = param::ConvBias::Sparse::DENSE;
  678. param.format = param::ConvBias::Format::NCHW44_DOT;
  679. auto OH = (H + 2 * P - FS) / static_cast<size_t>(S) + 1;
  680. auto OW = (W + 2 * P - FS) / static_cast<size_t>(S) + 1;
  681. TensorShape src = {N, IC / 4, H, W, 4};
  682. TensorShape filter = {OC / 4, IC / 4, FS, FS, 4, 4};
  683. if (group > 1) {
  684. filter = {group, OC / group / 4, IC / group / 4, FS, FS, 4, 4};
  685. param.sparse = param::ConvBias::Sparse::GROUP;
  686. }
  687. if (is_nchw) {
  688. src = {N, IC, H, W};
  689. filter = {OC / 4, FS, FS, IC, 4};
  690. }
  691. TensorShape bias = {1, OC / 4, 1, 1, 4};
  692. TensorShape dst = {N, OC / 4, OH, OW, 4};
  693. SmallVector<TensorShape> shapes{src, filter, bias, {}, dst};
  694. float computations = (((IC / group) * FS * FS + 1) * dst.total_nr_elems() * 2 +
  695. dst.total_nr_elems()) *
  696. 1e-6;
  697. std::vector<std::pair<SmallVector<TensorShape>, float>> shape_arg = {
  698. std::make_pair(shapes, computations)};
  699. benchmark_impl(
  700. param, shape_arg, ".+", RUNS, {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  701. };
  702. bench_case(1, 64, 64, 56, 56, 3, 1, 1, 2);
  703. bench_case(1, 64, 64, 128, 128, 3, 1, 1, 2);
  704. bench_case(1, 64, 64, 256, 256, 3, 1, 1, 2);
  705. bench_case(1, 64, 64, 156, 156, 3, 1, 1, 2);
  706. bench_case(1, 128, 128, 28, 28, 3, 1, 1, 2);
  707. bench_case(1, 256, 256, 14, 14, 3, 1, 1, 2);
  708. bench_case(1, 512, 512, 7, 7, 3, 1, 1, 2);
  709. bench_case(1, 64, 64, 56, 56, 3, 4, 1, 2);
  710. bench_case(1, 128, 128, 28, 28, 3, 4, 1, 2);
  711. bench_case(1, 256, 256, 14, 14, 3, 4, 1, 2);
  712. bench_case(1, 512, 512, 7, 7, 3, 4, 1, 2);
  713. }
  714. #endif
  715. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_FLOAT_NCHW44) {
  716. constexpr size_t RUNS = 40;
  717. std::vector<DType> data_type = {
  718. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  719. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  720. size_t group, size_t P, size_t S, bool is_nchw = false) {
  721. param::ConvBias param;
  722. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  723. param.pad_h = P;
  724. param.pad_w = P;
  725. param.stride_h = S;
  726. param.stride_w = S;
  727. param.sparse = param::ConvBias::Sparse::DENSE;
  728. param.format = param::ConvBias::Format::NCHW44;
  729. auto OH = (H + 2 * P - FS) / static_cast<size_t>(S) + 1;
  730. auto OW = (W + 2 * P - FS) / static_cast<size_t>(S) + 1;
  731. TensorShape src = {N, IC / 4, H, W, 4};
  732. TensorShape filter = {OC / 4, IC / 4, FS, FS, 4, 4};
  733. if (group > 1) {
  734. filter = {group, OC / group / 4, IC / group / 4, FS, FS, 4, 4};
  735. param.sparse = param::ConvBias::Sparse::GROUP;
  736. }
  737. if (is_nchw) {
  738. src = {N, IC, H, W};
  739. filter = {OC / 4, FS, FS, IC, 4};
  740. }
  741. TensorShape bias = {1, OC / 4, 1, 1, 4};
  742. TensorShape dst = {N, OC / 4, OH, OW, 4};
  743. SmallVector<TensorShape> shapes{src, filter, bias, {}, dst};
  744. float computations = (((IC / group) * FS * FS + 1) * dst.total_nr_elems() * 2 +
  745. dst.total_nr_elems()) *
  746. 1e-6;
  747. std::vector<std::pair<SmallVector<TensorShape>, float>> shape_arg = {
  748. std::make_pair(shapes, computations)};
  749. benchmark_impl(
  750. param, shape_arg, ".+", RUNS, {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
  751. };
  752. bench_case(1, 64, 64, 56, 56, 3, 1, 1, 2);
  753. bench_case(1, 128, 128, 28, 28, 3, 1, 1, 2);
  754. bench_case(1, 256, 256, 14, 14, 3, 1, 1, 2);
  755. bench_case(1, 512, 512, 7, 7, 3, 1, 1, 2);
  756. bench_case(1, 64, 64, 56, 56, 3, 4, 1, 2);
  757. bench_case(1, 128, 128, 28, 28, 3, 4, 1, 2);
  758. bench_case(1, 256, 256, 14, 14, 3, 4, 1, 2);
  759. bench_case(1, 512, 512, 7, 7, 3, 4, 1, 2);
  760. bench_case(1, 64, 64, 56 * 2, 56 * 2, 3, 4, 1, 2);
  761. bench_case(1, 128, 128, 28 * 2, 28 * 2, 3, 4, 1, 2);
  762. bench_case(1, 256, 256, 14 * 2, 14 * 2, 3, 4, 1, 2);
  763. bench_case(1, 512, 512, 7 * 2, 7 * 2, 3, 4, 1, 2);
  764. }
  765. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_INT8_INT8_INT8_STRIDE2) {
  766. constexpr size_t RUNS = 50;
  767. param::ConvBias param;
  768. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  769. param.pad_h = 1;
  770. param.pad_w = 1;
  771. param.stride_h = 2;
  772. param.stride_w = 2;
  773. param.sparse = param::ConvBias::Sparse::GROUP;
  774. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  775. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  776. size_t group, size_t P, size_t S) {
  777. SmallVector<TensorShape> shapes{
  778. {N, IC, H, W},
  779. {group, OC / group, IC / group, FS, FS},
  780. {1, OC, 1, 1},
  781. {},
  782. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  783. TensorShape dst{N, OC, H, W};
  784. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  785. dst.total_nr_elems()) *
  786. 1e-6;
  787. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  788. };
  789. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 2);
  790. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 2);
  791. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 2);
  792. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 2);
  793. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 2);
  794. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 2);
  795. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 2);
  796. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 2);
  797. std::string algo_name = "S8STRD2";
  798. printf("Benchmark S8STRD2_LARGE_GROUP algo\n");
  799. std::vector<DType> data_type = {
  800. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  801. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  802. benchmark_impl(
  803. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  804. data_type);
  805. benchmark_impl(
  806. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  807. data_type);
  808. benchmark_impl(
  809. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  810. data_type);
  811. shapes_and_computation.clear();
  812. algo_name = "S8STRD2";
  813. printf("Benchmark S8STRD2_SMALL_GROUP algo\n");
  814. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 2);
  815. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 2);
  816. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 2);
  817. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 2);
  818. benchmark_impl(
  819. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  820. data_type);
  821. benchmark_impl(
  822. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  823. data_type);
  824. benchmark_impl(
  825. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  826. data_type);
  827. }
  828. #if MGB_ENABLE_DOT
  829. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  830. BENCHMARK_CONVBIAS_INT8_INT8_INT8_STRIDE1_WITHDOTPROD) {
  831. constexpr size_t RUNS = 50;
  832. param::ConvBias param;
  833. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  834. param.pad_h = 1;
  835. param.pad_w = 1;
  836. param.stride_h = 1;
  837. param.stride_w = 1;
  838. param.sparse = param::ConvBias::Sparse::GROUP;
  839. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  840. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  841. size_t group, size_t P, size_t S) {
  842. SmallVector<TensorShape> shapes{
  843. {N, IC, H, W},
  844. {group, OC / group, IC / group, FS, FS},
  845. {1, OC, 1, 1},
  846. {},
  847. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  848. TensorShape dst{N, OC, H, W};
  849. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  850. dst.total_nr_elems()) *
  851. 1e-6;
  852. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  853. };
  854. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 1);
  855. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 1);
  856. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 1);
  857. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 1);
  858. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 1);
  859. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 1);
  860. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 1);
  861. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 1);
  862. std::string algo_name = "ARMDOTS8STRD1";
  863. printf("Benchmark ARMDOTS8STRD1_LARGE_GROUP algo\n");
  864. std::vector<DType> data_type = {
  865. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  866. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  867. benchmark_impl(
  868. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  869. data_type);
  870. benchmark_impl(
  871. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  872. data_type);
  873. benchmark_impl(
  874. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  875. data_type);
  876. shapes_and_computation.clear();
  877. algo_name = "ARMDOTS8STRD1";
  878. printf("Benchmark ARMDOTS8STRD1_SMALL_GROUP algo\n");
  879. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 1);
  880. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 1);
  881. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 1);
  882. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 1);
  883. benchmark_impl(
  884. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  885. data_type);
  886. benchmark_impl(
  887. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  888. data_type);
  889. benchmark_impl(
  890. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  891. data_type);
  892. }
  893. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  894. BENCHMARK_CONVBIAS_INT8_INT8_INT8_STRIDE2_WITHDOTPROD) {
  895. constexpr size_t RUNS = 50;
  896. param::ConvBias param;
  897. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  898. param.pad_h = 1;
  899. param.pad_w = 1;
  900. param.stride_h = 2;
  901. param.stride_w = 2;
  902. param.sparse = param::ConvBias::Sparse::GROUP;
  903. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  904. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  905. size_t group, size_t P, size_t S) {
  906. SmallVector<TensorShape> shapes{
  907. {N, IC, H, W},
  908. {group, OC / group, IC / group, FS, FS},
  909. {1, OC, 1, 1},
  910. {},
  911. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  912. TensorShape dst{N, OC, H, W};
  913. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  914. dst.total_nr_elems()) *
  915. 1e-6;
  916. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  917. };
  918. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 2);
  919. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 2);
  920. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 2);
  921. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 2);
  922. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 2);
  923. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 2);
  924. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 2);
  925. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 2);
  926. std::string algo_name = "ARMDOTS8STRD2";
  927. printf("Benchmark ARMDOTS8STRD2_LARGE_GROUP algo\n");
  928. std::vector<DType> data_type = {
  929. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  930. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  931. benchmark_impl(
  932. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  933. data_type);
  934. benchmark_impl(
  935. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  936. data_type);
  937. benchmark_impl(
  938. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  939. data_type);
  940. shapes_and_computation.clear();
  941. algo_name = "ARMDOTS8STRD2";
  942. printf("Benchmark ARMDOTS8STRD2_SMALL_GROUP algo\n");
  943. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 2);
  944. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 2);
  945. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 2);
  946. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 2);
  947. benchmark_impl(
  948. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  949. data_type);
  950. benchmark_impl(
  951. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  952. data_type);
  953. benchmark_impl(
  954. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  955. data_type);
  956. }
  957. #endif
  958. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  959. BENCHMARK_CONVBIAS_QUINT8_QUINT8_QUINT8_STRIDE1) {
  960. constexpr size_t RUNS = 50;
  961. param::ConvBias param;
  962. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  963. param.pad_h = 1;
  964. param.pad_w = 1;
  965. param.stride_h = 1;
  966. param.stride_w = 1;
  967. param.sparse = param::ConvBias::Sparse::GROUP;
  968. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  969. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  970. size_t group, size_t P, size_t S) {
  971. SmallVector<TensorShape> shapes{
  972. {N, IC, H, W},
  973. {group, OC / group, IC / group, FS, FS},
  974. {1, OC, 1, 1},
  975. {},
  976. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  977. TensorShape dst{N, OC, H, W};
  978. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  979. dst.total_nr_elems()) *
  980. 1e-6;
  981. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  982. };
  983. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 1);
  984. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 1);
  985. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 1);
  986. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 1);
  987. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 1);
  988. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 1);
  989. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 1);
  990. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 1);
  991. std::string algo_name = "QU8STRD1";
  992. printf("Benchmark QU8STRD1_LARGE_GROUP algo\n");
  993. std::vector<DType> data_type = {
  994. dtype::Quantized8Asymm(0.2f, 100), dtype::Quantized8Asymm(0.2f, 120),
  995. dtype::QuantizedS32(0.04f), dtype::Quantized8Asymm(1.4f, 110)};
  996. benchmark_impl(
  997. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  998. data_type);
  999. benchmark_impl(
  1000. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1001. data_type);
  1002. benchmark_impl(
  1003. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1004. data_type);
  1005. shapes_and_computation.clear();
  1006. algo_name = "QU8STRD1";
  1007. printf("Benchmark QU8STRD1_SMALL_GROUP algo\n");
  1008. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 1);
  1009. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 1);
  1010. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 1);
  1011. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 1);
  1012. benchmark_impl(
  1013. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1014. data_type);
  1015. benchmark_impl(
  1016. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1017. data_type);
  1018. benchmark_impl(
  1019. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1020. data_type);
  1021. }
  1022. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  1023. BENCHMARK_CONVBIAS_QUINT8_QUINT8_QUINT8_STRIDE2) {
  1024. constexpr size_t RUNS = 50;
  1025. param::ConvBias param;
  1026. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1027. param.pad_h = 1;
  1028. param.pad_w = 1;
  1029. param.stride_h = 2;
  1030. param.stride_w = 2;
  1031. param.sparse = param::ConvBias::Sparse::GROUP;
  1032. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1033. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1034. size_t group, size_t P, size_t S) {
  1035. SmallVector<TensorShape> shapes{
  1036. {N, IC, H, W},
  1037. {group, OC / group, IC / group, FS, FS},
  1038. {1, OC, 1, 1},
  1039. {},
  1040. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  1041. TensorShape dst{N, OC, H, W};
  1042. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1043. dst.total_nr_elems()) *
  1044. 1e-6;
  1045. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1046. };
  1047. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 2);
  1048. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 2);
  1049. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 2);
  1050. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 2);
  1051. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 2);
  1052. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 2);
  1053. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 2);
  1054. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 2);
  1055. std::string algo_name = "QU8STRD2";
  1056. printf("Benchmark QU8STRD2_LARGE_GROUP algo\n");
  1057. std::vector<DType> data_type = {
  1058. dtype::Quantized8Asymm(0.2f, 100), dtype::Quantized8Asymm(0.2f, 120),
  1059. dtype::QuantizedS32(0.04f), dtype::Quantized8Asymm(1.4f, 110)};
  1060. benchmark_impl(
  1061. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1062. data_type);
  1063. benchmark_impl(
  1064. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1065. data_type);
  1066. benchmark_impl(
  1067. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1068. data_type);
  1069. shapes_and_computation.clear();
  1070. algo_name = "QU8STRD2";
  1071. printf("Benchmark QU8STRD2_SMALL_GROUP algo\n");
  1072. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 2);
  1073. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 2);
  1074. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 2);
  1075. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 2);
  1076. benchmark_impl(
  1077. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1078. data_type);
  1079. benchmark_impl(
  1080. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1081. data_type);
  1082. benchmark_impl(
  1083. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1084. data_type);
  1085. }
  1086. #if MGB_ENABLE_DOT
  1087. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  1088. BENCHMARK_CONVBIAS_QUINT8_QUINT8_QUINT8_STRIDE1_WITHDOTPROD) {
  1089. constexpr size_t RUNS = 50;
  1090. param::ConvBias param;
  1091. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1092. param.pad_h = 1;
  1093. param.pad_w = 1;
  1094. param.stride_h = 1;
  1095. param.stride_w = 1;
  1096. param.sparse = param::ConvBias::Sparse::GROUP;
  1097. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1098. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1099. size_t group, size_t P, size_t S) {
  1100. SmallVector<TensorShape> shapes{
  1101. {N, IC, H, W},
  1102. {group, OC / group, IC / group, FS, FS},
  1103. {1, OC, 1, 1},
  1104. {},
  1105. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  1106. TensorShape dst{N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1};
  1107. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1108. dst.total_nr_elems()) *
  1109. 1e-6;
  1110. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1111. };
  1112. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 1);
  1113. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 1);
  1114. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 1);
  1115. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 1);
  1116. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 1);
  1117. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 1);
  1118. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 1);
  1119. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 1);
  1120. std::string algo_name = "ARMDOTU8STRD1";
  1121. printf("Benchmark ARMDOTU8STRD1_LARGE_GROUP algo\n");
  1122. std::vector<DType> data_type = {
  1123. dtype::Quantized8Asymm(0.2f, 100), dtype::Quantized8Asymm(0.2f, 120),
  1124. dtype::QuantizedS32(0.04f), dtype::Quantized8Asymm(1.4f, 110)};
  1125. benchmark_impl(
  1126. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1127. data_type);
  1128. benchmark_impl(
  1129. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1130. data_type);
  1131. benchmark_impl(
  1132. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1133. data_type);
  1134. shapes_and_computation.clear();
  1135. algo_name = "ARMDOTU8STRD1";
  1136. printf("Benchmark ARMDOTS8STRD1_SMALL_GROUP algo\n");
  1137. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 1);
  1138. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 1);
  1139. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 1);
  1140. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 1);
  1141. benchmark_impl(
  1142. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1143. data_type);
  1144. benchmark_impl(
  1145. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1146. data_type);
  1147. benchmark_impl(
  1148. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1149. data_type);
  1150. }
  1151. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  1152. BENCHMARK_CONVBIAS_QUINT8_QUINT8_QUINT8_STRIDE2_WITHDOTPROD) {
  1153. constexpr size_t RUNS = 50;
  1154. param::ConvBias param;
  1155. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1156. param.pad_h = 1;
  1157. param.pad_w = 1;
  1158. param.stride_h = 2;
  1159. param.stride_w = 2;
  1160. param.sparse = param::ConvBias::Sparse::GROUP;
  1161. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1162. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1163. size_t group, size_t P, size_t S) {
  1164. SmallVector<TensorShape> shapes{
  1165. {N, IC, H, W},
  1166. {group, OC / group, IC / group, FS, FS},
  1167. {1, OC, 1, 1},
  1168. {},
  1169. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  1170. TensorShape dst{N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1};
  1171. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1172. dst.total_nr_elems()) *
  1173. 1e-6;
  1174. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1175. };
  1176. bench_case(1, 32, 32, 200, 200, 5, 4, 1, 2);
  1177. bench_case(1, 32, 32, 200, 200, 5, 32, 1, 2);
  1178. bench_case(1, 32, 32, 128, 128, 5, 4, 1, 2);
  1179. bench_case(1, 32, 32, 128, 128, 5, 32, 1, 2);
  1180. bench_case(1, 32, 32, 100, 100, 5, 4, 1, 2);
  1181. bench_case(1, 32, 32, 100, 100, 5, 32, 1, 2);
  1182. bench_case(1, 32, 32, 80, 80, 5, 4, 1, 2);
  1183. bench_case(1, 32, 32, 80, 80, 5, 32, 1, 2);
  1184. std::string algo_name = "ARMDOTU8STRD2";
  1185. printf("Benchmark ARMDOTU8STRD2_LARGE_GROUP algo\n");
  1186. std::vector<DType> data_type = {
  1187. dtype::Quantized8Asymm(0.2f, 100), dtype::Quantized8Asymm(0.2f, 120),
  1188. dtype::QuantizedS32(0.04f), dtype::Quantized8Asymm(1.4f, 110)};
  1189. benchmark_impl(
  1190. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1191. data_type);
  1192. benchmark_impl(
  1193. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1194. data_type);
  1195. benchmark_impl(
  1196. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1197. data_type);
  1198. shapes_and_computation.clear();
  1199. algo_name = "ARMDOTU8STRD2";
  1200. printf("Benchmark ARMDOTU8STRD2_SMALL_GROUP algo\n");
  1201. bench_case(1, 32, 32, 200, 200, 5, 1, 1, 2);
  1202. bench_case(1, 32, 32, 128, 128, 5, 1, 1, 2);
  1203. bench_case(1, 32, 32, 100, 100, 5, 1, 1, 2);
  1204. bench_case(1, 32, 32, 80, 80, 5, 1, 1, 2);
  1205. benchmark_impl(
  1206. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1207. data_type);
  1208. benchmark_impl(
  1209. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1210. data_type);
  1211. benchmark_impl(
  1212. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1213. data_type);
  1214. }
  1215. #endif
  1216. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_WINOGRAD_F32) {
  1217. constexpr size_t RUNS = 50;
  1218. param::ConvBias param;
  1219. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1220. param.pad_h = 1;
  1221. param.pad_w = 1;
  1222. param.stride_h = 1;
  1223. param.stride_w = 1;
  1224. param.sparse = param::ConvBias::Sparse::GROUP;
  1225. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1226. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1227. size_t group) {
  1228. SmallVector<TensorShape> shapes{
  1229. {N, IC, H, W},
  1230. {group, OC / group, IC / group, FS, FS},
  1231. {1, OC, 1, 1},
  1232. {},
  1233. {N, OC, H, W}};
  1234. TensorShape dst{N, OC, H, W};
  1235. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1236. dst.total_nr_elems()) *
  1237. 1e-6;
  1238. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1239. };
  1240. bench_case(1, 32, 32, 200, 200, 3, 4);
  1241. bench_case(1, 32, 32, 200, 200, 3, 1);
  1242. bench_case(1, 32, 32, 128, 128, 3, 4);
  1243. bench_case(1, 32, 32, 128, 128, 3, 1);
  1244. bench_case(1, 32, 32, 100, 100, 3, 4);
  1245. bench_case(1, 32, 32, 100, 100, 3, 1);
  1246. bench_case(1, 32, 32, 80, 80, 3, 4);
  1247. bench_case(1, 512, 512, 14, 14, 3, 1);
  1248. bench_case(1, 512, 256, 14, 14, 3, 1);
  1249. bench_case(1, 512, 128, 14, 14, 3, 1);
  1250. bench_case(1, 512, 64, 14, 14, 3, 1);
  1251. bench_case(1, 512, 512, 7, 7, 3, 1);
  1252. bench_case(1, 512, 256, 7, 7, 3, 1);
  1253. bench_case(1, 512, 128, 7, 7, 3, 1);
  1254. bench_case(1, 512, 64, 7, 7, 3, 1);
  1255. std::string algo_name;
  1256. #if MEGDNN_AARCH64
  1257. algo_name = "WINOGRAD:AARCH64_F32_MK4_4x16:4:2";
  1258. #else
  1259. algo_name = "WINOGRAD:ARMV7_F32_MK4_4x8:4:2";
  1260. #endif
  1261. std::vector<DType> data_type = {
  1262. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  1263. printf("Benchmark WINOGRAD_F32_MK4 algo\n");
  1264. benchmark_impl(
  1265. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1266. data_type);
  1267. benchmark_impl(
  1268. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1269. data_type);
  1270. benchmark_impl(
  1271. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1272. data_type);
  1273. }
  1274. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_WINOGRAD_INT8) {
  1275. constexpr size_t RUNS = 50;
  1276. param::ConvBias param;
  1277. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1278. param.pad_h = 1;
  1279. param.pad_w = 1;
  1280. param.stride_h = 1;
  1281. param.stride_w = 1;
  1282. param.sparse = param::ConvBias::Sparse::GROUP;
  1283. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1284. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1285. size_t group) {
  1286. SmallVector<TensorShape> shapes{
  1287. {N, IC, H, W},
  1288. {group, OC / group, IC / group, FS, FS},
  1289. {1, OC, 1, 1},
  1290. {},
  1291. {N, OC, H, W}};
  1292. TensorShape dst{N, OC, H, W};
  1293. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1294. dst.total_nr_elems()) *
  1295. 1e-6;
  1296. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1297. };
  1298. bench_case(1, 32, 32, 200, 200, 3, 4);
  1299. bench_case(1, 32, 32, 200, 200, 3, 1);
  1300. bench_case(1, 32, 32, 128, 128, 3, 4);
  1301. bench_case(1, 32, 32, 128, 128, 3, 1);
  1302. bench_case(1, 32, 32, 100, 100, 3, 4);
  1303. bench_case(1, 32, 32, 100, 100, 3, 1);
  1304. bench_case(1, 32, 32, 80, 80, 3, 4);
  1305. bench_case(1, 512, 512, 14, 14, 3, 1);
  1306. bench_case(1, 512, 256, 14, 14, 3, 1);
  1307. bench_case(1, 512, 128, 14, 14, 3, 1);
  1308. bench_case(1, 512, 64, 14, 14, 3, 1);
  1309. bench_case(1, 512, 512, 7, 7, 3, 1);
  1310. bench_case(1, 512, 256, 7, 7, 3, 1);
  1311. bench_case(1, 512, 128, 7, 7, 3, 1);
  1312. bench_case(1, 512, 64, 7, 7, 3, 1);
  1313. std::string algo_name;
  1314. #if MEGDNN_AARCH64
  1315. algo_name = "WINOGRAD:AARCH64_INT16X16X32_MK8_8X8:8:2:32";
  1316. #else
  1317. algo_name = "WINOGRAD:ARMV7_INT16X16X32_MK8_4X8:8:2:32";
  1318. #endif
  1319. std::vector<DType> data_type = {
  1320. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  1321. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  1322. printf("Benchmark WINOGRAD_IN8_MK8 algo\n");
  1323. benchmark_impl(
  1324. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1325. data_type);
  1326. benchmark_impl(
  1327. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1328. data_type);
  1329. benchmark_impl(
  1330. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1331. data_type);
  1332. }
  1333. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  1334. BENCHMARK_CONVBIAS_WINOGRAD_NCHW44_INT8_MK8) {
  1335. constexpr size_t RUNS = 50;
  1336. param::ConvBias param;
  1337. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1338. param.pad_h = 1;
  1339. param.pad_w = 1;
  1340. param.stride_h = 1;
  1341. param.stride_w = 1;
  1342. param.sparse = param::ConvBias::Sparse::DENSE;
  1343. param.format = param::ConvBias::Format::NCHW44;
  1344. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1345. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1346. size_t group) {
  1347. SmallVector<TensorShape> shapes{
  1348. {N, IC / 4, H, W, 4},
  1349. {OC / 4, IC / 4, FS, FS, 4, 4},
  1350. {1, OC / 4, 1, 1, 4},
  1351. {},
  1352. {N, OC / 4, H, W, 4}};
  1353. TensorShape dst{N, OC, H, W};
  1354. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1355. dst.total_nr_elems()) *
  1356. 1e-6;
  1357. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1358. };
  1359. bench_case(1, 32, 32, 200, 200, 3, 1);
  1360. bench_case(1, 32, 32, 128, 128, 3, 1);
  1361. bench_case(1, 32, 32, 100, 100, 3, 1);
  1362. bench_case(1, 512, 512, 14, 14, 3, 1);
  1363. bench_case(1, 512, 256, 14, 14, 3, 1);
  1364. bench_case(1, 512, 128, 14, 14, 3, 1);
  1365. bench_case(1, 512, 64, 14, 14, 3, 1);
  1366. bench_case(1, 512, 512, 7, 7, 3, 1);
  1367. bench_case(1, 512, 256, 7, 7, 3, 1);
  1368. bench_case(1, 512, 128, 7, 7, 3, 1);
  1369. bench_case(1, 512, 64, 7, 7, 3, 1);
  1370. std::string algo_name;
  1371. #if MEGDNN_AARCH64
  1372. algo_name = "WINOGRAD_NCHW44:AARCH64_INT16X16X32_MK8_8X8:8:2:32";
  1373. #else
  1374. algo_name = "WINOGRAD_NCHW44:ARMV7_INT16X16X32_MK8_4X8:8:2:32";
  1375. #endif
  1376. std::vector<DType> data_type = {
  1377. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  1378. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  1379. printf("Benchmark WINOGRAD_INT8_MK8 algo\n");
  1380. benchmark_impl(
  1381. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1382. data_type);
  1383. benchmark_impl(
  1384. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1385. data_type);
  1386. benchmark_impl(
  1387. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1388. data_type);
  1389. }
  1390. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  1391. BENCHMARK_CONVBIAS_WINOGRAD_NCHW44_INT8_COMP_F32) {
  1392. constexpr size_t RUNS = 50;
  1393. param::ConvBias param;
  1394. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1395. param.pad_h = 1;
  1396. param.pad_w = 1;
  1397. param.stride_h = 1;
  1398. param.stride_w = 1;
  1399. param.sparse = param::ConvBias::Sparse::DENSE; // GROUP;
  1400. param.format = param::ConvBias::Format::NCHW44;
  1401. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1402. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1403. size_t group) {
  1404. SmallVector<TensorShape> shapes{
  1405. {N, IC / 4, H, W, 4},
  1406. {OC / 4, IC / 4, FS, FS, 4, 4},
  1407. {1, OC / 4, 1, 1, 4},
  1408. {},
  1409. {N, OC / 4, H, W, 4}};
  1410. TensorShape dst{N, OC, H, W};
  1411. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1412. dst.total_nr_elems()) *
  1413. 1e-6;
  1414. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1415. };
  1416. bench_case(1, 32, 32, 200, 200, 3, 1);
  1417. bench_case(1, 32, 32, 128, 128, 3, 1);
  1418. bench_case(1, 32, 32, 100, 100, 3, 1);
  1419. bench_case(1, 512, 512, 14, 14, 3, 1);
  1420. bench_case(1, 512, 256, 14, 14, 3, 1);
  1421. bench_case(1, 512, 128, 14, 14, 3, 1);
  1422. bench_case(1, 512, 64, 14, 14, 3, 1);
  1423. bench_case(1, 512, 512, 7, 7, 3, 1);
  1424. bench_case(1, 512, 256, 7, 7, 3, 1);
  1425. bench_case(1, 512, 128, 7, 7, 3, 1);
  1426. bench_case(1, 512, 64, 7, 7, 3, 1);
  1427. std::string algo_name;
  1428. #if MEGDNN_AARCH64
  1429. algo_name = "WINOGRAD_NCHW44:AARCH64_F32_MK4_4x16:4:2:32";
  1430. #else
  1431. algo_name = "WINOGRAD_NCHW44:ARMV7_F32_MK4_4x8:4:2:32";
  1432. #endif
  1433. std::vector<DType> data_type = {
  1434. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  1435. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  1436. printf("Benchmark WINOGRAD_INT8_NCHW44_MK4_COMP_F32 algo\n");
  1437. benchmark_impl(
  1438. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1439. data_type);
  1440. benchmark_impl(
  1441. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1442. data_type);
  1443. benchmark_impl(
  1444. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1445. data_type);
  1446. }
  1447. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_FP32) {
  1448. constexpr size_t RUNS = 50;
  1449. param::ConvBias param;
  1450. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1451. param.pad_h = 1;
  1452. param.pad_w = 1;
  1453. param.stride_h = 1;
  1454. param.stride_w = 1;
  1455. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1456. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1457. size_t group) {
  1458. SmallVector<TensorShape> shapes{
  1459. {N, IC, H, W},
  1460. {OC, IC / group, FS, FS},
  1461. {1, OC, 1, 1},
  1462. {},
  1463. {N, OC, H, W}};
  1464. TensorShape dst{N, OC, H, W};
  1465. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1466. dst.total_nr_elems()) *
  1467. 1e-6;
  1468. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1469. };
  1470. std::vector<DType> data_type = {
  1471. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  1472. bench_case(1, 32, 32, 300, 300, 3, 1);
  1473. bench_case(1, 32, 32, 400, 400, 3, 1);
  1474. bench_case(1, 32, 32, 100, 100, 3, 1);
  1475. bench_case(1, 32, 32, 80, 80, 3, 1);
  1476. bench_case(1, 32, 64, 200, 200, 3, 1);
  1477. bench_case(1, 32, 64, 128, 128, 3, 1);
  1478. bench_case(1, 32, 64, 100, 100, 3, 1);
  1479. bench_case(1, 32, 64, 80, 80, 3, 1);
  1480. bench_case(1, 32, 128, 200, 200, 3, 1);
  1481. bench_case(1, 32, 128, 128, 128, 3, 1);
  1482. bench_case(1, 32, 128, 100, 100, 3, 1);
  1483. bench_case(1, 32, 128, 80, 80, 3, 1);
  1484. bench_case(1, 64, 32, 7, 7, 3, 1);
  1485. bench_case(1, 64, 64, 7, 7, 3, 1);
  1486. bench_case(1, 64, 128, 7, 7, 3, 1);
  1487. bench_case(1, 64, 256, 7, 7, 3, 1);
  1488. bench_case(1, 64, 512, 7, 7, 3, 1);
  1489. bench_case(1, 64, 1024, 7, 7, 3, 1);
  1490. bench_case(1, 64, 32, 14, 14, 3, 1);
  1491. bench_case(1, 64, 64, 14, 14, 3, 1);
  1492. bench_case(1, 64, 128, 14, 14, 3, 1);
  1493. bench_case(1, 64, 256, 14, 14, 3, 1);
  1494. bench_case(1, 64, 512, 14, 14, 3, 1);
  1495. bench_case(1, 64, 1024, 14, 14, 3, 1);
  1496. bench_case(1, 128, 128, 14, 14, 3, 1);
  1497. bench_case(1, 128, 256, 14, 14, 3, 1);
  1498. bench_case(1, 512, 512, 14, 14, 3, 1);
  1499. bench_case(1, 256, 512, 14, 14, 3, 1);
  1500. bench_case(1, 512, 1024, 14, 14, 3, 1);
  1501. bench_case(1, 1024, 1024, 14, 14, 3, 1);
  1502. std::string algo_name = "IM2COLMATMUL:AARCH64_F32K8X12X1:96";
  1503. printf("Benchmark IM2COLMATMUL:AARCH64_F32K8X12X1algo:96\n");
  1504. benchmark_impl(
  1505. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1506. data_type);
  1507. benchmark_impl(
  1508. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1509. data_type);
  1510. benchmark_impl(
  1511. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1512. data_type);
  1513. algo_name = "IM2COLMATMUL:AARCH64_F32K8X12X1:192";
  1514. printf("Benchmark IM2COLMATMUL:AARCH64_F32K8X12X1algo:192\n");
  1515. benchmark_impl(
  1516. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1517. data_type);
  1518. benchmark_impl(
  1519. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1520. data_type);
  1521. benchmark_impl(
  1522. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1523. data_type);
  1524. algo_name = "IM2COLMATMUL:AARCH64_F32K8X12X1:384";
  1525. printf("Benchmark IM2COLMATMUL:AARCH64_F32K8X12X1algo:384\n");
  1526. benchmark_impl(
  1527. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1528. data_type);
  1529. benchmark_impl(
  1530. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1531. data_type);
  1532. benchmark_impl(
  1533. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1534. data_type);
  1535. shapes_and_computation.clear();
  1536. }
  1537. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_NCHW44_VS_NCHW88) {
  1538. constexpr size_t RUNS = 50;
  1539. using NLMode = param::ConvBias::NonlineMode;
  1540. std::vector<conv_bias::TestArg> args_nchw88, args_nchw44;
  1541. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1542. size_t group) {
  1543. param::ConvBias param_nchw88, param_nchw44;
  1544. param_nchw88.format = param::ConvBias::Format::NCHW88;
  1545. param_nchw44.format = param::ConvBias::Format::NCHW44;
  1546. for (size_t pad : {1, 2, 4}) {
  1547. for (size_t stride : {1, 2, 3}) {
  1548. for (auto nlmode :
  1549. {NLMode::RELU, NLMode::IDENTITY, NLMode::SIGMOID,
  1550. NLMode::H_SWISH}) {
  1551. param_nchw88.nonlineMode = nlmode;
  1552. param_nchw88.pad_h = pad;
  1553. param_nchw88.pad_w = pad;
  1554. param_nchw88.stride_h = stride;
  1555. param_nchw88.stride_w = stride;
  1556. param_nchw44.nonlineMode = nlmode;
  1557. param_nchw44.pad_h = pad;
  1558. param_nchw44.pad_w = pad;
  1559. param_nchw44.stride_h = stride;
  1560. param_nchw44.stride_w = stride;
  1561. args_nchw88.emplace_back(
  1562. param_nchw88, TensorShape{N, IC / 8, H, W, 8},
  1563. TensorShape{OC / 8, IC / group / 8, FS, FS, 8, 8},
  1564. TensorShape{1, OC / 8, 1, 1, 8});
  1565. args_nchw44.emplace_back(
  1566. param_nchw44, TensorShape{N, IC / 4, H, W, 4},
  1567. TensorShape{OC / 4, IC / group / 4, FS, FS, 4, 4},
  1568. TensorShape{1, OC / 4, 1, 1, 4});
  1569. }
  1570. }
  1571. }
  1572. };
  1573. std::vector<DType> data_type_fp16 = {
  1574. dtype::Float16(), dtype::Float16(), dtype::Float16(), dtype::Float16()};
  1575. std::vector<DType> data_type_fp32 = {
  1576. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  1577. bench_case(1, 32, 32, 300, 300, 3, 1);
  1578. bench_case(1, 32, 32, 400, 400, 3, 1);
  1579. bench_case(1, 32, 32, 100, 100, 3, 1);
  1580. bench_case(1, 32, 32, 80, 80, 3, 1);
  1581. bench_case(1, 32, 64, 200, 200, 3, 1);
  1582. bench_case(1, 32, 64, 128, 128, 3, 1);
  1583. bench_case(1, 32, 64, 100, 100, 3, 1);
  1584. bench_case(1, 32, 64, 80, 80, 3, 1);
  1585. bench_case(1, 32, 128, 200, 200, 3, 1);
  1586. bench_case(1, 32, 128, 128, 128, 3, 1);
  1587. bench_case(1, 32, 128, 100, 100, 3, 1);
  1588. bench_case(1, 32, 128, 80, 80, 3, 1);
  1589. bench_case(1, 64, 32, 7, 7, 3, 1);
  1590. bench_case(1, 64, 64, 7, 7, 3, 1);
  1591. bench_case(1, 64, 128, 7, 7, 3, 1);
  1592. bench_case(1, 64, 256, 7, 7, 3, 1);
  1593. bench_case(1, 64, 512, 7, 7, 3, 1);
  1594. bench_case(1, 64, 1024, 7, 7, 3, 1);
  1595. bench_case(1, 64, 32, 14, 14, 3, 1);
  1596. bench_case(1, 64, 64, 14, 14, 3, 1);
  1597. bench_case(1, 64, 128, 14, 14, 3, 1);
  1598. bench_case(1, 64, 256, 14, 14, 3, 1);
  1599. bench_case(1, 64, 512, 14, 14, 3, 1);
  1600. bench_case(1, 64, 1024, 14, 14, 3, 1);
  1601. bench_case(1, 128, 128, 14, 14, 3, 1);
  1602. bench_case(1, 128, 256, 14, 14, 3, 1);
  1603. bench_case(1, 512, 512, 14, 14, 3, 1);
  1604. bench_case(1, 256, 512, 14, 14, 3, 1);
  1605. bench_case(1, 512, 1024, 14, 14, 3, 1);
  1606. bench_case(1, 1024, 1024, 14, 14, 3, 1);
  1607. std::string algo_name_nchw88 = "IM2COLMATMUL:AARCH64_F16_MK8_16X12X1:96";
  1608. std::string algo_name_nchw44 = "IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1:96";
  1609. benchmark_with_contrast(
  1610. args_nchw88, algo_name_nchw88, data_type_fp16, args_nchw44,
  1611. algo_name_nchw44, data_type_fp32, RUNS, {1, {4}});
  1612. }
  1613. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  1614. BENCHMARK_CHANNEL_WISE_INT8_INT8_INT8_STRIDE1) {
  1615. constexpr size_t RUNS = 50;
  1616. param::ConvBias param;
  1617. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1618. param.pad_h = 1;
  1619. param.pad_w = 1;
  1620. param.stride_h = 1;
  1621. param.stride_w = 1;
  1622. param.sparse = param::ConvBias::Sparse::GROUP;
  1623. param.format = param::ConvBias::Format::NCHW44;
  1624. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1625. auto bench_case = [&](size_t N, size_t IC, size_t H, size_t W, size_t FS,
  1626. size_t P) {
  1627. size_t group = IC;
  1628. size_t OC = IC;
  1629. size_t S = 1;
  1630. SmallVector<TensorShape> shapes{
  1631. {N, IC, H, W, 4},
  1632. {group, 1, 1, FS, FS, 4},
  1633. {1, OC, 1, 1, 4},
  1634. {},
  1635. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1, 4}};
  1636. TensorShape dst{N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1, 4};
  1637. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1638. dst.total_nr_elems()) *
  1639. 1e-6;
  1640. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1641. };
  1642. bench_case(1, 128, 200, 200, 3, 1);
  1643. bench_case(1, 128, 128, 128, 3, 1);
  1644. bench_case(1, 128, 100, 100, 3, 1);
  1645. bench_case(1, 128, 80, 80, 3, 1);
  1646. bench_case(1, 128, 56, 56, 3, 1);
  1647. bench_case(1, 128, 28, 28, 3, 1);
  1648. bench_case(1, 128, 14, 14, 3, 1);
  1649. bench_case(1, 64, 200, 200, 3, 1);
  1650. bench_case(1, 64, 128, 128, 3, 1);
  1651. bench_case(1, 64, 100, 100, 3, 1);
  1652. bench_case(1, 64, 80, 80, 3, 1);
  1653. bench_case(1, 64, 56, 56, 3, 1);
  1654. bench_case(1, 64, 28, 28, 3, 1);
  1655. bench_case(1, 64, 14, 14, 3, 1);
  1656. bench_case(1, 32, 200, 200, 3, 1);
  1657. bench_case(1, 32, 128, 128, 3, 1);
  1658. bench_case(1, 32, 100, 100, 3, 1);
  1659. bench_case(1, 32, 80, 80, 3, 1);
  1660. bench_case(1, 32, 56, 56, 3, 1);
  1661. bench_case(1, 32, 28, 28, 3, 1);
  1662. bench_case(1, 32, 14, 14, 3, 1);
  1663. std::string algo_name = "S8_CHAN_WISE_STRD1_NCHW44";
  1664. printf("Benchmarker S8_CHAN_WISE_STRD1_NCHW44 algo\n");
  1665. std::vector<DType> data_type = {
  1666. dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
  1667. dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
  1668. benchmark_impl(
  1669. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1670. data_type);
  1671. benchmark_impl(
  1672. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1673. data_type);
  1674. benchmark_impl(
  1675. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1676. data_type);
  1677. }
  1678. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  1679. BENCHMARK_CHANNEL_WISE_INT8_INT8_INT16_STRIDE1) {
  1680. constexpr size_t RUNS = 50;
  1681. param::ConvBias param;
  1682. param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
  1683. param.pad_h = 1;
  1684. param.pad_w = 1;
  1685. param.stride_h = 1;
  1686. param.stride_w = 1;
  1687. param.sparse = param::ConvBias::Sparse::GROUP;
  1688. param.format = param::ConvBias::Format::NCHW44;
  1689. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1690. auto bench_case = [&](size_t N, size_t IC, size_t H, size_t W, size_t FS,
  1691. size_t P) {
  1692. size_t group = IC;
  1693. size_t OC = IC;
  1694. size_t S = 1;
  1695. SmallVector<TensorShape> shapes{
  1696. {N, IC, H, W, 4},
  1697. {group, 1, 1, FS, FS, 4},
  1698. {1, OC, 1, 1, 4},
  1699. {},
  1700. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1, 4}};
  1701. TensorShape dst{N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1, 4};
  1702. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1703. dst.total_nr_elems()) *
  1704. 1e-6;
  1705. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1706. };
  1707. bench_case(1, 128, 200, 200, 3, 1);
  1708. bench_case(1, 128, 128, 128, 3, 1);
  1709. bench_case(1, 128, 100, 100, 3, 1);
  1710. bench_case(1, 128, 80, 80, 3, 1);
  1711. bench_case(1, 128, 56, 56, 3, 1);
  1712. bench_case(1, 128, 28, 28, 3, 1);
  1713. bench_case(1, 128, 14, 14, 3, 1);
  1714. bench_case(1, 64, 200, 200, 3, 1);
  1715. bench_case(1, 64, 128, 128, 3, 1);
  1716. bench_case(1, 64, 100, 100, 3, 1);
  1717. bench_case(1, 64, 80, 80, 3, 1);
  1718. bench_case(1, 64, 56, 56, 3, 1);
  1719. bench_case(1, 64, 28, 28, 3, 1);
  1720. bench_case(1, 64, 14, 14, 3, 1);
  1721. bench_case(1, 32, 200, 200, 3, 1);
  1722. bench_case(1, 32, 128, 128, 3, 1);
  1723. bench_case(1, 32, 100, 100, 3, 1);
  1724. bench_case(1, 32, 80, 80, 3, 1);
  1725. bench_case(1, 32, 56, 56, 3, 1);
  1726. bench_case(1, 32, 28, 28, 3, 1);
  1727. bench_case(1, 32, 14, 14, 3, 1);
  1728. std::string algo_name = "S8x8x16_CHAN_WISE_STRD1_STRD2_NCHW44";
  1729. printf("Benchmarker S8x8x16_CHAN_WISE_STRD1_STRD2_NCHW44 algo\n");
  1730. std::vector<DType> data_type = {
  1731. dtype::Int8(), dtype::Int8(), dtype::Int16(), dtype::Int16()};
  1732. benchmark_impl(
  1733. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1734. data_type);
  1735. benchmark_impl(
  1736. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1737. data_type);
  1738. benchmark_impl(
  1739. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1740. data_type);
  1741. }
  1742. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_IM2COL_NCHW44_INT8x8x32_STRIDE1) {
  1743. constexpr size_t RUNS = 50;
  1744. param::ConvBias param;
  1745. param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
  1746. param.pad_h = 1;
  1747. param.pad_w = 1;
  1748. param.stride_h = 1;
  1749. param.stride_w = 1;
  1750. param.sparse = param::ConvBias::Sparse::DENSE;
  1751. param.format = param::ConvBias::Format::NCHW44;
  1752. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1753. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  1754. size_t group = 1) {
  1755. SmallVector<TensorShape> shapes{
  1756. {N, IC, H, W, 4},
  1757. {OC, IC / group, FS, FS, 4, 4},
  1758. {/*1, OC, 1, 1*/},
  1759. {},
  1760. {N, OC, H, W, 4}};
  1761. TensorShape dst{N, OC, H, W, 4};
  1762. float computations = ((4 * IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  1763. dst.total_nr_elems()) *
  1764. 1e-6;
  1765. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1766. };
  1767. bench_case(1, 32, 32, 300, 300, 3, 1);
  1768. bench_case(1, 32, 32, 400, 400, 3, 1);
  1769. bench_case(1, 32, 32, 100, 100, 3, 1);
  1770. bench_case(1, 32, 32, 80, 80, 3, 1);
  1771. bench_case(1, 32, 64, 200, 200, 3, 1);
  1772. bench_case(1, 32, 64, 128, 128, 3, 1);
  1773. bench_case(1, 32, 64, 100, 100, 3, 1);
  1774. bench_case(1, 32, 64, 80, 80, 3, 1);
  1775. bench_case(1, 32, 128, 200, 200, 3, 1);
  1776. bench_case(1, 32, 128, 128, 128, 3, 1);
  1777. bench_case(1, 32, 128, 100, 100, 3, 1);
  1778. bench_case(1, 32, 128, 80, 80, 3, 1);
  1779. #if 1
  1780. bench_case(1, 64, 32, 7, 7, 3, 1);
  1781. bench_case(1, 64, 64, 7, 7, 3, 1);
  1782. bench_case(1, 64, 128, 7, 7, 3, 1);
  1783. bench_case(1, 64, 256, 7, 7, 3, 1);
  1784. bench_case(1, 64, 512, 7, 7, 3, 1);
  1785. bench_case(1, 64, 1024, 7, 7, 3, 1);
  1786. bench_case(1, 64, 32, 14, 14, 3, 1);
  1787. bench_case(1, 64, 64, 14, 14, 3, 1);
  1788. bench_case(1, 64, 128, 14, 14, 3, 1);
  1789. bench_case(1, 64, 256, 14, 14, 3, 1);
  1790. bench_case(1, 64, 512, 14, 14, 3, 1);
  1791. bench_case(1, 64, 1024, 14, 14, 3, 1);
  1792. bench_case(1, 128, 128, 14, 14, 3, 1);
  1793. bench_case(1, 128, 256, 14, 14, 3, 1);
  1794. bench_case(1, 512, 512, 14, 14, 3, 1);
  1795. bench_case(1, 256, 512, 14, 14, 3, 1);
  1796. bench_case(1, 512, 1024, 14, 14, 3, 1);
  1797. bench_case(1, 1024, 1024, 14, 14, 3, 1);
  1798. #endif
  1799. std::string algo_name = "IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96";
  1800. printf("Benchmarker IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96 algo\n");
  1801. std::vector<DType> data_type = {
  1802. dtype::QuantizedS8(2.5f),
  1803. dtype::QuantizedS8(2.5f),
  1804. dtype::QuantizedS32(6.25f),
  1805. {}};
  1806. benchmark_impl(
  1807. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1808. data_type);
  1809. benchmark_impl(
  1810. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1811. data_type);
  1812. benchmark_impl(
  1813. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1814. data_type);
  1815. algo_name = "IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:192";
  1816. printf("Benchmarker IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:192 "
  1817. "algo\n");
  1818. benchmark_impl(
  1819. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1820. data_type);
  1821. benchmark_impl(
  1822. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1823. data_type);
  1824. benchmark_impl(
  1825. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1826. data_type);
  1827. algo_name = "IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:384";
  1828. printf("Benchmarker IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:384 "
  1829. "algo\n");
  1830. benchmark_impl(
  1831. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1832. data_type);
  1833. benchmark_impl(
  1834. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1835. data_type);
  1836. benchmark_impl(
  1837. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1838. data_type);
  1839. }
  1840. #endif
  1841. /*================== BENCHMARK MULTITHREAD CONV1X1 =====================*/
  1842. #if MEGDNN_WITH_BENCHMARK
  1843. namespace {
  1844. std::vector<std::pair<SmallVector<TensorShape>, float>>
  1845. get_conv1x1_multithread_benchmark_args() {
  1846. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  1847. auto bench_case = [&](size_t IC, size_t OC, size_t H, size_t W) {
  1848. SmallVector<TensorShape> shapes{
  1849. {1, IC, H, W}, {OC, IC, 1, 1}, {1, OC, 1, 1}, {}, {1, OC, H, W}};
  1850. TensorShape dst{1, OC, H, W};
  1851. float computations =
  1852. (IC * dst.total_nr_elems() * 2 + dst.total_nr_elems()) * 1e-6;
  1853. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  1854. };
  1855. bench_case(32, 32, 300, 300);
  1856. bench_case(32, 32, 400, 400);
  1857. bench_case(32, 32, 100, 100);
  1858. bench_case(32, 32, 80, 80);
  1859. bench_case(32, 64, 200, 200);
  1860. bench_case(32, 64, 128, 128);
  1861. bench_case(32, 64, 100, 100);
  1862. bench_case(32, 64, 80, 80);
  1863. bench_case(32, 128, 200, 200);
  1864. bench_case(32, 128, 128, 128);
  1865. bench_case(32, 128, 100, 100);
  1866. bench_case(32, 128, 80, 80);
  1867. bench_case(64, 32, 7, 7);
  1868. bench_case(64, 64, 7, 7);
  1869. bench_case(64, 128, 7, 7);
  1870. bench_case(64, 256, 7, 7);
  1871. bench_case(64, 512, 7, 7);
  1872. bench_case(64, 1024, 7, 7);
  1873. bench_case(64, 32, 14, 14);
  1874. bench_case(64, 64, 14, 14);
  1875. bench_case(64, 128, 14, 14);
  1876. bench_case(64, 256, 14, 14);
  1877. bench_case(64, 512, 14, 14);
  1878. bench_case(64, 1024, 14, 14);
  1879. bench_case(128, 128, 14, 14);
  1880. bench_case(128, 256, 14, 14);
  1881. bench_case(512, 512, 14, 14);
  1882. bench_case(256, 512, 14, 14);
  1883. bench_case(512, 1024, 14, 14);
  1884. bench_case(1024, 1024, 14, 14);
  1885. return shapes_and_computation;
  1886. }
  1887. void conv1x1_multithread_benchmark(
  1888. const char* algo_name, DType stype, DType ftype, DType btype, DType dtype) {
  1889. constexpr size_t RUNS = 50;
  1890. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation =
  1891. get_conv1x1_multithread_benchmark_args();
  1892. std::vector<DType> data_type = {stype, ftype, btype, dtype};
  1893. param::ConvBias param;
  1894. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1895. param.pad_h = 0;
  1896. param.pad_w = 0;
  1897. param.stride_h = 1;
  1898. param.stride_w = 1;
  1899. benchmark_impl(
  1900. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  1901. data_type);
  1902. benchmark_impl(
  1903. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  1904. data_type);
  1905. benchmark_impl(
  1906. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  1907. data_type);
  1908. shapes_and_computation.clear();
  1909. }
  1910. } // namespace
  1911. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_CONV1X1_S1_FP32) {
  1912. #if MEGDNN_AARCH64
  1913. conv1x1_multithread_benchmark(
  1914. "CONV1x1:AARCH64_F32K8X12X1:8", dtype::Float32(), dtype::Float32(),
  1915. dtype::Float32(), dtype::Float32());
  1916. #else
  1917. conv1x1_multithread_benchmark(
  1918. "CONV1x1:ARMV7_F32:8", dtype::Float32(), dtype::Float32(), dtype::Float32(),
  1919. dtype::Float32());
  1920. #endif
  1921. }
  1922. TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
  1923. BENCHMARK_CONVBIAS_CONV1X1_S1_QUANTIZEDASYM) {
  1924. dtype::Quantized8Asymm stype(0.2f, 100);
  1925. dtype::Quantized8Asymm ftype(0.2f, 120);
  1926. dtype::QuantizedS32 btype(0.04f);
  1927. dtype::Quantized8Asymm dtype(1.4f, 110);
  1928. #if MEGDNN_AARCH64
  1929. #if MGB_ENABLE_DOT
  1930. conv1x1_multithread_benchmark(
  1931. "CONV1x1:AARCH64_QUINT8_K8X8X4_DOTPROD:8", stype, ftype, btype, dtype);
  1932. #else
  1933. conv1x1_multithread_benchmark(
  1934. "CONV1x1:AARCH64_QUINT8_K8X8X8:8", stype, ftype, btype, dtype);
  1935. #endif
  1936. #else
  1937. conv1x1_multithread_benchmark(
  1938. "CONV1x1:ARMV7_QUINT8_K4X8X8:8", stype, ftype, btype, dtype);
  1939. #endif
  1940. }
  1941. #endif
  1942. // vim: syntax=cpp.doxygen