You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias.cpp 107 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659
  1. /**
  2. * \file dnn/test/arm_common/conv_bias.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megdnn/dtype.h"
  13. #include "test/arm_common/fixture.h"
  14. #include "megdnn/opr_param_defs.h"
  15. #include "megdnn/oprs.h"
  16. #include "src/fallback/conv_bias/common.h"
  17. #include "test/common/benchmarker.h"
  18. #include "test/common/checker.h"
  19. #include "test/common/conv_bias.h"
  20. #include "test/common/rng.h"
  21. #include "test/common/task_record_check.h"
  22. #include "test/common/tensor.h"
  23. #include "test/common/workspace_wrapper.h"
  24. using namespace megdnn;
  25. using namespace test;
  26. using namespace conv_bias;
  27. //! TODO this algo current does not support multithread
  28. TEST_F(ARM_COMMON, CONVBIAS_INT8_INT8_INT16_STRIDE2F2) {
  29. checker_conv_bias_int8x8x16(
  30. get_conv_bias_args({2}, 2, true, true, true), handle(), "I8816STRD2F2");
  31. }
  32. TEST_F(ARM_COMMON, CONV_BIAS_MATMUL) {
  33. using namespace conv_bias;
  34. std::vector<TestArg> args = get_quantized_args();
  35. Checker<ConvBiasForward> checker(handle());
  36. checker.set_before_exec_callback(
  37. conv_bias::ConvBiasAlgoChecker<ConvBias>("S8MATMUL"));
  38. #if MEGDNN_ARMV7
  39. checker.set_epsilon(1);
  40. #endif
  41. UniformIntRNG rng{-50, 50};
  42. for (auto&& arg : args) {
  43. if (arg.bias.ndim == 4 && arg.bias[2] != 1 && arg.bias[3] != 1)
  44. continue;
  45. checker.set_dtype(0, dtype::QuantizedS8(0.41113496f))
  46. .set_dtype(1, dtype::QuantizedS8(0.01887994f))
  47. .set_dtype(2, dtype::QuantizedS32(0.41113496f * 0.01887994f))
  48. .set_dtype(4, dtype::QuantizedS8(0.49550694f))
  49. .set_rng(0, &rng)
  50. .set_rng(1, &rng)
  51. .set_rng(2, &rng)
  52. .set_param(arg.param)
  53. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  54. }
  55. }
  56. TEST_F(ARM_COMMON, CONV_BIAS_RECORD) {
  57. using namespace conv_bias;
  58. std::vector<TestArg> args = get_quantized_args();
  59. TaskRecordChecker<ConvBiasForward> checker(0);
  60. #if MEGDNN_ARMV7
  61. checker.set_epsilon(1);
  62. #endif
  63. UniformIntRNG rng{-50, 50};
  64. for (auto&& arg : args) {
  65. if (arg.bias.ndim == 4 && arg.bias[2] != 1 && arg.bias[3] != 1)
  66. continue;
  67. checker.set_dtype(0, dtype::QuantizedS8(0.41113496f))
  68. .set_dtype(1, dtype::QuantizedS8(0.01887994f))
  69. .set_dtype(2, dtype::QuantizedS32(0.41113496f * 0.01887994f))
  70. .set_dtype(4, dtype::QuantizedS8(0.49550694f))
  71. .set_rng(0, &rng)
  72. .set_rng(1, &rng)
  73. .set_rng(2, &rng)
  74. .set_param(arg.param)
  75. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  76. }
  77. }
  78. TEST_F(ARM_COMMON, CONV_BIAS_WINOGRAD_F63_4) {
  79. using namespace conv_bias;
  80. std::vector<TestArg> args = get_winograd_mk_packed_args();
  81. Checker<ConvBiasForward> checker(handle());
  82. check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4);
  83. }
  84. TEST_F(ARM_COMMON, CONV_BIAS_WINOGRAD_F63_4_WEIGHT_PREPROCESS) {
  85. using namespace conv_bias;
  86. std::vector<TestArg> args = get_winograd_mk_packed_args();
  87. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  88. handle());
  89. check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4);
  90. }
  91. #define CONV_BIAS_MATMUL_QU8_MODE(MODE) \
  92. using namespace conv_bias; \
  93. std::vector<TestArg> args = get_quantized_args_with_nlmode(MODE); \
  94. Checker<ConvBiasForward> checker(handle()); \
  95. checker.set_before_exec_callback( \
  96. conv_bias::ConvBiasAlgoChecker<ConvBias>("QU8MATMUL")); \
  97. UniformIntRNG rng{0, 127}; \
  98. for (auto&& arg : args) { \
  99. if (arg.bias.ndim == 4 && arg.bias[2] != 1 && arg.bias[3] != 1) \
  100. continue; \
  101. checker.set_dtype(0, dtype::Quantized8Asymm(2.5f, static_cast<uint8_t>(127))) \
  102. .set_dtype(1, dtype::Quantized8Asymm(2.7f, static_cast<uint8_t>(126))) \
  103. .set_dtype(2, dtype::QuantizedS32(6.75f)) \
  104. .set_dtype( \
  105. 4, dtype::Quantized8Asymm(60.25f, static_cast<uint8_t>(125))) \
  106. .set_rng(0, &rng) \
  107. .set_rng(1, &rng) \
  108. .set_rng(2, &rng) \
  109. .set_param(arg.param) \
  110. .execs({arg.src, arg.filter, arg.bias, {}, {}}); \
  111. }
  112. #define MODE_STR(mode) param::ConvBias::NonlineMode::mode
  113. #define CB_TEST(MODE) \
  114. TEST_F(ARM_COMMON, CONV_BIAS_MATMUL_QU8_##MODE) { \
  115. CONV_BIAS_MATMUL_QU8_MODE(MODE_STR(MODE)); \
  116. }
  117. CB_TEST(IDENTITY);
  118. CB_TEST(RELU);
  119. CB_TEST(H_SWISH);
  120. #undef MODE_STR
  121. #undef CB_TEST
  122. #undef CONV_BIAS_MATMUL_QU8_MODE
  123. #if MEGDNN_WITH_BENCHMARK
  124. static void benchmark_convbias(
  125. Handle* handle, std::string int_name, std::string float_name,
  126. bool is_fp32 = false, bool is_8x8x16 = false) {
  127. constexpr size_t RUNS = 30;
  128. Benchmarker<ConvBias> benchmarker_int(handle);
  129. benchmarker_int.set_times(RUNS)
  130. .set_dtype(0, dtype::QuantizedS8(2.5))
  131. .set_dtype(1, dtype::QuantizedS8(2.5))
  132. .set_dtype(2, dtype::QuantizedS32(6.25))
  133. .set_dtype(4, dtype::QuantizedS8(60.25))
  134. .set_display(false);
  135. benchmarker_int.set_before_exec_callback(
  136. conv_bias::ConvBiasAlgoChecker<ConvBias>(int_name.c_str()));
  137. Benchmarker<ConvBias> benchmarker_float(handle);
  138. benchmarker_float.set_display(false).set_times(RUNS);
  139. benchmarker_float.set_before_exec_callback(
  140. conv_bias::ConvBiasAlgoChecker<ConvBias>(float_name.c_str()));
  141. Benchmarker<ConvBias> benchmarker_nchw44(handle);
  142. if (is_fp32) {
  143. benchmarker_nchw44.set_times(RUNS)
  144. .set_dtype(0, dtype::Float32())
  145. .set_dtype(1, dtype::Float32())
  146. .set_dtype(2, dtype::Float32())
  147. .set_dtype(4, dtype::Float32())
  148. .set_display(false);
  149. } else if (is_8x8x16) {
  150. benchmarker_nchw44.set_times(RUNS)
  151. .set_dtype(0, dtype::Int8())
  152. .set_dtype(1, dtype::Int8())
  153. .set_dtype(2, dtype::Int16())
  154. .set_dtype(4, dtype::Int16())
  155. .set_display(false);
  156. benchmarker_int.set_times(RUNS)
  157. .set_dtype(0, dtype::Int8())
  158. .set_dtype(1, dtype::Int8())
  159. .set_dtype(2, dtype::Int16())
  160. .set_dtype(4, dtype::Int16())
  161. .set_display(false);
  162. } else {
  163. benchmarker_nchw44.set_times(RUNS)
  164. .set_dtype(0, dtype::QuantizedS8(2.5))
  165. .set_dtype(1, dtype::QuantizedS8(2.5))
  166. .set_dtype(2, dtype::QuantizedS32(6.25))
  167. .set_dtype(4, dtype::QuantizedS8(60.25))
  168. .set_display(false);
  169. }
  170. auto nchw44_algo_regx = ".*(DIRECT|NCHW_NCHW44).*";
  171. #if MGB_ENBALE_DOT
  172. if (!is_fp32) {
  173. nchw44_algo_regx = ".*DOT.*";
  174. }
  175. #endif
  176. benchmarker_nchw44.set_before_exec_callback(
  177. conv_bias::ConvBiasAlgoChecker<ConvBias>(nchw44_algo_regx));
  178. auto run = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  179. size_t stride, bool input_nchw = false) {
  180. param::ConvBias param;
  181. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  182. if (is_8x8x16) {
  183. param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
  184. }
  185. param.stride_h = stride;
  186. param.stride_w = stride;
  187. param.pad_h = FS / 2;
  188. param.pad_w = FS / 2;
  189. auto OH = (H + 2 * param.pad_h - FS) / static_cast<size_t>(param.stride_h) + 1;
  190. auto OW = (W + 2 * param.pad_w - FS) / static_cast<size_t>(param.stride_w) + 1;
  191. TensorShape src({N, IC, H, W}), filter({OC, IC, FS, FS}), bias({1, OC, 1, 1}),
  192. dst({N, OC, OH, OW});
  193. if (is_8x8x16) {
  194. bias = {};
  195. }
  196. param.format = param::ConvBias::Format::NCHW;
  197. auto int_used =
  198. benchmarker_int.set_param(param).exec({src, filter, bias, {}, dst}) /
  199. RUNS;
  200. auto float_used =
  201. benchmarker_float.set_param(param).exec({src, filter, bias, {}, dst}) /
  202. RUNS;
  203. param.format = param::ConvBias::Format::NCHW44;
  204. src = {N, IC / 4, H, W, 4};
  205. filter = {OC / 4, IC / 4, FS, FS, 4, 4};
  206. if (input_nchw) {
  207. src = {N, IC, H, W};
  208. filter = {OC / 4, FS, FS, IC, 4};
  209. }
  210. bias = {1, OC / 4, 1, 1, 4};
  211. if (is_8x8x16) {
  212. bias = {};
  213. }
  214. dst = {N, OC / 4, OH, OW, 4};
  215. auto int_nchw44_used =
  216. benchmarker_nchw44.set_param(param).exec({src, filter, bias, {}, dst}) /
  217. RUNS;
  218. float computations = IC * (FS * FS) * dst.total_nr_elems() * 2 * 1e-6;
  219. printf("run: %s %s %s->%s \n", src.to_string().c_str(),
  220. filter.to_string().c_str(), bias.to_string().c_str(),
  221. dst.to_string().c_str());
  222. printf("float: %f ms %f Gflops, ", float_used, computations / float_used);
  223. printf("int_nchw: %f ms %f Gflops, ", int_used, computations / int_used);
  224. auto speed_up = int_used / int_nchw44_used;
  225. if (is_fp32) {
  226. speed_up = float_used / int_nchw44_used;
  227. printf("fp32_nchw44: %f ms %f Gflops %f speedup, ", int_nchw44_used,
  228. computations / int_nchw44_used, speed_up);
  229. } else {
  230. printf("int_nchw44: %f ms %f Gflops %f speedup, ", int_nchw44_used,
  231. computations / int_nchw44_used, speed_up);
  232. }
  233. printf("\n");
  234. };
  235. if (is_fp32) {
  236. run(1, 1, 4, 112, 112, 2, 2, true);
  237. run(1, 3, 24, 224, 224, 3, 2, true);
  238. run(1, 3, 32, 224, 224, 3, 2, true);
  239. run(1, 3, 64, 224, 224, 7, 2, true);
  240. run(1, 1, 4, 112, 112, 2, 1, true);
  241. run(1, 3, 32, 224, 224, 3, 1, true);
  242. run(1, 3, 64, 224, 224, 3, 1, true);
  243. run(1, 3, 64, 224, 224, 7, 1, true);
  244. run(1, 64, 128, 56, 56, 3, 2, false);
  245. run(1, 128, 256, 28, 28, 3, 2, false);
  246. run(1, 256, 512, 14, 14, 3, 2, false);
  247. run(1, 128, 128, 28, 28, 3, 1, false);
  248. run(1, 256, 256, 14, 14, 3, 1, false);
  249. run(1, 512, 512, 7, 7, 3, 1, false);
  250. } else {
  251. run(1, 1, 4, 112, 112, 2, 2, true);
  252. run(1, 3, 8, 224, 224, 3, 2, true);
  253. run(1, 3, 32, 224, 224, 3, 2, true);
  254. run(1, 3, 32, 224, 224, 5, 2, true);
  255. run(1, 3, 64, 224, 224, 7, 2, true);
  256. run(1, 1, 4, 112, 112, 2, 1, true);
  257. run(1, 3, 32, 224, 224, 3, 1, true);
  258. run(1, 3, 32, 224, 224, 5, 1, true);
  259. run(1, 3, 64, 224, 224, 7, 1, true);
  260. run(1, 64, 128, 56, 56, 3, 2, false);
  261. run(1, 128, 256, 28, 28, 3, 2, false);
  262. run(1, 256, 512, 14, 14, 3, 2, false);
  263. run(1, 128, 128, 28, 28, 3, 1, false);
  264. run(1, 256, 256, 14, 14, 3, 1, false);
  265. run(1, 512, 512, 7, 7, 3, 1, false);
  266. for (size_t stride : {1}) {
  267. printf("stride %zu\n", stride);
  268. for (size_t filter_size : {2, 3, 5, 7}) {
  269. for (size_t img_size : {32}) {
  270. for (size_t channel : {8, 16, 32, 64, 128, 256}) {
  271. run(1, channel, channel, img_size, img_size, filter_size,
  272. stride, false);
  273. }
  274. }
  275. }
  276. }
  277. }
  278. }
  279. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_NCHW44) {
  280. #if MEGDNN_AARCH64
  281. benchmark_convbias(
  282. handle(), "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16:384",
  283. "IM2COLMATMUL:AARCH64_F32K8X12X1:192", true);
  284. benchmark_convbias(
  285. handle(), "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16:384",
  286. "IM2COLMATMUL:AARCH64_F32K8X12X1:192", false);
  287. benchmark_convbias(
  288. handle(), "IM2COLMATMUL:AARCH64_INT8X8X16_K4X4X16:192",
  289. "IM2COLMATMUL:AARCH64_F32K8X12X1:192", false, true);
  290. #else
  291. benchmark_convbias(
  292. handle(), "IM2COLMATMUL:ARMV7_INT8X8X32_K4X8X8:384",
  293. "IM2COLMATMUL:ARMV7_F32:192", true);
  294. benchmark_convbias(
  295. handle(), "IM2COLMATMUL:ARMV7_INT8X8X32_K4X8X8:384",
  296. "IM2COLMATMUL:ARMV7_F32:192", false);
  297. benchmark_convbias(
  298. handle(), "IM2COLMATMUL:ARMV7_INT8X8X16_K4X8X8:384",
  299. "IM2COLMATMUL:ARMV7_F32:192", false, true);
  300. #endif
  301. }
  302. TEST_F(ARM_COMMON_MULTI_THREADS, BENCHMARK_CONVBIAS_NCHW44) {
  303. #if MEGDNN_AARCH64
  304. benchmark_convbias(
  305. handle(), "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16:384",
  306. "IM2COLMATMUL:AARCH64_F32K8X12X1:192", true);
  307. benchmark_convbias(
  308. handle(), "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16:384",
  309. "IM2COLMATMUL:AARCH64_F32K8X12X1:192", false);
  310. #else
  311. benchmark_convbias(
  312. handle(), "IM2COLMATMUL:ARMV7_INT8X8X32_K4X8X8:384",
  313. "IM2COLMATMUL:ARMV7_F32:192", true);
  314. benchmark_convbias(
  315. handle(), "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16:384",
  316. "IM2COLMATMUL:ARMV7_F32:192", false);
  317. #endif
  318. }
  319. #endif
  320. TEST_F(ARM_COMMON, CONV_BIAS_MATMUL_QS8) {
  321. using namespace conv_bias;
  322. std::vector<TestArg> args = get_quantized_args();
  323. Checker<ConvBiasForward> checker(handle());
  324. checker.set_before_exec_callback(
  325. conv_bias::ConvBiasAlgoChecker<ConvBias>("S8MATMUL"));
  326. #if MEGDNN_ARMV7
  327. checker.set_epsilon(1);
  328. #endif
  329. UniformIntRNG rng{0, 255};
  330. for (auto&& arg : args) {
  331. if (arg.bias.ndim == 4 && arg.bias[2] != 1 && arg.bias[3] != 1)
  332. continue;
  333. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  334. .set_dtype(1, dtype::QuantizedS8(2.7f))
  335. .set_dtype(2, dtype::QuantizedS32(6.75f))
  336. .set_dtype(4, dtype::QuantizedS8(60.25f))
  337. .set_rng(0, &rng)
  338. .set_rng(1, &rng)
  339. .set_rng(2, &rng)
  340. .set_param(arg.param)
  341. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  342. }
  343. }
  344. #if MEGDNN_ARMV7
  345. TEST_F(ARM_COMMON, CONV_BIAS_RESCALE_OP) {
  346. using namespace conv_bias;
  347. Checker<ConvBias> checker(handle());
  348. checker.set_before_exec_callback(
  349. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("S8MATMUL"));
  350. checker.set_epsilon(1).set_max_avg_error(1e-2).set_max_avg_biased_error(1e-3);
  351. UniformIntRNG rng{-128, 127};
  352. checker.set_dtype(0, dtype::QuantizedS8(0.41113496f))
  353. .set_dtype(1, dtype::QuantizedS8(0.01887994f))
  354. .set_dtype(2, dtype::QuantizedS32(0.41113496f * 0.01887994f))
  355. .set_dtype(4, dtype::QuantizedS8(0.49550694f))
  356. .set_rng(0, &rng)
  357. .set_rng(1, &rng)
  358. .set_rng(2, &rng);
  359. param::ConvBias param;
  360. param.stride_h = 1;
  361. param.stride_w = 1;
  362. param.pad_h = 0;
  363. param.pad_w = 0;
  364. param.nonlineMode = NonlineMode::IDENTITY;
  365. //! Unary op
  366. checker.set_param(param).exec(
  367. {TensorShape{2, 1, 128, 128},
  368. TensorShape{16, 1, 2, 2},
  369. TensorShape{},
  370. TensorShape{},
  371. {}});
  372. //! Binary op
  373. checker.set_param(param).exec(
  374. {TensorShape{2, 1, 128, 128},
  375. TensorShape{16, 1, 2, 2},
  376. TensorShape{1, 16, 1, 1},
  377. TensorShape{},
  378. {}});
  379. }
  380. #endif
  381. #if MEGDNN_WITH_BENCHMARK
  382. void benchmark_im2col(
  383. const char* algo_name, const char* im2col_name, Handle* handle, size_t kernel,
  384. size_t pack_size = 1) {
  385. auto&& args = get_winograd_benchmark_args(kernel, pack_size);
  386. using namespace conv_bias;
  387. constexpr size_t RUN = 10;
  388. Benchmarker<ConvBias> benchmark(handle);
  389. benchmark.set_display(false);
  390. benchmark.set_times(RUN);
  391. Benchmarker<ConvBias> benchmark_im2col(handle);
  392. benchmark_im2col.set_display(false);
  393. benchmark_im2col.set_times(RUN);
  394. for (auto&& arg : args) {
  395. TensorLayout dst_layout;
  396. auto opr = handle->create_operator<ConvBias>();
  397. opr->param() = arg.param;
  398. opr->deduce_layout(
  399. {arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()},
  400. {arg.bias, dtype::Float32()}, {}, dst_layout);
  401. //! dst.nr_elems * IC * FH * FW * 2
  402. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  403. arg.filter[2] * arg.filter[3] * 2.0 /
  404. (1024 * 1024 * 1024) * 1e3;
  405. benchmark.set_param(arg.param);
  406. auto used = algo_benchmark<ConvBias>(
  407. benchmark, {arg.src, arg.filter, {}, {}, {}}, algo_name) /
  408. RUN;
  409. benchmark_im2col.set_param(arg.param);
  410. auto used_im2col = algo_benchmark<ConvBias>(
  411. benchmark_im2col, {arg.src, arg.filter, {}, {}, {}},
  412. im2col_name) /
  413. RUN;
  414. printf("%s %s: normal: %f ms %f Gflops im2col: %f ms %f GFlops "
  415. "speedup: "
  416. "%f\n",
  417. arg.src.to_string().c_str(), arg.filter.to_string().c_str(), used,
  418. computations / used, used_im2col, computations / used_im2col,
  419. used / used_im2col);
  420. }
  421. }
  422. void benchmark_im2col_single_algo(
  423. const char* im2col_name, Handle* handle, size_t kernel, size_t pack_size = 1) {
  424. std::vector<conv_bias::TestArg> args;
  425. auto pack = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p) {
  426. if (ic % pack_size != 0 || oc % pack_size != 0)
  427. return;
  428. if (w + 2 * p < kernel || h + 2 * p < kernel)
  429. return;
  430. param::ConvBias param;
  431. param.stride_h = 1;
  432. param.stride_w = 1;
  433. param.pad_h = p;
  434. param.pad_w = p;
  435. args.push_back(conv_bias::TestArg{
  436. param,
  437. TensorShape{1, ic, h, w},
  438. TensorShape{oc, ic, kernel, kernel},
  439. {1, oc, 1, 1}});
  440. };
  441. pack(1, 64, 100, 100, kernel, 1);
  442. pack(8, 64, 100, 100, kernel, 1);
  443. pack(16, 64, 100, 100, kernel, 1);
  444. pack(32, 64, 100, 100, kernel, 1);
  445. pack(64, 64, 100, 100, kernel, 1);
  446. pack(128, 64, 100, 100, kernel, 1);
  447. pack(256, 64, 100, 100, kernel, 1);
  448. pack(512, 64, 100, 100, kernel, 1);
  449. pack(1024, 64, 100, 100, kernel, 1);
  450. pack(1, 64, 10, 10, kernel, 1);
  451. pack(8, 64, 10, 10, kernel, 1);
  452. pack(16, 64, 10, 10, kernel, 1);
  453. pack(32, 64, 10, 10, kernel, 1);
  454. pack(64, 64, 10, 10, kernel, 1);
  455. pack(128, 64, 10, 10, kernel, 1);
  456. pack(256, 64, 10, 10, kernel, 1);
  457. pack(512, 64, 10, 10, kernel, 1);
  458. pack(1024, 64, 10, 10, kernel, 1);
  459. pack(1, 16, 10, 10, kernel, 1);
  460. pack(8, 16, 10, 10, kernel, 1);
  461. pack(16, 16, 10, 10, kernel, 1);
  462. pack(32, 16, 10, 10, kernel, 1);
  463. pack(64, 16, 10, 10, kernel, 1);
  464. pack(128, 16, 10, 10, kernel, 1);
  465. pack(256, 16, 10, 10, kernel, 1);
  466. pack(512, 16, 10, 10, kernel, 1);
  467. pack(1024, 16, 10, 10, kernel, 1);
  468. using namespace conv_bias;
  469. constexpr size_t RUN = 20;
  470. Benchmarker<ConvBias> benchmark_im2col(handle);
  471. benchmark_im2col.set_display(false);
  472. benchmark_im2col.set_times(RUN);
  473. for (auto&& arg : args) {
  474. TensorLayout dst_layout;
  475. auto opr = handle->create_operator<ConvBias>();
  476. opr->param() = arg.param;
  477. opr->deduce_layout(
  478. {arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()},
  479. {arg.bias, dtype::Float32()}, {}, dst_layout);
  480. //! dst.nr_elems * IC * FH * FW * 2
  481. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  482. arg.filter[2] * arg.filter[3] * 2.0 /
  483. (1024 * 1024 * 1024) * 1e3;
  484. benchmark_im2col.set_param(arg.param);
  485. auto used_im2col = algo_benchmark<ConvBias>(
  486. benchmark_im2col, {arg.src, arg.filter, {}, {}, {}},
  487. im2col_name) /
  488. RUN;
  489. printf("%s %s: im2col: %f ms %f GFlops \n", arg.src.to_string().c_str(),
  490. arg.filter.to_string().c_str(), used_im2col, computations / used_im2col);
  491. }
  492. }
  493. void benchmark_nchw44_8x8x16_vs_8x8x32(
  494. const char* im2col_name, Handle* handle, size_t kernel, size_t stride,
  495. size_t pack_size = 1) {
  496. megdnn_assert(stride == 1 || stride == 2, "only support stride 1 or 2");
  497. std::vector<conv_bias::TestArg> args;
  498. auto pack = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p) {
  499. if (ic % pack_size != 0 || oc % pack_size != 0)
  500. return;
  501. if (w + 2 * p < kernel || h + 2 * p < kernel)
  502. return;
  503. param::ConvBias param;
  504. param.format = param::ConvBias::Format::NCHW44;
  505. param.stride_h = stride;
  506. param.stride_w = stride;
  507. param.pad_h = p;
  508. param.pad_w = p;
  509. param.sparse = param::ConvBias::Sparse::DENSE;
  510. args.push_back(conv_bias::TestArg{
  511. param,
  512. TensorShape{1, ic / 4, h, w, 4},
  513. TensorShape{oc / 4, ic / 4, kernel, kernel, 4, 4},
  514. {1, oc / 4, 1, 1, 4}});
  515. };
  516. pack(1, 64, 56, 56, kernel, 0);
  517. pack(8, 64, 56, 56, kernel, 0);
  518. pack(16, 64, 56, 56, kernel, 1);
  519. pack(32, 64, 56, 56, kernel, 1);
  520. pack(1, 64, 100, 100, kernel, 1);
  521. pack(8, 64, 100, 100, kernel, 1);
  522. pack(1, 64, 100, 100, kernel, 0);
  523. pack(8, 64, 100, 100, kernel, 0);
  524. pack(16, 64, 100, 100, kernel, 1);
  525. pack(32, 64, 100, 100, kernel, 1);
  526. pack(64, 64, 100, 100, kernel, 1);
  527. pack(128, 64, 100, 100, kernel, 1);
  528. pack(256, 64, 100, 100, kernel, 1);
  529. pack(512, 64, 100, 100, kernel, 1);
  530. pack(1024, 64, 100, 100, kernel, 1);
  531. pack(1, 32, 200, 200, kernel, 1);
  532. pack(8, 64, 200, 200, kernel, 1);
  533. pack(1, 32, 200, 200, kernel, 0);
  534. pack(8, 64, 200, 200, kernel, 0);
  535. pack(16, 96, 200, 200, kernel, 1);
  536. pack(32, 32, 200, 200, kernel, 1);
  537. pack(64, 64, 200, 200, kernel, 1);
  538. pack(128, 96, 200, 200, kernel, 1);
  539. pack(1, 64, 10, 10, kernel, 1);
  540. pack(8, 64, 10, 10, kernel, 1);
  541. pack(16, 64, 10, 10, kernel, 1);
  542. pack(32, 64, 10, 10, kernel, 1);
  543. pack(64, 64, 10, 10, kernel, 1);
  544. pack(128, 64, 10, 10, kernel, 1);
  545. pack(256, 64, 10, 10, kernel, 1);
  546. pack(512, 64, 10, 10, kernel, 1);
  547. pack(1024, 64, 10, 10, kernel, 1);
  548. using namespace conv_bias;
  549. constexpr size_t RUN = 20;
  550. Benchmarker<ConvBias> benchmark_im2col(handle);
  551. benchmark_im2col.set_display(false);
  552. benchmark_im2col.set_times(RUN);
  553. Benchmarker<ConvBias> benchmark_8832(handle);
  554. benchmark_8832.set_display(false);
  555. benchmark_8832.set_times(RUN);
  556. for (auto&& arg : args) {
  557. TensorLayout dst_layout;
  558. auto opr = handle->create_operator<ConvBias>();
  559. opr->param() = arg.param;
  560. opr->deduce_layout(
  561. {arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()},
  562. {arg.bias, dtype::Float32()}, {}, dst_layout);
  563. //! dst.nr_elems * IC * FH * FW * 2
  564. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  565. arg.filter[2] * arg.filter[3] * 2.0 * 4 /
  566. (1024 * 1024 * 1024) * 1e3;
  567. benchmark_im2col.set_param(arg.param);
  568. benchmark_im2col.set_dtype(0, dtype::Int8());
  569. benchmark_im2col.set_dtype(1, dtype::Int8());
  570. benchmark_im2col.set_dtype(2, dtype::Int16());
  571. benchmark_im2col.set_dtype(4, dtype::Int16());
  572. auto used_8816 = algo_benchmark<ConvBias>(
  573. benchmark_im2col, {arg.src, arg.filter, {}, {}, {}},
  574. im2col_name) /
  575. RUN;
  576. benchmark_8832.set_param(arg.param);
  577. benchmark_8832.set_dtype(0, dtype::QuantizedS8(2.5));
  578. benchmark_8832.set_dtype(1, dtype::QuantizedS8(2.5));
  579. benchmark_8832.set_dtype(2, dtype::QuantizedS32(6.25));
  580. benchmark_8832.set_dtype(4, {});
  581. auto used_8832 = algo_benchmark<ConvBias>(
  582. benchmark_8832, {arg.src, arg.filter, {}, {}, {}},
  583. "S8_NCHW44_DIRECT") /
  584. RUN;
  585. printf("%s %s: 8816: %f ms %f GFlops ", arg.src.to_string().c_str(),
  586. arg.filter.to_string().c_str(), used_8816, computations / used_8816);
  587. printf("%s %s: 8832: %f ms %f GFlops ", arg.src.to_string().c_str(),
  588. arg.filter.to_string().c_str(), used_8832, computations / used_8832);
  589. printf("speedup %f \n", used_8832 / used_8816);
  590. }
  591. }
  592. void BENCHMARK_IM2COL_NCHW44_VS_NCHW(
  593. const char* algo_name, const char* im2col_name, Handle* handle, size_t kernel,
  594. DType src_type, DType dst_type) {
  595. auto&& args = get_winograd_benchmark_args(kernel, 4);
  596. using namespace conv_bias;
  597. constexpr size_t RUN = 10;
  598. Benchmarker<ConvBias> benchmark(handle);
  599. benchmark.set_display(false);
  600. benchmark.set_times(RUN);
  601. benchmark.set_dtype(0, src_type);
  602. benchmark.set_dtype(1, src_type);
  603. benchmark.set_dtype(2, dst_type);
  604. benchmark.set_dtype(4, dst_type);
  605. Benchmarker<ConvBias> benchmark_im2col(handle);
  606. benchmark_im2col.set_display(false);
  607. benchmark_im2col.set_times(RUN);
  608. benchmark_im2col.set_dtype(0, src_type);
  609. benchmark_im2col.set_dtype(1, src_type);
  610. benchmark_im2col.set_dtype(2, dst_type);
  611. benchmark_im2col.set_dtype(4, dst_type);
  612. for (auto&& arg : args) {
  613. TensorLayout dst_layout;
  614. auto opr = handle->create_operator<ConvBias>();
  615. opr->param() = arg.param;
  616. opr->deduce_layout(
  617. {arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()},
  618. {arg.bias, dtype::Float32()}, {}, dst_layout);
  619. //! dst.nr_elems * IC * FH * FW * 2
  620. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  621. arg.filter[2] * arg.filter[3] * 2.0 /
  622. (1024 * 1024 * 1024) * 1e3;
  623. std::vector<conv_bias::TestArg> nchw44param;
  624. benchmark.set_param(arg.param);
  625. auto used = algo_benchmark<ConvBias>(
  626. benchmark, {arg.src, arg.filter, {}, {}, {}}, algo_name) /
  627. RUN;
  628. arg.param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
  629. arg.param.format = param::ConvBias::Format::NCHW44;
  630. benchmark_im2col.set_param(arg.param);
  631. nchw44param.push_back(conv_bias::TestArg{
  632. arg.param,
  633. TensorShape{
  634. arg.src.shape[0], arg.src.shape[1] / 4, arg.src[2],
  635. arg.src.shape[3], 4},
  636. TensorShape{
  637. arg.filter.shape[0] / 4, arg.filter.shape[1] / 4, kernel,
  638. kernel, 4, 4},
  639. TensorShape{}});
  640. auto used_im2col =
  641. algo_benchmark<ConvBias>(
  642. benchmark_im2col,
  643. {nchw44param[0].src, nchw44param[0].filter, {}, {}, {}},
  644. im2col_name) /
  645. RUN;
  646. printf("nchw44 shape src %s filter %s\n",
  647. nchw44param[0].src.to_string().c_str(),
  648. nchw44param[0].filter.to_string().c_str());
  649. printf("%s %s: normal: %f ms %f Gflops im2col: %f ms %f GFlops "
  650. "speedup: "
  651. "%f\n",
  652. arg.src.to_string().c_str(), arg.filter.to_string().c_str(), used,
  653. computations / used, used_im2col, computations / used_im2col,
  654. used / used_im2col);
  655. }
  656. }
  657. std::vector<conv_bias::TestArg> get_nchw44_channel_wise_benchmark_args(
  658. std::vector<size_t> kernel, size_t stride, bool no_bias, bool no_nonlinemode,
  659. bool no_full_bias) {
  660. using namespace conv_bias;
  661. using Param = param::ConvBias;
  662. using NLMode = param::ConvBias::NonlineMode;
  663. std::vector<TestArg> args;
  664. auto pack = [&](size_t n, size_t group, size_t w, size_t h, size_t kernel,
  665. size_t stride, NLMode nlmode, bool pad) {
  666. Param param;
  667. param.stride_h = stride;
  668. param.stride_w = stride;
  669. if (pad) {
  670. param.pad_h = kernel / 2;
  671. param.pad_w = kernel / 2;
  672. } else {
  673. param.pad_h = 0;
  674. param.pad_w = 0;
  675. }
  676. param.nonlineMode = nlmode;
  677. param.format = param::ConvBias::Format::NCHW44;
  678. param.sparse = param::ConvBias::Sparse::GROUP;
  679. args.emplace_back(
  680. param, TensorShape{n, group, h, w, 4},
  681. TensorShape{group, 1, 1, kernel, kernel, 4}, TensorShape{});
  682. if (!no_bias) {
  683. args.emplace_back(
  684. param, TensorShape{n, group, h, w, 4},
  685. TensorShape{group, 1, 1, kernel, kernel, 4},
  686. TensorShape{1, group, 1, 1, 4});
  687. }
  688. if (!no_full_bias) {
  689. args.emplace_back(
  690. param, TensorShape{n, group, h, w, 4},
  691. TensorShape{group, 1, 1, kernel, kernel, 4},
  692. TensorShape{
  693. n, group, (h + 2 * param.pad_w - kernel) / stride + 1,
  694. (w + 2 * param.pad_w - kernel) / stride + 1, 4});
  695. }
  696. };
  697. std::vector<NLMode> nonlinemode = {NLMode::IDENTITY};
  698. if (!no_nonlinemode) {
  699. nonlinemode.emplace_back(NLMode::RELU);
  700. nonlinemode.emplace_back(NLMode::H_SWISH);
  701. }
  702. for (size_t n : {1}) {
  703. for (auto nlmode : nonlinemode) {
  704. for (bool pad : {true}) {
  705. for (size_t group : {1, 2, 4, 128}) {
  706. for (size_t size : {40, 89, 100, 200}) {
  707. for (size_t kern : kernel) {
  708. pack(n, group, size, size, kern, stride, nlmode, pad);
  709. }
  710. }
  711. }
  712. }
  713. for (bool pad : {false}) {
  714. for (size_t group : {1, 2, 4, 8, 16, 32, 64, 128}) {
  715. for (size_t size : {40, 89, 100}) {
  716. for (size_t kern : kernel) {
  717. pack(n, group, size, size, kern, stride, nlmode, pad);
  718. }
  719. }
  720. }
  721. }
  722. }
  723. }
  724. return args;
  725. }
  726. void BENCHMARK_GROUPCONV_NCHW44_int8x8x16VS_int8x8x32(
  727. const char* algo_name0, const char* algo_name1, Handle* handle, size_t kernel,
  728. size_t stride = 1, size_t pack_size = 1) {
  729. auto args = get_nchw44_channel_wise_benchmark_args(
  730. {2, 3, 5}, stride, false, true, true);
  731. using namespace conv_bias;
  732. constexpr size_t RUN = 10;
  733. Benchmarker<ConvBias> benchmark(handle);
  734. benchmark.set_display(false);
  735. benchmark.set_times(RUN);
  736. benchmark.set_dtype(0, dtype::Int8());
  737. benchmark.set_dtype(1, dtype::Int8());
  738. benchmark.set_dtype(2, dtype::Int32());
  739. benchmark.set_dtype(4, dtype::Int32());
  740. Benchmarker<ConvBias> benchmark_algo1(handle);
  741. benchmark_algo1.set_display(false);
  742. benchmark_algo1.set_times(RUN);
  743. benchmark_algo1.set_dtype(0, dtype::Int8());
  744. benchmark_algo1.set_dtype(1, dtype::Int8());
  745. benchmark_algo1.set_dtype(2, dtype::Int16());
  746. benchmark_algo1.set_dtype(4, dtype::Int16());
  747. for (auto&& arg : args) {
  748. TensorLayout dst_layout;
  749. auto opr = handle->create_operator<ConvBias>();
  750. opr->param() = arg.param;
  751. opr->deduce_layout(
  752. {arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()},
  753. {arg.bias, dtype::Float32()}, {}, dst_layout);
  754. //! dst.nr_elems * IC * FH * FW * 2
  755. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  756. arg.filter[2] * arg.filter[3] * 2.0 * pack_size /
  757. (1024 * 1024 * 1024) * 1e3;
  758. benchmark.set_param(arg.param);
  759. auto used = algo_benchmark<ConvBias>(
  760. benchmark, {arg.src, arg.filter, {}, {}, {}}, algo_name0) /
  761. RUN;
  762. arg.param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
  763. arg.param.format = param::ConvBias::Format::NCHW44;
  764. benchmark_algo1.set_param(arg.param);
  765. auto used_algo1 = algo_benchmark<ConvBias>(
  766. benchmark_algo1, {arg.src, arg.filter, {}, {}, {}},
  767. algo_name1) /
  768. RUN;
  769. printf("%s %s: normal: %f ms %f Gflops 8x8x16: %f ms %f GFlops "
  770. "speedup: "
  771. "%f\n",
  772. arg.src.to_string().c_str(), arg.filter.to_string().c_str(), used,
  773. computations / used, used_algo1, computations / used_algo1,
  774. used / used_algo1);
  775. }
  776. }
  777. #if MEGDNN_AARCH64
  778. TEST_F(ARM_COMMON, BENCHMARK_NCHW_VS_NCHW44_INT8x8x32) {
  779. printf("=========================compare "
  780. "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16, "
  781. "IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16 \n");
  782. BENCHMARK_IM2COL_NCHW44_VS_NCHW(
  783. "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16",
  784. "IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16", handle(), 3, dtype::Int8(),
  785. dtype::Int32());
  786. }
  787. #endif
  788. TEST_F(ARM_COMMON, BENCHMARK_NCHW_VS_NCHW44_INT8x8x16) {
  789. #if MEGDNN_ARMV7
  790. const char* default_algo = "IM2COLMATMUL:ARMV7_INT8X8X16_K4X8X8";
  791. const char* mk4_algo = "IM2COLMATMUL:ARMV7_INT8X8X16_MK4_K8X8X4";
  792. printf("compare %s vs %s \n", default_algo, mk4_algo);
  793. BENCHMARK_IM2COL_NCHW44_VS_NCHW(
  794. default_algo, mk4_algo, handle(), 3, dtype::Int8(), dtype::Int16());
  795. #else
  796. const char* default_algo = "IM2COLMATMUL:AARCH64_INT8X8X16_K4X4X16";
  797. const char* mk4_algo = "IM2COLMATMUL:AARCH64_INT8X8X16_MK4_4X4X8";
  798. printf("compare %s vs %s \n", default_algo, mk4_algo);
  799. BENCHMARK_IM2COL_NCHW44_VS_NCHW(
  800. default_algo, mk4_algo, handle(), 3, dtype::Int8(), dtype::Int16());
  801. #endif
  802. }
  803. TEST_F(ARM_COMMON, BENCHMARK_GROUP_CONV_NCHW44_INT8x8x32_VS_INT8x8x16_STRIDE1) {
  804. BENCHMARK_GROUPCONV_NCHW44_int8x8x16VS_int8x8x32(
  805. "S8_CHAN_WISE_STRD1_NCHW44", "S8x8x16_CHAN_WISE_STRD1_STRD2_NCHW44",
  806. handle(), 3, 1, 4);
  807. }
  808. TEST_F(ARM_COMMON, BENCHMARK_GROUP_CONV_NCHW44_INT8x8x32_VS_INT8x8x16_STRIDE2) {
  809. BENCHMARK_GROUPCONV_NCHW44_int8x8x16VS_int8x8x32(
  810. "S8_CHAN_WISE_STRD2_NCHW44", "S8x8x16_CHAN_WISE_STRD1_STRD2_NCHW44",
  811. handle(), 3, 2, 4);
  812. }
  813. TEST_F(ARM_COMMON, BENCHMARK_GROUP_CONVBIAS_QUANTIZED) {
  814. constexpr size_t RUNS = 50;
  815. param::ConvBias param;
  816. param.sparse = param::ConvBias::Sparse::GROUP;
  817. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  818. Benchmarker<ConvBias> benchmarker_int(handle());
  819. benchmarker_int.set_times(RUNS)
  820. .set_dtype(0, dtype::QuantizedS8(2.5f))
  821. .set_dtype(1, dtype::QuantizedS8(2.5f))
  822. .set_dtype(2, dtype::QuantizedS32(6.25f))
  823. .set_dtype(4, dtype::QuantizedS8(40.25f))
  824. .set_display(false);
  825. Benchmarker<ConvBias> benchmarker_float(handle());
  826. benchmarker_float.set_display(false).set_times(RUNS);
  827. auto run = [&](size_t N, size_t GROUP, size_t IC, size_t OC, size_t H, size_t W,
  828. size_t FS, size_t STRD) {
  829. megdnn_assert(IC % GROUP == 0 && OC % GROUP == 0);
  830. TensorShape src({N, IC, H, W}), filter({GROUP, OC / GROUP, IC / GROUP, FS, FS}),
  831. bias({1, OC, 1, 1}), dst({N, OC, H / STRD, W / STRD});
  832. param.pad_h = FS / 2;
  833. param.pad_w = FS / 2;
  834. param.stride_h = STRD;
  835. param.stride_w = STRD;
  836. auto int_used =
  837. benchmarker_int.set_param(param).exec({src, filter, bias, {}, dst}) /
  838. RUNS;
  839. auto float_used =
  840. benchmarker_float.set_param(param).exec({src, filter, bias, {}, dst}) /
  841. RUNS;
  842. float computations = (IC / GROUP * FS * FS * dst.total_nr_elems() * 2 +
  843. dst.total_nr_elems()) *
  844. 1e-6;
  845. printf("run: %s %s %s->%s \nfloat: %f ms %f Gflops int: %f ms "
  846. "%f Gflops speedup: %f\n",
  847. src.to_string().c_str(), filter.to_string().c_str(),
  848. bias.to_string().c_str(), dst.to_string().c_str(), float_used,
  849. computations / float_used, int_used, computations / int_used,
  850. float_used / int_used);
  851. };
  852. run(1, 1, 28, 28, 28, 28, 3, 1);
  853. run(1, 68, 68, 68, 14, 14, 3, 2);
  854. run(1, 96, 96, 96, 14, 14, 3, 2);
  855. run(1, 100, 100, 100, 7, 7, 3, 1);
  856. }
  857. #endif
  858. #if MEGDNN_WITH_BENCHMARK
  859. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_MATMUL) {
  860. constexpr size_t RUNS = 10;
  861. param::ConvBias param;
  862. param.stride_h = 1;
  863. param.stride_w = 1;
  864. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  865. Benchmarker<ConvBias> benchmarker(handle()), benchmarker_fused(handle());
  866. benchmarker.set_times(RUNS)
  867. .set_dtype(0, dtype::QuantizedS8(2.5f))
  868. .set_dtype(1, dtype::QuantizedS8(2.5f))
  869. .set_dtype(2, dtype::QuantizedS32(6.25f))
  870. .set_dtype(4, dtype::QuantizedS8(40.25f))
  871. .set_display(false);
  872. benchmarker_fused.set_times(RUNS)
  873. .set_dtype(0, dtype::QuantizedS8(2.5f))
  874. .set_dtype(1, dtype::QuantizedS8(2.5f))
  875. .set_dtype(2, dtype::QuantizedS32(6.25f))
  876. .set_dtype(4, dtype::QuantizedS8(40.25f))
  877. .set_display(false);
  878. benchmarker_fused.set_before_exec_callback(
  879. conv_bias::ConvBiasAlgoChecker<ConvBias>("S8MATMUL"));
  880. auto run = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS) {
  881. TensorShape src({N, IC, H, W}), filter({OC, IC, FS, FS}), bias({1, OC, 1, 1}),
  882. dst({N, OC, H, W});
  883. param.pad_h = FS / 2;
  884. param.pad_w = FS / 2;
  885. auto default_used =
  886. benchmarker.set_param(param).exec({src, filter, bias, {}, dst}) / RUNS;
  887. auto fused_used =
  888. benchmarker_fused.set_param(param).exec({src, filter, bias, {}, dst}) /
  889. RUNS;
  890. float computations = IC * (FS * FS + 1) * dst.total_nr_elems() * 2 * 1e-6;
  891. printf("run: %s %s %s->%s \ndefault: %f ms %f Gflops fused: %f ms "
  892. "%f Gflops speedup: %f\n",
  893. src.to_string().c_str(), filter.to_string().c_str(),
  894. bias.to_string().c_str(), dst.to_string().c_str(), default_used,
  895. computations / default_used, fused_used, computations / fused_used,
  896. default_used / fused_used);
  897. };
  898. run(1, 128, 128, 32, 32, 3);
  899. for (size_t IC : {36, 48}) {
  900. for (size_t OC : {36, 48, 64}) {
  901. for (size_t size : {56, 128, 256}) {
  902. for (size_t FS : {1, 3, 5}) {
  903. run(1, IC, OC, size, size, FS);
  904. }
  905. }
  906. }
  907. }
  908. }
  909. #endif
  910. #if MEGDNN_WITH_BENCHMARK
  911. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_8X8X16_DIRECT_STRIDE1) {
  912. benchmark_nchw44_8x8x16_vs_8x8x32("S8x8x16_NCHW44_DIRECT", handle(), 2, 1, 4);
  913. benchmark_nchw44_8x8x16_vs_8x8x32("S8x8x16_NCHW44_DIRECT", handle(), 3, 1, 4);
  914. benchmark_nchw44_8x8x16_vs_8x8x32("S8x8x16_NCHW44_DIRECT", handle(), 5, 1, 4);
  915. benchmark_nchw44_8x8x16_vs_8x8x32("S8x8x16_NCHW44_DIRECT", handle(), 7, 1, 4);
  916. }
  917. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_8X8X16_DIRECT_STRIDE2) {
  918. benchmark_nchw44_8x8x16_vs_8x8x32("S8x8x16_NCHW44_DIRECT", handle(), 2, 2, 4);
  919. benchmark_nchw44_8x8x16_vs_8x8x32("S8x8x16_NCHW44_DIRECT", handle(), 3, 2, 4);
  920. benchmark_nchw44_8x8x16_vs_8x8x32("S8x8x16_NCHW44_DIRECT", handle(), 5, 2, 4);
  921. benchmark_nchw44_8x8x16_vs_8x8x32("S8x8x16_NCHW44_DIRECT", handle(), 7, 2, 4);
  922. }
  923. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F23) {
  924. #if MEGDNN_AARCH64
  925. benchmark_winograd("WINOGRAD:AARCH64_F32:1:2", handle(), 3);
  926. #else
  927. benchmark_winograd("WINOGRAD:ARMV7_F32_:1:2", handle(), 3);
  928. #endif
  929. }
  930. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F23_4x4) {
  931. #if MEGDNN_AARCH64
  932. benchmark_winograd("WINOGRAD:AARCH64_F32_MK4_4x16:4:2", handle(), 3, 4);
  933. #else
  934. benchmark_winograd("WINOGRAD:ARMV7_F32_MK4_4x8:4:2", handle(), 3, 4);
  935. #endif
  936. }
  937. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F63) {
  938. #if MEGDNN_AARCH64
  939. benchmark_winograd("WINOGRAD:AARCH64_F32K8X12X1:1:6", handle(), 3);
  940. #else
  941. benchmark_winograd("WINOGRAD:ARMV7_F32:1:6", handle(), 3);
  942. #endif
  943. }
  944. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F63_4x4) {
  945. #if MEGDNN_AARCH64
  946. benchmark_winograd("WINOGRAD:AARCH64_F32_MK4_4x16:4:6", handle(), 3, 4);
  947. #else
  948. benchmark_winograd("WINOGRAD:ARMV7_F32_MK4_4x8:4:6", handle(), 3, 4);
  949. #endif
  950. }
  951. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F54) {
  952. #if MEGDNN_AARCH64
  953. benchmark_winograd("WINOGRAD:AARCH64_F32K8X12X1:1:5", handle(), 4);
  954. #else
  955. benchmark_winograd("WINOGRAD:ARMV7_F32:1:5", handle(), 4);
  956. #endif
  957. }
  958. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F45) {
  959. #if MEGDNN_AARCH64
  960. benchmark_winograd("WINOGRAD:AARCH64_F32K8X12X1:1:4", handle(), 5);
  961. #else
  962. benchmark_winograd("WINOGRAD:ARMV7_F32:1:4", handle(), 5);
  963. #endif
  964. }
  965. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  966. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F16_F23) {
  967. #if MEGDNN_AARCH64
  968. benchmark_winograd_fp16(
  969. "WINOGRAD:AARCH64_F32_MK4_4x16:4:2", "WINOGRAD:AARCH64_F16_K8X24X1:1:6",
  970. handle(), 3, 4);
  971. #else
  972. benchmark_winograd_fp16(
  973. "WINOGRAD:ARMV7_F32:1:2", "WINOGRAD:AARCH32_F16_K4X16X1:1:2", handle(), 3);
  974. #endif
  975. }
  976. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F16_F45) {
  977. #if MEGDNN_AARCH64
  978. benchmark_winograd_fp16(
  979. "WINOGRAD:AARCH64_F32K8X12X1:1:4", "WINOGRAD:AARCH64_F16_K8X24X1:1:4",
  980. handle(), 5);
  981. #else
  982. benchmark_winograd_fp16(
  983. "WINOGRAD:ARMV7_F32:1:4", "WINOGRAD:AARCH32_F16_K4X16X1:1:4", handle(), 5);
  984. #endif
  985. }
  986. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F16_F63) {
  987. #if MEGDNN_AARCH64
  988. benchmark_winograd_fp16(
  989. "WINOGRAD:AARCH64_F32K8X12X1:1:6", "WINOGRAD:AARCH64_F16_K8X24X1:1:6",
  990. handle(), 3);
  991. #else
  992. benchmark_winograd_fp16(
  993. "WINOGRAD:ARMV7_F32:1:6", "WINOGRAD:AARCH32_F16_K4X16X1:1:6", handle(), 3);
  994. #endif
  995. }
  996. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F16_F23_8x8) {
  997. #if MEGDNN_AARCH64
  998. benchmark_winograd_fp16(
  999. "WINOGRAD:AARCH64_F32_MK4_4x16:4:2", "WINOGRAD:AARCH64_F16_MK8_8X8:8:2",
  1000. handle(), 3, 8);
  1001. #else
  1002. benchmark_winograd_fp16(
  1003. "WINOGRAD:ARMV7_F32_MK4_4x8:4:2", "WINOGRAD:AARCH32_F16_MK8_4X8:8:2",
  1004. handle(), 3, 8);
  1005. #endif
  1006. }
  1007. #endif
  1008. void benchmark_winograd_nchw_vs_nchw44(
  1009. const char* algo_name0, const char* algo_name1, Handle* handle) {
  1010. using namespace conv_bias;
  1011. using NLMode = param::ConvBias::NonlineMode;
  1012. std::vector<conv_bias::TestArg> args_nchw44;
  1013. std::vector<conv_bias::TestArg> args_nchw;
  1014. auto pack = [&](size_t n, size_t oc, size_t ic, size_t h, size_t w, size_t group,
  1015. NLMode nlmode) {
  1016. param::ConvBias param;
  1017. param.format = param::ConvBias::Format::NCHW44;
  1018. param.stride_h = 1;
  1019. param.stride_w = 1;
  1020. param.pad_h = 1;
  1021. param.pad_w = 1;
  1022. param.nonlineMode = nlmode;
  1023. if (group == 1) {
  1024. param.sparse = param::ConvBias::Sparse::DENSE;
  1025. args_nchw44.emplace_back(
  1026. param, TensorShape{n, ic / 4, h, w, 4},
  1027. TensorShape{oc / 4, ic / 4, 3, 3, 4, 4}, TensorShape{});
  1028. param.format = param::ConvBias::Format::NCHW;
  1029. args_nchw.emplace_back(
  1030. param, TensorShape{n, ic, h, w}, TensorShape{oc, ic, 3, 3},
  1031. TensorShape{});
  1032. } else {
  1033. auto oc_per_group = oc / group;
  1034. auto ic_per_group = ic / group;
  1035. param.sparse = param::ConvBias::Sparse::GROUP;
  1036. args_nchw44.emplace_back(
  1037. param, TensorShape{n, ic_per_group / 4, h, w, 4},
  1038. TensorShape{group, oc_per_group / 4, ic_per_group / 4, 3, 3, 4, 4},
  1039. TensorShape{});
  1040. param.format = param::ConvBias::Format::NCHW;
  1041. args_nchw.emplace_back(
  1042. param, TensorShape{n, ic, h, w},
  1043. TensorShape{group, oc_per_group, ic_per_group, 3, 3},
  1044. TensorShape{});
  1045. }
  1046. };
  1047. std::vector<NLMode> nonlinemode = {NLMode::IDENTITY};
  1048. for (auto nlmode : nonlinemode)
  1049. for (size_t n : {1})
  1050. for (size_t group = 1; group <= 1; ++group) {
  1051. pack(n, 512, 512, 15, 15, group, nlmode);
  1052. pack(n, 512, 256, 15, 15, group, nlmode);
  1053. pack(n, 256, 256, 29, 29, group, nlmode);
  1054. pack(n, 256, 128, 29, 29, group, nlmode);
  1055. pack(n, 128, 128, 57, 57, group, nlmode);
  1056. pack(n, 128, 64, 57, 57, group, nlmode);
  1057. pack(n, 24, 24, 224, 224, group, nlmode);
  1058. pack(n, 64, 24, 123, 123, group, nlmode);
  1059. pack(n, 64, 64, 56, 56, group, nlmode);
  1060. pack(n, 128, 128, 28, 28, group, nlmode);
  1061. pack(n, 256, 256, 14, 14, group, nlmode);
  1062. pack(n, 512, 512, 7, 7, group, nlmode);
  1063. }
  1064. using namespace conv_bias;
  1065. constexpr size_t RUN = 10;
  1066. Benchmarker<ConvBias> benchmark_winograd_nchw(handle);
  1067. benchmark_winograd_nchw.set_display(false);
  1068. benchmark_winograd_nchw.set_times(RUN);
  1069. Benchmarker<ConvBias> benchmark_winograd_nchw44(handle);
  1070. benchmark_winograd_nchw44.set_display(false);
  1071. benchmark_winograd_nchw44.set_times(RUN);
  1072. std::string winograd_nchw_algo_name = ssprintf("WINOGRAD:%s", algo_name0);
  1073. std::string winograd_nchw44_algo_name = ssprintf("WINOGRAD_NCHW44:%s", algo_name1);
  1074. for (size_t i = 0; i < args_nchw.size(); ++i) {
  1075. auto arg_nchw = args_nchw[i];
  1076. auto arg_nchw44 = args_nchw44[i];
  1077. TensorLayout dst_layout;
  1078. auto opr = handle->create_operator<ConvBias>();
  1079. opr->param() = arg_nchw.param;
  1080. opr->deduce_layout(
  1081. {arg_nchw.src, dtype::Float32()}, {arg_nchw.filter, dtype::Float32()},
  1082. {arg_nchw.bias, dtype::Float32()}, {}, dst_layout);
  1083. //! dst.nr_elems * IC * FH * FW * 2
  1084. float computations = dst_layout.total_nr_elems() * arg_nchw.filter[1] *
  1085. arg_nchw.filter[2] * arg_nchw.filter[3] * 2.0 /
  1086. (1024 * 1024 * 1024) * 1e3;
  1087. benchmark_winograd_nchw.set_param(arg_nchw.param);
  1088. auto nchw_used = algo_benchmark<ConvBias>(
  1089. benchmark_winograd_nchw,
  1090. {arg_nchw.src, arg_nchw.filter, {}, {}, {}},
  1091. winograd_nchw_algo_name.c_str()) /
  1092. RUN;
  1093. benchmark_winograd_nchw44.set_param(arg_nchw44.param);
  1094. auto nchw44_used = algo_benchmark<ConvBias>(
  1095. benchmark_winograd_nchw44,
  1096. {arg_nchw44.src, arg_nchw44.filter, {}, {}, {}},
  1097. winograd_nchw44_algo_name.c_str()) /
  1098. RUN;
  1099. printf("%s %s: nchw: %f ms %f Gflops nchw44: %f ms %f GFlops "
  1100. "speedup: "
  1101. "%f\n",
  1102. arg_nchw.src.to_string().c_str(), arg_nchw.filter.to_string().c_str(),
  1103. nchw_used, computations / nchw_used, nchw44_used,
  1104. computations / nchw44_used, nchw_used / nchw44_used);
  1105. }
  1106. }
  1107. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F23_MK4_NCHW_VS_NCHW44) {
  1108. #if MEGDNN_AARCH64
  1109. benchmark_winograd_nchw_vs_nchw44(
  1110. "AARCH64_F32_MK4_4x16:4:2", "AARCH64_F32_MK4_4x16:4:2", handle());
  1111. #else
  1112. benchmark_winograd_nchw_vs_nchw44(
  1113. "ARMV7_F32_MK4_4x8:4:2", "ARMV7_F32_MK4_4x8:4:2", handle());
  1114. #endif
  1115. }
  1116. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F63_MK4_NCHW_VS_NCHW44) {
  1117. #if MEGDNN_AARCH64
  1118. benchmark_winograd_nchw_vs_nchw44(
  1119. "AARCH64_F32_MK4_4x16:4:6", "AARCH64_F32_MK4_4x16:4:6", handle());
  1120. #else
  1121. benchmark_winograd_nchw_vs_nchw44(
  1122. "ARMV7_F32_MK4_4x8:4:6", "ARMV7_F32_MK4_4x8:4:6", handle());
  1123. #endif
  1124. }
  1125. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F73_MK4_NCHW_VS_NCHW44) {
  1126. #if MEGDNN_AARCH64
  1127. benchmark_winograd_nchw_vs_nchw44(
  1128. "AARCH64_F32_MK4_4x16:4:6", "ARM_COMMON_F32_GEMV_MK4:4:7", handle());
  1129. #else
  1130. benchmark_winograd_nchw_vs_nchw44(
  1131. "ARMV7_F32_MK4_4x8:4:6", "ARMV7_F32_MK4_4x8:4:7", handle());
  1132. #endif
  1133. }
  1134. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F23_8x8) {
  1135. auto benchmark_winograd_quantized = [](const char* algo_name_fp32,
  1136. const char* algo_name_quantized,
  1137. Handle* handle, size_t kernel) {
  1138. auto&& args = get_winograd_benchmark_args(kernel);
  1139. using namespace conv_bias;
  1140. constexpr size_t RUN = 10;
  1141. Benchmarker<ConvBias> benchmark(handle);
  1142. benchmark.set_display(false);
  1143. benchmark.set_times(RUN);
  1144. Benchmarker<ConvBias> benchmark_winograd(handle);
  1145. benchmark_winograd.set_display(false).set_times(RUN);
  1146. benchmark_winograd.set_dtype(0, dtype::QuantizedS8(2.5f))
  1147. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1148. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1149. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1150. for (auto&& arg : args) {
  1151. TensorLayout dst_layout;
  1152. auto opr = handle->create_operator<ConvBias>();
  1153. opr->param() = arg.param;
  1154. opr->deduce_layout(
  1155. {arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()},
  1156. {arg.bias, dtype::Float32()}, {}, dst_layout);
  1157. //! dst.nr_elems * IC * FH * FW * 2
  1158. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1159. arg.filter[2] * arg.filter[3] * 2.0 /
  1160. (1024 * 1024 * 1024) * 1e3;
  1161. benchmark.set_param(arg.param);
  1162. auto used = algo_benchmark<ConvBias>(
  1163. benchmark, {arg.src, arg.filter, {}, {}, {}},
  1164. algo_name_fp32) /
  1165. RUN;
  1166. benchmark_winograd.set_param(arg.param);
  1167. auto used_winograd =
  1168. algo_benchmark<ConvBias>(
  1169. benchmark_winograd, {arg.src, arg.filter, {}, {}, {}},
  1170. algo_name_quantized) /
  1171. RUN;
  1172. printf("%s %s: normal: %f ms %f Gflops winograd: %f ms %f GFlops "
  1173. "speedup: "
  1174. "%f\n",
  1175. arg.src.to_string().c_str(), arg.filter.to_string().c_str(), used,
  1176. computations / used, used_winograd, computations / used_winograd,
  1177. used / used_winograd);
  1178. }
  1179. };
  1180. #if MEGDNN_AARCH64
  1181. benchmark_winograd_quantized(
  1182. "WINOGRAD:AARCH64_F32_MK4_4x16:4:2",
  1183. "WINOGRAD:AARCH64_INT16X16X32_MK8_8X8:8:2", handle(), 3);
  1184. #else
  1185. benchmark_winograd_quantized(
  1186. "WINOGRAD:ARMV7_F32_MK4_4x8:4:2", "WINOGRAD:ARMV7_INT16X16X32_MK8_4X8:8:2",
  1187. handle(), 3);
  1188. #endif
  1189. }
  1190. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_INT8_STRIDE1) {
  1191. // have to remove preferred restrict in usable func before run the benchmark
  1192. using namespace conv_bias;
  1193. std::vector<TestArg> args;
  1194. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  1195. NonlineMode nonline_mode) {
  1196. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1197. return;
  1198. param::ConvBias param;
  1199. param.stride_h = 1;
  1200. param.stride_w = 1;
  1201. param.pad_h = p;
  1202. param.pad_w = p;
  1203. param.nonlineMode = nonline_mode;
  1204. //! channel bias
  1205. args.emplace_back(
  1206. param, TensorShape{2, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1207. TensorShape{1, oc, 1, 1});
  1208. };
  1209. for (size_t kernel : {2, 3, 5, 7})
  1210. for (size_t ic : {1, 8, 16, 32})
  1211. for (size_t oc : {1, 8, 16, 32})
  1212. for (size_t p : {1})
  1213. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  1214. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  1215. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  1216. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  1217. }
  1218. constexpr size_t RUN = 50;
  1219. Benchmarker<ConvBias> benchmark0(handle());
  1220. benchmark0.set_dtype(0, dtype::QuantizedS8(2.5f))
  1221. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1222. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1223. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1224. benchmark0.set_display(false);
  1225. benchmark0.set_times(RUN);
  1226. benchmark0.set_before_exec_callback(
  1227. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("S8STRD1"));
  1228. Benchmarker<ConvBias> benchmark1(handle());
  1229. benchmark1.set_dtype(0, dtype::QuantizedS8(2.5f))
  1230. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1231. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1232. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1233. benchmark1.set_display(false);
  1234. benchmark1.set_times(RUN);
  1235. for (auto&& arg : args) {
  1236. TensorLayout dst_layout;
  1237. auto opr = handle()->create_operator<ConvBias>();
  1238. opr->param() = arg.param;
  1239. opr->deduce_layout(
  1240. {arg.src, dtype::Int8()}, {arg.filter, dtype::Int8()},
  1241. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1242. //! dst.nr_elems * IC * FH * FW * 2
  1243. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1244. arg.filter[2] * arg.filter[3] * 2.0 /
  1245. (1024 * 1024 * 1024) * 1e3;
  1246. auto used0 = benchmark0.set_param(arg.param).exec(
  1247. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1248. RUN;
  1249. auto used1 = benchmark1.set_param(arg.param).exec(
  1250. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1251. RUN;
  1252. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  1253. "speedup: %f\n",
  1254. arg.src.to_string().c_str(), arg.filter.to_string().c_str(), used0,
  1255. computations / used0, used1, computations / used1, used1 / used0);
  1256. }
  1257. }
  1258. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_INT8_STRIDE2) {
  1259. // have to remove preferred restrict in usable func before run the benchmark
  1260. using namespace conv_bias;
  1261. std::vector<TestArg> args;
  1262. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  1263. NonlineMode nonline_mode) {
  1264. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1265. return;
  1266. param::ConvBias param;
  1267. param.stride_h = 2;
  1268. param.stride_w = 2;
  1269. param.pad_h = p;
  1270. param.pad_w = p;
  1271. param.nonlineMode = nonline_mode;
  1272. //! channel bias
  1273. args.emplace_back(
  1274. param, TensorShape{2, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1275. TensorShape{1, oc, 1, 1});
  1276. };
  1277. for (size_t kernel : {2, 3, 5, 7})
  1278. for (size_t ic : {1, 8, 16, 32})
  1279. for (size_t oc : {1, 8, 16, 32})
  1280. for (size_t p : {1})
  1281. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  1282. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  1283. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  1284. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  1285. }
  1286. constexpr size_t RUN = 50;
  1287. Benchmarker<ConvBias> benchmark0(handle());
  1288. benchmark0.set_dtype(0, dtype::QuantizedS8(2.5f))
  1289. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1290. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1291. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1292. benchmark0.set_display(false);
  1293. benchmark0.set_times(RUN);
  1294. benchmark0.set_before_exec_callback(
  1295. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("S8STRD2"));
  1296. Benchmarker<ConvBias> benchmark1(handle());
  1297. benchmark1.set_dtype(0, dtype::QuantizedS8(2.5f))
  1298. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1299. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1300. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1301. benchmark1.set_display(false);
  1302. benchmark1.set_times(RUN);
  1303. for (auto&& arg : args) {
  1304. TensorLayout dst_layout;
  1305. auto opr = handle()->create_operator<ConvBias>();
  1306. opr->param() = arg.param;
  1307. opr->deduce_layout(
  1308. {arg.src, dtype::Int8()}, {arg.filter, dtype::Int8()},
  1309. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1310. //! dst.nr_elems * IC * FH * FW * 2
  1311. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1312. arg.filter[2] * arg.filter[3] * 2.0 /
  1313. (1024 * 1024 * 1024) * 1e3;
  1314. auto used0 = benchmark0.set_param(arg.param).exec(
  1315. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1316. RUN;
  1317. auto used1 = benchmark1.set_param(arg.param).exec(
  1318. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1319. RUN;
  1320. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  1321. "speedup: %f\n",
  1322. arg.src.to_string().c_str(), arg.filter.to_string().c_str(), used0,
  1323. computations / used0, used1, computations / used1, used1 / used0);
  1324. }
  1325. }
  1326. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_QUINT8_STRIDE1) {
  1327. // have to remove preferred restrict in usable func before run the benchmark
  1328. using namespace conv_bias;
  1329. std::vector<TestArg> args;
  1330. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  1331. NonlineMode nonline_mode) {
  1332. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1333. return;
  1334. param::ConvBias param;
  1335. param.stride_h = 1;
  1336. param.stride_w = 1;
  1337. param.pad_h = p;
  1338. param.pad_w = p;
  1339. param.nonlineMode = nonline_mode;
  1340. //! channel bias
  1341. args.emplace_back(
  1342. param, TensorShape{2, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1343. TensorShape{1, oc, 1, 1});
  1344. };
  1345. for (size_t kernel : {2, 3, 5, 7})
  1346. for (size_t ic : {1, 8, 16, 32})
  1347. for (size_t oc : {1, 8, 16, 32})
  1348. for (size_t p : {1})
  1349. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  1350. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  1351. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  1352. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  1353. }
  1354. constexpr size_t RUN = 50;
  1355. Benchmarker<ConvBias> benchmark0(handle());
  1356. benchmark0.set_dtype(0, dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1357. .set_dtype(1, dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1358. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1359. .set_dtype(4, dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1360. benchmark0.set_display(false);
  1361. benchmark0.set_times(RUN);
  1362. benchmark0.set_before_exec_callback(
  1363. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("QU8STRD1"));
  1364. Benchmarker<ConvBias> benchmark1(handle());
  1365. benchmark1.set_dtype(0, dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1366. .set_dtype(1, dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1367. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1368. .set_dtype(4, dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1369. benchmark1.set_display(false);
  1370. benchmark1.set_times(RUN);
  1371. for (auto&& arg : args) {
  1372. TensorLayout dst_layout;
  1373. auto opr = handle()->create_operator<ConvBias>();
  1374. opr->param() = arg.param;
  1375. opr->deduce_layout(
  1376. {arg.src, dtype::Int8()}, {arg.filter, dtype::Int8()},
  1377. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1378. //! dst.nr_elems * IC * FH * FW * 2
  1379. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1380. arg.filter[2] * arg.filter[3] * 2.0 /
  1381. (1024 * 1024 * 1024) * 1e3;
  1382. auto used0 = benchmark0.set_param(arg.param).exec(
  1383. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1384. RUN;
  1385. auto used1 = benchmark1.set_param(arg.param).exec(
  1386. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1387. RUN;
  1388. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  1389. "speedup: %f\n",
  1390. arg.src.to_string().c_str(), arg.filter.to_string().c_str(), used0,
  1391. computations / used0, used1, computations / used1, used1 / used0);
  1392. }
  1393. }
  1394. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_QUINT8_STRIDE2) {
  1395. // have to remove preferred restrict in usable func before run the benchmark
  1396. using namespace conv_bias;
  1397. std::vector<TestArg> args;
  1398. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  1399. NonlineMode nonline_mode) {
  1400. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1401. return;
  1402. param::ConvBias param;
  1403. param.stride_h = 2;
  1404. param.stride_w = 2;
  1405. param.pad_h = p;
  1406. param.pad_w = p;
  1407. param.nonlineMode = nonline_mode;
  1408. //! channel bias
  1409. args.emplace_back(
  1410. param, TensorShape{2, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1411. TensorShape{1, oc, 1, 1});
  1412. };
  1413. for (size_t kernel : {2, 3, 5, 7})
  1414. for (size_t ic : {1, 8, 16, 32})
  1415. for (size_t oc : {1, 8, 16, 32})
  1416. for (size_t p : {1})
  1417. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  1418. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  1419. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  1420. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  1421. }
  1422. constexpr size_t RUN = 50;
  1423. Benchmarker<ConvBias> benchmark0(handle());
  1424. benchmark0.set_dtype(0, dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1425. .set_dtype(1, dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1426. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1427. .set_dtype(4, dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1428. benchmark0.set_display(false);
  1429. benchmark0.set_times(RUN);
  1430. benchmark0.set_before_exec_callback(
  1431. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("QU8STRD2"));
  1432. Benchmarker<ConvBias> benchmark1(handle());
  1433. benchmark1.set_dtype(0, dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1434. .set_dtype(1, dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1435. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1436. .set_dtype(4, dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1437. benchmark1.set_display(false);
  1438. benchmark1.set_times(RUN);
  1439. for (auto&& arg : args) {
  1440. TensorLayout dst_layout;
  1441. auto opr = handle()->create_operator<ConvBias>();
  1442. opr->param() = arg.param;
  1443. opr->deduce_layout(
  1444. {arg.src, dtype::Int8()}, {arg.filter, dtype::Int8()},
  1445. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1446. //! dst.nr_elems * IC * FH * FW * 2
  1447. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1448. arg.filter[2] * arg.filter[3] * 2.0 /
  1449. (1024 * 1024 * 1024) * 1e3;
  1450. auto used0 = benchmark0.set_param(arg.param).exec(
  1451. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1452. RUN;
  1453. auto used1 = benchmark1.set_param(arg.param).exec(
  1454. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1455. RUN;
  1456. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  1457. "speedup: %f\n",
  1458. arg.src.to_string().c_str(), arg.filter.to_string().c_str(), used0,
  1459. computations / used0, used1, computations / used1, used1 / used0);
  1460. }
  1461. }
  1462. TEST_F(ARM_COMMON, BENCHMARK_CHANNEL_WISE_F32_STRIDE1_NCHW44) {
  1463. // have to remove preferred restrict in usable func before run the benchmark
  1464. using namespace conv_bias;
  1465. param::ConvBias param;
  1466. param.stride_h = 1;
  1467. param.stride_w = 1;
  1468. param.pad_h = 1;
  1469. param.pad_w = 1;
  1470. param.nonlineMode = NonlineMode::RELU;
  1471. param.sparse = param::ConvBias::Sparse::GROUP;
  1472. constexpr size_t RUN = 50;
  1473. Benchmarker<ConvBias> benchmark0(handle());
  1474. benchmark0.set_display(false);
  1475. benchmark0.set_param(param);
  1476. benchmark0.set_times(RUN);
  1477. benchmark0.set_before_exec_callback(
  1478. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32STRD1"));
  1479. auto opr = handle()->create_operator<ConvBias>();
  1480. opr->param() = param;
  1481. param.format = param::ConvBias::Format::NCHW44;
  1482. Benchmarker<ConvBias> benchmark1(handle());
  1483. benchmark1.set_display(false);
  1484. benchmark1.set_param(param);
  1485. benchmark1.set_times(RUN);
  1486. benchmark1.set_before_exec_callback(
  1487. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32_CHANNEL_WISE_NCHW44"));
  1488. auto run = [&](size_t group, size_t w, size_t h, size_t kernel) {
  1489. TensorLayout dst_layout;
  1490. opr->deduce_layout(
  1491. {{1, group * 4, h, w}, dtype::Int8()},
  1492. {{group * 4, 1, 1, kernel, kernel}, dtype::Int8()},
  1493. {{1, group * 4, 1, 1}, dtype::Int32()}, {}, dst_layout);
  1494. //! dst.nr_elems * IC * FH * FW * 2
  1495. float computations = dst_layout.total_nr_elems() * kernel * kernel * 2.0 /
  1496. (1024 * 1024 * 1024) * 1e3;
  1497. auto used0 = benchmark0.exec(
  1498. {{1, group * 4, h, w},
  1499. {group * 4, 1, 1, kernel, kernel},
  1500. {1, group * 4, 1, 1},
  1501. {},
  1502. {}}) /
  1503. RUN;
  1504. auto used1 = benchmark1.exec(
  1505. {{1, group, h, w, 4},
  1506. {group, 1, 1, kernel, kernel, 4},
  1507. {1, group, 1, 1, 4},
  1508. {},
  1509. {}}) /
  1510. RUN;
  1511. printf("group/h/w/kernel:%zu,%zu,%zu,%zu: nchw: %f ms %f Gflops "
  1512. "nchw44: "
  1513. "%f ms %f GFlops "
  1514. "speedup: %f\n",
  1515. group, h, w, kernel, used0, computations / used0, used1,
  1516. computations / used1, used0 / used1);
  1517. };
  1518. for (size_t group : {8, 16, 32, 64}) {
  1519. for (size_t kerenl : {2, 3, 5}) {
  1520. run(group, 112, 112, kerenl);
  1521. run(group, 56, 56, kerenl);
  1522. run(group, 48, 48, kerenl);
  1523. run(group, 28, 28, kerenl);
  1524. run(group, 14, 14, kerenl);
  1525. }
  1526. }
  1527. run(8, 112, 112, 3);
  1528. run(32, 56, 56, 3);
  1529. run(64, 28, 28, 3);
  1530. run(128, 14, 14, 3);
  1531. }
  1532. TEST_F(ARM_COMMON, BENCHMARK_CHANNEL_WISE_F32_STRIDE2_NCHW44) {
  1533. // have to remove preferred restrict in usable func before run the benchmark
  1534. using namespace conv_bias;
  1535. param::ConvBias param;
  1536. param.stride_h = 2;
  1537. param.stride_w = 2;
  1538. param.pad_h = 1;
  1539. param.pad_w = 1;
  1540. param.nonlineMode = NonlineMode::RELU;
  1541. param.sparse = param::ConvBias::Sparse::GROUP;
  1542. constexpr size_t RUN = 50;
  1543. Benchmarker<ConvBias> benchmark0(handle());
  1544. benchmark0.set_display(false);
  1545. benchmark0.set_param(param);
  1546. benchmark0.set_times(RUN);
  1547. benchmark0.set_before_exec_callback(
  1548. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32STRD2"));
  1549. auto opr = handle()->create_operator<ConvBias>();
  1550. opr->param() = param;
  1551. param.format = param::ConvBias::Format::NCHW44;
  1552. Benchmarker<ConvBias> benchmark1(handle());
  1553. benchmark1.set_display(false);
  1554. benchmark1.set_param(param);
  1555. benchmark1.set_times(RUN);
  1556. benchmark1.set_before_exec_callback(
  1557. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32_CHANNEL_WISE_NCHW44"));
  1558. auto run = [&](size_t group, size_t w, size_t h, size_t kernel) {
  1559. TensorLayout dst_layout;
  1560. opr->deduce_layout(
  1561. {{1, group * 4, h, w}, dtype::Int8()},
  1562. {{group * 4, 1, 1, kernel, kernel}, dtype::Int8()},
  1563. {{1, group * 4, 1, 1}, dtype::Int32()}, {}, dst_layout);
  1564. //! dst.nr_elems * IC * FH * FW * 2
  1565. float computations = dst_layout.total_nr_elems() * kernel * kernel * 2.0 /
  1566. (1024 * 1024 * 1024) * 1e3;
  1567. auto used0 = benchmark0.exec(
  1568. {{1, group * 4, h, w},
  1569. {group * 4, 1, 1, kernel, kernel},
  1570. {1, group * 4, 1, 1},
  1571. {},
  1572. {}}) /
  1573. RUN;
  1574. auto used1 = benchmark1.exec(
  1575. {{1, group, h, w, 4},
  1576. {group, 1, 1, kernel, kernel, 4},
  1577. {1, group, 1, 1, 4},
  1578. {},
  1579. {}}) /
  1580. RUN;
  1581. printf("group/h/w/kernel:%zu,%zu,%zu,%zu: nchw: %f ms %f Gflops "
  1582. "nchw44: "
  1583. "%f ms %f GFlops "
  1584. "speedup: %f\n",
  1585. group, h, w, kernel, used0, computations / used0, used1,
  1586. computations / used1, used0 / used1);
  1587. };
  1588. for (size_t group : {8, 16, 32, 64}) {
  1589. for (size_t kerenl : {2, 3, 5}) {
  1590. run(group, 112, 112, kerenl);
  1591. run(group, 56, 56, kerenl);
  1592. run(group, 48, 48, kerenl);
  1593. run(group, 28, 28, kerenl);
  1594. run(group, 14, 14, kerenl);
  1595. }
  1596. }
  1597. run(8, 112, 112, 3);
  1598. run(32, 56, 56, 3);
  1599. run(64, 28, 28, 3);
  1600. run(128, 14, 14, 3);
  1601. }
  1602. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_QINT8_STRIDE1_NCHW44) {
  1603. // have to remove preferred restrict in usable func before run the benchmark
  1604. using namespace conv_bias;
  1605. param::ConvBias param;
  1606. param.stride_h = 1;
  1607. param.stride_w = 1;
  1608. param.pad_h = 1;
  1609. param.pad_w = 1;
  1610. param.nonlineMode = NonlineMode::RELU;
  1611. param.sparse = param::ConvBias::Sparse::GROUP;
  1612. constexpr size_t RUN = 50;
  1613. Benchmarker<ConvBias> benchmark0(handle());
  1614. benchmark0.set_dtype(0, dtype::QuantizedS8(0.2f))
  1615. .set_dtype(1, dtype::QuantizedS8(0.2f))
  1616. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1617. .set_dtype(4, dtype::QuantizedS8(1.4f));
  1618. benchmark0.set_display(false);
  1619. benchmark0.set_param(param);
  1620. benchmark0.set_times(RUN);
  1621. benchmark0.set_before_exec_callback(
  1622. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("S8STRD1"));
  1623. auto opr = handle()->create_operator<ConvBias>();
  1624. opr->param() = param;
  1625. param.format = param::ConvBias::Format::NCHW44;
  1626. Benchmarker<ConvBias> benchmark1(handle());
  1627. benchmark1.set_dtype(0, dtype::QuantizedS8(0.2f))
  1628. .set_dtype(1, dtype::QuantizedS8(0.2f))
  1629. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1630. .set_dtype(4, dtype::QuantizedS8(1.4f));
  1631. benchmark1.set_display(false);
  1632. benchmark1.set_param(param);
  1633. benchmark1.set_times(RUN);
  1634. benchmark1.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  1635. "S8_CHAN_WISE_STRD1_NCHW44"));
  1636. auto run = [&](size_t group, size_t w, size_t h, size_t kernel) {
  1637. TensorLayout dst_layout;
  1638. opr->deduce_layout(
  1639. {{1, group * 4, h, w}, dtype::Int8()},
  1640. {{group * 4, 1, 1, kernel, kernel}, dtype::Int8()},
  1641. {{1, group * 4, 1, 1}, dtype::Int32()}, {}, dst_layout);
  1642. //! dst.nr_elems * IC * FH * FW * 2
  1643. float computations = dst_layout.total_nr_elems() * kernel * kernel * 2.0 /
  1644. (1024 * 1024 * 1024) * 1e3;
  1645. auto used0 = benchmark0.exec(
  1646. {{1, group * 4, h, w},
  1647. {group * 4, 1, 1, kernel, kernel},
  1648. {1, group * 4, 1, 1},
  1649. {},
  1650. {}}) /
  1651. RUN;
  1652. auto used1 = benchmark1.exec(
  1653. {{1, group, h, w, 4},
  1654. {group, 1, 1, kernel, kernel, 4},
  1655. {1, group, 1, 1, 4},
  1656. {},
  1657. {}}) /
  1658. RUN;
  1659. printf("group/h/w/kernel:%zu,%zu,%zu,%zu: nchw: %f ms %f Gflops "
  1660. "nchw44: "
  1661. "%f ms %f GFlops "
  1662. "speedup: %f\n",
  1663. group, h, w, kernel, used0, computations / used0, used1,
  1664. computations / used1, used0 / used1);
  1665. };
  1666. for (size_t group : {8, 16, 32, 64, 128}) {
  1667. for (size_t kerenl : {2, 3, 5}) {
  1668. run(group, 112, 112, kerenl);
  1669. run(group, 56, 56, kerenl);
  1670. run(group, 48, 48, kerenl);
  1671. run(group, 28, 28, kerenl);
  1672. run(group, 14, 14, kerenl);
  1673. }
  1674. }
  1675. }
  1676. #endif
  1677. #if MGB_ENBALE_DOT
  1678. #if MEGDNN_WITH_BENCHMARK
  1679. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_INT8_STRIDE1_WITHDOTPROD) {
  1680. // have to remove preferred restrict in usable func before run the benchmark
  1681. using namespace conv_bias;
  1682. std::vector<TestArg> args;
  1683. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  1684. NonlineMode nonline_mode) {
  1685. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1686. return;
  1687. param::ConvBias param;
  1688. param.stride_h = 1;
  1689. param.stride_w = 1;
  1690. param.pad_h = p;
  1691. param.pad_w = p;
  1692. param.nonlineMode = nonline_mode;
  1693. //! channel bias
  1694. args.emplace_back(
  1695. param, TensorShape{2, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1696. TensorShape{1, oc, 1, 1});
  1697. };
  1698. for (size_t kernel : {2, 3, 5, 7})
  1699. for (size_t ic : {1, 8, 16, 32})
  1700. for (size_t oc : {1, 8, 16, 32})
  1701. for (size_t p : {1})
  1702. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  1703. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  1704. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  1705. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  1706. }
  1707. constexpr size_t RUN = 50;
  1708. Benchmarker<ConvBias> benchmark0(handle());
  1709. benchmark0.set_dtype(0, dtype::QuantizedS8(2.5f))
  1710. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1711. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1712. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1713. benchmark0.set_display(false);
  1714. benchmark0.set_times(RUN);
  1715. benchmark0.set_before_exec_callback(
  1716. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("ARMDOTS8STRD1"));
  1717. Benchmarker<ConvBias> benchmark1(handle());
  1718. benchmark1.set_dtype(0, dtype::QuantizedS8(2.5f))
  1719. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1720. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1721. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1722. benchmark1.set_display(false);
  1723. benchmark1.set_times(RUN);
  1724. for (auto&& arg : args) {
  1725. TensorLayout dst_layout;
  1726. auto opr = handle()->create_operator<ConvBias>();
  1727. opr->param() = arg.param;
  1728. opr->deduce_layout(
  1729. {arg.src, dtype::Int8()}, {arg.filter, dtype::Int8()},
  1730. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1731. //! dst.nr_elems * IC * FH * FW * 2
  1732. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1733. arg.filter[2] * arg.filter[3] * 2.0 /
  1734. (1024 * 1024 * 1024) * 1e3;
  1735. auto used0 = benchmark0.set_param(arg.param).exec(
  1736. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1737. RUN;
  1738. auto used1 = benchmark1.set_param(arg.param).exec(
  1739. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1740. RUN;
  1741. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  1742. "speedup: %f\n",
  1743. arg.src.to_string().c_str(), arg.filter.to_string().c_str(), used0,
  1744. computations / used0, used1, computations / used1, used1 / used0);
  1745. }
  1746. }
  1747. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_INT8_STRIDE2_WITHDOTPROD) {
  1748. // have to remove preferred restrict in usable func before run the benchmark
  1749. using namespace conv_bias;
  1750. std::vector<TestArg> args;
  1751. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  1752. NonlineMode nonline_mode) {
  1753. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1754. return;
  1755. param::ConvBias param;
  1756. param.stride_h = 2;
  1757. param.stride_w = 2;
  1758. param.pad_h = p;
  1759. param.pad_w = p;
  1760. param.nonlineMode = nonline_mode;
  1761. //! channel bias
  1762. args.emplace_back(
  1763. param, TensorShape{2, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1764. TensorShape{1, oc, 1, 1});
  1765. };
  1766. for (size_t kernel : {2, 3, 5, 7})
  1767. for (size_t ic : {1, 8, 16, 32})
  1768. for (size_t oc : {1, 8, 16, 32})
  1769. for (size_t p : {1})
  1770. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  1771. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  1772. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  1773. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  1774. }
  1775. constexpr size_t RUN = 50;
  1776. Benchmarker<ConvBias> benchmark0(handle());
  1777. benchmark0.set_dtype(0, dtype::QuantizedS8(2.5f))
  1778. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1779. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1780. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1781. benchmark0.set_display(false);
  1782. benchmark0.set_times(RUN);
  1783. benchmark0.set_before_exec_callback(
  1784. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("ARMDOTS8STRD2"));
  1785. Benchmarker<ConvBias> benchmark1(handle());
  1786. benchmark1.set_dtype(0, dtype::QuantizedS8(2.5f))
  1787. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1788. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1789. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1790. benchmark1.set_display(false);
  1791. benchmark1.set_times(RUN);
  1792. for (auto&& arg : args) {
  1793. TensorLayout dst_layout;
  1794. auto opr = handle()->create_operator<ConvBias>();
  1795. opr->param() = arg.param;
  1796. opr->deduce_layout(
  1797. {arg.src, dtype::Int8()}, {arg.filter, dtype::Int8()},
  1798. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1799. //! dst.nr_elems * IC * FH * FW * 2
  1800. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1801. arg.filter[2] * arg.filter[3] * 2.0 /
  1802. (1024 * 1024 * 1024) * 1e3;
  1803. auto used0 = benchmark0.set_param(arg.param).exec(
  1804. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1805. RUN;
  1806. auto used1 = benchmark1.set_param(arg.param).exec(
  1807. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1808. RUN;
  1809. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  1810. "speedup: %f\n",
  1811. arg.src.to_string().c_str(), arg.filter.to_string().c_str(), used0,
  1812. computations / used0, used1, computations / used1, used1 / used0);
  1813. }
  1814. }
  1815. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_QUINT8_STRIDE1_WITHDOTPROD) {
  1816. // have to remove preferred restrict in usable func before run the benchmark
  1817. using namespace conv_bias;
  1818. std::vector<TestArg> args;
  1819. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  1820. NonlineMode nonline_mode) {
  1821. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1822. return;
  1823. param::ConvBias param;
  1824. param.stride_h = 1;
  1825. param.stride_w = 1;
  1826. param.pad_h = p;
  1827. param.pad_w = p;
  1828. param.nonlineMode = nonline_mode;
  1829. //! channel bias
  1830. args.emplace_back(
  1831. param, TensorShape{2, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1832. TensorShape{1, oc, 1, 1});
  1833. };
  1834. // clang-format off
  1835. for (size_t kernel : {2, 3, 5, 7})
  1836. for (size_t ic : {1, 8, 16, 32})
  1837. for (size_t oc : {1, 8, 16, 32})
  1838. for (size_t p : {1})
  1839. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  1840. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  1841. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  1842. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  1843. }
  1844. // clang-format on
  1845. constexpr size_t RUN = 50;
  1846. Benchmarker<ConvBias> benchmark0(handle());
  1847. benchmark0.set_dtype(0, dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1848. .set_dtype(1, dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1849. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1850. .set_dtype(4, dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1851. benchmark0.set_display(false);
  1852. benchmark0.set_times(RUN);
  1853. benchmark0.set_before_exec_callback(
  1854. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("ARMDOTU8STRD1"));
  1855. Benchmarker<ConvBias> benchmark1(handle());
  1856. benchmark1.set_dtype(0, dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1857. .set_dtype(1, dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1858. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1859. .set_dtype(4, dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1860. benchmark1.set_display(false);
  1861. benchmark1.set_times(RUN);
  1862. for (auto&& arg : args) {
  1863. TensorLayout dst_layout;
  1864. auto opr = handle()->create_operator<ConvBias>();
  1865. opr->param() = arg.param;
  1866. opr->deduce_layout(
  1867. {arg.src, dtype::Int8()}, {arg.filter, dtype::Int8()},
  1868. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1869. //! dst.nr_elems * IC * FH * FW * 2
  1870. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1871. arg.filter[2] * arg.filter[3] * 2.0 /
  1872. (1024 * 1024 * 1024) * 1e3;
  1873. auto used0 = benchmark0.set_param(arg.param).exec(
  1874. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1875. RUN;
  1876. auto used1 = benchmark1.set_param(arg.param).exec(
  1877. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1878. RUN;
  1879. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  1880. "speedup: %f\n",
  1881. arg.src.to_string().c_str(), arg.filter.to_string().c_str(), used0,
  1882. computations / used0, used1, computations / used1, used1 / used0);
  1883. }
  1884. }
  1885. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_QUINT8_STRIDE2_WITHDOTPROD) {
  1886. // have to remove preferred restrict in usable func before run the benchmark
  1887. using namespace conv_bias;
  1888. std::vector<TestArg> args;
  1889. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  1890. NonlineMode nonline_mode) {
  1891. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1892. return;
  1893. param::ConvBias param;
  1894. param.stride_h = 2;
  1895. param.stride_w = 2;
  1896. param.pad_h = p;
  1897. param.pad_w = p;
  1898. param.nonlineMode = nonline_mode;
  1899. //! channel bias
  1900. args.emplace_back(
  1901. param, TensorShape{2, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  1902. TensorShape{1, oc, 1, 1});
  1903. };
  1904. // clang-format off
  1905. for (size_t kernel : {2, 3, 5, 7})
  1906. for (size_t ic : {1, 8, 16, 32})
  1907. for (size_t oc : {1, 8, 16, 32})
  1908. for (size_t p : {1})
  1909. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  1910. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  1911. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  1912. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  1913. }
  1914. // clang-format on
  1915. constexpr size_t RUN = 50;
  1916. Benchmarker<ConvBias> benchmark0(handle());
  1917. benchmark0.set_dtype(0, dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1918. .set_dtype(1, dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1919. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1920. .set_dtype(4, dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1921. benchmark0.set_display(false);
  1922. benchmark0.set_times(RUN);
  1923. benchmark0.set_before_exec_callback(
  1924. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("ARMDOTU8STRD2"));
  1925. Benchmarker<ConvBias> benchmark1(handle());
  1926. benchmark1.set_dtype(0, dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1927. .set_dtype(1, dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1928. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1929. .set_dtype(4, dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1930. benchmark1.set_display(false);
  1931. benchmark1.set_times(RUN);
  1932. for (auto&& arg : args) {
  1933. TensorLayout dst_layout;
  1934. auto opr = handle()->create_operator<ConvBias>();
  1935. opr->param() = arg.param;
  1936. opr->deduce_layout(
  1937. {arg.src, dtype::Int8()}, {arg.filter, dtype::Int8()},
  1938. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1939. //! dst.nr_elems * IC * FH * FW * 2
  1940. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1941. arg.filter[2] * arg.filter[3] * 2.0 /
  1942. (1024 * 1024 * 1024) * 1e3;
  1943. auto used0 = benchmark0.set_param(arg.param).exec(
  1944. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1945. RUN;
  1946. auto used1 = benchmark1.set_param(arg.param).exec(
  1947. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1948. RUN;
  1949. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  1950. "speedup: %f\n",
  1951. arg.src.to_string().c_str(), arg.filter.to_string().c_str(), used0,
  1952. computations / used0, used1, computations / used1, used1 / used0);
  1953. }
  1954. }
  1955. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_INT8_STRIDE1_WITHDOTPROD_NCHW44_DOT) {
  1956. using namespace conv_bias;
  1957. std::vector<TestArg> args;
  1958. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
  1959. size_t stride, NonlineMode nonline_mode) {
  1960. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1961. return;
  1962. param::ConvBias param;
  1963. param.stride_h = stride;
  1964. param.stride_w = stride;
  1965. param.pad_h = p;
  1966. param.pad_w = p;
  1967. param.nonlineMode = nonline_mode;
  1968. param.format = param::ConvBias::Format::NCHW44_DOT;
  1969. //! channel bias
  1970. args.emplace_back(
  1971. param, TensorShape{1, ic / 4, h, w, 4},
  1972. TensorShape{oc / 4, ic / 4, kernel, kernel, 4, 4},
  1973. TensorShape{1, oc / 4, 1, 1, 4});
  1974. };
  1975. for (size_t stride : {1, 2})
  1976. for (size_t kernel : {2, 3, 5, 7})
  1977. for (size_t oc : {64})
  1978. for (NonlineMode nonline_mode : {NonlineMode::IDENTITY}) {
  1979. run(oc, oc, 56, 56, kernel, kernel / 2, stride, nonline_mode);
  1980. }
  1981. constexpr size_t RUN = 50;
  1982. Benchmarker<ConvBias> benchmark0(handle());
  1983. benchmark0.set_dtype(0, dtype::QuantizedS8(2.5f))
  1984. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1985. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1986. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1987. benchmark0.set_display(false);
  1988. benchmark0.set_times(RUN);
  1989. benchmark0.set_before_exec_callback(
  1990. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("ARMDOTS8DIRECT_NCHW44"));
  1991. Benchmarker<ConvBias> benchmark1(handle());
  1992. benchmark1.set_dtype(0, dtype::QuantizedS8(2.5f))
  1993. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1994. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1995. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1996. benchmark1.set_display(false);
  1997. benchmark1.set_times(RUN);
  1998. for (auto&& arg : args) {
  1999. TensorLayout dst_layout;
  2000. auto opr = handle()->create_operator<ConvBias>();
  2001. opr->param() = arg.param;
  2002. opr->deduce_layout(
  2003. {arg.src, dtype::Int8()}, {arg.filter, dtype::Int8()},
  2004. {arg.bias, dtype::Int32()}, {}, dst_layout);
  2005. //! dst.nr_elems * IC * FH * FW * 2
  2006. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  2007. arg.filter[2] * arg.filter[3] * 8.0 /
  2008. (1024 * 1024 * 1024) * 1e3;
  2009. auto used0 = benchmark0.set_param(arg.param).exec(
  2010. {arg.src, arg.filter, arg.bias, {}, {}}) /
  2011. RUN;
  2012. auto used1 = benchmark1.set_param(arg.param).exec(
  2013. {arg.src, arg.filter, arg.bias, {}, {}}) /
  2014. RUN;
  2015. printf("%s %s: Direct use: %f ms %f Gflops normal: %f ms %f GFlops "
  2016. "speedup: %f\n",
  2017. arg.src.to_string().c_str(), arg.filter.to_string().c_str(), used0,
  2018. computations / used0, used1, computations / used1, used1 / used0);
  2019. }
  2020. }
  2021. #endif
  2022. #endif
  2023. /*====================== BENCHMARK CONV1X1 ===========================*/
  2024. #if MEGDNN_WITH_BENCHMARK
  2025. namespace {
  2026. std::vector<conv_bias::TestArg> get_conv_bias_1x1_benchmark_args(size_t pack_size = 1) {
  2027. using namespace conv_bias;
  2028. std::vector<TestArg> args;
  2029. param::ConvBias param;
  2030. param.stride_h = 1;
  2031. param.stride_w = 1;
  2032. param.pad_h = 0;
  2033. param.pad_w = 0;
  2034. param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
  2035. auto bench_case = [&](size_t OC, size_t IC, size_t H, size_t W) {
  2036. if (pack_size == 1)
  2037. args.emplace_back(
  2038. param, TensorShape{1, IC, H, W}, TensorShape{OC, IC, 1, 1},
  2039. TensorShape{});
  2040. else {
  2041. if (pack_size == 4)
  2042. param.format = param::ConvBias::Format::NCHW44;
  2043. args.emplace_back(
  2044. param, TensorShape{1, IC / pack_size, H, W, pack_size},
  2045. TensorShape{
  2046. OC / pack_size, IC / pack_size, 1, 1, pack_size, pack_size},
  2047. TensorShape{});
  2048. }
  2049. };
  2050. //! MobileNetV1
  2051. bench_case(64, 32, 112, 112);
  2052. bench_case(128, 64, 56, 56);
  2053. bench_case(128, 128, 56, 56);
  2054. bench_case(256, 128, 28, 28);
  2055. bench_case(256, 256, 28, 28);
  2056. bench_case(512, 256, 14, 14);
  2057. bench_case(512, 512, 14, 14);
  2058. bench_case(1024, 512, 7, 7);
  2059. bench_case(1024, 1024, 7, 7);
  2060. //! MobileNetV2
  2061. bench_case(16, 32, 112, 112);
  2062. bench_case(96, 16, 112, 112);
  2063. bench_case(144, 24, 56, 56);
  2064. bench_case(192, 32, 28, 28);
  2065. bench_case(384, 64, 28, 28);
  2066. bench_case(576, 96, 14, 14);
  2067. bench_case(960, 160, 7, 7);
  2068. bench_case(320, 960, 7, 7);
  2069. bench_case(1280, 320, 7, 7);
  2070. //! MobileNetV3-Large
  2071. bench_case(64, 16, 112, 112);
  2072. bench_case(72, 24, 56, 56);
  2073. bench_case(120, 40, 28, 28);
  2074. bench_case(240, 40, 28, 28);
  2075. bench_case(200, 80, 14, 14);
  2076. bench_case(184, 80, 14, 14);
  2077. bench_case(480, 80, 14, 14);
  2078. bench_case(672, 112, 14, 14);
  2079. //! MobileNetV3-Small
  2080. bench_case(72, 16, 56, 56);
  2081. bench_case(88, 24, 28, 28);
  2082. bench_case(96, 24, 28, 28);
  2083. bench_case(240, 40, 14, 14);
  2084. bench_case(120, 40, 14, 14);
  2085. bench_case(144, 48, 14, 14);
  2086. bench_case(288, 48, 14, 14);
  2087. bench_case(576, 96, 7, 7);
  2088. //! resnet50
  2089. bench_case(256, 64, 56, 56);
  2090. bench_case(512, 128, 28, 28);
  2091. bench_case(1024, 256, 14, 14);
  2092. bench_case(2048, 512, 7, 7);
  2093. return args;
  2094. }
  2095. void benchmark_conv1x1(
  2096. const char* matmul_algo_name, Handle* handle, DType stype, DType matmul_dtype,
  2097. DType bias_type, DType conv_dtype, bool is_mk4 = false) {
  2098. using namespace conv_bias;
  2099. int pack_size = is_mk4 ? 4 : 1;
  2100. std::vector<TestArg> conv_bias_1x1_args =
  2101. get_conv_bias_1x1_benchmark_args(pack_size);
  2102. constexpr size_t RUNS = 50;
  2103. param::MatrixMul param;
  2104. param.transposeA = false;
  2105. param.transposeB = false;
  2106. if (is_mk4) {
  2107. param.format = MatrixMul::Param::Format::MK4;
  2108. }
  2109. Benchmarker<MatrixMul> benchmark_matmul(handle);
  2110. benchmark_matmul.set_before_exec_callback(AlgoChecker<MatrixMul>(matmul_algo_name));
  2111. benchmark_matmul.set_times(RUNS)
  2112. .set_dtype(0, stype)
  2113. .set_dtype(1, stype)
  2114. .set_dtype(2, matmul_dtype)
  2115. .set_param(param)
  2116. .set_display(false);
  2117. std::string conv1x1_algo_name = ssprintf("CONV1x1:%s:24", matmul_algo_name);
  2118. Benchmarker<ConvBias> benchmark_conv1x1(handle);
  2119. benchmark_conv1x1.set_before_exec_callback(
  2120. conv_bias::ConvBiasAlgoChecker<ConvBias>(conv1x1_algo_name.c_str()));
  2121. benchmark_conv1x1.set_times(RUNS)
  2122. .set_dtype(0, stype)
  2123. .set_dtype(1, stype)
  2124. .set_dtype(2, bias_type)
  2125. .set_dtype(4, conv_dtype)
  2126. .set_display(false);
  2127. for (auto&& arg : conv_bias_1x1_args) {
  2128. size_t IC = arg.src[1];
  2129. size_t OH = arg.src[2];
  2130. size_t OW = arg.src[3];
  2131. size_t OC = arg.filter[0];
  2132. size_t M = OC * pack_size;
  2133. size_t K = IC * pack_size;
  2134. size_t N = OH * OW;
  2135. float computations = M * N * K * 2.f / (1024 * 1024 * 1024) * 1e3;
  2136. TensorShape A, B;
  2137. A = TensorShape{M, K};
  2138. B = TensorShape{K, N};
  2139. if (is_mk4) {
  2140. A = TensorShape{M / 4, K / 4, 4, 4};
  2141. B = TensorShape{K / 4, N, 4};
  2142. }
  2143. auto conv1x1_used = benchmark_conv1x1.set_param(arg.param).exec(
  2144. {arg.src, arg.filter, arg.bias, {}, {}}) /
  2145. RUNS;
  2146. auto matmul_used = benchmark_matmul.exec({A, B, {}}) / RUNS;
  2147. printf("%s %s:\n matmul: %f ms %f Gflops\nconv1x1: %f ms %f GFlops "
  2148. "speedup: "
  2149. "%f\n",
  2150. arg.src.to_string().c_str(), arg.filter.to_string().c_str(), matmul_used,
  2151. computations / matmul_used, conv1x1_used, computations / conv1x1_used,
  2152. matmul_used / conv1x1_used);
  2153. }
  2154. }
  2155. } // namespace
  2156. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_CONV1X1_S1_F32) {
  2157. #if MEGDNN_AARCH64
  2158. benchmark_conv1x1(
  2159. "AARCH64_F32K8X12X1", handle(), dtype::Float32{}, dtype::Float32{},
  2160. dtype::Float32{}, dtype::Float32{});
  2161. #else
  2162. benchmark_conv1x1(
  2163. "ARMV7_F32", handle(), dtype::Float32{}, dtype::Float32{}, dtype::Float32{},
  2164. dtype::Float32{});
  2165. #endif
  2166. }
  2167. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  2168. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_CONV1X1_S1_F16) {
  2169. #if MEGDNN_AARCH64
  2170. benchmark_conv1x1(
  2171. "AARCH64_F16_K8X24X1", handle(), dtype::Float16{}, dtype::Float16{},
  2172. dtype::Float16{}, dtype::Float16{});
  2173. #else
  2174. benchmark_conv1x1(
  2175. "AARCH32_F16_K4X16X1", handle(), dtype::Float16{}, dtype::Float16{},
  2176. dtype::Float16{}, dtype::Float16{});
  2177. #endif
  2178. }
  2179. #endif
  2180. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_CONV1X1_S1_QUANTIZEDSYM) {
  2181. dtype::QuantizedS8 stype(2.5f);
  2182. dtype::QuantizedS32 dtype(6.25f);
  2183. #if MEGDNN_AARCH64
  2184. #if MGB_ENBALE_DOT
  2185. benchmark_conv1x1(
  2186. "AARCH64_INT8X8X32_K8X12X4_DOTPROD", handle(), stype, dtype, dtype, dtype);
  2187. #else
  2188. benchmark_conv1x1("AARCH64_INT8X8X32_K8X8X8", handle(), stype, dtype, dtype, dtype);
  2189. benchmark_conv1x1(
  2190. "AARCH64_INT8X8X32_K4X4X16", handle(), stype, dtype, dtype, dtype);
  2191. #endif
  2192. #elif MEGDNN_ARMV7
  2193. benchmark_conv1x1("ARMV7_INT8X8X32_K4X8X8", handle(), stype, dtype, dtype, dtype);
  2194. #endif
  2195. }
  2196. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_CONV1X1_S1_QUANTIZEDASYM) {
  2197. dtype::Quantized8Asymm stype(1.2f, (uint8_t)125);
  2198. dtype::QuantizedS32 dtype(1.2 * 1.2);
  2199. #if MEGDNN_AARCH64
  2200. #if MGB_ENBALE_DOT
  2201. benchmark_conv1x1(
  2202. "AARCH64_QUINT8_K8X8X4_DOTPROD", handle(), stype, dtype, dtype, dtype);
  2203. #else
  2204. benchmark_conv1x1("AARCH64_QUINT8_K8X8X8", handle(), stype, dtype, dtype, dtype);
  2205. #endif
  2206. #elif MEGDNN_ARMV7
  2207. benchmark_conv1x1("ARMV7_QUINT8_K4X8X8", handle(), stype, dtype, dtype, dtype);
  2208. #endif
  2209. }
  2210. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_CONV1X1_S1_INT8x8x16) {
  2211. #if MEGDNN_AARCH64
  2212. benchmark_conv1x1(
  2213. "AARCH64_INT8X8X16_K8X8X8", handle(), dtype::Int8{}, dtype::Int16{},
  2214. dtype::Int16{}, dtype::Int16{});
  2215. benchmark_conv1x1(
  2216. "AARCH64_INT8X8X16_K4X4X16", handle(), dtype::Int8{}, dtype::Int16{},
  2217. dtype::Int16{}, dtype::Int16{});
  2218. #elif MEGDNN_ARMV7
  2219. benchmark_conv1x1(
  2220. "ARMV7_INT8X8X16_K4X8X8", handle(), dtype::Int8{}, dtype::Int16{},
  2221. dtype::Int16{}, dtype::Int16{});
  2222. benchmark_conv1x1(
  2223. "ARMV7_INT8X8X16_K4X2X16", handle(), dtype::Int8{}, dtype::Int16{},
  2224. dtype::Int16{}, dtype::Int16{});
  2225. benchmark_conv1x1(
  2226. "ARMV7_INT8X8X16_MK4_K8X8X4", handle(), dtype::Int8{}, dtype::Int16{},
  2227. dtype::Int16{}, dtype::Int16{}, true);
  2228. #endif
  2229. }
  2230. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_CONV1X1_GEMV_FP32) {
  2231. using namespace conv_bias;
  2232. std::vector<conv_bias::TestArg> args;
  2233. param::ConvBias conv_param;
  2234. conv_param.stride_h = 1;
  2235. conv_param.stride_w = 1;
  2236. conv_param.pad_h = 0;
  2237. conv_param.pad_w = 0;
  2238. conv_param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
  2239. auto run = [&](size_t M, size_t K) {
  2240. args.emplace_back(
  2241. conv_param, TensorShape{1, K, 1, 1}, TensorShape{M, K, 1, 1},
  2242. TensorShape{});
  2243. };
  2244. for (size_t M : {4, 64, 1024, 4096})
  2245. for (size_t K : {128, 256, 1024, 4096})
  2246. run(M, K);
  2247. constexpr size_t RUNS = 50;
  2248. param::MatrixMul param;
  2249. param.transposeA = false;
  2250. param.transposeB = false;
  2251. Benchmarker<MatrixMul> benchmark_matmul(handle());
  2252. benchmark_matmul.set_before_exec_callback(
  2253. AlgoChecker<MatrixMul>("ARM_COMMON_F32_GEMV"));
  2254. benchmark_matmul.set_times(RUNS)
  2255. .set_dtype(0, dtype::Float32{})
  2256. .set_dtype(1, dtype::Float32{})
  2257. .set_dtype(2, dtype::Float32{})
  2258. .set_param(param)
  2259. .set_display(false);
  2260. Benchmarker<ConvBias> benchmark_conv1x1(handle());
  2261. benchmark_conv1x1.set_before_exec_callback(
  2262. conv_bias::ConvBiasAlgoChecker<ConvBias>("CONV1x1_GEMV"));
  2263. benchmark_conv1x1.set_times(RUNS)
  2264. .set_dtype(0, dtype::Float32{})
  2265. .set_dtype(1, dtype::Float32{})
  2266. .set_dtype(2, dtype::Float32{})
  2267. .set_dtype(4, dtype::Float32{})
  2268. .set_display(false);
  2269. std::cout << "warm up:\n";
  2270. for (int i = 0; i < 50; i++) {
  2271. benchmark_matmul.exec({{1, 1024}, {1024, 512}, {}});
  2272. benchmark_matmul.set_display(true);
  2273. }
  2274. for (auto&& arg : args) {
  2275. size_t IC = arg.src[1];
  2276. size_t OH = arg.src[2];
  2277. size_t OW = arg.src[3];
  2278. size_t OC = arg.filter[0];
  2279. size_t M = OC;
  2280. size_t K = IC;
  2281. size_t N = OH * OW;
  2282. float computations = M * N * K * 2.f / (1024 * 1024 * 1024) * 1e3;
  2283. TensorShape A, B;
  2284. A = TensorShape{M, K};
  2285. B = TensorShape{K, N};
  2286. auto conv1x1_used = benchmark_conv1x1.set_param(arg.param).exec(
  2287. {arg.src, arg.filter, arg.bias, {}, {}}) /
  2288. RUNS;
  2289. auto matmul_used = benchmark_matmul.exec({A, B, {}}) / RUNS;
  2290. printf("%s %s:\n gemv: %f ms %f Gflops\nconv1x1: %f ms %f GFlops "
  2291. "speedup: "
  2292. "%f\n",
  2293. arg.src.to_string().c_str(), arg.filter.to_string().c_str(), matmul_used,
  2294. computations / matmul_used, conv1x1_used, computations / conv1x1_used,
  2295. matmul_used / conv1x1_used);
  2296. }
  2297. }
  2298. //! enable none dot algo now
  2299. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_1X1_S1_NCHW_VS_NCHW44_INT8x8x32) {
  2300. std::vector<TestArg> conv_bias_1x1_args_nchw44 =
  2301. get_conv_bias_1x1_benchmark_args(4);
  2302. std::vector<TestArg> conv_bias_1x1_args_nchw = get_conv_bias_1x1_benchmark_args(1);
  2303. constexpr size_t RUNS = 50;
  2304. Benchmarker<ConvBias> benchmark_conv1x1_nchw44(handle());
  2305. benchmark_conv1x1_nchw44.set_before_exec_callback(
  2306. conv_bias::ConvBiasAlgoChecker<ConvBias>(
  2307. "CONV1x1:AARCH64_INT8X8X32_MK4_4X4X16:24"));
  2308. benchmark_conv1x1_nchw44.set_times(RUNS)
  2309. .set_dtype(0, dtype::Int8())
  2310. .set_dtype(1, dtype::Int8())
  2311. .set_dtype(2, dtype::Int32())
  2312. .set_dtype(4, dtype::Int32())
  2313. .set_display(false);
  2314. Benchmarker<ConvBias> benchmark_conv1x1_nchw(handle());
  2315. benchmark_conv1x1_nchw.set_before_exec_callback(
  2316. conv_bias::ConvBiasAlgoChecker<ConvBias>(
  2317. "CONV1x1:AARCH64_INT8X8X32_K4X4X16:24"));
  2318. benchmark_conv1x1_nchw.set_times(RUNS)
  2319. .set_dtype(0, dtype::Int8())
  2320. .set_dtype(1, dtype::Int8())
  2321. .set_dtype(2, dtype::Int32())
  2322. .set_dtype(4, dtype::Int32())
  2323. .set_display(false);
  2324. for (size_t i = 0; i < conv_bias_1x1_args_nchw44.size(); ++i) {
  2325. auto&& arg_nchw = conv_bias_1x1_args_nchw[i];
  2326. auto&& arg_nchw44 = conv_bias_1x1_args_nchw44[i];
  2327. size_t IC = arg_nchw.src[1];
  2328. size_t OH = arg_nchw.src[2];
  2329. size_t OW = arg_nchw.src[3];
  2330. size_t OC = arg_nchw.filter[0];
  2331. size_t M = OC;
  2332. size_t K = IC;
  2333. size_t N = OH * OW;
  2334. float computations = M * N * K * 2.f / (1024 * 1024 * 1024) * 1e3;
  2335. auto conv1x1_nchw =
  2336. benchmark_conv1x1_nchw.set_param(arg_nchw.param)
  2337. .exec({arg_nchw.src, arg_nchw.filter, arg_nchw.bias, {}, {}}) /
  2338. RUNS;
  2339. auto conv1x1_nchw44 = benchmark_conv1x1_nchw44.set_param(arg_nchw44.param)
  2340. .exec({arg_nchw44.src,
  2341. arg_nchw44.filter,
  2342. arg_nchw44.bias,
  2343. {},
  2344. {}}) /
  2345. RUNS;
  2346. printf("%s %s:\n conv_1x1_nchw: %f ms %f Gflops\nconv1x1_nchw44: %f ms "
  2347. "%f GFlops "
  2348. "speedup: "
  2349. "%f\n",
  2350. arg_nchw.src.to_string().c_str(), arg_nchw.filter.to_string().c_str(),
  2351. conv1x1_nchw, computations / conv1x1_nchw, conv1x1_nchw44,
  2352. computations / conv1x1_nchw44, conv1x1_nchw / conv1x1_nchw44);
  2353. }
  2354. }
  2355. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_WINOGRAD_VS_IM2COL_INT8) {
  2356. auto&& args = get_winograd_benchmark_args(3, 8);
  2357. using namespace conv_bias;
  2358. constexpr size_t RUN = 10;
  2359. Benchmarker<ConvBias> benchmark_im2col(handle());
  2360. benchmark_im2col.set_display(false);
  2361. benchmark_im2col.set_times(RUN);
  2362. benchmark_im2col.set_dtype(0, dtype::QuantizedS8(2.5f))
  2363. .set_dtype(1, dtype::QuantizedS8(2.5f))
  2364. .set_dtype(2, dtype::QuantizedS32(6.25f))
  2365. .set_dtype(4, dtype::QuantizedS8(60.25f));
  2366. Benchmarker<ConvBias> benchmark_winograd(handle());
  2367. benchmark_winograd.set_display(false);
  2368. benchmark_winograd.set_times(RUN);
  2369. benchmark_winograd.set_dtype(0, dtype::QuantizedS8(2.5f))
  2370. .set_dtype(1, dtype::QuantizedS8(2.5f))
  2371. .set_dtype(2, dtype::QuantizedS32(6.25f))
  2372. .set_dtype(4, dtype::QuantizedS8(60.25f));
  2373. for (auto&& arg : args) {
  2374. TensorLayout dst_layout;
  2375. auto opr = handle()->create_operator<ConvBias>();
  2376. opr->param() = arg.param;
  2377. opr->deduce_layout(
  2378. {arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()},
  2379. {arg.bias, dtype::Float32()}, {}, dst_layout);
  2380. //! dst.nr_elems * IC * FH * FW * 2
  2381. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  2382. arg.filter[2] * arg.filter[3] * 2.0 /
  2383. (1024 * 1024 * 1024) * 1e3;
  2384. benchmark_im2col.set_param(arg.param);
  2385. auto im2col_used = algo_benchmark<ConvBias>(
  2386. benchmark_im2col, {arg.src, arg.filter, {}, {}, {}},
  2387. "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16") /
  2388. RUN;
  2389. benchmark_winograd.set_param(arg.param);
  2390. auto winograd_used =
  2391. algo_benchmark<ConvBias>(
  2392. benchmark_winograd, {arg.src, arg.filter, {}, {}, {}},
  2393. "WINOGRAD:AARCH64_INT16X16X32_MK8_8X8:8:2") /
  2394. RUN;
  2395. printf("%s %s: im2col: %f ms %f Gflops winograd: %f ms %f GFlops "
  2396. "speedup: "
  2397. "%f\n",
  2398. arg.src.to_string().c_str(), arg.filter.to_string().c_str(), im2col_used,
  2399. computations / im2col_used, winograd_used, computations / winograd_used,
  2400. im2col_used / winograd_used);
  2401. }
  2402. }
  2403. #endif
  2404. // vim: syntax=cpp.doxygen