You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias.cpp 87 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143
  1. /**
  2. * \file dnn/test/arm_common/conv_bias.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megdnn/dtype.h"
  13. #include "test/arm_common/fixture.h"
  14. #include "megdnn/opr_param_defs.h"
  15. #include "megdnn/oprs.h"
  16. #include "src/fallback/conv_bias/common.h"
  17. #include "test/common/benchmarker.h"
  18. #include "test/common/checker.h"
  19. #include "test/common/conv_bias.h"
  20. #include "test/common/rng.h"
  21. #include "test/common/tensor.h"
  22. #include "test/common/workspace_wrapper.h"
  23. using namespace megdnn;
  24. using namespace test;
  25. using namespace conv_bias;
  26. //! TODO this algo current does not support multithread
  27. TEST_F(ARM_COMMON, CONVBIAS_INT8_INT8_INT16_STRIDE2F2) {
  28. checker_conv_bias_int8x8x16(get_conv_bias_args({2}, 2, true, true, true),
  29. handle(), "I8816STRD2F2");
  30. }
  31. TEST_F(ARM_COMMON, CONV_BIAS_MATMUL) {
  32. using namespace conv_bias;
  33. std::vector<TestArg> args = get_quantized_args();
  34. Checker<ConvBiasForward> checker(handle());
  35. checker.set_before_exec_callback(
  36. conv_bias::ConvBiasAlgoChecker<ConvBias>("S8MATMUL"));
  37. #if MEGDNN_ARMV7
  38. checker.set_epsilon(1);
  39. #endif
  40. UniformIntRNG rng{-50, 50};
  41. for (auto&& arg : args) {
  42. if (arg.bias.ndim == 4 && arg.bias[2] != 1 && arg.bias[3] != 1)
  43. continue;
  44. checker.set_dtype(0, dtype::QuantizedS8(0.41113496f))
  45. .set_dtype(1, dtype::QuantizedS8(0.01887994f))
  46. .set_dtype(2, dtype::QuantizedS32(0.41113496f * 0.01887994f))
  47. .set_dtype(4, dtype::QuantizedS8(0.49550694f))
  48. .set_rng(0, &rng)
  49. .set_rng(1, &rng)
  50. .set_rng(2, &rng)
  51. .set_param(arg.param)
  52. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  53. }
  54. }
  55. #define CONV_BIAS_MATMUL_QU8_MODE(MODE) \
  56. using namespace conv_bias; \
  57. std::vector<TestArg> args = get_quantized_args_with_nlmode(MODE); \
  58. Checker<ConvBiasForward> checker(handle()); \
  59. checker.set_before_exec_callback( \
  60. conv_bias::ConvBiasAlgoChecker<ConvBias>("QU8MATMUL")); \
  61. UniformIntRNG rng{0, 127}; \
  62. for (auto&& arg : args) { \
  63. if (arg.bias.ndim == 4 && arg.bias[2] != 1 && arg.bias[3] != 1) \
  64. continue; \
  65. checker.set_dtype(0, dtype::Quantized8Asymm( \
  66. 2.5f, static_cast<uint8_t>(127))) \
  67. .set_dtype(1, dtype::Quantized8Asymm( \
  68. 2.7f, static_cast<uint8_t>(126))) \
  69. .set_dtype(2, dtype::QuantizedS32(6.75f)) \
  70. .set_dtype(4, dtype::Quantized8Asymm( \
  71. 60.25f, static_cast<uint8_t>(125))) \
  72. .set_rng(0, &rng) \
  73. .set_rng(1, &rng) \
  74. .set_rng(2, &rng) \
  75. .set_param(arg.param) \
  76. .execs({arg.src, arg.filter, arg.bias, {}, {}}); \
  77. }
  78. #define MODE_STR(mode) param::ConvBias::NonlineMode::mode
  79. #define CB_TEST(MODE) \
  80. TEST_F(ARM_COMMON, CONV_BIAS_MATMUL_QU8_##MODE) { \
  81. CONV_BIAS_MATMUL_QU8_MODE(MODE_STR(MODE)); \
  82. }
  83. CB_TEST(IDENTITY);
  84. CB_TEST(RELU);
  85. CB_TEST(H_SWISH);
  86. #undef MODE_STR
  87. #undef CB_TEST
  88. #undef CONV_BIAS_MATMUL_QU8_MODE
  89. #if MEGDNN_WITH_BENCHMARK
  90. static void benchmark_convbias(Handle* handle, bool is_fp32 = false) {
  91. constexpr size_t RUNS = 30;
  92. Benchmarker<ConvBias> benchmarker_int(handle);
  93. benchmarker_int.set_times(RUNS)
  94. .set_dtype(0, dtype::QuantizedS8(2.5))
  95. .set_dtype(1, dtype::QuantizedS8(2.5))
  96. .set_dtype(2, dtype::QuantizedS32(6.25))
  97. .set_dtype(4, dtype::QuantizedS8(60.25))
  98. .set_display(false);
  99. benchmarker_int.set_before_exec_callback(
  100. conv_bias::ConvBiasAlgoChecker<ConvBias>("IM2COLMATMUL:.+"));
  101. Benchmarker<ConvBias> benchmarker_float(handle);
  102. benchmarker_float.set_display(false).set_times(RUNS);
  103. benchmarker_float.set_before_exec_callback(
  104. conv_bias::ConvBiasAlgoChecker<ConvBias>("IM2COLMATMUL:.+"));
  105. Benchmarker<ConvBias> benchmarker_nchw44(handle);
  106. if (is_fp32) {
  107. benchmarker_nchw44.set_times(RUNS)
  108. .set_dtype(0, dtype::Float32())
  109. .set_dtype(1, dtype::Float32())
  110. .set_dtype(2, dtype::Float32())
  111. .set_dtype(4, dtype::Float32())
  112. .set_display(false);
  113. } else {
  114. benchmarker_nchw44.set_times(RUNS)
  115. .set_dtype(0, dtype::QuantizedS8(2.5))
  116. .set_dtype(1, dtype::QuantizedS8(2.5))
  117. .set_dtype(2, dtype::QuantizedS32(6.25))
  118. .set_dtype(4, dtype::QuantizedS8(60.25))
  119. .set_display(false);
  120. }
  121. auto nchw44_algo_regx = ".*(DIRECT|NCHW_NCHW44).*";
  122. #if __ARM_FEATURE_DOTPROD
  123. if (!is_fp32) {
  124. nchw44_algo_regx = ".*DOT.*";
  125. }
  126. #endif
  127. benchmarker_nchw44.set_before_exec_callback(
  128. conv_bias::ConvBiasAlgoChecker<ConvBias>(nchw44_algo_regx));
  129. auto run = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  130. size_t FS, size_t stride, bool input_nchw = false) {
  131. param::ConvBias param;
  132. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  133. param.stride_h = stride;
  134. param.stride_w = stride;
  135. param.pad_h = FS / 2;
  136. param.pad_w = FS / 2;
  137. auto OH = (H + 2 * param.pad_h - FS) /
  138. static_cast<size_t>(param.stride_h) +
  139. 1;
  140. auto OW = (W + 2 * param.pad_w - FS) /
  141. static_cast<size_t>(param.stride_w) +
  142. 1;
  143. TensorShape src({N, IC, H, W}), filter({OC, IC, FS, FS}),
  144. bias({1, OC, 1, 1}), dst({N, OC, OH, OW});
  145. param.format = param::ConvBias::Format::NCHW;
  146. auto int_used = benchmarker_int.set_param(param).exec(
  147. {src, filter, bias, {}, dst}) /
  148. RUNS;
  149. auto float_used = benchmarker_float.set_param(param).exec(
  150. {src, filter, bias, {}, dst}) /
  151. RUNS;
  152. param.format = param::ConvBias::Format::NCHW44;
  153. src = {N, IC / 4, H, W, 4};
  154. filter = {OC / 4, IC / 4, FS, FS, 4, 4};
  155. if (input_nchw) {
  156. src = {N, IC, H, W};
  157. filter = {OC / 4, FS, FS, IC, 4};
  158. }
  159. bias = {1, OC / 4, 1, 1, 4};
  160. dst = {N, OC / 4, OH, OW, 4};
  161. auto int_nchw44_used = benchmarker_nchw44.set_param(param).exec(
  162. {src, filter, bias, {}, dst}) /
  163. RUNS;
  164. float computations = IC * (FS * FS) * dst.total_nr_elems() * 2 * 1e-6;
  165. printf("run: %s %s %s->%s \n", src.to_string().c_str(),
  166. filter.to_string().c_str(), bias.to_string().c_str(),
  167. dst.to_string().c_str());
  168. printf("float: %f ms %f Gflops, ", float_used,
  169. computations / float_used);
  170. printf("int_nchw: %f ms %f Gflops, ", int_used,
  171. computations / int_used);
  172. auto speed_up = int_used / int_nchw44_used;
  173. if (is_fp32) {
  174. speed_up = float_used / int_nchw44_used;
  175. printf("fp32_nchw44: %f ms %f Gflops %f speedup, ", int_nchw44_used,
  176. computations / int_nchw44_used, speed_up);
  177. } else {
  178. printf("int_nchw44: %f ms %f Gflops %f speedup, ", int_nchw44_used,
  179. computations / int_nchw44_used, speed_up);
  180. }
  181. printf("\n");
  182. };
  183. if (is_fp32) {
  184. run(1, 1, 4, 112, 112, 2, 2, true);
  185. run(1, 3, 32, 224, 224, 3, 2, true);
  186. run(1, 3, 64, 224, 224, 7, 2, true);
  187. run(1, 64, 128, 56, 56, 3, 2, false);
  188. run(1, 128, 256, 28, 28, 3, 2, false);
  189. run(1, 256, 512, 14, 14, 3, 2, false);
  190. run(1, 128, 128, 28, 28, 3, 1, false);
  191. run(1, 256, 256, 14, 14, 3, 1, false);
  192. run(1, 512, 512, 7, 7, 3, 1, false);
  193. } else {
  194. run(1, 1, 4, 112, 112, 2, 2, true);
  195. run(1, 3, 32, 224, 224, 3, 2, true);
  196. run(1, 3, 32, 224, 224, 5, 2, true);
  197. run(1, 3, 64, 224, 224, 7, 2, true);
  198. run(1, 1, 4, 112, 112, 2, 1, true);
  199. run(1, 3, 32, 224, 224, 3, 1, true);
  200. run(1, 3, 32, 224, 224, 5, 1, true);
  201. run(1, 3, 64, 224, 224, 7, 1, true);
  202. for (size_t stride : {1, 2}) {
  203. printf("stride %zu\n", stride);
  204. for (size_t filter_size : {2, 3, 5, 7}) {
  205. for (size_t img_size : {32}) {
  206. for (size_t channel : {8, 16, 32, 64, 128, 256}) {
  207. run(1, channel, channel, img_size, img_size,
  208. filter_size, stride, false);
  209. }
  210. }
  211. }
  212. }
  213. }
  214. }
  215. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_NCHW44) {
  216. benchmark_convbias(handle(), true);
  217. benchmark_convbias(handle(), false);
  218. }
  219. TEST_F(ARM_COMMON_MULTI_THREADS, BENCHMARK_CONVBIAS_NCHW44) {
  220. benchmark_convbias(handle(), true);
  221. benchmark_convbias(handle(), false);
  222. }
  223. #endif
  224. TEST_F(ARM_COMMON, CONV_BIAS_MATMUL_QS8) {
  225. using namespace conv_bias;
  226. std::vector<TestArg> args = get_quantized_args();
  227. Checker<ConvBiasForward> checker(handle());
  228. checker.set_before_exec_callback(
  229. conv_bias::ConvBiasAlgoChecker<ConvBias>("S8MATMUL"));
  230. #if MEGDNN_ARMV7
  231. checker.set_epsilon(1);
  232. #endif
  233. UniformIntRNG rng{0, 255};
  234. for (auto&& arg : args) {
  235. if (arg.bias.ndim == 4 && arg.bias[2] != 1 && arg.bias[3] != 1)
  236. continue;
  237. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  238. .set_dtype(1, dtype::QuantizedS8(2.7f))
  239. .set_dtype(2, dtype::QuantizedS32(6.75f))
  240. .set_dtype(4, dtype::QuantizedS8(60.25f))
  241. .set_rng(0, &rng)
  242. .set_rng(1, &rng)
  243. .set_rng(2, &rng)
  244. .set_param(arg.param)
  245. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  246. }
  247. }
  248. #if MEGDNN_ARMV7
  249. TEST_F(ARM_COMMON, CONV_BIAS_RESCALE_OP) {
  250. using namespace conv_bias;
  251. Checker<ConvBias> checker(handle());
  252. checker.set_before_exec_callback(
  253. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("S8MATMUL"));
  254. checker.set_epsilon(1).set_max_avg_error(1e-2).set_max_avg_biased_error(
  255. 1e-3);
  256. UniformIntRNG rng{-128, 127};
  257. checker.set_dtype(0, dtype::QuantizedS8(0.41113496f))
  258. .set_dtype(1, dtype::QuantizedS8(0.01887994f))
  259. .set_dtype(2, dtype::QuantizedS32(0.41113496f * 0.01887994f))
  260. .set_dtype(4, dtype::QuantizedS8(0.49550694f))
  261. .set_rng(0, &rng)
  262. .set_rng(1, &rng)
  263. .set_rng(2, &rng);
  264. param::ConvBias param;
  265. param.stride_h = 1;
  266. param.stride_w = 1;
  267. param.pad_h = 0;
  268. param.pad_w = 0;
  269. param.nonlineMode = NonlineMode::IDENTITY;
  270. //! Unary op
  271. checker.set_param(param).exec({TensorShape{2, 1, 128, 128},
  272. TensorShape{16, 1, 2, 2},
  273. TensorShape{},
  274. TensorShape{},
  275. {}});
  276. //! Binary op
  277. checker.set_param(param).exec({TensorShape{2, 1, 128, 128},
  278. TensorShape{16, 1, 2, 2},
  279. TensorShape{1, 16, 1, 1},
  280. TensorShape{},
  281. {}});
  282. }
  283. #endif
  284. #if MEGDNN_WITH_BENCHMARK
  285. void benchmark_im2col(const char* algo_name, const char* im2col_name,
  286. Handle* handle, size_t kernel, size_t pack_size = 1) {
  287. auto&& args = get_winograd_benchmark_args(kernel, pack_size);
  288. using namespace conv_bias;
  289. constexpr size_t RUN = 10;
  290. Benchmarker<ConvBias> benchmark(handle);
  291. benchmark.set_display(false);
  292. benchmark.set_times(RUN);
  293. Benchmarker<ConvBias> benchmark_im2col(handle);
  294. benchmark_im2col.set_display(false);
  295. benchmark_im2col.set_times(RUN);
  296. for (auto&& arg : args) {
  297. TensorLayout dst_layout;
  298. auto opr = handle->create_operator<ConvBias>();
  299. opr->param() = arg.param;
  300. opr->deduce_layout({arg.src, dtype::Float32()},
  301. {arg.filter, dtype::Float32()},
  302. {arg.bias, dtype::Float32()}, {}, dst_layout);
  303. //! dst.nr_elems * IC * FH * FW * 2
  304. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  305. arg.filter[2] * arg.filter[3] * 2.0 /
  306. (1024 * 1024 * 1024) * 1e3;
  307. benchmark.set_param(arg.param);
  308. auto used = algo_benchmark<ConvBias>(benchmark,
  309. {arg.src, arg.filter, {}, {}, {}},
  310. algo_name) /
  311. RUN;
  312. benchmark_im2col.set_param(arg.param);
  313. auto used_im2col =
  314. algo_benchmark<ConvBias>(benchmark_im2col,
  315. {arg.src, arg.filter, {}, {}, {}},
  316. im2col_name) /
  317. RUN;
  318. printf("%s %s: normal: %f ms %f Gflops im2col: %f ms %f GFlops "
  319. "speedup: "
  320. "%f\n",
  321. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  322. used, computations / used, used_im2col,
  323. computations / used_im2col, used / used_im2col);
  324. }
  325. }
  326. void benchmark_im2col_single_algo(const char* im2col_name, Handle* handle,
  327. size_t kernel, size_t pack_size = 1) {
  328. std::vector<conv_bias::TestArg> args;
  329. auto pack = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  330. size_t p) {
  331. if (ic % pack_size != 0 || oc % pack_size != 0)
  332. return;
  333. if (w + 2 * p < kernel || h + 2 * p < kernel)
  334. return;
  335. param::ConvBias param;
  336. param.stride_h = 1;
  337. param.stride_w = 1;
  338. param.pad_h = p;
  339. param.pad_w = p;
  340. args.push_back(conv_bias::TestArg{param,
  341. TensorShape{1, ic, h, w},
  342. TensorShape{oc, ic, kernel, kernel},
  343. {1, oc, 1, 1}});
  344. };
  345. pack(1, 64, 100, 100, kernel, 1);
  346. pack(8, 64, 100, 100, kernel, 1);
  347. pack(16, 64, 100, 100, kernel, 1);
  348. pack(32, 64, 100, 100, kernel, 1);
  349. pack(64, 64, 100, 100, kernel, 1);
  350. pack(128, 64, 100, 100, kernel, 1);
  351. pack(256, 64, 100, 100, kernel, 1);
  352. pack(512, 64, 100, 100, kernel, 1);
  353. pack(1024, 64, 100, 100, kernel, 1);
  354. pack(1, 64, 10, 10, kernel, 1);
  355. pack(8, 64, 10, 10, kernel, 1);
  356. pack(16, 64, 10, 10, kernel, 1);
  357. pack(32, 64, 10, 10, kernel, 1);
  358. pack(64, 64, 10, 10, kernel, 1);
  359. pack(128, 64, 10, 10, kernel, 1);
  360. pack(256, 64, 10, 10, kernel, 1);
  361. pack(512, 64, 10, 10, kernel, 1);
  362. pack(1024, 64, 10, 10, kernel, 1);
  363. pack(1, 16, 10, 10, kernel, 1);
  364. pack(8, 16, 10, 10, kernel, 1);
  365. pack(16, 16, 10, 10, kernel, 1);
  366. pack(32, 16, 10, 10, kernel, 1);
  367. pack(64, 16, 10, 10, kernel, 1);
  368. pack(128, 16, 10, 10, kernel, 1);
  369. pack(256, 16, 10, 10, kernel, 1);
  370. pack(512, 16, 10, 10, kernel, 1);
  371. pack(1024, 16, 10, 10, kernel, 1);
  372. using namespace conv_bias;
  373. constexpr size_t RUN = 20;
  374. Benchmarker<ConvBias> benchmark_im2col(handle);
  375. benchmark_im2col.set_display(false);
  376. benchmark_im2col.set_times(RUN);
  377. for (auto&& arg : args) {
  378. TensorLayout dst_layout;
  379. auto opr = handle->create_operator<ConvBias>();
  380. opr->param() = arg.param;
  381. opr->deduce_layout({arg.src, dtype::Float32()},
  382. {arg.filter, dtype::Float32()},
  383. {arg.bias, dtype::Float32()}, {}, dst_layout);
  384. //! dst.nr_elems * IC * FH * FW * 2
  385. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  386. arg.filter[2] * arg.filter[3] * 2.0 /
  387. (1024 * 1024 * 1024) * 1e3;
  388. benchmark_im2col.set_param(arg.param);
  389. auto used_im2col =
  390. algo_benchmark<ConvBias>(benchmark_im2col,
  391. {arg.src, arg.filter, {}, {}, {}},
  392. im2col_name) /
  393. RUN;
  394. printf("%s %s: im2col: %f ms %f GFlops \n", arg.src.to_string().c_str(),
  395. arg.filter.to_string().c_str(), used_im2col,
  396. computations / used_im2col);
  397. }
  398. }
  399. void BENCHMARK_IM2COL_NCHW44_VS_NCHW(const char* algo_name,
  400. const char* im2col_name, Handle* handle,
  401. size_t kernel, size_t pack_size = 1) {
  402. auto&& args = get_winograd_benchmark_args(kernel, pack_size);
  403. using namespace conv_bias;
  404. constexpr size_t RUN = 10;
  405. Benchmarker<ConvBias> benchmark(handle);
  406. benchmark.set_display(false);
  407. benchmark.set_times(RUN);
  408. benchmark.set_dtype(0, dtype::Int8());
  409. benchmark.set_dtype(1, dtype::Int8());
  410. benchmark.set_dtype(2, dtype::Int32());
  411. benchmark.set_dtype(4, dtype::Int32());
  412. Benchmarker<ConvBias> benchmark_im2col(handle);
  413. benchmark_im2col.set_display(false);
  414. benchmark_im2col.set_times(RUN);
  415. benchmark_im2col.set_dtype(0, dtype::Int8());
  416. benchmark_im2col.set_dtype(1, dtype::Int8());
  417. benchmark_im2col.set_dtype(2, dtype::Int32());
  418. benchmark_im2col.set_dtype(4, dtype::Int32());
  419. for (auto&& arg : args) {
  420. TensorLayout dst_layout;
  421. auto opr = handle->create_operator<ConvBias>();
  422. opr->param() = arg.param;
  423. opr->deduce_layout({arg.src, dtype::Float32()},
  424. {arg.filter, dtype::Float32()},
  425. {arg.bias, dtype::Float32()}, {}, dst_layout);
  426. //! dst.nr_elems * IC * FH * FW * 2
  427. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  428. arg.filter[2] * arg.filter[3] * 2.0 /
  429. (1024 * 1024 * 1024) * 1e3;
  430. std::vector<conv_bias::TestArg> nchw44param;
  431. benchmark.set_param(arg.param);
  432. auto used = algo_benchmark<ConvBias>(benchmark,
  433. {arg.src, arg.filter, {}, {}, {}},
  434. algo_name) /
  435. RUN;
  436. arg.param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
  437. arg.param.format = param::ConvBias::Format::NCHW44;
  438. benchmark_im2col.set_param(arg.param);
  439. nchw44param.push_back(conv_bias::TestArg{
  440. arg.param,
  441. TensorShape{arg.src.shape[0], arg.src.shape[1] / 4, arg.src[2],
  442. arg.src.shape[3], 4},
  443. TensorShape{arg.filter.shape[0] / 4, arg.filter.shape[1] / 4,
  444. kernel, kernel, 4, 4},
  445. TensorShape{}});
  446. auto used_im2col =
  447. algo_benchmark<ConvBias>(
  448. benchmark_im2col,
  449. {nchw44param[0].src, nchw44param[0].filter, {}, {}, {}},
  450. im2col_name) /
  451. RUN;
  452. printf("nchw44 shape src %s filter %s\n",
  453. nchw44param[0].src.to_string().c_str(),
  454. nchw44param[0].filter.to_string().c_str());
  455. printf("%s %s: normal: %f ms %f Gflops im2col: %f ms %f GFlops "
  456. "speedup: "
  457. "%f\n",
  458. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  459. used, computations / used, used_im2col,
  460. computations / used_im2col, used / used_im2col);
  461. }
  462. }
  463. TEST_F(ARM_COMMON, BENCHMARK_NCHW_VS_NCHW44_INT8x8x32) {
  464. printf("=========================compare "
  465. "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16, "
  466. "IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16 \n");
  467. BENCHMARK_IM2COL_NCHW44_VS_NCHW("IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16",
  468. "IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16",
  469. handle(), 3, 4);
  470. }
  471. TEST_F(ARM_COMMON, BENCHMARK_GROUP_CONVBIAS_QUANTIZED) {
  472. constexpr size_t RUNS = 50;
  473. param::ConvBias param;
  474. param.sparse = param::ConvBias::Sparse::GROUP;
  475. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  476. Benchmarker<ConvBias> benchmarker_int(handle());
  477. benchmarker_int.set_times(RUNS)
  478. .set_dtype(0, dtype::QuantizedS8(2.5f))
  479. .set_dtype(1, dtype::QuantizedS8(2.5f))
  480. .set_dtype(2, dtype::QuantizedS32(6.25f))
  481. .set_dtype(4, dtype::QuantizedS8(40.25f))
  482. .set_display(false);
  483. Benchmarker<ConvBias> benchmarker_float(handle());
  484. benchmarker_float.set_display(false).set_times(RUNS);
  485. auto run = [&](size_t N, size_t GROUP, size_t IC, size_t OC, size_t H,
  486. size_t W, size_t FS, size_t STRD) {
  487. megdnn_assert(IC % GROUP == 0 && OC % GROUP == 0);
  488. TensorShape src({N, IC, H, W}),
  489. filter({GROUP, OC / GROUP, IC / GROUP, FS, FS}),
  490. bias({1, OC, 1, 1}), dst({N, OC, H / STRD, W / STRD});
  491. param.pad_h = FS / 2;
  492. param.pad_w = FS / 2;
  493. param.stride_h = STRD;
  494. param.stride_w = STRD;
  495. auto int_used = benchmarker_int.set_param(param).exec(
  496. {src, filter, bias, {}, dst}) /
  497. RUNS;
  498. auto float_used = benchmarker_float.set_param(param).exec(
  499. {src, filter, bias, {}, dst}) /
  500. RUNS;
  501. float computations = (IC / GROUP * FS * FS * dst.total_nr_elems() * 2 +
  502. dst.total_nr_elems()) *
  503. 1e-6;
  504. printf("run: %s %s %s->%s \nfloat: %f ms %f Gflops int: %f ms "
  505. "%f Gflops speedup: %f\n",
  506. src.to_string().c_str(), filter.to_string().c_str(),
  507. bias.to_string().c_str(), dst.to_string().c_str(), float_used,
  508. computations / float_used, int_used, computations / int_used,
  509. float_used / int_used);
  510. };
  511. run(1, 1, 28, 28, 28, 28, 3, 1);
  512. run(1, 68, 68, 68, 14, 14, 3, 2);
  513. run(1, 96, 96, 96, 14, 14, 3, 2);
  514. run(1, 100, 100, 100, 7, 7, 3, 1);
  515. }
  516. #endif
  517. #if MEGDNN_WITH_BENCHMARK
  518. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_MATMUL) {
  519. constexpr size_t RUNS = 10;
  520. param::ConvBias param;
  521. param.stride_h = 1;
  522. param.stride_w = 1;
  523. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  524. Benchmarker<ConvBias> benchmarker(handle()), benchmarker_fused(handle());
  525. benchmarker.set_times(RUNS)
  526. .set_dtype(0, dtype::QuantizedS8(2.5f))
  527. .set_dtype(1, dtype::QuantizedS8(2.5f))
  528. .set_dtype(2, dtype::QuantizedS32(6.25f))
  529. .set_dtype(4, dtype::QuantizedS8(40.25f))
  530. .set_display(false);
  531. benchmarker_fused.set_times(RUNS)
  532. .set_dtype(0, dtype::QuantizedS8(2.5f))
  533. .set_dtype(1, dtype::QuantizedS8(2.5f))
  534. .set_dtype(2, dtype::QuantizedS32(6.25f))
  535. .set_dtype(4, dtype::QuantizedS8(40.25f))
  536. .set_display(false);
  537. benchmarker_fused.set_before_exec_callback(
  538. conv_bias::ConvBiasAlgoChecker<ConvBias>("S8MATMUL"));
  539. auto run = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  540. size_t FS) {
  541. TensorShape src({N, IC, H, W}), filter({OC, IC, FS, FS}),
  542. bias({1, OC, 1, 1}), dst({N, OC, H, W});
  543. param.pad_h = FS / 2;
  544. param.pad_w = FS / 2;
  545. auto default_used = benchmarker.set_param(param).exec(
  546. {src, filter, bias, {}, dst}) /
  547. RUNS;
  548. auto fused_used = benchmarker_fused.set_param(param).exec(
  549. {src, filter, bias, {}, dst}) /
  550. RUNS;
  551. float computations =
  552. IC * (FS * FS + 1) * dst.total_nr_elems() * 2 * 1e-6;
  553. printf("run: %s %s %s->%s \ndefault: %f ms %f Gflops fused: %f ms "
  554. "%f Gflops speedup: %f\n",
  555. src.to_string().c_str(), filter.to_string().c_str(),
  556. bias.to_string().c_str(), dst.to_string().c_str(), default_used,
  557. computations / default_used, fused_used,
  558. computations / fused_used, default_used / fused_used);
  559. };
  560. run(1, 128, 128, 32, 32, 3);
  561. for (size_t IC : {36, 48}) {
  562. for (size_t OC : {36, 48, 64}) {
  563. for (size_t size : {56, 128, 256}) {
  564. for (size_t FS : {1, 3, 5}) {
  565. run(1, IC, OC, size, size, FS);
  566. }
  567. }
  568. }
  569. }
  570. }
  571. #endif
  572. #if MEGDNN_WITH_BENCHMARK
  573. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F23) {
  574. #if MEGDNN_AARCH64
  575. benchmark_winograd("WINOGRAD:AARCH64_F32:1:2", handle(), 3);
  576. #else
  577. benchmark_winograd("WINOGRAD:ARMV7_F32_:1:2", handle(), 3);
  578. #endif
  579. }
  580. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F23_4x4) {
  581. #if MEGDNN_AARCH64
  582. benchmark_winograd("WINOGRAD:AARCH64_F32_MK4_4x16:4:2", handle(), 3, 4);
  583. #else
  584. benchmark_winograd("WINOGRAD:ARMV7_F32_MK4_4x8:4:2", handle(), 3, 4);
  585. #endif
  586. }
  587. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F63) {
  588. #if MEGDNN_AARCH64
  589. benchmark_winograd("WINOGRAD:AARCH64_F32K8X12X1:1:6", handle(), 3);
  590. #else
  591. benchmark_winograd("WINOGRAD:ARMV7_F32:1:6", handle(), 3);
  592. #endif
  593. }
  594. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F63_4x4) {
  595. #if MEGDNN_AARCH64
  596. benchmark_winograd("WINOGRAD:AARCH64_F32_MK4_4x16:4:6", handle(), 3, 4);
  597. #else
  598. benchmark_winograd("WINOGRAD:ARMV7_F32_MK4_4x8:4:6", handle(), 3, 4);
  599. #endif
  600. }
  601. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F54) {
  602. #if MEGDNN_AARCH64
  603. benchmark_winograd("WINOGRAD:AARCH64_F32K8X12X1:1:5", handle(), 4);
  604. #else
  605. benchmark_winograd("WINOGRAD:ARMV7_F32:1:5", handle(), 4);
  606. #endif
  607. }
  608. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F45) {
  609. #if MEGDNN_AARCH64
  610. benchmark_winograd("WINOGRAD:AARCH64_F32K8X12X1:1:4", handle(), 5);
  611. #else
  612. benchmark_winograd("WINOGRAD:ARMV7_F32:1:4", handle(), 5);
  613. #endif
  614. }
  615. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  616. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F16_F23) {
  617. #if MEGDNN_AARCH64
  618. benchmark_winograd_fp16("WINOGRAD:AARCH64_F32_MK4_4x16:4:2",
  619. "WINOGRAD:AARCH64_F16_K8X24X1:1:6", handle(), 3, 4);
  620. #else
  621. benchmark_winograd_fp16("WINOGRAD:ARMV7_F32:1:2",
  622. "WINOGRAD:AARCH32_F16_K4X16X1:1:2", handle(), 3);
  623. #endif
  624. }
  625. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F16_F45) {
  626. #if MEGDNN_AARCH64
  627. benchmark_winograd_fp16("WINOGRAD:AARCH64_F32K8X12X1:1:4",
  628. "WINOGRAD:AARCH64_F16_K8X24X1:1:4", handle(), 5);
  629. #else
  630. benchmark_winograd_fp16("WINOGRAD:ARMV7_F32:1:4",
  631. "WINOGRAD:AARCH32_F16_K4X16X1:1:4", handle(), 5);
  632. #endif
  633. }
  634. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F16_F63) {
  635. #if MEGDNN_AARCH64
  636. benchmark_winograd_fp16("WINOGRAD:AARCH64_F32K8X12X1:1:6",
  637. "WINOGRAD:AARCH64_F16_K8X24X1:1:6", handle(), 3);
  638. #else
  639. benchmark_winograd_fp16("WINOGRAD:ARMV7_F32:1:6",
  640. "WINOGRAD:AARCH32_F16_K4X16X1:1:6", handle(), 3);
  641. #endif
  642. }
  643. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F16_F23_8x8) {
  644. #if MEGDNN_AARCH64
  645. benchmark_winograd_fp16("WINOGRAD:AARCH64_F32_MK4_4x16:4:2",
  646. "WINOGRAD:AARCH64_F16_MK8_8X8:8:2", handle(), 3, 8);
  647. #else
  648. benchmark_winograd_fp16("WINOGRAD:ARMV7_F32_MK4_4x8:4:2",
  649. "WINOGRAD:AARCH32_F16_MK8_4X8:8:2", handle(), 3, 8);
  650. #endif
  651. }
  652. #endif
  653. void benchmark_winograd_nchw_vs_nchw44(const char* algo_name, Handle* handle) {
  654. using namespace conv_bias;
  655. using NLMode = param::ConvBias::NonlineMode;
  656. std::vector<conv_bias::TestArg> args_nchw44;
  657. std::vector<conv_bias::TestArg> args_nchw;
  658. auto pack = [&](size_t n, size_t oc, size_t ic, size_t h, size_t w,
  659. size_t group, NLMode nlmode) {
  660. param::ConvBias param;
  661. param.format = param::ConvBias::Format::NCHW44;
  662. param.stride_h = 1;
  663. param.stride_w = 1;
  664. param.pad_h = 1;
  665. param.pad_w = 1;
  666. param.nonlineMode = nlmode;
  667. if (group == 1) {
  668. param.sparse = param::ConvBias::Sparse::DENSE;
  669. args_nchw44.emplace_back(param, TensorShape{n, ic / 4, h, w, 4},
  670. TensorShape{oc / 4, ic / 4, 3, 3, 4, 4},
  671. TensorShape{});
  672. param.format = param::ConvBias::Format::NCHW;
  673. args_nchw.emplace_back(param, TensorShape{n, ic, h, w},
  674. TensorShape{oc, ic, 3, 3}, TensorShape{});
  675. } else {
  676. auto oc_per_group = oc / group;
  677. auto ic_per_group = ic / group;
  678. param.sparse = param::ConvBias::Sparse::GROUP;
  679. args_nchw44.emplace_back(param,
  680. TensorShape{n, ic_per_group / 4, h, w, 4},
  681. TensorShape{group, oc_per_group / 4,
  682. ic_per_group / 4, 3, 3, 4, 4},
  683. TensorShape{});
  684. param.format = param::ConvBias::Format::NCHW;
  685. args_nchw.emplace_back(
  686. param, TensorShape{n, ic, h, w},
  687. TensorShape{group, oc_per_group, ic_per_group, 3, 3},
  688. TensorShape{});
  689. }
  690. };
  691. std::vector<NLMode> nonlinemode = {NLMode::IDENTITY};
  692. for (auto nlmode : nonlinemode)
  693. for (size_t n : {1, 2})
  694. for (size_t group = 1; group <= 2; ++group) {
  695. pack(n, 512, 512, 15, 15, group, nlmode);
  696. pack(n, 512, 256, 15, 15, group, nlmode);
  697. pack(n, 256, 256, 29, 29, group, nlmode);
  698. pack(n, 256, 128, 29, 29, group, nlmode);
  699. pack(n, 128, 128, 57, 57, group, nlmode);
  700. pack(n, 128, 64, 57, 57, group, nlmode);
  701. pack(n, 24, 24, 224, 224, group, nlmode);
  702. pack(n, 64, 24, 123, 123, group, nlmode);
  703. pack(n, 64, 64, 56, 56, group, nlmode);
  704. pack(n, 128, 128, 28, 28, group, nlmode);
  705. pack(n, 256, 256, 14, 14, group, nlmode);
  706. pack(n, 512, 512, 7, 7, group, nlmode);
  707. }
  708. using namespace conv_bias;
  709. constexpr size_t RUN = 10;
  710. Benchmarker<ConvBias> benchmark_winograd_nchw(handle);
  711. benchmark_winograd_nchw.set_display(false);
  712. benchmark_winograd_nchw.set_times(RUN);
  713. Benchmarker<ConvBias> benchmark_winograd_nchw44(handle);
  714. benchmark_winograd_nchw44.set_display(false);
  715. benchmark_winograd_nchw44.set_times(RUN);
  716. std::string winograd_nchw_algo_name = ssprintf("WINOGRAD:%s", algo_name);
  717. std::string winograd_nchw44_algo_name =
  718. ssprintf("WINOGRAD_NCHW44:%s", algo_name);
  719. for (size_t i = 0; i < args_nchw.size(); ++i) {
  720. auto arg_nchw = args_nchw[i];
  721. auto arg_nchw44 = args_nchw44[i];
  722. TensorLayout dst_layout;
  723. auto opr = handle->create_operator<ConvBias>();
  724. opr->param() = arg_nchw.param;
  725. opr->deduce_layout({arg_nchw.src, dtype::Float32()},
  726. {arg_nchw.filter, dtype::Float32()},
  727. {arg_nchw.bias, dtype::Float32()}, {}, dst_layout);
  728. //! dst.nr_elems * IC * FH * FW * 2
  729. float computations = dst_layout.total_nr_elems() * arg_nchw.filter[1] *
  730. arg_nchw.filter[2] * arg_nchw.filter[3] * 2.0 /
  731. (1024 * 1024 * 1024) * 1e3;
  732. benchmark_winograd_nchw.set_param(arg_nchw.param);
  733. auto nchw_used = algo_benchmark<ConvBias>(
  734. benchmark_winograd_nchw,
  735. {arg_nchw.src, arg_nchw.filter, {}, {}, {}},
  736. winograd_nchw_algo_name.c_str()) /
  737. RUN;
  738. benchmark_winograd_nchw44.set_param(arg_nchw44.param);
  739. auto nchw44_used =
  740. algo_benchmark<ConvBias>(
  741. benchmark_winograd_nchw44,
  742. {arg_nchw44.src, arg_nchw44.filter, {}, {}, {}},
  743. winograd_nchw44_algo_name.c_str()) /
  744. RUN;
  745. printf("%s %s: nchw: %f ms %f Gflops nchw44: %f ms %f GFlops "
  746. "speedup: "
  747. "%f\n",
  748. arg_nchw.src.to_string().c_str(),
  749. arg_nchw.filter.to_string().c_str(), nchw_used,
  750. computations / nchw_used, nchw44_used,
  751. computations / nchw44_used, nchw_used / nchw44_used);
  752. }
  753. }
  754. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F23_MK4_NCHW_VS_NCHW44) {
  755. #if MEGDNN_AARCH64
  756. benchmark_winograd_nchw_vs_nchw44("AARCH64_F32_MK4_4x16:4:2", handle());
  757. #else
  758. benchmark_winograd_nchw_vs_nchw44("ARMV7_F32_MK4_4x8:4:2", handle());
  759. #endif
  760. }
  761. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F63_MK4_NCHW_VS_NCHW44) {
  762. #if MEGDNN_AARCH64
  763. benchmark_winograd_nchw_vs_nchw44("AARCH64_F32_MK4_4x16:4:6", handle());
  764. #else
  765. benchmark_winograd_nchw_vs_nchw44("ARMV7_F32_MK4_4x8:4:6", handle());
  766. #endif
  767. }
  768. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F23_8x8) {
  769. auto benchmark_winograd_quantized = [](const char* algo_name_fp32,
  770. const char* algo_name_quantized,
  771. Handle* handle, size_t kernel) {
  772. auto&& args = get_winograd_benchmark_args(kernel);
  773. using namespace conv_bias;
  774. constexpr size_t RUN = 10;
  775. Benchmarker<ConvBias> benchmark(handle);
  776. benchmark.set_display(false);
  777. benchmark.set_times(RUN);
  778. Benchmarker<ConvBias> benchmark_winograd(handle);
  779. benchmark_winograd.set_display(false).set_times(RUN);
  780. benchmark_winograd.set_dtype(0, dtype::QuantizedS8(2.5f))
  781. .set_dtype(1, dtype::QuantizedS8(2.5f))
  782. .set_dtype(2, dtype::QuantizedS32(6.25f))
  783. .set_dtype(4, dtype::QuantizedS8(60.25f));
  784. for (auto&& arg : args) {
  785. TensorLayout dst_layout;
  786. auto opr = handle->create_operator<ConvBias>();
  787. opr->param() = arg.param;
  788. opr->deduce_layout({arg.src, dtype::Float32()},
  789. {arg.filter, dtype::Float32()},
  790. {arg.bias, dtype::Float32()}, {}, dst_layout);
  791. //! dst.nr_elems * IC * FH * FW * 2
  792. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  793. arg.filter[2] * arg.filter[3] * 2.0 /
  794. (1024 * 1024 * 1024) * 1e3;
  795. benchmark.set_param(arg.param);
  796. auto used = algo_benchmark<ConvBias>(
  797. benchmark, {arg.src, arg.filter, {}, {}, {}},
  798. algo_name_fp32) /
  799. RUN;
  800. benchmark_winograd.set_param(arg.param);
  801. auto used_winograd =
  802. algo_benchmark<ConvBias>(benchmark_winograd,
  803. {arg.src, arg.filter, {}, {}, {}},
  804. algo_name_quantized) /
  805. RUN;
  806. printf("%s %s: normal: %f ms %f Gflops winograd: %f ms %f GFlops "
  807. "speedup: "
  808. "%f\n",
  809. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  810. used, computations / used, used_winograd,
  811. computations / used_winograd, used / used_winograd);
  812. }
  813. };
  814. #if MEGDNN_AARCH64
  815. benchmark_winograd_quantized("WINOGRAD:AARCH64_F32_MK4_4x16:4:2",
  816. "WINOGRAD:AARCH64_INT16X16X32_MK8_8X8:8:2",
  817. handle(), 3);
  818. #else
  819. benchmark_winograd_quantized("WINOGRAD:ARMV7_F32_MK4_4x8:4:2",
  820. "WINOGRAD:ARMV7_INT16X16X32_MK8_4X8:8:2",
  821. handle(), 3);
  822. #endif
  823. }
  824. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_INT8_STRIDE1) {
  825. // have to remove preferred restrict in usable func before run the benchmark
  826. using namespace conv_bias;
  827. std::vector<TestArg> args;
  828. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  829. size_t p, NonlineMode nonline_mode) {
  830. if (w + 2 * p < kernel || h + 2 * p < kernel)
  831. return;
  832. param::ConvBias param;
  833. param.stride_h = 1;
  834. param.stride_w = 1;
  835. param.pad_h = p;
  836. param.pad_w = p;
  837. param.nonlineMode = nonline_mode;
  838. //! channel bias
  839. args.emplace_back(param, TensorShape{2, ic, h, w},
  840. TensorShape{oc, ic, kernel, kernel},
  841. TensorShape{1, oc, 1, 1});
  842. };
  843. for (size_t kernel : {2, 3, 5, 7})
  844. for (size_t ic : {1, 8, 16, 32})
  845. for (size_t oc : {1, 8, 16, 32})
  846. for (size_t p : {1})
  847. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  848. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  849. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  850. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  851. }
  852. constexpr size_t RUN = 50;
  853. Benchmarker<ConvBias> benchmark0(handle());
  854. benchmark0.set_dtype(0, dtype::QuantizedS8(2.5f))
  855. .set_dtype(1, dtype::QuantizedS8(2.5f))
  856. .set_dtype(2, dtype::QuantizedS32(6.25f))
  857. .set_dtype(4, dtype::QuantizedS8(60.25f));
  858. benchmark0.set_display(false);
  859. benchmark0.set_times(RUN);
  860. benchmark0.set_before_exec_callback(
  861. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("S8STRD1"));
  862. Benchmarker<ConvBias> benchmark1(handle());
  863. benchmark1.set_dtype(0, dtype::QuantizedS8(2.5f))
  864. .set_dtype(1, dtype::QuantizedS8(2.5f))
  865. .set_dtype(2, dtype::QuantizedS32(6.25f))
  866. .set_dtype(4, dtype::QuantizedS8(60.25f));
  867. benchmark1.set_display(false);
  868. benchmark1.set_times(RUN);
  869. for (auto&& arg : args) {
  870. TensorLayout dst_layout;
  871. auto opr = handle()->create_operator<ConvBias>();
  872. opr->param() = arg.param;
  873. opr->deduce_layout({arg.src, dtype::Int8()},
  874. {arg.filter, dtype::Int8()},
  875. {arg.bias, dtype::Int32()}, {}, dst_layout);
  876. //! dst.nr_elems * IC * FH * FW * 2
  877. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  878. arg.filter[2] * arg.filter[3] * 2.0 /
  879. (1024 * 1024 * 1024) * 1e3;
  880. auto used0 = benchmark0.set_param(arg.param).exec(
  881. {arg.src, arg.filter, arg.bias, {}, {}}) /
  882. RUN;
  883. auto used1 = benchmark1.set_param(arg.param).exec(
  884. {arg.src, arg.filter, arg.bias, {}, {}}) /
  885. RUN;
  886. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  887. "speedup: %f\n",
  888. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  889. used0, computations / used0, used1, computations / used1,
  890. used1 / used0);
  891. }
  892. }
  893. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_INT8_STRIDE2) {
  894. // have to remove preferred restrict in usable func before run the benchmark
  895. using namespace conv_bias;
  896. std::vector<TestArg> args;
  897. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  898. size_t p, NonlineMode nonline_mode) {
  899. if (w + 2 * p < kernel || h + 2 * p < kernel)
  900. return;
  901. param::ConvBias param;
  902. param.stride_h = 2;
  903. param.stride_w = 2;
  904. param.pad_h = p;
  905. param.pad_w = p;
  906. param.nonlineMode = nonline_mode;
  907. //! channel bias
  908. args.emplace_back(param, TensorShape{2, ic, h, w},
  909. TensorShape{oc, ic, kernel, kernel},
  910. TensorShape{1, oc, 1, 1});
  911. };
  912. for (size_t kernel : {2, 3, 5, 7})
  913. for (size_t ic : {1, 8, 16, 32})
  914. for (size_t oc : {1, 8, 16, 32})
  915. for (size_t p : {1})
  916. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  917. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  918. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  919. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  920. }
  921. constexpr size_t RUN = 50;
  922. Benchmarker<ConvBias> benchmark0(handle());
  923. benchmark0.set_dtype(0, dtype::QuantizedS8(2.5f))
  924. .set_dtype(1, dtype::QuantizedS8(2.5f))
  925. .set_dtype(2, dtype::QuantizedS32(6.25f))
  926. .set_dtype(4, dtype::QuantizedS8(60.25f));
  927. benchmark0.set_display(false);
  928. benchmark0.set_times(RUN);
  929. benchmark0.set_before_exec_callback(
  930. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("S8STRD2"));
  931. Benchmarker<ConvBias> benchmark1(handle());
  932. benchmark1.set_dtype(0, dtype::QuantizedS8(2.5f))
  933. .set_dtype(1, dtype::QuantizedS8(2.5f))
  934. .set_dtype(2, dtype::QuantizedS32(6.25f))
  935. .set_dtype(4, dtype::QuantizedS8(60.25f));
  936. benchmark1.set_display(false);
  937. benchmark1.set_times(RUN);
  938. for (auto&& arg : args) {
  939. TensorLayout dst_layout;
  940. auto opr = handle()->create_operator<ConvBias>();
  941. opr->param() = arg.param;
  942. opr->deduce_layout({arg.src, dtype::Int8()},
  943. {arg.filter, dtype::Int8()},
  944. {arg.bias, dtype::Int32()}, {}, dst_layout);
  945. //! dst.nr_elems * IC * FH * FW * 2
  946. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  947. arg.filter[2] * arg.filter[3] * 2.0 /
  948. (1024 * 1024 * 1024) * 1e3;
  949. auto used0 = benchmark0.set_param(arg.param).exec(
  950. {arg.src, arg.filter, arg.bias, {}, {}}) /
  951. RUN;
  952. auto used1 = benchmark1.set_param(arg.param).exec(
  953. {arg.src, arg.filter, arg.bias, {}, {}}) /
  954. RUN;
  955. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  956. "speedup: %f\n",
  957. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  958. used0, computations / used0, used1, computations / used1,
  959. used1 / used0);
  960. }
  961. }
  962. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_QUINT8_STRIDE1) {
  963. // have to remove preferred restrict in usable func before run the benchmark
  964. using namespace conv_bias;
  965. std::vector<TestArg> args;
  966. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  967. size_t p, NonlineMode nonline_mode) {
  968. if (w + 2 * p < kernel || h + 2 * p < kernel)
  969. return;
  970. param::ConvBias param;
  971. param.stride_h = 1;
  972. param.stride_w = 1;
  973. param.pad_h = p;
  974. param.pad_w = p;
  975. param.nonlineMode = nonline_mode;
  976. //! channel bias
  977. args.emplace_back(param, TensorShape{2, ic, h, w},
  978. TensorShape{oc, ic, kernel, kernel},
  979. TensorShape{1, oc, 1, 1});
  980. };
  981. for (size_t kernel : {2, 3, 5, 7})
  982. for (size_t ic : {1, 8, 16, 32})
  983. for (size_t oc : {1, 8, 16, 32})
  984. for (size_t p : {1})
  985. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  986. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  987. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  988. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  989. }
  990. constexpr size_t RUN = 50;
  991. Benchmarker<ConvBias> benchmark0(handle());
  992. benchmark0
  993. .set_dtype(0,
  994. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  995. .set_dtype(1,
  996. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  997. .set_dtype(2, dtype::QuantizedS32(0.04f))
  998. .set_dtype(4,
  999. dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1000. benchmark0.set_display(false);
  1001. benchmark0.set_times(RUN);
  1002. benchmark0.set_before_exec_callback(
  1003. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("QU8STRD1"));
  1004. Benchmarker<ConvBias> benchmark1(handle());
  1005. benchmark1
  1006. .set_dtype(0,
  1007. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1008. .set_dtype(1,
  1009. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1010. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1011. .set_dtype(4,
  1012. dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1013. benchmark1.set_display(false);
  1014. benchmark1.set_times(RUN);
  1015. for (auto&& arg : args) {
  1016. TensorLayout dst_layout;
  1017. auto opr = handle()->create_operator<ConvBias>();
  1018. opr->param() = arg.param;
  1019. opr->deduce_layout({arg.src, dtype::Int8()},
  1020. {arg.filter, dtype::Int8()},
  1021. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1022. //! dst.nr_elems * IC * FH * FW * 2
  1023. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1024. arg.filter[2] * arg.filter[3] * 2.0 /
  1025. (1024 * 1024 * 1024) * 1e3;
  1026. auto used0 = benchmark0.set_param(arg.param).exec(
  1027. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1028. RUN;
  1029. auto used1 = benchmark1.set_param(arg.param).exec(
  1030. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1031. RUN;
  1032. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  1033. "speedup: %f\n",
  1034. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  1035. used0, computations / used0, used1, computations / used1,
  1036. used1 / used0);
  1037. }
  1038. }
  1039. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_QUINT8_STRIDE2) {
  1040. // have to remove preferred restrict in usable func before run the benchmark
  1041. using namespace conv_bias;
  1042. std::vector<TestArg> args;
  1043. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1044. size_t p, NonlineMode nonline_mode) {
  1045. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1046. return;
  1047. param::ConvBias param;
  1048. param.stride_h = 2;
  1049. param.stride_w = 2;
  1050. param.pad_h = p;
  1051. param.pad_w = p;
  1052. param.nonlineMode = nonline_mode;
  1053. //! channel bias
  1054. args.emplace_back(param, TensorShape{2, ic, h, w},
  1055. TensorShape{oc, ic, kernel, kernel},
  1056. TensorShape{1, oc, 1, 1});
  1057. };
  1058. for (size_t kernel : {2, 3, 5, 7})
  1059. for (size_t ic : {1, 8, 16, 32})
  1060. for (size_t oc : {1, 8, 16, 32})
  1061. for (size_t p : {1})
  1062. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  1063. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  1064. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  1065. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  1066. }
  1067. constexpr size_t RUN = 50;
  1068. Benchmarker<ConvBias> benchmark0(handle());
  1069. benchmark0
  1070. .set_dtype(0,
  1071. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1072. .set_dtype(1,
  1073. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1074. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1075. .set_dtype(4,
  1076. dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1077. benchmark0.set_display(false);
  1078. benchmark0.set_times(RUN);
  1079. benchmark0.set_before_exec_callback(
  1080. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("QU8STRD2"));
  1081. Benchmarker<ConvBias> benchmark1(handle());
  1082. benchmark1
  1083. .set_dtype(0,
  1084. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1085. .set_dtype(1,
  1086. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1087. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1088. .set_dtype(4,
  1089. dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1090. benchmark1.set_display(false);
  1091. benchmark1.set_times(RUN);
  1092. for (auto&& arg : args) {
  1093. TensorLayout dst_layout;
  1094. auto opr = handle()->create_operator<ConvBias>();
  1095. opr->param() = arg.param;
  1096. opr->deduce_layout({arg.src, dtype::Int8()},
  1097. {arg.filter, dtype::Int8()},
  1098. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1099. //! dst.nr_elems * IC * FH * FW * 2
  1100. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1101. arg.filter[2] * arg.filter[3] * 2.0 /
  1102. (1024 * 1024 * 1024) * 1e3;
  1103. auto used0 = benchmark0.set_param(arg.param).exec(
  1104. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1105. RUN;
  1106. auto used1 = benchmark1.set_param(arg.param).exec(
  1107. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1108. RUN;
  1109. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  1110. "speedup: %f\n",
  1111. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  1112. used0, computations / used0, used1, computations / used1,
  1113. used1 / used0);
  1114. }
  1115. }
  1116. TEST_F(ARM_COMMON, BENCHMARK_CHANNEL_WISE_F32_STRIDE1_NCHW44) {
  1117. // have to remove preferred restrict in usable func before run the benchmark
  1118. using namespace conv_bias;
  1119. param::ConvBias param;
  1120. param.stride_h = 1;
  1121. param.stride_w = 1;
  1122. param.pad_h = 1;
  1123. param.pad_w = 1;
  1124. param.nonlineMode = NonlineMode::RELU;
  1125. param.sparse = param::ConvBias::Sparse::GROUP;
  1126. constexpr size_t RUN = 50;
  1127. Benchmarker<ConvBias> benchmark0(handle());
  1128. benchmark0.set_display(false);
  1129. benchmark0.set_param(param);
  1130. benchmark0.set_times(RUN);
  1131. benchmark0.set_before_exec_callback(
  1132. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  1133. "F32STRD1_LARGE_GROUP"));
  1134. auto opr = handle()->create_operator<ConvBias>();
  1135. opr->param() = param;
  1136. param.format = param::ConvBias::Format::NCHW44;
  1137. Benchmarker<ConvBias> benchmark1(handle());
  1138. benchmark1.set_display(false);
  1139. benchmark1.set_param(param);
  1140. benchmark1.set_times(RUN);
  1141. benchmark1.set_before_exec_callback(
  1142. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  1143. "F32_CHANNEL_WISE_NCHW44"));
  1144. auto run = [&](size_t group, size_t w, size_t h, size_t kernel) {
  1145. TensorLayout dst_layout;
  1146. opr->deduce_layout({{1, group * 4, h, w}, dtype::Int8()},
  1147. {{group * 4, 1, 1, kernel, kernel}, dtype::Int8()},
  1148. {{1, group * 4, 1, 1}, dtype::Int32()}, {},
  1149. dst_layout);
  1150. //! dst.nr_elems * IC * FH * FW * 2
  1151. float computations = dst_layout.total_nr_elems() * kernel * kernel *
  1152. 2.0 / (1024 * 1024 * 1024) * 1e3;
  1153. auto used0 = benchmark0.exec({{1, group * 4, h, w},
  1154. {group * 4, 1, 1, kernel, kernel},
  1155. {1, group * 4, 1, 1},
  1156. {},
  1157. {}}) /
  1158. RUN;
  1159. auto used1 = benchmark1.exec({{1, group, h, w, 4},
  1160. {group, 1, 1, kernel, kernel, 4},
  1161. {1, group, 1, 1, 4},
  1162. {},
  1163. {}}) /
  1164. RUN;
  1165. printf("group/h/w/kernel:%zu,%zu,%zu,%zu: nchw: %f ms %f Gflops "
  1166. "nchw44: "
  1167. "%f ms %f GFlops "
  1168. "speedup: %f\n",
  1169. group, h, w, kernel, used0, computations / used0, used1,
  1170. computations / used1, used0 / used1);
  1171. };
  1172. for (size_t group : {8, 16, 32, 64}) {
  1173. for (size_t kerenl : {2, 3, 5}) {
  1174. run(group, 112, 112, kerenl);
  1175. run(group, 56, 56, kerenl);
  1176. run(group, 48, 48, kerenl);
  1177. run(group, 28, 28, kerenl);
  1178. run(group, 14, 14, kerenl);
  1179. }
  1180. }
  1181. run(8, 112, 112, 3);
  1182. run(32, 56, 56, 3);
  1183. run(64, 28, 28, 3);
  1184. run(128, 14, 14, 3);
  1185. }
  1186. TEST_F(ARM_COMMON, BENCHMARK_CHANNEL_WISE_F32_STRIDE2_NCHW44) {
  1187. // have to remove preferred restrict in usable func before run the benchmark
  1188. using namespace conv_bias;
  1189. param::ConvBias param;
  1190. param.stride_h = 2;
  1191. param.stride_w = 2;
  1192. param.pad_h = 1;
  1193. param.pad_w = 1;
  1194. param.nonlineMode = NonlineMode::RELU;
  1195. param.sparse = param::ConvBias::Sparse::GROUP;
  1196. constexpr size_t RUN = 50;
  1197. Benchmarker<ConvBias> benchmark0(handle());
  1198. benchmark0.set_display(false);
  1199. benchmark0.set_param(param);
  1200. benchmark0.set_times(RUN);
  1201. benchmark0.set_before_exec_callback(
  1202. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  1203. "F32STRD2_LARGE_GROUP"));
  1204. auto opr = handle()->create_operator<ConvBias>();
  1205. opr->param() = param;
  1206. param.format = param::ConvBias::Format::NCHW44;
  1207. Benchmarker<ConvBias> benchmark1(handle());
  1208. benchmark1.set_display(false);
  1209. benchmark1.set_param(param);
  1210. benchmark1.set_times(RUN);
  1211. benchmark1.set_before_exec_callback(
  1212. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  1213. "F32_CHANNEL_WISE_NCHW44"));
  1214. auto run = [&](size_t group, size_t w, size_t h, size_t kernel) {
  1215. TensorLayout dst_layout;
  1216. opr->deduce_layout({{1, group * 4, h, w}, dtype::Int8()},
  1217. {{group * 4, 1, 1, kernel, kernel}, dtype::Int8()},
  1218. {{1, group * 4, 1, 1}, dtype::Int32()}, {},
  1219. dst_layout);
  1220. //! dst.nr_elems * IC * FH * FW * 2
  1221. float computations = dst_layout.total_nr_elems() * kernel * kernel *
  1222. 2.0 / (1024 * 1024 * 1024) * 1e3;
  1223. auto used0 = benchmark0.exec({{1, group * 4, h, w},
  1224. {group * 4, 1, 1, kernel, kernel},
  1225. {1, group * 4, 1, 1},
  1226. {},
  1227. {}}) /
  1228. RUN;
  1229. auto used1 = benchmark1.exec({{1, group, h, w, 4},
  1230. {group, 1, 1, kernel, kernel, 4},
  1231. {1, group, 1, 1, 4},
  1232. {},
  1233. {}}) /
  1234. RUN;
  1235. printf("group/h/w/kernel:%zu,%zu,%zu,%zu: nchw: %f ms %f Gflops "
  1236. "nchw44: "
  1237. "%f ms %f GFlops "
  1238. "speedup: %f\n",
  1239. group, h, w, kernel, used0, computations / used0, used1,
  1240. computations / used1, used0 / used1);
  1241. };
  1242. for (size_t group : {8, 16, 32, 64}) {
  1243. for (size_t kerenl : {2, 3, 5}) {
  1244. run(group, 112, 112, kerenl);
  1245. run(group, 56, 56, kerenl);
  1246. run(group, 48, 48, kerenl);
  1247. run(group, 28, 28, kerenl);
  1248. run(group, 14, 14, kerenl);
  1249. }
  1250. }
  1251. run(8, 112, 112, 3);
  1252. run(32, 56, 56, 3);
  1253. run(64, 28, 28, 3);
  1254. run(128, 14, 14, 3);
  1255. }
  1256. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_QINT8_STRIDE1_NCHW44) {
  1257. // have to remove preferred restrict in usable func before run the benchmark
  1258. using namespace conv_bias;
  1259. param::ConvBias param;
  1260. param.stride_h = 1;
  1261. param.stride_w = 1;
  1262. param.pad_h = 1;
  1263. param.pad_w = 1;
  1264. param.nonlineMode = NonlineMode::RELU;
  1265. param.sparse = param::ConvBias::Sparse::GROUP;
  1266. constexpr size_t RUN = 50;
  1267. Benchmarker<ConvBias> benchmark0(handle());
  1268. benchmark0.set_dtype(0, dtype::QuantizedS8(0.2f))
  1269. .set_dtype(1, dtype::QuantizedS8(0.2f))
  1270. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1271. .set_dtype(4, dtype::QuantizedS8(1.4f));
  1272. benchmark0.set_display(false);
  1273. benchmark0.set_param(param);
  1274. benchmark0.set_times(RUN);
  1275. benchmark0.set_before_exec_callback(
  1276. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  1277. "S8STRD1_LARGE_GROUP"));
  1278. auto opr = handle()->create_operator<ConvBias>();
  1279. opr->param() = param;
  1280. param.format = param::ConvBias::Format::NCHW44;
  1281. Benchmarker<ConvBias> benchmark1(handle());
  1282. benchmark1.set_dtype(0, dtype::QuantizedS8(0.2f))
  1283. .set_dtype(1, dtype::QuantizedS8(0.2f))
  1284. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1285. .set_dtype(4, dtype::QuantizedS8(1.4f));
  1286. benchmark1.set_display(false);
  1287. benchmark1.set_param(param);
  1288. benchmark1.set_times(RUN);
  1289. benchmark1.set_before_exec_callback(
  1290. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  1291. "S8_CHAN_WISE_STRD1_NCHW44"));
  1292. auto run = [&](size_t group, size_t w, size_t h, size_t kernel) {
  1293. TensorLayout dst_layout;
  1294. opr->deduce_layout({{1, group * 4, h, w}, dtype::Int8()},
  1295. {{group * 4, 1, 1, kernel, kernel}, dtype::Int8()},
  1296. {{1, group * 4, 1, 1}, dtype::Int32()}, {},
  1297. dst_layout);
  1298. //! dst.nr_elems * IC * FH * FW * 2
  1299. float computations = dst_layout.total_nr_elems() * kernel * kernel *
  1300. 2.0 / (1024 * 1024 * 1024) * 1e3;
  1301. auto used0 = benchmark0.exec({{1, group * 4, h, w},
  1302. {group * 4, 1, 1, kernel, kernel},
  1303. {1, group * 4, 1, 1},
  1304. {},
  1305. {}}) /
  1306. RUN;
  1307. auto used1 = benchmark1.exec({{1, group, h, w, 4},
  1308. {group, 1, 1, kernel, kernel, 4},
  1309. {1, group, 1, 1, 4},
  1310. {},
  1311. {}}) /
  1312. RUN;
  1313. printf("group/h/w/kernel:%zu,%zu,%zu,%zu: nchw: %f ms %f Gflops "
  1314. "nchw44: "
  1315. "%f ms %f GFlops "
  1316. "speedup: %f\n",
  1317. group, h, w, kernel, used0, computations / used0, used1,
  1318. computations / used1, used0 / used1);
  1319. };
  1320. for (size_t group : {8, 16, 32, 64, 128}) {
  1321. for (size_t kerenl : {2, 3, 5}) {
  1322. run(group, 112, 112, kerenl);
  1323. run(group, 56, 56, kerenl);
  1324. run(group, 48, 48, kerenl);
  1325. run(group, 28, 28, kerenl);
  1326. run(group, 14, 14, kerenl);
  1327. }
  1328. }
  1329. }
  1330. #endif
  1331. #if __ARM_FEATURE_DOTPROD
  1332. #if MEGDNN_WITH_BENCHMARK
  1333. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_INT8_STRIDE1_WITHDOTPROD) {
  1334. // have to remove preferred restrict in usable func before run the benchmark
  1335. using namespace conv_bias;
  1336. std::vector<TestArg> args;
  1337. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1338. size_t p, NonlineMode nonline_mode) {
  1339. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1340. return;
  1341. param::ConvBias param;
  1342. param.stride_h = 1;
  1343. param.stride_w = 1;
  1344. param.pad_h = p;
  1345. param.pad_w = p;
  1346. param.nonlineMode = nonline_mode;
  1347. //! channel bias
  1348. args.emplace_back(param, TensorShape{2, ic, h, w},
  1349. TensorShape{oc, ic, kernel, kernel},
  1350. TensorShape{1, oc, 1, 1});
  1351. };
  1352. for (size_t kernel : {2, 3, 5, 7})
  1353. for (size_t ic : {1, 8, 16, 32})
  1354. for (size_t oc : {1, 8, 16, 32})
  1355. for (size_t p : {1})
  1356. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  1357. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  1358. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  1359. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  1360. }
  1361. constexpr size_t RUN = 50;
  1362. Benchmarker<ConvBias> benchmark0(handle());
  1363. benchmark0.set_dtype(0, dtype::QuantizedS8(2.5f))
  1364. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1365. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1366. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1367. benchmark0.set_display(false);
  1368. benchmark0.set_times(RUN);
  1369. benchmark0.set_before_exec_callback(
  1370. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("ARMDOTS8STRD1"));
  1371. Benchmarker<ConvBias> benchmark1(handle());
  1372. benchmark1.set_dtype(0, dtype::QuantizedS8(2.5f))
  1373. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1374. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1375. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1376. benchmark1.set_display(false);
  1377. benchmark1.set_times(RUN);
  1378. for (auto&& arg : args) {
  1379. TensorLayout dst_layout;
  1380. auto opr = handle()->create_operator<ConvBias>();
  1381. opr->param() = arg.param;
  1382. opr->deduce_layout({arg.src, dtype::Int8()},
  1383. {arg.filter, dtype::Int8()},
  1384. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1385. //! dst.nr_elems * IC * FH * FW * 2
  1386. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1387. arg.filter[2] * arg.filter[3] * 2.0 /
  1388. (1024 * 1024 * 1024) * 1e3;
  1389. auto used0 = benchmark0.set_param(arg.param).exec(
  1390. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1391. RUN;
  1392. auto used1 = benchmark1.set_param(arg.param).exec(
  1393. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1394. RUN;
  1395. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  1396. "speedup: %f\n",
  1397. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  1398. used0, computations / used0, used1, computations / used1,
  1399. used1 / used0);
  1400. }
  1401. }
  1402. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_INT8_STRIDE2_WITHDOTPROD) {
  1403. // have to remove preferred restrict in usable func before run the benchmark
  1404. using namespace conv_bias;
  1405. std::vector<TestArg> args;
  1406. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1407. size_t p, NonlineMode nonline_mode) {
  1408. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1409. return;
  1410. param::ConvBias param;
  1411. param.stride_h = 2;
  1412. param.stride_w = 2;
  1413. param.pad_h = p;
  1414. param.pad_w = p;
  1415. param.nonlineMode = nonline_mode;
  1416. //! channel bias
  1417. args.emplace_back(param, TensorShape{2, ic, h, w},
  1418. TensorShape{oc, ic, kernel, kernel},
  1419. TensorShape{1, oc, 1, 1});
  1420. };
  1421. for (size_t kernel : {2, 3, 5, 7})
  1422. for (size_t ic : {1, 8, 16, 32})
  1423. for (size_t oc : {1, 8, 16, 32})
  1424. for (size_t p : {1})
  1425. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  1426. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  1427. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  1428. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  1429. }
  1430. constexpr size_t RUN = 50;
  1431. Benchmarker<ConvBias> benchmark0(handle());
  1432. benchmark0.set_dtype(0, dtype::QuantizedS8(2.5f))
  1433. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1434. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1435. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1436. benchmark0.set_display(false);
  1437. benchmark0.set_times(RUN);
  1438. benchmark0.set_before_exec_callback(
  1439. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("ARMDOTS8STRD2"));
  1440. Benchmarker<ConvBias> benchmark1(handle());
  1441. benchmark1.set_dtype(0, dtype::QuantizedS8(2.5f))
  1442. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1443. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1444. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1445. benchmark1.set_display(false);
  1446. benchmark1.set_times(RUN);
  1447. for (auto&& arg : args) {
  1448. TensorLayout dst_layout;
  1449. auto opr = handle()->create_operator<ConvBias>();
  1450. opr->param() = arg.param;
  1451. opr->deduce_layout({arg.src, dtype::Int8()},
  1452. {arg.filter, dtype::Int8()},
  1453. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1454. //! dst.nr_elems * IC * FH * FW * 2
  1455. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1456. arg.filter[2] * arg.filter[3] * 2.0 /
  1457. (1024 * 1024 * 1024) * 1e3;
  1458. auto used0 = benchmark0.set_param(arg.param).exec(
  1459. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1460. RUN;
  1461. auto used1 = benchmark1.set_param(arg.param).exec(
  1462. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1463. RUN;
  1464. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  1465. "speedup: %f\n",
  1466. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  1467. used0, computations / used0, used1, computations / used1,
  1468. used1 / used0);
  1469. }
  1470. }
  1471. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_QUINT8_STRIDE1_WITHDOTPROD) {
  1472. // have to remove preferred restrict in usable func before run the benchmark
  1473. using namespace conv_bias;
  1474. std::vector<TestArg> args;
  1475. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1476. size_t p, NonlineMode nonline_mode) {
  1477. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1478. return;
  1479. param::ConvBias param;
  1480. param.stride_h = 1;
  1481. param.stride_w = 1;
  1482. param.pad_h = p;
  1483. param.pad_w = p;
  1484. param.nonlineMode = nonline_mode;
  1485. //! channel bias
  1486. args.emplace_back(param, TensorShape{2, ic, h, w},
  1487. TensorShape{oc, ic, kernel, kernel},
  1488. TensorShape{1, oc, 1, 1});
  1489. };
  1490. // clang-format off
  1491. for (size_t kernel : {2, 3, 5, 7})
  1492. for (size_t ic : {1, 8, 16, 32})
  1493. for (size_t oc : {1, 8, 16, 32})
  1494. for (size_t p : {1})
  1495. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  1496. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  1497. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  1498. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  1499. }
  1500. // clang-format on
  1501. constexpr size_t RUN = 50;
  1502. Benchmarker<ConvBias> benchmark0(handle());
  1503. benchmark0
  1504. .set_dtype(0,
  1505. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1506. .set_dtype(1,
  1507. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1508. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1509. .set_dtype(4,
  1510. dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1511. benchmark0.set_display(false);
  1512. benchmark0.set_times(RUN);
  1513. benchmark0.set_before_exec_callback(
  1514. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("ARMDOTU8STRD1"));
  1515. Benchmarker<ConvBias> benchmark1(handle());
  1516. benchmark1
  1517. .set_dtype(0,
  1518. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1519. .set_dtype(1,
  1520. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1521. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1522. .set_dtype(4,
  1523. dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1524. benchmark1.set_display(false);
  1525. benchmark1.set_times(RUN);
  1526. for (auto&& arg : args) {
  1527. TensorLayout dst_layout;
  1528. auto opr = handle()->create_operator<ConvBias>();
  1529. opr->param() = arg.param;
  1530. opr->deduce_layout({arg.src, dtype::Int8()},
  1531. {arg.filter, dtype::Int8()},
  1532. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1533. //! dst.nr_elems * IC * FH * FW * 2
  1534. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1535. arg.filter[2] * arg.filter[3] * 2.0 /
  1536. (1024 * 1024 * 1024) * 1e3;
  1537. auto used0 = benchmark0.set_param(arg.param).exec(
  1538. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1539. RUN;
  1540. auto used1 = benchmark1.set_param(arg.param).exec(
  1541. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1542. RUN;
  1543. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  1544. "speedup: %f\n",
  1545. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  1546. used0, computations / used0, used1, computations / used1,
  1547. used1 / used0);
  1548. }
  1549. }
  1550. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_QUINT8_STRIDE2_WITHDOTPROD) {
  1551. // have to remove preferred restrict in usable func before run the benchmark
  1552. using namespace conv_bias;
  1553. std::vector<TestArg> args;
  1554. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1555. size_t p, NonlineMode nonline_mode) {
  1556. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1557. return;
  1558. param::ConvBias param;
  1559. param.stride_h = 2;
  1560. param.stride_w = 2;
  1561. param.pad_h = p;
  1562. param.pad_w = p;
  1563. param.nonlineMode = nonline_mode;
  1564. //! channel bias
  1565. args.emplace_back(param, TensorShape{2, ic, h, w},
  1566. TensorShape{oc, ic, kernel, kernel},
  1567. TensorShape{1, oc, 1, 1});
  1568. };
  1569. // clang-format off
  1570. for (size_t kernel : {2, 3, 5, 7})
  1571. for (size_t ic : {1, 8, 16, 32})
  1572. for (size_t oc : {1, 8, 16, 32})
  1573. for (size_t p : {1})
  1574. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  1575. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  1576. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  1577. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  1578. }
  1579. // clang-format on
  1580. constexpr size_t RUN = 50;
  1581. Benchmarker<ConvBias> benchmark0(handle());
  1582. benchmark0
  1583. .set_dtype(0,
  1584. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1585. .set_dtype(1,
  1586. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1587. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1588. .set_dtype(4,
  1589. dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1590. benchmark0.set_display(false);
  1591. benchmark0.set_times(RUN);
  1592. benchmark0.set_before_exec_callback(
  1593. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("ARMDOTU8STRD2"));
  1594. Benchmarker<ConvBias> benchmark1(handle());
  1595. benchmark1
  1596. .set_dtype(0,
  1597. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1598. .set_dtype(1,
  1599. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1600. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1601. .set_dtype(4,
  1602. dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1603. benchmark1.set_display(false);
  1604. benchmark1.set_times(RUN);
  1605. for (auto&& arg : args) {
  1606. TensorLayout dst_layout;
  1607. auto opr = handle()->create_operator<ConvBias>();
  1608. opr->param() = arg.param;
  1609. opr->deduce_layout({arg.src, dtype::Int8()},
  1610. {arg.filter, dtype::Int8()},
  1611. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1612. //! dst.nr_elems * IC * FH * FW * 2
  1613. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1614. arg.filter[2] * arg.filter[3] * 2.0 /
  1615. (1024 * 1024 * 1024) * 1e3;
  1616. auto used0 = benchmark0.set_param(arg.param).exec(
  1617. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1618. RUN;
  1619. auto used1 = benchmark1.set_param(arg.param).exec(
  1620. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1621. RUN;
  1622. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  1623. "speedup: %f\n",
  1624. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  1625. used0, computations / used0, used1, computations / used1,
  1626. used1 / used0);
  1627. }
  1628. }
  1629. #endif
  1630. #endif
  1631. /*====================== BENCHMARK CONV1X1 ===========================*/
  1632. #if MEGDNN_WITH_BENCHMARK
  1633. namespace {
  1634. std::vector<conv_bias::TestArg> get_conv_bias_1x1_benchmark_args(
  1635. size_t pack_size = 1) {
  1636. using namespace conv_bias;
  1637. std::vector<TestArg> args;
  1638. param::ConvBias param;
  1639. param.stride_h = 1;
  1640. param.stride_w = 1;
  1641. param.pad_h = 0;
  1642. param.pad_w = 0;
  1643. param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
  1644. auto bench_case = [&](size_t OC, size_t IC, size_t H, size_t W) {
  1645. if (pack_size == 1)
  1646. args.emplace_back(param, TensorShape{1, IC, H, W},
  1647. TensorShape{OC, IC, 1, 1}, TensorShape{});
  1648. else {
  1649. if (pack_size == 4)
  1650. param.format = param::ConvBias::Format::NCHW44;
  1651. args.emplace_back(param,
  1652. TensorShape{1, IC / pack_size, H, W, pack_size},
  1653. TensorShape{OC / pack_size, IC / pack_size, 1, 1,
  1654. pack_size, pack_size},
  1655. TensorShape{});
  1656. }
  1657. };
  1658. //! MobileNetV1
  1659. bench_case(64, 32, 112, 112);
  1660. bench_case(128, 64, 56, 56);
  1661. bench_case(128, 128, 56, 56);
  1662. bench_case(256, 128, 28, 28);
  1663. bench_case(256, 256, 28, 28);
  1664. bench_case(512, 256, 14, 14);
  1665. bench_case(512, 512, 14, 14);
  1666. bench_case(1024, 512, 7, 7);
  1667. bench_case(1024, 1024, 7, 7);
  1668. //! MobileNetV2
  1669. bench_case(16, 32, 112, 112);
  1670. bench_case(96, 16, 112, 112);
  1671. bench_case(144, 24, 56, 56);
  1672. bench_case(192, 32, 28, 28);
  1673. bench_case(384, 64, 28, 28);
  1674. bench_case(576, 96, 14, 14);
  1675. bench_case(960, 160, 7, 7);
  1676. bench_case(320, 960, 7, 7);
  1677. bench_case(1280, 320, 7, 7);
  1678. //! MobileNetV3-Large
  1679. bench_case(64, 16, 112, 112);
  1680. bench_case(72, 24, 56, 56);
  1681. bench_case(120, 40, 28, 28);
  1682. bench_case(240, 40, 28, 28);
  1683. bench_case(200, 80, 14, 14);
  1684. bench_case(184, 80, 14, 14);
  1685. bench_case(480, 80, 14, 14);
  1686. bench_case(672, 112, 14, 14);
  1687. //! MobileNetV3-Small
  1688. bench_case(72, 16, 56, 56);
  1689. bench_case(88, 24, 28, 28);
  1690. bench_case(96, 24, 28, 28);
  1691. bench_case(240, 40, 14, 14);
  1692. bench_case(120, 40, 14, 14);
  1693. bench_case(144, 48, 14, 14);
  1694. bench_case(288, 48, 14, 14);
  1695. bench_case(576, 96, 7, 7);
  1696. //! resnet50
  1697. bench_case(256, 64, 56, 56);
  1698. bench_case(512, 128, 28, 28);
  1699. bench_case(1024, 256, 14, 14);
  1700. bench_case(2048, 512, 7, 7);
  1701. return args;
  1702. }
  1703. void benchmark_conv1x1(const char* matmul_algo_name, Handle* handle,
  1704. DType stype, DType matmul_dtype, DType bias_type,
  1705. DType conv_dtype) {
  1706. using namespace conv_bias;
  1707. std::vector<TestArg> conv_bias_1x1_args =
  1708. get_conv_bias_1x1_benchmark_args();
  1709. constexpr size_t RUNS = 50;
  1710. param::MatrixMul param;
  1711. param.transposeA = false;
  1712. param.transposeB = false;
  1713. Benchmarker<MatrixMul> benchmark_matmul(handle);
  1714. benchmark_matmul.set_before_exec_callback(
  1715. AlgoChecker<MatrixMul>(matmul_algo_name));
  1716. benchmark_matmul.set_times(RUNS)
  1717. .set_dtype(0, stype)
  1718. .set_dtype(1, stype)
  1719. .set_dtype(2, matmul_dtype)
  1720. .set_param(param)
  1721. .set_display(false);
  1722. std::string conv1x1_algo_name = ssprintf("CONV1x1:%s:24", matmul_algo_name);
  1723. Benchmarker<ConvBias> benchmark_conv1x1(handle);
  1724. benchmark_conv1x1.set_before_exec_callback(
  1725. conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1726. conv1x1_algo_name.c_str()));
  1727. benchmark_conv1x1.set_times(RUNS)
  1728. .set_dtype(0, stype)
  1729. .set_dtype(1, stype)
  1730. .set_dtype(2, bias_type)
  1731. .set_dtype(4, conv_dtype)
  1732. .set_display(false);
  1733. for (auto&& arg : conv_bias_1x1_args) {
  1734. size_t IC = arg.src[1];
  1735. size_t OH = arg.src[2];
  1736. size_t OW = arg.src[3];
  1737. size_t OC = arg.filter[0];
  1738. size_t M = OC;
  1739. size_t K = IC;
  1740. size_t N = OH * OW;
  1741. float computations = M * N * K * 2.f / (1024 * 1024 * 1024) * 1e3;
  1742. TensorShape A, B;
  1743. A = TensorShape{M, K};
  1744. B = TensorShape{K, N};
  1745. auto conv1x1_used = benchmark_conv1x1.set_param(arg.param).exec(
  1746. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1747. RUNS;
  1748. auto matmul_used = benchmark_matmul.exec({A, B, {}}) / RUNS;
  1749. printf("\n%s: ", matmul_algo_name);
  1750. printf("%s %s:\n matmul: %f ms %f Gflops\nconv1x1: %f ms %f GFlops "
  1751. "speedup: "
  1752. "%f\n",
  1753. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  1754. matmul_used, computations / matmul_used, conv1x1_used,
  1755. computations / conv1x1_used, matmul_used / conv1x1_used);
  1756. }
  1757. }
  1758. } // namespace
  1759. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_CONV1X1_S1_F32) {
  1760. #if MEGDNN_AARCH64
  1761. benchmark_conv1x1("AARCH64_F32K8X12X1", handle(), dtype::Float32{},
  1762. dtype::Float32{}, dtype::Float32{}, dtype::Float32{});
  1763. #else
  1764. benchmark_conv1x1("ARMV7_F32", handle(), dtype::Float32{}, dtype::Float32{},
  1765. dtype::Float32{}, dtype::Float32{});
  1766. #endif
  1767. }
  1768. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  1769. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_CONV1X1_S1_F16) {
  1770. #if MEGDNN_AARCH64
  1771. benchmark_conv1x1("AARCH64_F16_K8X24X1", handle(), dtype::Float16{},
  1772. dtype::Float16{}, dtype::Float16{}, dtype::Float16{});
  1773. #else
  1774. benchmark_conv1x1("AARCH32_F16_K4X16X1", handle(), dtype::Float16{},
  1775. dtype::Float16{}, dtype::Float16{}, dtype::Float16{});
  1776. #endif
  1777. }
  1778. #endif
  1779. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_CONV1X1_S1_QUANTIZEDSYM) {
  1780. dtype::QuantizedS8 stype(2.5f);
  1781. dtype::QuantizedS32 dtype(6.25f);
  1782. #if MEGDNN_AARCH64
  1783. #if __ARM_FEATURE_DOTPROD
  1784. benchmark_conv1x1("AARCH64_INT8X8X32_K8X12X4_DOTPROD", handle(), stype,
  1785. dtype, dtype, dtype);
  1786. #else
  1787. benchmark_conv1x1("AARCH64_INT8X8X32_K8X8X8", handle(), stype, dtype, dtype,
  1788. dtype);
  1789. benchmark_conv1x1("AARCH64_INT8X8X32_K4X4X16", handle(), stype, dtype,
  1790. dtype, dtype);
  1791. #endif
  1792. #elif MEGDNN_ARMV7
  1793. benchmark_conv1x1("ARMV7_INT8X8X32_K4X8X8", handle(), stype, dtype, dtype,
  1794. dtype);
  1795. #endif
  1796. }
  1797. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_CONV1X1_S1_QUANTIZEDASYM) {
  1798. dtype::Quantized8Asymm stype(1.2f, (uint8_t)125);
  1799. dtype::QuantizedS32 dtype(1.2 * 1.2);
  1800. #if MEGDNN_AARCH64
  1801. #if __ARM_FEATURE_DOTPROD
  1802. benchmark_conv1x1("AARCH64_QUINT8_K8X8X4_DOTPROD", handle(), stype, dtype,
  1803. dtype, dtype);
  1804. #else
  1805. benchmark_conv1x1("AARCH64_QUINT8_K8X8X8", handle(), stype, dtype, dtype,
  1806. dtype);
  1807. #endif
  1808. #elif MEGDNN_ARMV7
  1809. benchmark_conv1x1("ARMV7_QUINT8_K4X8X8", handle(), stype, dtype, dtype,
  1810. dtype);
  1811. #endif
  1812. }
  1813. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_CONV1X1_S1_INT8x8x16) {
  1814. #if MEGDNN_AARCH64
  1815. benchmark_conv1x1("AARCH64_INT8X8X16_K8X8X8", handle(), dtype::Int8{},
  1816. dtype::Int16{}, dtype::Int16{}, dtype::Int16{});
  1817. benchmark_conv1x1("AARCH64_INT8X8X16_K4X4X16", handle(), dtype::Int8{},
  1818. dtype::Int16{}, dtype::Int16{}, dtype::Int16{});
  1819. #elif MEGDNN_ARMV7
  1820. benchmark_conv1x1("ARMV7_INT8X8X16_K4X8X8", handle(), dtype::Int8{},
  1821. dtype::Int16{}, dtype::Int16{}, dtype::Int16{});
  1822. benchmark_conv1x1("ARMV7_INT8X8X16_K4X2X16", handle(), dtype::Int8{},
  1823. dtype::Int16{}, dtype::Int16{}, dtype::Int16{});
  1824. #endif
  1825. }
  1826. #ifndef __ARM_FEATURE_DOTPROD
  1827. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_1X1_S1_NCHW_VS_NCHW44_INT8x8x32) {
  1828. std::vector<TestArg> conv_bias_1x1_args_nchw44 =
  1829. get_conv_bias_1x1_benchmark_args(4);
  1830. std::vector<TestArg> conv_bias_1x1_args_nchw =
  1831. get_conv_bias_1x1_benchmark_args(1);
  1832. constexpr size_t RUNS = 50;
  1833. Benchmarker<ConvBias> benchmark_conv1x1_nchw44(handle());
  1834. benchmark_conv1x1_nchw44.set_before_exec_callback(
  1835. conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1836. "CONV1x1:AARCH64_INT8X8X32_MK4_4X4X16:24"));
  1837. benchmark_conv1x1_nchw44.set_times(RUNS)
  1838. .set_dtype(0, dtype::Int8())
  1839. .set_dtype(1, dtype::Int8())
  1840. .set_dtype(2, dtype::Int32())
  1841. .set_dtype(4, dtype::Int32())
  1842. .set_display(false);
  1843. Benchmarker<ConvBias> benchmark_conv1x1_nchw(handle());
  1844. benchmark_conv1x1_nchw.set_before_exec_callback(
  1845. conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1846. "CONV1x1:AARCH64_INT8X8X32_K4X4X16:24"));
  1847. benchmark_conv1x1_nchw.set_times(RUNS)
  1848. .set_dtype(0, dtype::Int8())
  1849. .set_dtype(1, dtype::Int8())
  1850. .set_dtype(2, dtype::Int32())
  1851. .set_dtype(4, dtype::Int32())
  1852. .set_display(false);
  1853. for (size_t i = 0; i < conv_bias_1x1_args_nchw44.size(); ++i) {
  1854. auto&& arg_nchw = conv_bias_1x1_args_nchw[i];
  1855. auto&& arg_nchw44 = conv_bias_1x1_args_nchw44[i];
  1856. size_t IC = arg_nchw.src[1];
  1857. size_t OH = arg_nchw.src[2];
  1858. size_t OW = arg_nchw.src[3];
  1859. size_t OC = arg_nchw.filter[0];
  1860. size_t M = OC;
  1861. size_t K = IC;
  1862. size_t N = OH * OW;
  1863. float computations = M * N * K * 2.f / (1024 * 1024 * 1024) * 1e3;
  1864. auto conv1x1_nchw = benchmark_conv1x1_nchw.set_param(arg_nchw.param)
  1865. .exec({arg_nchw.src,
  1866. arg_nchw.filter,
  1867. arg_nchw.bias,
  1868. {},
  1869. {}}) /
  1870. RUNS;
  1871. auto conv1x1_nchw44 =
  1872. benchmark_conv1x1_nchw44.set_param(arg_nchw44.param)
  1873. .exec({arg_nchw44.src,
  1874. arg_nchw44.filter,
  1875. arg_nchw44.bias,
  1876. {},
  1877. {}}) /
  1878. RUNS;
  1879. printf("%s %s:\n conv_1x1_nchw: %f ms %f Gflops\nconv1x1_nchw44: %f ms "
  1880. "%f GFlops "
  1881. "speedup: "
  1882. "%f\n",
  1883. arg_nchw.src.to_string().c_str(),
  1884. arg_nchw.filter.to_string().c_str(), conv1x1_nchw,
  1885. computations / conv1x1_nchw, conv1x1_nchw44,
  1886. computations / conv1x1_nchw44, conv1x1_nchw / conv1x1_nchw44);
  1887. }
  1888. }
  1889. #endif
  1890. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_WINOGRAD_VS_IM2COL_INT8) {
  1891. auto&& args = get_winograd_benchmark_args(3, 8);
  1892. using namespace conv_bias;
  1893. constexpr size_t RUN = 10;
  1894. Benchmarker<ConvBias> benchmark_im2col(handle());
  1895. benchmark_im2col.set_display(false);
  1896. benchmark_im2col.set_times(RUN);
  1897. benchmark_im2col.set_dtype(0, dtype::QuantizedS8(2.5f))
  1898. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1899. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1900. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1901. Benchmarker<ConvBias> benchmark_winograd(handle());
  1902. benchmark_winograd.set_display(false);
  1903. benchmark_winograd.set_times(RUN);
  1904. benchmark_winograd.set_dtype(0, dtype::QuantizedS8(2.5f))
  1905. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1906. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1907. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1908. for (auto&& arg : args) {
  1909. TensorLayout dst_layout;
  1910. auto opr = handle()->create_operator<ConvBias>();
  1911. opr->param() = arg.param;
  1912. opr->deduce_layout({arg.src, dtype::Float32()},
  1913. {arg.filter, dtype::Float32()},
  1914. {arg.bias, dtype::Float32()}, {}, dst_layout);
  1915. //! dst.nr_elems * IC * FH * FW * 2
  1916. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1917. arg.filter[2] * arg.filter[3] * 2.0 /
  1918. (1024 * 1024 * 1024) * 1e3;
  1919. benchmark_im2col.set_param(arg.param);
  1920. auto im2col_used =
  1921. algo_benchmark<ConvBias>(
  1922. benchmark_im2col, {arg.src, arg.filter, {}, {}, {}},
  1923. "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16") /
  1924. RUN;
  1925. benchmark_winograd.set_param(arg.param);
  1926. auto winograd_used =
  1927. algo_benchmark<ConvBias>(
  1928. benchmark_winograd, {arg.src, arg.filter, {}, {}, {}},
  1929. "WINOGRAD:AARCH64_INT16X16X32_MK8_8X8:8:2") /
  1930. RUN;
  1931. printf("%s %s: im2col: %f ms %f Gflops winograd: %f ms %f GFlops "
  1932. "speedup: "
  1933. "%f\n",
  1934. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  1935. im2col_used, computations / im2col_used, winograd_used,
  1936. computations / winograd_used, im2col_used / winograd_used);
  1937. }
  1938. }
  1939. #endif
  1940. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台