You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias.cpp 75 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852
  1. /**
  2. * \file dnn/test/arm_common/conv_bias.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megdnn/dtype.h"
  13. #include "test/arm_common/fixture.h"
  14. #include "megdnn/opr_param_defs.h"
  15. #include "megdnn/oprs.h"
  16. #include "src/fallback/conv_bias/common.h"
  17. #include "test/common/benchmarker.h"
  18. #include "test/common/checker.h"
  19. #include "test/common/conv_bias.h"
  20. #include "test/common/rng.h"
  21. #include "test/common/tensor.h"
  22. #include "test/common/workspace_wrapper.h"
  23. using namespace megdnn;
  24. using namespace test;
  25. using namespace conv_bias;
  26. //! TODO this algo current does not support multithread
  27. TEST_F(ARM_COMMON, CONVBIAS_INT8_INT8_INT16_STRIDE2F2) {
  28. checker_conv_bias_int8x8x16(get_conv_bias_args({2}, 2, true, true, true),
  29. handle(), "I8816STRD2F2");
  30. }
  31. TEST_F(ARM_COMMON, CONV_BIAS_MATMUL) {
  32. using namespace conv_bias;
  33. std::vector<TestArg> args = get_quantized_args();
  34. Checker<ConvBiasForward> checker(handle());
  35. checker.set_before_exec_callback(
  36. conv_bias::ConvBiasAlgoChecker<ConvBias>("S8MATMUL"));
  37. #if MEGDNN_ARMV7
  38. checker.set_epsilon(1);
  39. #endif
  40. UniformIntRNG rng{-50, 50};
  41. for (auto&& arg : args) {
  42. if (arg.bias.ndim == 4 && arg.bias[2] != 1 && arg.bias[3] != 1)
  43. continue;
  44. checker.set_dtype(0, dtype::QuantizedS8(0.41113496f))
  45. .set_dtype(1, dtype::QuantizedS8(0.01887994f))
  46. .set_dtype(2, dtype::QuantizedS32(0.41113496f * 0.01887994f))
  47. .set_dtype(4, dtype::QuantizedS8(0.49550694f))
  48. .set_rng(0, &rng)
  49. .set_rng(1, &rng)
  50. .set_rng(2, &rng)
  51. .set_param(arg.param)
  52. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  53. }
  54. }
  55. #define CONV_BIAS_MATMUL_QU8_MODE(MODE) \
  56. using namespace conv_bias; \
  57. std::vector<TestArg> args = get_quantized_args_with_nlmode(MODE); \
  58. Checker<ConvBiasForward> checker(handle()); \
  59. checker.set_before_exec_callback( \
  60. conv_bias::ConvBiasAlgoChecker<ConvBias>("QU8MATMUL")); \
  61. UniformIntRNG rng{0, 127}; \
  62. for (auto&& arg : args) { \
  63. if (arg.bias.ndim == 4 && arg.bias[2] != 1 && arg.bias[3] != 1) \
  64. continue; \
  65. checker.set_dtype(0, dtype::Quantized8Asymm( \
  66. 2.5f, static_cast<uint8_t>(127))) \
  67. .set_dtype(1, dtype::Quantized8Asymm( \
  68. 2.7f, static_cast<uint8_t>(126))) \
  69. .set_dtype(2, dtype::QuantizedS32(6.75f)) \
  70. .set_dtype(4, dtype::Quantized8Asymm( \
  71. 60.25f, static_cast<uint8_t>(125))) \
  72. .set_rng(0, &rng) \
  73. .set_rng(1, &rng) \
  74. .set_rng(2, &rng) \
  75. .set_param(arg.param) \
  76. .execs({arg.src, arg.filter, arg.bias, {}, {}}); \
  77. }
  78. #define MODE_STR(mode) param::ConvBias::NonlineMode::mode
  79. #define CB_TEST(MODE) \
  80. TEST_F(ARM_COMMON, CONV_BIAS_MATMUL_QU8_##MODE) { \
  81. CONV_BIAS_MATMUL_QU8_MODE(MODE_STR(MODE)); \
  82. }
  83. CB_TEST(IDENTITY);
  84. CB_TEST(RELU);
  85. CB_TEST(H_SWISH);
  86. #undef MODE_STR
  87. #undef CB_TEST
  88. #undef CONV_BIAS_MATMUL_QU8_MODE
  89. #if MEGDNN_WITH_BENCHMARK
  90. static void benchmark_convbias(Handle* handle, bool is_fp32 = false) {
  91. constexpr size_t RUNS = 30;
  92. Benchmarker<ConvBias> benchmarker_int(handle);
  93. benchmarker_int.set_times(RUNS)
  94. .set_dtype(0, dtype::QuantizedS8(2.5))
  95. .set_dtype(1, dtype::QuantizedS8(2.5))
  96. .set_dtype(2, dtype::QuantizedS32(6.25))
  97. .set_dtype(4, dtype::QuantizedS8(60.25))
  98. .set_display(false);
  99. benchmarker_int.set_before_exec_callback(
  100. conv_bias::ConvBiasAlgoChecker<ConvBias>(
  101. "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16:384"));
  102. Benchmarker<ConvBias> benchmarker_float(handle);
  103. benchmarker_float.set_display(false).set_times(RUNS);
  104. benchmarker_float.set_before_exec_callback(
  105. conv_bias::ConvBiasAlgoChecker<ConvBias>(
  106. "IM2COLMATMUL:AARCH64_F32K8X12X1:192"));
  107. Benchmarker<ConvBias> benchmarker_int_nchw44(handle);
  108. if (is_fp32) {
  109. benchmarker_int_nchw44.set_times(RUNS)
  110. .set_dtype(0, dtype::Float32())
  111. .set_dtype(1, dtype::Float32())
  112. .set_dtype(2, dtype::Float32())
  113. .set_dtype(4, dtype::Float32())
  114. .set_display(false);
  115. } else {
  116. benchmarker_int_nchw44.set_times(RUNS)
  117. .set_dtype(0, dtype::QuantizedS8(2.5))
  118. .set_dtype(1, dtype::QuantizedS8(2.5))
  119. .set_dtype(2, dtype::QuantizedS32(6.25))
  120. .set_dtype(4, dtype::QuantizedS8(60.25))
  121. .set_display(false);
  122. }
  123. benchmarker_int_nchw44.set_before_exec_callback(
  124. conv_bias::ConvBiasAlgoChecker<ConvBias>(".+"));
  125. auto run = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  126. size_t FS, size_t stride, bool input_nchw = false) {
  127. param::ConvBias param;
  128. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  129. param.stride_h = stride;
  130. param.stride_w = stride;
  131. param.pad_h = FS / 2;
  132. param.pad_w = FS / 2;
  133. auto OH = (H + 2 * param.pad_h - FS) /
  134. static_cast<size_t>(param.stride_h) +
  135. 1;
  136. auto OW = (W + 2 * param.pad_w - FS) /
  137. static_cast<size_t>(param.stride_w) +
  138. 1;
  139. TensorShape src({N, IC, H, W}), filter({OC, IC, FS, FS}),
  140. bias({1, OC, 1, 1}), dst({N, OC, OH, OW});
  141. param.format = param::ConvBias::Format::NCHW;
  142. auto int_used = benchmarker_int.set_param(param).exec(
  143. {src, filter, bias, {}, dst}) /
  144. RUNS;
  145. auto float_used = benchmarker_float.set_param(param).exec(
  146. {src, filter, bias, {}, dst}) /
  147. RUNS;
  148. param.format = param::ConvBias::Format::NCHW44;
  149. src = {N, IC / 4, H, W, 4};
  150. filter = {OC / 4, IC / 4, FS, FS, 4, 4};
  151. if (input_nchw) {
  152. src = {N, IC, H, W};
  153. filter = {OC / 4, FS, FS, IC, 4};
  154. }
  155. bias = {1, OC / 4, 1, 1, 4};
  156. dst = {N, OC / 4, OH, OW, 4};
  157. auto int_nchw44_used = benchmarker_int_nchw44.set_param(param).exec(
  158. {src, filter, bias, {}, dst}) /
  159. RUNS;
  160. float computations = IC * (FS * FS) * dst.total_nr_elems() * 2 * 1e-6;
  161. printf("run: %s %s %s->%s \n", src.to_string().c_str(),
  162. filter.to_string().c_str(), bias.to_string().c_str(),
  163. dst.to_string().c_str());
  164. printf("float: %f ms %f Gflops, ", float_used,
  165. computations / float_used);
  166. printf("int_nchw: %f ms %f Gflops, ", int_used,
  167. computations / int_used);
  168. auto speed_up = int_used / int_nchw44_used;
  169. if (is_fp32) {
  170. speed_up = float_used / int_nchw44_used;
  171. printf("fp32_nchw44: %f ms %f Gflops %f speedup, ", int_nchw44_used,
  172. computations / int_nchw44_used, speed_up);
  173. } else {
  174. printf("int_nchw44: %f ms %f Gflops %f speedup, ", int_nchw44_used,
  175. computations / int_nchw44_used, speed_up);
  176. }
  177. printf("\n");
  178. };
  179. if (is_fp32) {
  180. run(1, 1, 4, 112, 112, 2, 2, true);
  181. run(1, 3, 32, 224, 224, 3, 2, true);
  182. run(1, 3, 64, 224, 224, 7, 2, true);
  183. run(1, 64, 128, 56, 56, 3, 2, false);
  184. run(1, 128, 256, 28, 28, 3, 2, false);
  185. run(1, 256, 512, 14, 14, 3, 2, false);
  186. run(1, 64, 128, 56, 56, 7, 2, false);
  187. run(1, 128, 256, 28, 28, 7, 2, false);
  188. run(1, 256, 512, 14, 14, 7, 2, false);
  189. run(1, 64, 64, 48, 48, 3, 2, false);
  190. } else {
  191. for (size_t stride : {1, 2}) {
  192. printf("stride %zu\n", stride);
  193. for (size_t filter_size : {2, 3, 5, 7}) {
  194. for (size_t img_size : {32}) {
  195. for (size_t channel : {8, 16, 32, 64, 128, 256}) {
  196. run(1, channel, channel, img_size, img_size,
  197. filter_size, stride, false);
  198. }
  199. }
  200. }
  201. }
  202. }
  203. }
  204. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_NCHW44) {
  205. benchmark_convbias(handle(), true);
  206. }
  207. TEST_F(ARM_COMMON_MULTI_THREADS, BENCHMARK_CONVBIAS_NCHW44) {
  208. benchmark_convbias(handle(), true);
  209. }
  210. #endif
  211. TEST_F(ARM_COMMON, CONV_BIAS_MATMUL_QS8) {
  212. using namespace conv_bias;
  213. std::vector<TestArg> args = get_quantized_args();
  214. Checker<ConvBiasForward> checker(handle());
  215. checker.set_before_exec_callback(
  216. conv_bias::ConvBiasAlgoChecker<ConvBias>("S8MATMUL"));
  217. #if MEGDNN_ARMV7
  218. checker.set_epsilon(1);
  219. #endif
  220. UniformIntRNG rng{0, 255};
  221. for (auto&& arg : args) {
  222. if (arg.bias.ndim == 4 && arg.bias[2] != 1 && arg.bias[3] != 1)
  223. continue;
  224. checker.set_dtype(0, dtype::QuantizedS8(2.5f))
  225. .set_dtype(1, dtype::QuantizedS8(2.7f))
  226. .set_dtype(2, dtype::QuantizedS32(6.75f))
  227. .set_dtype(4, dtype::QuantizedS8(60.25f))
  228. .set_rng(0, &rng)
  229. .set_rng(1, &rng)
  230. .set_rng(2, &rng)
  231. .set_param(arg.param)
  232. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  233. }
  234. }
  235. #if MEGDNN_ARMV7
  236. TEST_F(ARM_COMMON, CONV_BIAS_RESCALE_OP) {
  237. using namespace conv_bias;
  238. Checker<ConvBias> checker(handle());
  239. checker.set_before_exec_callback(
  240. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("S8MATMUL"));
  241. checker.set_epsilon(1).set_max_avg_error(1e-2).set_max_avg_biased_error(
  242. 1e-3);
  243. UniformIntRNG rng{-128, 127};
  244. checker.set_dtype(0, dtype::QuantizedS8(0.41113496f))
  245. .set_dtype(1, dtype::QuantizedS8(0.01887994f))
  246. .set_dtype(2, dtype::QuantizedS32(0.41113496f * 0.01887994f))
  247. .set_dtype(4, dtype::QuantizedS8(0.49550694f))
  248. .set_rng(0, &rng)
  249. .set_rng(1, &rng)
  250. .set_rng(2, &rng);
  251. param::ConvBias param;
  252. param.stride_h = 1;
  253. param.stride_w = 1;
  254. param.pad_h = 0;
  255. param.pad_w = 0;
  256. param.nonlineMode = NonlineMode::IDENTITY;
  257. //! Unary op
  258. checker.set_param(param).exec({TensorShape{2, 1, 128, 128},
  259. TensorShape{16, 1, 2, 2},
  260. TensorShape{},
  261. TensorShape{},
  262. {}});
  263. //! Binary op
  264. checker.set_param(param).exec({TensorShape{2, 1, 128, 128},
  265. TensorShape{16, 1, 2, 2},
  266. TensorShape{1, 16, 1, 1},
  267. TensorShape{},
  268. {}});
  269. }
  270. #endif
  271. #if MEGDNN_WITH_BENCHMARK
  272. void benchmark_im2col(const char* algo_name, const char* im2col_name,
  273. Handle* handle, size_t kernel, size_t pack_size = 1) {
  274. auto&& args = get_winograd_benchmark_args(kernel, pack_size);
  275. using namespace conv_bias;
  276. constexpr size_t RUN = 10;
  277. Benchmarker<ConvBias> benchmark(handle);
  278. benchmark.set_display(false);
  279. benchmark.set_times(RUN);
  280. Benchmarker<ConvBias> benchmark_im2col(handle);
  281. benchmark_im2col.set_display(false);
  282. benchmark_im2col.set_times(RUN);
  283. for (auto&& arg : args) {
  284. TensorLayout dst_layout;
  285. auto opr = handle->create_operator<ConvBias>();
  286. opr->param() = arg.param;
  287. opr->deduce_layout({arg.src, dtype::Float32()},
  288. {arg.filter, dtype::Float32()},
  289. {arg.bias, dtype::Float32()}, {}, dst_layout);
  290. //! dst.nr_elems * IC * FH * FW * 2
  291. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  292. arg.filter[2] * arg.filter[3] * 2.0 /
  293. (1024 * 1024 * 1024) * 1e3;
  294. benchmark.set_param(arg.param);
  295. auto used = algo_benchmark<ConvBias>(benchmark,
  296. {arg.src, arg.filter, {}, {}, {}},
  297. algo_name) /
  298. RUN;
  299. benchmark_im2col.set_param(arg.param);
  300. auto used_im2col =
  301. algo_benchmark<ConvBias>(benchmark_im2col,
  302. {arg.src, arg.filter, {}, {}, {}},
  303. im2col_name) /
  304. RUN;
  305. printf("%s %s: normal: %f ms %f Gflops im2col: %f ms %f GFlops "
  306. "speedup: "
  307. "%f\n",
  308. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  309. used, computations / used, used_im2col,
  310. computations / used_im2col, used / used_im2col);
  311. }
  312. }
  313. void benchmark_im2col_single_algo(const char* im2col_name, Handle* handle,
  314. size_t kernel, size_t pack_size = 1) {
  315. std::vector<conv_bias::TestArg> args;
  316. auto pack = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  317. size_t p) {
  318. if (ic % pack_size != 0 || oc % pack_size != 0)
  319. return;
  320. if (w + 2 * p < kernel || h + 2 * p < kernel)
  321. return;
  322. param::ConvBias param;
  323. param.stride_h = 1;
  324. param.stride_w = 1;
  325. param.pad_h = p;
  326. param.pad_w = p;
  327. args.push_back(conv_bias::TestArg{param,
  328. TensorShape{1, ic, h, w},
  329. TensorShape{oc, ic, kernel, kernel},
  330. {1, oc, 1, 1}});
  331. };
  332. pack(1, 64, 100, 100, kernel, 1);
  333. pack(8, 64, 100, 100, kernel, 1);
  334. pack(16, 64, 100, 100, kernel, 1);
  335. pack(32, 64, 100, 100, kernel, 1);
  336. pack(64, 64, 100, 100, kernel, 1);
  337. pack(128, 64, 100, 100, kernel, 1);
  338. pack(256, 64, 100, 100, kernel, 1);
  339. pack(512, 64, 100, 100, kernel, 1);
  340. pack(1024, 64, 100, 100, kernel, 1);
  341. pack(1, 64, 10, 10, kernel, 1);
  342. pack(8, 64, 10, 10, kernel, 1);
  343. pack(16, 64, 10, 10, kernel, 1);
  344. pack(32, 64, 10, 10, kernel, 1);
  345. pack(64, 64, 10, 10, kernel, 1);
  346. pack(128, 64, 10, 10, kernel, 1);
  347. pack(256, 64, 10, 10, kernel, 1);
  348. pack(512, 64, 10, 10, kernel, 1);
  349. pack(1024, 64, 10, 10, kernel, 1);
  350. pack(1, 16, 10, 10, kernel, 1);
  351. pack(8, 16, 10, 10, kernel, 1);
  352. pack(16, 16, 10, 10, kernel, 1);
  353. pack(32, 16, 10, 10, kernel, 1);
  354. pack(64, 16, 10, 10, kernel, 1);
  355. pack(128, 16, 10, 10, kernel, 1);
  356. pack(256, 16, 10, 10, kernel, 1);
  357. pack(512, 16, 10, 10, kernel, 1);
  358. pack(1024, 16, 10, 10, kernel, 1);
  359. using namespace conv_bias;
  360. constexpr size_t RUN = 20;
  361. Benchmarker<ConvBias> benchmark_im2col(handle);
  362. benchmark_im2col.set_display(false);
  363. benchmark_im2col.set_times(RUN);
  364. for (auto&& arg : args) {
  365. TensorLayout dst_layout;
  366. auto opr = handle->create_operator<ConvBias>();
  367. opr->param() = arg.param;
  368. opr->deduce_layout({arg.src, dtype::Float32()},
  369. {arg.filter, dtype::Float32()},
  370. {arg.bias, dtype::Float32()}, {}, dst_layout);
  371. //! dst.nr_elems * IC * FH * FW * 2
  372. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  373. arg.filter[2] * arg.filter[3] * 2.0 /
  374. (1024 * 1024 * 1024) * 1e3;
  375. benchmark_im2col.set_param(arg.param);
  376. auto used_im2col =
  377. algo_benchmark<ConvBias>(benchmark_im2col,
  378. {arg.src, arg.filter, {}, {}, {}},
  379. im2col_name) /
  380. RUN;
  381. printf("%s %s: im2col: %f ms %f GFlops \n", arg.src.to_string().c_str(),
  382. arg.filter.to_string().c_str(), used_im2col,
  383. computations / used_im2col);
  384. }
  385. }
  386. void BENCHMARK_IM2COL_NCHW44_VS_NCHW(const char* algo_name,
  387. const char* im2col_name, Handle* handle,
  388. size_t kernel, size_t pack_size = 1) {
  389. auto&& args = get_winograd_benchmark_args(kernel, pack_size);
  390. using namespace conv_bias;
  391. constexpr size_t RUN = 10;
  392. Benchmarker<ConvBias> benchmark(handle);
  393. benchmark.set_display(false);
  394. benchmark.set_times(RUN);
  395. benchmark.set_dtype(0, dtype::Int8());
  396. benchmark.set_dtype(1, dtype::Int8());
  397. benchmark.set_dtype(2, dtype::Int32());
  398. benchmark.set_dtype(4, dtype::Int32());
  399. Benchmarker<ConvBias> benchmark_im2col(handle);
  400. benchmark_im2col.set_display(false);
  401. benchmark_im2col.set_times(RUN);
  402. benchmark_im2col.set_dtype(0, dtype::Int8());
  403. benchmark_im2col.set_dtype(1, dtype::Int8());
  404. benchmark_im2col.set_dtype(2, dtype::Int32());
  405. benchmark_im2col.set_dtype(4, dtype::Int32());
  406. for (auto&& arg : args) {
  407. TensorLayout dst_layout;
  408. auto opr = handle->create_operator<ConvBias>();
  409. opr->param() = arg.param;
  410. opr->deduce_layout({arg.src, dtype::Float32()},
  411. {arg.filter, dtype::Float32()},
  412. {arg.bias, dtype::Float32()}, {}, dst_layout);
  413. //! dst.nr_elems * IC * FH * FW * 2
  414. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  415. arg.filter[2] * arg.filter[3] * 2.0 /
  416. (1024 * 1024 * 1024) * 1e3;
  417. std::vector<conv_bias::TestArg> nchw44param;
  418. benchmark.set_param(arg.param);
  419. auto used = algo_benchmark<ConvBias>(benchmark,
  420. {arg.src, arg.filter, {}, {}, {}},
  421. algo_name) /
  422. RUN;
  423. arg.param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
  424. arg.param.format = param::ConvBias::Format::NCHW44;
  425. benchmark_im2col.set_param(arg.param);
  426. nchw44param.push_back(conv_bias::TestArg{
  427. arg.param,
  428. TensorShape{arg.src.shape[0], arg.src.shape[1] / 4, arg.src[2],
  429. arg.src.shape[3], 4},
  430. TensorShape{arg.filter.shape[0] / 4, arg.filter.shape[1] / 4,
  431. kernel, kernel, 4, 4},
  432. TensorShape{}});
  433. auto used_im2col =
  434. algo_benchmark<ConvBias>(
  435. benchmark_im2col,
  436. {nchw44param[0].src, nchw44param[0].filter, {}, {}, {}},
  437. im2col_name) /
  438. RUN;
  439. printf("nchw44 shape src %s filter %s\n",
  440. nchw44param[0].src.to_string().c_str(),
  441. nchw44param[0].filter.to_string().c_str());
  442. printf("%s %s: normal: %f ms %f Gflops im2col: %f ms %f GFlops "
  443. "speedup: "
  444. "%f\n",
  445. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  446. used, computations / used, used_im2col,
  447. computations / used_im2col, used / used_im2col);
  448. }
  449. }
  450. TEST_F(ARM_COMMON, BENCHMARK_NCHW_VS_NCHW44_INT8x8x32) {
  451. printf("=========================compare "
  452. "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16, "
  453. "IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16 \n");
  454. BENCHMARK_IM2COL_NCHW44_VS_NCHW("IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16",
  455. "IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16",
  456. handle(), 3, 4);
  457. }
  458. TEST_F(ARM_COMMON, BENCHMARK_GROUP_CONVBIAS_QUANTIZED) {
  459. constexpr size_t RUNS = 50;
  460. param::ConvBias param;
  461. param.sparse = param::ConvBias::Sparse::GROUP;
  462. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  463. Benchmarker<ConvBias> benchmarker_int(handle());
  464. benchmarker_int.set_times(RUNS)
  465. .set_dtype(0, dtype::QuantizedS8(2.5f))
  466. .set_dtype(1, dtype::QuantizedS8(2.5f))
  467. .set_dtype(2, dtype::QuantizedS32(6.25f))
  468. .set_dtype(4, dtype::QuantizedS8(40.25f))
  469. .set_display(false);
  470. Benchmarker<ConvBias> benchmarker_float(handle());
  471. benchmarker_float.set_display(false).set_times(RUNS);
  472. auto run = [&](size_t N, size_t GROUP, size_t IC, size_t OC, size_t H,
  473. size_t W, size_t FS, size_t STRD) {
  474. megdnn_assert(IC % GROUP == 0 && OC % GROUP == 0);
  475. TensorShape src({N, IC, H, W}),
  476. filter({GROUP, OC / GROUP, IC / GROUP, FS, FS}),
  477. bias({1, OC, 1, 1}), dst({N, OC, H / STRD, W / STRD});
  478. param.pad_h = FS / 2;
  479. param.pad_w = FS / 2;
  480. param.stride_h = STRD;
  481. param.stride_w = STRD;
  482. auto int_used = benchmarker_int.set_param(param).exec(
  483. {src, filter, bias, {}, dst}) /
  484. RUNS;
  485. auto float_used = benchmarker_float.set_param(param).exec(
  486. {src, filter, bias, {}, dst}) /
  487. RUNS;
  488. float computations = (IC / GROUP * FS * FS * dst.total_nr_elems() * 2 +
  489. dst.total_nr_elems()) *
  490. 1e-6;
  491. printf("run: %s %s %s->%s \nfloat: %f ms %f Gflops int: %f ms "
  492. "%f Gflops speedup: %f\n",
  493. src.to_string().c_str(), filter.to_string().c_str(),
  494. bias.to_string().c_str(), dst.to_string().c_str(), float_used,
  495. computations / float_used, int_used, computations / int_used,
  496. float_used / int_used);
  497. };
  498. run(1, 1, 28, 28, 28, 28, 3, 1);
  499. run(1, 68, 68, 68, 14, 14, 3, 2);
  500. run(1, 96, 96, 96, 14, 14, 3, 2);
  501. run(1, 100, 100, 100, 7, 7, 3, 1);
  502. }
  503. #endif
  504. #if MEGDNN_WITH_BENCHMARK
  505. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_MATMUL) {
  506. constexpr size_t RUNS = 10;
  507. param::ConvBias param;
  508. param.stride_h = 1;
  509. param.stride_w = 1;
  510. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  511. Benchmarker<ConvBias> benchmarker(handle()), benchmarker_fused(handle());
  512. benchmarker.set_times(RUNS)
  513. .set_dtype(0, dtype::QuantizedS8(2.5f))
  514. .set_dtype(1, dtype::QuantizedS8(2.5f))
  515. .set_dtype(2, dtype::QuantizedS32(6.25f))
  516. .set_dtype(4, dtype::QuantizedS8(40.25f))
  517. .set_display(false);
  518. benchmarker_fused.set_times(RUNS)
  519. .set_dtype(0, dtype::QuantizedS8(2.5f))
  520. .set_dtype(1, dtype::QuantizedS8(2.5f))
  521. .set_dtype(2, dtype::QuantizedS32(6.25f))
  522. .set_dtype(4, dtype::QuantizedS8(40.25f))
  523. .set_display(false);
  524. benchmarker_fused.set_before_exec_callback(
  525. conv_bias::ConvBiasAlgoChecker<ConvBias>("S8MATMUL"));
  526. auto run = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
  527. size_t FS) {
  528. TensorShape src({N, IC, H, W}), filter({OC, IC, FS, FS}),
  529. bias({1, OC, 1, 1}), dst({N, OC, H, W});
  530. param.pad_h = FS / 2;
  531. param.pad_w = FS / 2;
  532. auto default_used = benchmarker.set_param(param).exec(
  533. {src, filter, bias, {}, dst}) /
  534. RUNS;
  535. auto fused_used = benchmarker_fused.set_param(param).exec(
  536. {src, filter, bias, {}, dst}) /
  537. RUNS;
  538. float computations =
  539. IC * (FS * FS + 1) * dst.total_nr_elems() * 2 * 1e-6;
  540. printf("run: %s %s %s->%s \ndefault: %f ms %f Gflops fused: %f ms "
  541. "%f Gflops speedup: %f\n",
  542. src.to_string().c_str(), filter.to_string().c_str(),
  543. bias.to_string().c_str(), dst.to_string().c_str(), default_used,
  544. computations / default_used, fused_used,
  545. computations / fused_used, default_used / fused_used);
  546. };
  547. run(1, 128, 128, 32, 32, 3);
  548. for (size_t IC : {36, 48}) {
  549. for (size_t OC : {36, 48, 64}) {
  550. for (size_t size : {56, 128, 256}) {
  551. for (size_t FS : {1, 3, 5}) {
  552. run(1, IC, OC, size, size, FS);
  553. }
  554. }
  555. }
  556. }
  557. }
  558. #endif
  559. #if MEGDNN_WITH_BENCHMARK
  560. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F23) {
  561. #if MEGDNN_AARCH64
  562. benchmark_winograd("WINOGRAD:AARCH64_F32:1:2", handle(), 3);
  563. #else
  564. benchmark_winograd("WINOGRAD:ARMV7_F32_:1:2", handle(), 3);
  565. #endif
  566. }
  567. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F23_4x4) {
  568. #if MEGDNN_AARCH64
  569. benchmark_winograd("WINOGRAD:AARCH64_F32_MK4_4x16:4:2", handle(), 3, 4);
  570. #else
  571. benchmark_winograd("WINOGRAD:ARMV7_F32_MK4_4x8:4:2", handle(), 3, 4);
  572. #endif
  573. }
  574. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F63) {
  575. #if MEGDNN_AARCH64
  576. benchmark_winograd("WINOGRAD:AARCH64_F32K8X12X1:1:6", handle(), 3);
  577. #else
  578. benchmark_winograd("WINOGRAD:ARMV7_F32:1:6", handle(), 3);
  579. #endif
  580. }
  581. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F63_4x4) {
  582. #if MEGDNN_AARCH64
  583. benchmark_winograd("WINOGRAD:AARCH64_F32_MK4_4x16:4:6", handle(), 3, 4);
  584. #else
  585. benchmark_winograd("WINOGRAD:ARMV7_F32_MK4_4x8:4:6", handle(), 3, 4);
  586. #endif
  587. }
  588. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F54) {
  589. #if MEGDNN_AARCH64
  590. benchmark_winograd("WINOGRAD:AARCH64_F32K8X12X1:1:5", handle(), 4);
  591. #else
  592. benchmark_winograd("WINOGRAD:ARMV7_F32:1:5", handle(), 4);
  593. #endif
  594. }
  595. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F45) {
  596. #if MEGDNN_AARCH64
  597. benchmark_winograd("WINOGRAD:AARCH64_F32K8X12X1:1:4", handle(), 5);
  598. #else
  599. benchmark_winograd("WINOGRAD:ARMV7_F32:1:4", handle(), 5);
  600. #endif
  601. }
  602. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  603. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F16_F23) {
  604. #if MEGDNN_AARCH64
  605. benchmark_winograd_fp16("WINOGRAD:AARCH64_F32_MK4_4x16:4:2",
  606. "WINOGRAD:AARCH64_F16_K8X24X1:1:6", handle(), 3, 4);
  607. #else
  608. benchmark_winograd_fp16("WINOGRAD:ARMV7_F32:1:2",
  609. "WINOGRAD:AARCH32_F16_K4X16X1:1:2", handle(), 3);
  610. #endif
  611. }
  612. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F16_F45) {
  613. #if MEGDNN_AARCH64
  614. benchmark_winograd_fp16("WINOGRAD:AARCH64_F32K8X12X1:1:4",
  615. "WINOGRAD:AARCH64_F16_K8X24X1:1:4", handle(), 5);
  616. #else
  617. benchmark_winograd_fp16("WINOGRAD:ARMV7_F32:1:4",
  618. "WINOGRAD:AARCH32_F16_K4X16X1:1:4", handle(), 5);
  619. #endif
  620. }
  621. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F16_F63) {
  622. #if MEGDNN_AARCH64
  623. benchmark_winograd_fp16("WINOGRAD:AARCH64_F32K8X12X1:1:6",
  624. "WINOGRAD:AARCH64_F16_K8X24X1:1:6", handle(), 3);
  625. #else
  626. benchmark_winograd_fp16("WINOGRAD:ARMV7_F32:1:6",
  627. "WINOGRAD:AARCH32_F16_K4X16X1:1:6", handle(), 3);
  628. #endif
  629. }
  630. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F16_F23_8x8) {
  631. #if MEGDNN_AARCH64
  632. benchmark_winograd_fp16("WINOGRAD:AARCH64_F32_MK4_4x16:4:2",
  633. "WINOGRAD:AARCH64_F16_MK8_8X8:8:2", handle(), 3, 8);
  634. #else
  635. benchmark_winograd_fp16("WINOGRAD:ARMV7_F32_MK4_4x8:4:2",
  636. "WINOGRAD:AARCH32_F16_MK8_4X8:8:2", handle(), 3, 8);
  637. #endif
  638. }
  639. #endif
  640. TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F23_8x8) {
  641. auto benchmark_winograd_quantized = [](const char* algo_name_fp32,
  642. const char* algo_name_quantized,
  643. Handle* handle, size_t kernel) {
  644. auto&& args = get_winograd_benchmark_args(kernel);
  645. using namespace conv_bias;
  646. constexpr size_t RUN = 10;
  647. Benchmarker<ConvBias> benchmark(handle);
  648. benchmark.set_display(false);
  649. benchmark.set_times(RUN);
  650. Benchmarker<ConvBias> benchmark_winograd(handle);
  651. benchmark_winograd.set_display(false).set_times(RUN);
  652. benchmark_winograd.set_dtype(0, dtype::QuantizedS8(2.5f))
  653. .set_dtype(1, dtype::QuantizedS8(2.5f))
  654. .set_dtype(2, dtype::QuantizedS32(6.25f))
  655. .set_dtype(4, dtype::QuantizedS8(60.25f));
  656. for (auto&& arg : args) {
  657. TensorLayout dst_layout;
  658. auto opr = handle->create_operator<ConvBias>();
  659. opr->param() = arg.param;
  660. opr->deduce_layout({arg.src, dtype::Float32()},
  661. {arg.filter, dtype::Float32()},
  662. {arg.bias, dtype::Float32()}, {}, dst_layout);
  663. //! dst.nr_elems * IC * FH * FW * 2
  664. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  665. arg.filter[2] * arg.filter[3] * 2.0 /
  666. (1024 * 1024 * 1024) * 1e3;
  667. benchmark.set_param(arg.param);
  668. auto used = algo_benchmark<ConvBias>(
  669. benchmark, {arg.src, arg.filter, {}, {}, {}},
  670. algo_name_fp32) /
  671. RUN;
  672. benchmark_winograd.set_param(arg.param);
  673. auto used_winograd =
  674. algo_benchmark<ConvBias>(benchmark_winograd,
  675. {arg.src, arg.filter, {}, {}, {}},
  676. algo_name_quantized) /
  677. RUN;
  678. printf("%s %s: normal: %f ms %f Gflops winograd: %f ms %f GFlops "
  679. "speedup: "
  680. "%f\n",
  681. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  682. used, computations / used, used_winograd,
  683. computations / used_winograd, used / used_winograd);
  684. }
  685. };
  686. #if MEGDNN_AARCH64
  687. benchmark_winograd_quantized("WINOGRAD:AARCH64_F32_MK4_4x16:4:2",
  688. "WINOGRAD:AARCH64_INT16X16X32_MK8_8X8:8:2",
  689. handle(), 3);
  690. #else
  691. benchmark_winograd_quantized("WINOGRAD:ARMV7_F32_MK4_4x8:4:2",
  692. "WINOGRAD:ARMV7_INT16X16X32_MK8_4X8:8:2",
  693. handle(), 3);
  694. #endif
  695. }
  696. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_INT8_STRIDE1) {
  697. // have to remove preferred restrict in usable func before run the benchmark
  698. using namespace conv_bias;
  699. std::vector<TestArg> args;
  700. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  701. size_t p, NonlineMode nonline_mode) {
  702. if (w + 2 * p < kernel || h + 2 * p < kernel)
  703. return;
  704. param::ConvBias param;
  705. param.stride_h = 1;
  706. param.stride_w = 1;
  707. param.pad_h = p;
  708. param.pad_w = p;
  709. param.nonlineMode = nonline_mode;
  710. //! channel bias
  711. args.emplace_back(param, TensorShape{2, ic, h, w},
  712. TensorShape{oc, ic, kernel, kernel},
  713. TensorShape{1, oc, 1, 1});
  714. };
  715. for (size_t kernel : {2, 3, 5, 7})
  716. for (size_t ic : {1, 8, 16, 32})
  717. for (size_t oc : {1, 8, 16, 32})
  718. for (size_t p : {1})
  719. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  720. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  721. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  722. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  723. }
  724. constexpr size_t RUN = 50;
  725. Benchmarker<ConvBias> benchmark0(handle());
  726. benchmark0.set_dtype(0, dtype::QuantizedS8(2.5f))
  727. .set_dtype(1, dtype::QuantizedS8(2.5f))
  728. .set_dtype(2, dtype::QuantizedS32(6.25f))
  729. .set_dtype(4, dtype::QuantizedS8(60.25f));
  730. benchmark0.set_display(false);
  731. benchmark0.set_times(RUN);
  732. benchmark0.set_before_exec_callback(
  733. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("S8STRD1"));
  734. Benchmarker<ConvBias> benchmark1(handle());
  735. benchmark1.set_dtype(0, dtype::QuantizedS8(2.5f))
  736. .set_dtype(1, dtype::QuantizedS8(2.5f))
  737. .set_dtype(2, dtype::QuantizedS32(6.25f))
  738. .set_dtype(4, dtype::QuantizedS8(60.25f));
  739. benchmark1.set_display(false);
  740. benchmark1.set_times(RUN);
  741. for (auto&& arg : args) {
  742. TensorLayout dst_layout;
  743. auto opr = handle()->create_operator<ConvBias>();
  744. opr->param() = arg.param;
  745. opr->deduce_layout({arg.src, dtype::Int8()},
  746. {arg.filter, dtype::Int8()},
  747. {arg.bias, dtype::Int32()}, {}, dst_layout);
  748. //! dst.nr_elems * IC * FH * FW * 2
  749. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  750. arg.filter[2] * arg.filter[3] * 2.0 /
  751. (1024 * 1024 * 1024) * 1e3;
  752. auto used0 = benchmark0.set_param(arg.param).exec(
  753. {arg.src, arg.filter, arg.bias, {}, {}}) /
  754. RUN;
  755. auto used1 = benchmark1.set_param(arg.param).exec(
  756. {arg.src, arg.filter, arg.bias, {}, {}}) /
  757. RUN;
  758. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  759. "speedup: %f\n",
  760. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  761. used0, computations / used0, used1, computations / used1,
  762. used1 / used0);
  763. }
  764. }
  765. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_INT8_STRIDE2) {
  766. // have to remove preferred restrict in usable func before run the benchmark
  767. using namespace conv_bias;
  768. std::vector<TestArg> args;
  769. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  770. size_t p, NonlineMode nonline_mode) {
  771. if (w + 2 * p < kernel || h + 2 * p < kernel)
  772. return;
  773. param::ConvBias param;
  774. param.stride_h = 2;
  775. param.stride_w = 2;
  776. param.pad_h = p;
  777. param.pad_w = p;
  778. param.nonlineMode = nonline_mode;
  779. //! channel bias
  780. args.emplace_back(param, TensorShape{2, ic, h, w},
  781. TensorShape{oc, ic, kernel, kernel},
  782. TensorShape{1, oc, 1, 1});
  783. };
  784. for (size_t kernel : {2, 3, 5, 7})
  785. for (size_t ic : {1, 8, 16, 32})
  786. for (size_t oc : {1, 8, 16, 32})
  787. for (size_t p : {1})
  788. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  789. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  790. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  791. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  792. }
  793. constexpr size_t RUN = 50;
  794. Benchmarker<ConvBias> benchmark0(handle());
  795. benchmark0.set_dtype(0, dtype::QuantizedS8(2.5f))
  796. .set_dtype(1, dtype::QuantizedS8(2.5f))
  797. .set_dtype(2, dtype::QuantizedS32(6.25f))
  798. .set_dtype(4, dtype::QuantizedS8(60.25f));
  799. benchmark0.set_display(false);
  800. benchmark0.set_times(RUN);
  801. benchmark0.set_before_exec_callback(
  802. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("S8STRD2"));
  803. Benchmarker<ConvBias> benchmark1(handle());
  804. benchmark1.set_dtype(0, dtype::QuantizedS8(2.5f))
  805. .set_dtype(1, dtype::QuantizedS8(2.5f))
  806. .set_dtype(2, dtype::QuantizedS32(6.25f))
  807. .set_dtype(4, dtype::QuantizedS8(60.25f));
  808. benchmark1.set_display(false);
  809. benchmark1.set_times(RUN);
  810. for (auto&& arg : args) {
  811. TensorLayout dst_layout;
  812. auto opr = handle()->create_operator<ConvBias>();
  813. opr->param() = arg.param;
  814. opr->deduce_layout({arg.src, dtype::Int8()},
  815. {arg.filter, dtype::Int8()},
  816. {arg.bias, dtype::Int32()}, {}, dst_layout);
  817. //! dst.nr_elems * IC * FH * FW * 2
  818. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  819. arg.filter[2] * arg.filter[3] * 2.0 /
  820. (1024 * 1024 * 1024) * 1e3;
  821. auto used0 = benchmark0.set_param(arg.param).exec(
  822. {arg.src, arg.filter, arg.bias, {}, {}}) /
  823. RUN;
  824. auto used1 = benchmark1.set_param(arg.param).exec(
  825. {arg.src, arg.filter, arg.bias, {}, {}}) /
  826. RUN;
  827. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  828. "speedup: %f\n",
  829. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  830. used0, computations / used0, used1, computations / used1,
  831. used1 / used0);
  832. }
  833. }
  834. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_QUINT8_STRIDE1) {
  835. // have to remove preferred restrict in usable func before run the benchmark
  836. using namespace conv_bias;
  837. std::vector<TestArg> args;
  838. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  839. size_t p, NonlineMode nonline_mode) {
  840. if (w + 2 * p < kernel || h + 2 * p < kernel)
  841. return;
  842. param::ConvBias param;
  843. param.stride_h = 1;
  844. param.stride_w = 1;
  845. param.pad_h = p;
  846. param.pad_w = p;
  847. param.nonlineMode = nonline_mode;
  848. //! channel bias
  849. args.emplace_back(param, TensorShape{2, ic, h, w},
  850. TensorShape{oc, ic, kernel, kernel},
  851. TensorShape{1, oc, 1, 1});
  852. };
  853. for (size_t kernel : {2, 3, 5, 7})
  854. for (size_t ic : {1, 8, 16, 32})
  855. for (size_t oc : {1, 8, 16, 32})
  856. for (size_t p : {1})
  857. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  858. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  859. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  860. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  861. }
  862. constexpr size_t RUN = 50;
  863. Benchmarker<ConvBias> benchmark0(handle());
  864. benchmark0
  865. .set_dtype(0,
  866. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  867. .set_dtype(1,
  868. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  869. .set_dtype(2, dtype::QuantizedS32(0.04f))
  870. .set_dtype(4,
  871. dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  872. benchmark0.set_display(false);
  873. benchmark0.set_times(RUN);
  874. benchmark0.set_before_exec_callback(
  875. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("QU8STRD1"));
  876. Benchmarker<ConvBias> benchmark1(handle());
  877. benchmark1
  878. .set_dtype(0,
  879. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  880. .set_dtype(1,
  881. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  882. .set_dtype(2, dtype::QuantizedS32(0.04f))
  883. .set_dtype(4,
  884. dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  885. benchmark1.set_display(false);
  886. benchmark1.set_times(RUN);
  887. for (auto&& arg : args) {
  888. TensorLayout dst_layout;
  889. auto opr = handle()->create_operator<ConvBias>();
  890. opr->param() = arg.param;
  891. opr->deduce_layout({arg.src, dtype::Int8()},
  892. {arg.filter, dtype::Int8()},
  893. {arg.bias, dtype::Int32()}, {}, dst_layout);
  894. //! dst.nr_elems * IC * FH * FW * 2
  895. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  896. arg.filter[2] * arg.filter[3] * 2.0 /
  897. (1024 * 1024 * 1024) * 1e3;
  898. auto used0 = benchmark0.set_param(arg.param).exec(
  899. {arg.src, arg.filter, arg.bias, {}, {}}) /
  900. RUN;
  901. auto used1 = benchmark1.set_param(arg.param).exec(
  902. {arg.src, arg.filter, arg.bias, {}, {}}) /
  903. RUN;
  904. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  905. "speedup: %f\n",
  906. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  907. used0, computations / used0, used1, computations / used1,
  908. used1 / used0);
  909. }
  910. }
  911. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_QUINT8_STRIDE2) {
  912. // have to remove preferred restrict in usable func before run the benchmark
  913. using namespace conv_bias;
  914. std::vector<TestArg> args;
  915. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  916. size_t p, NonlineMode nonline_mode) {
  917. if (w + 2 * p < kernel || h + 2 * p < kernel)
  918. return;
  919. param::ConvBias param;
  920. param.stride_h = 2;
  921. param.stride_w = 2;
  922. param.pad_h = p;
  923. param.pad_w = p;
  924. param.nonlineMode = nonline_mode;
  925. //! channel bias
  926. args.emplace_back(param, TensorShape{2, ic, h, w},
  927. TensorShape{oc, ic, kernel, kernel},
  928. TensorShape{1, oc, 1, 1});
  929. };
  930. for (size_t kernel : {2, 3, 5, 7})
  931. for (size_t ic : {1, 8, 16, 32})
  932. for (size_t oc : {1, 8, 16, 32})
  933. for (size_t p : {1})
  934. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  935. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  936. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  937. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  938. }
  939. constexpr size_t RUN = 50;
  940. Benchmarker<ConvBias> benchmark0(handle());
  941. benchmark0
  942. .set_dtype(0,
  943. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  944. .set_dtype(1,
  945. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  946. .set_dtype(2, dtype::QuantizedS32(0.04f))
  947. .set_dtype(4,
  948. dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  949. benchmark0.set_display(false);
  950. benchmark0.set_times(RUN);
  951. benchmark0.set_before_exec_callback(
  952. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("QU8STRD2"));
  953. Benchmarker<ConvBias> benchmark1(handle());
  954. benchmark1
  955. .set_dtype(0,
  956. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  957. .set_dtype(1,
  958. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  959. .set_dtype(2, dtype::QuantizedS32(0.04f))
  960. .set_dtype(4,
  961. dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  962. benchmark1.set_display(false);
  963. benchmark1.set_times(RUN);
  964. for (auto&& arg : args) {
  965. TensorLayout dst_layout;
  966. auto opr = handle()->create_operator<ConvBias>();
  967. opr->param() = arg.param;
  968. opr->deduce_layout({arg.src, dtype::Int8()},
  969. {arg.filter, dtype::Int8()},
  970. {arg.bias, dtype::Int32()}, {}, dst_layout);
  971. //! dst.nr_elems * IC * FH * FW * 2
  972. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  973. arg.filter[2] * arg.filter[3] * 2.0 /
  974. (1024 * 1024 * 1024) * 1e3;
  975. auto used0 = benchmark0.set_param(arg.param).exec(
  976. {arg.src, arg.filter, arg.bias, {}, {}}) /
  977. RUN;
  978. auto used1 = benchmark1.set_param(arg.param).exec(
  979. {arg.src, arg.filter, arg.bias, {}, {}}) /
  980. RUN;
  981. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  982. "speedup: %f\n",
  983. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  984. used0, computations / used0, used1, computations / used1,
  985. used1 / used0);
  986. }
  987. }
  988. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_QINT8_STRIDE1_NCHW44) {
  989. // have to remove preferred restrict in usable func before run the benchmark
  990. using namespace conv_bias;
  991. param::ConvBias param;
  992. param.stride_h = 1;
  993. param.stride_w = 1;
  994. param.pad_h = 1;
  995. param.pad_w = 1;
  996. param.nonlineMode = NonlineMode::RELU;
  997. param.sparse = param::ConvBias::Sparse::GROUP;
  998. constexpr size_t RUN = 50;
  999. Benchmarker<ConvBias> benchmark0(handle());
  1000. benchmark0.set_dtype(0, dtype::QuantizedS8(0.2f))
  1001. .set_dtype(1, dtype::QuantizedS8(0.2f))
  1002. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1003. .set_dtype(4, dtype::QuantizedS8(1.4f));
  1004. benchmark0.set_display(false);
  1005. benchmark0.set_param(param);
  1006. benchmark0.set_times(RUN);
  1007. benchmark0.set_before_exec_callback(
  1008. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  1009. "S8STRD1_LARGE_GROUP"));
  1010. auto opr = handle()->create_operator<ConvBias>();
  1011. opr->param() = param;
  1012. param.format = param::ConvBias::Format::NCHW44;
  1013. Benchmarker<ConvBias> benchmark1(handle());
  1014. benchmark1.set_dtype(0, dtype::QuantizedS8(0.2f))
  1015. .set_dtype(1, dtype::QuantizedS8(0.2f))
  1016. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1017. .set_dtype(4, dtype::QuantizedS8(1.4f));
  1018. benchmark1.set_display(false);
  1019. benchmark1.set_param(param);
  1020. benchmark1.set_times(RUN);
  1021. benchmark1.set_before_exec_callback(
  1022. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  1023. "S8_CHAN_WISE_STRD1_NCHW44"));
  1024. auto run = [&](size_t group, size_t w, size_t h, size_t kernel) {
  1025. TensorLayout dst_layout;
  1026. opr->deduce_layout({{1, group * 4, h, w}, dtype::Int8()},
  1027. {{group * 4, 1, 1, kernel, kernel}, dtype::Int8()},
  1028. {{1, group * 4, 1, 1}, dtype::Int32()}, {},
  1029. dst_layout);
  1030. //! dst.nr_elems * IC * FH * FW * 2
  1031. float computations = dst_layout.total_nr_elems() * kernel * kernel *
  1032. 2.0 / (1024 * 1024 * 1024) * 1e3;
  1033. auto used0 = benchmark0.exec({{1, group * 4, h, w},
  1034. {group * 4, 1, 1, kernel, kernel},
  1035. {1, group * 4, 1, 1},
  1036. {},
  1037. {}}) /
  1038. RUN;
  1039. auto used1 = benchmark1.exec({{1, group, h, w, 4},
  1040. {group, 1, 1, kernel, kernel, 4},
  1041. {1, group, 1, 1, 4},
  1042. {},
  1043. {}}) /
  1044. RUN;
  1045. printf("group/h/w/kernel:%zu,%zu,%zu,%zu: nchw: %f ms %f Gflops "
  1046. "nchw44: "
  1047. "%f ms %f GFlops "
  1048. "speedup: %f\n",
  1049. group, h, w, kernel, used0, computations / used0, used1,
  1050. computations / used1, used0 / used1);
  1051. };
  1052. for (size_t group : {8, 16, 32, 64, 128}) {
  1053. for (size_t kerenl : {2, 3, 5}) {
  1054. run(group, 112, 112, kerenl);
  1055. run(group, 56, 56, kerenl);
  1056. run(group, 48, 48, kerenl);
  1057. run(group, 28, 28, kerenl);
  1058. run(group, 14, 14, kerenl);
  1059. }
  1060. }
  1061. }
  1062. #endif
  1063. #if __ARM_FEATURE_DOTPROD
  1064. #if MEGDNN_WITH_BENCHMARK
  1065. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_INT8_STRIDE1_WITHDOTPROD) {
  1066. // have to remove preferred restrict in usable func before run the benchmark
  1067. using namespace conv_bias;
  1068. std::vector<TestArg> args;
  1069. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1070. size_t p, NonlineMode nonline_mode) {
  1071. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1072. return;
  1073. param::ConvBias param;
  1074. param.stride_h = 1;
  1075. param.stride_w = 1;
  1076. param.pad_h = p;
  1077. param.pad_w = p;
  1078. param.nonlineMode = nonline_mode;
  1079. //! channel bias
  1080. args.emplace_back(param, TensorShape{2, ic, h, w},
  1081. TensorShape{oc, ic, kernel, kernel},
  1082. TensorShape{1, oc, 1, 1});
  1083. };
  1084. for (size_t kernel : {2, 3, 5, 7})
  1085. for (size_t ic : {1, 8, 16, 32})
  1086. for (size_t oc : {1, 8, 16, 32})
  1087. for (size_t p : {1})
  1088. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  1089. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  1090. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  1091. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  1092. }
  1093. constexpr size_t RUN = 50;
  1094. Benchmarker<ConvBias> benchmark0(handle());
  1095. benchmark0.set_dtype(0, dtype::QuantizedS8(2.5f))
  1096. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1097. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1098. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1099. benchmark0.set_display(false);
  1100. benchmark0.set_times(RUN);
  1101. benchmark0.set_before_exec_callback(
  1102. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("ARMDOTS8STRD1"));
  1103. Benchmarker<ConvBias> benchmark1(handle());
  1104. benchmark1.set_dtype(0, dtype::QuantizedS8(2.5f))
  1105. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1106. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1107. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1108. benchmark1.set_display(false);
  1109. benchmark1.set_times(RUN);
  1110. for (auto&& arg : args) {
  1111. TensorLayout dst_layout;
  1112. auto opr = handle()->create_operator<ConvBias>();
  1113. opr->param() = arg.param;
  1114. opr->deduce_layout({arg.src, dtype::Int8()},
  1115. {arg.filter, dtype::Int8()},
  1116. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1117. //! dst.nr_elems * IC * FH * FW * 2
  1118. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1119. arg.filter[2] * arg.filter[3] * 2.0 /
  1120. (1024 * 1024 * 1024) * 1e3;
  1121. auto used0 = benchmark0.set_param(arg.param).exec(
  1122. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1123. RUN;
  1124. auto used1 = benchmark1.set_param(arg.param).exec(
  1125. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1126. RUN;
  1127. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  1128. "speedup: %f\n",
  1129. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  1130. used0, computations / used0, used1, computations / used1,
  1131. used1 / used0);
  1132. }
  1133. }
  1134. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_INT8_STRIDE2_WITHDOTPROD) {
  1135. // have to remove preferred restrict in usable func before run the benchmark
  1136. using namespace conv_bias;
  1137. std::vector<TestArg> args;
  1138. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1139. size_t p, NonlineMode nonline_mode) {
  1140. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1141. return;
  1142. param::ConvBias param;
  1143. param.stride_h = 2;
  1144. param.stride_w = 2;
  1145. param.pad_h = p;
  1146. param.pad_w = p;
  1147. param.nonlineMode = nonline_mode;
  1148. //! channel bias
  1149. args.emplace_back(param, TensorShape{2, ic, h, w},
  1150. TensorShape{oc, ic, kernel, kernel},
  1151. TensorShape{1, oc, 1, 1});
  1152. };
  1153. for (size_t kernel : {2, 3, 5, 7})
  1154. for (size_t ic : {1, 8, 16, 32})
  1155. for (size_t oc : {1, 8, 16, 32})
  1156. for (size_t p : {1})
  1157. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  1158. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  1159. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  1160. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  1161. }
  1162. constexpr size_t RUN = 50;
  1163. Benchmarker<ConvBias> benchmark0(handle());
  1164. benchmark0.set_dtype(0, dtype::QuantizedS8(2.5f))
  1165. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1166. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1167. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1168. benchmark0.set_display(false);
  1169. benchmark0.set_times(RUN);
  1170. benchmark0.set_before_exec_callback(
  1171. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("ARMDOTS8STRD2"));
  1172. Benchmarker<ConvBias> benchmark1(handle());
  1173. benchmark1.set_dtype(0, dtype::QuantizedS8(2.5f))
  1174. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1175. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1176. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1177. benchmark1.set_display(false);
  1178. benchmark1.set_times(RUN);
  1179. for (auto&& arg : args) {
  1180. TensorLayout dst_layout;
  1181. auto opr = handle()->create_operator<ConvBias>();
  1182. opr->param() = arg.param;
  1183. opr->deduce_layout({arg.src, dtype::Int8()},
  1184. {arg.filter, dtype::Int8()},
  1185. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1186. //! dst.nr_elems * IC * FH * FW * 2
  1187. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1188. arg.filter[2] * arg.filter[3] * 2.0 /
  1189. (1024 * 1024 * 1024) * 1e3;
  1190. auto used0 = benchmark0.set_param(arg.param).exec(
  1191. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1192. RUN;
  1193. auto used1 = benchmark1.set_param(arg.param).exec(
  1194. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1195. RUN;
  1196. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  1197. "speedup: %f\n",
  1198. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  1199. used0, computations / used0, used1, computations / used1,
  1200. used1 / used0);
  1201. }
  1202. }
  1203. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_QUINT8_STRIDE1_WITHDOTPROD) {
  1204. // have to remove preferred restrict in usable func before run the benchmark
  1205. using namespace conv_bias;
  1206. std::vector<TestArg> args;
  1207. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1208. size_t p, NonlineMode nonline_mode) {
  1209. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1210. return;
  1211. param::ConvBias param;
  1212. param.stride_h = 1;
  1213. param.stride_w = 1;
  1214. param.pad_h = p;
  1215. param.pad_w = p;
  1216. param.nonlineMode = nonline_mode;
  1217. //! channel bias
  1218. args.emplace_back(param, TensorShape{2, ic, h, w},
  1219. TensorShape{oc, ic, kernel, kernel},
  1220. TensorShape{1, oc, 1, 1});
  1221. };
  1222. // clang-format off
  1223. for (size_t kernel : {2, 3, 5, 7})
  1224. for (size_t ic : {1, 8, 16, 32})
  1225. for (size_t oc : {1, 8, 16, 32})
  1226. for (size_t p : {1})
  1227. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  1228. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  1229. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  1230. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  1231. }
  1232. // clang-format on
  1233. constexpr size_t RUN = 50;
  1234. Benchmarker<ConvBias> benchmark0(handle());
  1235. benchmark0
  1236. .set_dtype(0,
  1237. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1238. .set_dtype(1,
  1239. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1240. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1241. .set_dtype(4,
  1242. dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1243. benchmark0.set_display(false);
  1244. benchmark0.set_times(RUN);
  1245. benchmark0.set_before_exec_callback(
  1246. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("ARMDOTU8STRD1"));
  1247. Benchmarker<ConvBias> benchmark1(handle());
  1248. benchmark1
  1249. .set_dtype(0,
  1250. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1251. .set_dtype(1,
  1252. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1253. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1254. .set_dtype(4,
  1255. dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1256. benchmark1.set_display(false);
  1257. benchmark1.set_times(RUN);
  1258. for (auto&& arg : args) {
  1259. TensorLayout dst_layout;
  1260. auto opr = handle()->create_operator<ConvBias>();
  1261. opr->param() = arg.param;
  1262. opr->deduce_layout({arg.src, dtype::Int8()},
  1263. {arg.filter, dtype::Int8()},
  1264. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1265. //! dst.nr_elems * IC * FH * FW * 2
  1266. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1267. arg.filter[2] * arg.filter[3] * 2.0 /
  1268. (1024 * 1024 * 1024) * 1e3;
  1269. auto used0 = benchmark0.set_param(arg.param).exec(
  1270. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1271. RUN;
  1272. auto used1 = benchmark1.set_param(arg.param).exec(
  1273. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1274. RUN;
  1275. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  1276. "speedup: %f\n",
  1277. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  1278. used0, computations / used0, used1, computations / used1,
  1279. used1 / used0);
  1280. }
  1281. }
  1282. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_QUINT8_STRIDE2_WITHDOTPROD) {
  1283. // have to remove preferred restrict in usable func before run the benchmark
  1284. using namespace conv_bias;
  1285. std::vector<TestArg> args;
  1286. auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  1287. size_t p, NonlineMode nonline_mode) {
  1288. if (w + 2 * p < kernel || h + 2 * p < kernel)
  1289. return;
  1290. param::ConvBias param;
  1291. param.stride_h = 2;
  1292. param.stride_w = 2;
  1293. param.pad_h = p;
  1294. param.pad_w = p;
  1295. param.nonlineMode = nonline_mode;
  1296. //! channel bias
  1297. args.emplace_back(param, TensorShape{2, ic, h, w},
  1298. TensorShape{oc, ic, kernel, kernel},
  1299. TensorShape{1, oc, 1, 1});
  1300. };
  1301. // clang-format off
  1302. for (size_t kernel : {2, 3, 5, 7})
  1303. for (size_t ic : {1, 8, 16, 32})
  1304. for (size_t oc : {1, 8, 16, 32})
  1305. for (size_t p : {1})
  1306. for (NonlineMode nonline_mode : {NonlineMode::RELU}) {
  1307. run(oc, ic, 56, 56, kernel, p, nonline_mode);
  1308. run(oc, ic, 128, 128, kernel, p, nonline_mode);
  1309. run(oc, ic, 256, 256, kernel, p, nonline_mode);
  1310. }
  1311. // clang-format on
  1312. constexpr size_t RUN = 50;
  1313. Benchmarker<ConvBias> benchmark0(handle());
  1314. benchmark0
  1315. .set_dtype(0,
  1316. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1317. .set_dtype(1,
  1318. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1319. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1320. .set_dtype(4,
  1321. dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1322. benchmark0.set_display(false);
  1323. benchmark0.set_times(RUN);
  1324. benchmark0.set_before_exec_callback(
  1325. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("ARMDOTU8STRD2"));
  1326. Benchmarker<ConvBias> benchmark1(handle());
  1327. benchmark1
  1328. .set_dtype(0,
  1329. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(100)))
  1330. .set_dtype(1,
  1331. dtype::Quantized8Asymm(0.2f, static_cast<uint8_t>(120)))
  1332. .set_dtype(2, dtype::QuantizedS32(0.04f))
  1333. .set_dtype(4,
  1334. dtype::Quantized8Asymm(1.4f, static_cast<uint8_t>(110)));
  1335. benchmark1.set_display(false);
  1336. benchmark1.set_times(RUN);
  1337. for (auto&& arg : args) {
  1338. TensorLayout dst_layout;
  1339. auto opr = handle()->create_operator<ConvBias>();
  1340. opr->param() = arg.param;
  1341. opr->deduce_layout({arg.src, dtype::Int8()},
  1342. {arg.filter, dtype::Int8()},
  1343. {arg.bias, dtype::Int32()}, {}, dst_layout);
  1344. //! dst.nr_elems * IC * FH * FW * 2
  1345. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1346. arg.filter[2] * arg.filter[3] * 2.0 /
  1347. (1024 * 1024 * 1024) * 1e3;
  1348. auto used0 = benchmark0.set_param(arg.param).exec(
  1349. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1350. RUN;
  1351. auto used1 = benchmark1.set_param(arg.param).exec(
  1352. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1353. RUN;
  1354. printf("%s %s: conv_bias: %f ms %f Gflops conv_elem: %f ms %f GFlops "
  1355. "speedup: %f\n",
  1356. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  1357. used0, computations / used0, used1, computations / used1,
  1358. used1 / used0);
  1359. }
  1360. }
  1361. #endif
  1362. #endif
  1363. /*====================== BENCHMARK CONV1X1 ===========================*/
  1364. #if MEGDNN_WITH_BENCHMARK
  1365. namespace {
  1366. std::vector<conv_bias::TestArg> get_conv_bias_1x1_benchmark_args(
  1367. size_t pack_size = 1) {
  1368. using namespace conv_bias;
  1369. std::vector<TestArg> args;
  1370. param::ConvBias param;
  1371. param.stride_h = 1;
  1372. param.stride_w = 1;
  1373. param.pad_h = 0;
  1374. param.pad_w = 0;
  1375. param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
  1376. auto bench_case = [&](size_t OC, size_t IC, size_t H, size_t W) {
  1377. if (pack_size == 1)
  1378. args.emplace_back(param, TensorShape{1, IC, H, W},
  1379. TensorShape{OC, IC, 1, 1}, TensorShape{});
  1380. else {
  1381. if (pack_size == 4)
  1382. param.format = param::ConvBias::Format::NCHW44;
  1383. args.emplace_back(param,
  1384. TensorShape{1, IC / pack_size, H, W, pack_size},
  1385. TensorShape{OC / pack_size, IC / pack_size, 1, 1,
  1386. pack_size, pack_size},
  1387. TensorShape{});
  1388. }
  1389. };
  1390. //! MobileNetV1
  1391. bench_case(64, 32, 112, 112);
  1392. bench_case(128, 64, 56, 56);
  1393. bench_case(128, 128, 56, 56);
  1394. bench_case(256, 128, 28, 28);
  1395. bench_case(256, 256, 28, 28);
  1396. bench_case(512, 256, 14, 14);
  1397. bench_case(512, 512, 14, 14);
  1398. bench_case(1024, 512, 7, 7);
  1399. bench_case(1024, 1024, 7, 7);
  1400. //! MobileNetV2
  1401. bench_case(16, 32, 112, 112);
  1402. bench_case(96, 16, 112, 112);
  1403. bench_case(144, 24, 56, 56);
  1404. bench_case(192, 32, 28, 28);
  1405. bench_case(384, 64, 28, 28);
  1406. bench_case(576, 96, 14, 14);
  1407. bench_case(960, 160, 7, 7);
  1408. bench_case(320, 960, 7, 7);
  1409. bench_case(1280, 320, 7, 7);
  1410. //! MobileNetV3-Large
  1411. bench_case(64, 16, 112, 112);
  1412. bench_case(72, 24, 56, 56);
  1413. bench_case(120, 40, 28, 28);
  1414. bench_case(240, 40, 28, 28);
  1415. bench_case(200, 80, 14, 14);
  1416. bench_case(184, 80, 14, 14);
  1417. bench_case(480, 80, 14, 14);
  1418. bench_case(672, 112, 14, 14);
  1419. //! MobileNetV3-Small
  1420. bench_case(72, 16, 56, 56);
  1421. bench_case(88, 24, 28, 28);
  1422. bench_case(96, 24, 28, 28);
  1423. bench_case(240, 40, 14, 14);
  1424. bench_case(120, 40, 14, 14);
  1425. bench_case(144, 48, 14, 14);
  1426. bench_case(288, 48, 14, 14);
  1427. bench_case(576, 96, 7, 7);
  1428. //! resnet50
  1429. bench_case(256, 64, 56, 56);
  1430. bench_case(512, 128, 28, 28);
  1431. bench_case(1024, 256, 14, 14);
  1432. bench_case(2048, 512, 7, 7);
  1433. return args;
  1434. }
  1435. void benchmark_conv1x1(const char* matmul_algo_name, Handle* handle,
  1436. DType stype, DType matmul_dtype, DType bias_type,
  1437. DType conv_dtype) {
  1438. using namespace conv_bias;
  1439. std::vector<TestArg> conv_bias_1x1_args =
  1440. get_conv_bias_1x1_benchmark_args();
  1441. constexpr size_t RUNS = 50;
  1442. param::MatrixMul param;
  1443. param.transposeA = false;
  1444. param.transposeB = false;
  1445. Benchmarker<MatrixMul> benchmark_matmul(handle);
  1446. benchmark_matmul.set_before_exec_callback(
  1447. AlgoChecker<MatrixMul>(matmul_algo_name));
  1448. benchmark_matmul.set_times(RUNS)
  1449. .set_dtype(0, stype)
  1450. .set_dtype(1, stype)
  1451. .set_dtype(2, matmul_dtype)
  1452. .set_param(param)
  1453. .set_display(false);
  1454. std::string conv1x1_algo_name = ssprintf("CONV1x1:%s:24", matmul_algo_name);
  1455. Benchmarker<ConvBias> benchmark_conv1x1(handle);
  1456. benchmark_conv1x1.set_before_exec_callback(
  1457. conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1458. conv1x1_algo_name.c_str()));
  1459. benchmark_conv1x1.set_times(RUNS)
  1460. .set_dtype(0, stype)
  1461. .set_dtype(1, stype)
  1462. .set_dtype(2, bias_type)
  1463. .set_dtype(4, conv_dtype)
  1464. .set_display(false);
  1465. for (auto&& arg : conv_bias_1x1_args) {
  1466. size_t IC = arg.src[1];
  1467. size_t OH = arg.src[2];
  1468. size_t OW = arg.src[3];
  1469. size_t OC = arg.filter[0];
  1470. size_t M = OC;
  1471. size_t K = IC;
  1472. size_t N = OH * OW;
  1473. float computations = M * N * K * 2.f / (1024 * 1024 * 1024) * 1e3;
  1474. TensorShape A, B;
  1475. A = TensorShape{M, K};
  1476. B = TensorShape{K, N};
  1477. auto conv1x1_used = benchmark_conv1x1.set_param(arg.param).exec(
  1478. {arg.src, arg.filter, arg.bias, {}, {}}) /
  1479. RUNS;
  1480. auto matmul_used = benchmark_matmul.exec({A, B, {}}) / RUNS;
  1481. printf("\n%s: ", matmul_algo_name);
  1482. printf("%s %s:\n matmul: %f ms %f Gflops\nconv1x1: %f ms %f GFlops "
  1483. "speedup: "
  1484. "%f\n",
  1485. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  1486. matmul_used, computations / matmul_used, conv1x1_used,
  1487. computations / conv1x1_used, matmul_used / conv1x1_used);
  1488. }
  1489. }
  1490. } // namespace
  1491. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_CONV1X1_S1_F32) {
  1492. #if MEGDNN_AARCH64
  1493. benchmark_conv1x1("AARCH64_F32K8X12X1", handle(), dtype::Float32{},
  1494. dtype::Float32{}, dtype::Float32{}, dtype::Float32{});
  1495. #else
  1496. benchmark_conv1x1("ARMV7_F32", handle(), dtype::Float32{}, dtype::Float32{},
  1497. dtype::Float32{}, dtype::Float32{});
  1498. #endif
  1499. }
  1500. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  1501. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_CONV1X1_S1_F16) {
  1502. #if MEGDNN_AARCH64
  1503. benchmark_conv1x1("AARCH64_F16_K8X24X1", handle(), dtype::Float16{},
  1504. dtype::Float16{}, dtype::Float16{}, dtype::Float16{});
  1505. #else
  1506. benchmark_conv1x1("AARCH32_F16_K4X16X1", handle(), dtype::Float16{},
  1507. dtype::Float16{}, dtype::Float16{}, dtype::Float16{});
  1508. #endif
  1509. }
  1510. #endif
  1511. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_CONV1X1_S1_QUANTIZEDSYM) {
  1512. dtype::QuantizedS8 stype(2.5f);
  1513. dtype::QuantizedS32 dtype(6.25f);
  1514. #if MEGDNN_AARCH64
  1515. #if __ARM_FEATURE_DOTPROD
  1516. benchmark_conv1x1("AARCH64_INT8X8X32_K8X12X4_DOTPROD", handle(), stype,
  1517. dtype, dtype, dtype);
  1518. #else
  1519. benchmark_conv1x1("AARCH64_INT8X8X32_K8X8X8", handle(), stype, dtype, dtype,
  1520. dtype);
  1521. benchmark_conv1x1("AARCH64_INT8X8X32_K4X4X16", handle(), stype, dtype,
  1522. dtype, dtype);
  1523. #endif
  1524. #elif MEGDNN_ARMV7
  1525. benchmark_conv1x1("ARMV7_INT8X8X32_K4X8X8", handle(), stype, dtype, dtype,
  1526. dtype);
  1527. #endif
  1528. }
  1529. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_CONV1X1_S1_QUANTIZEDASYM) {
  1530. dtype::Quantized8Asymm stype(1.2f, (uint8_t)125);
  1531. dtype::QuantizedS32 dtype(1.2 * 1.2);
  1532. #if MEGDNN_AARCH64
  1533. #if __ARM_FEATURE_DOTPROD
  1534. benchmark_conv1x1("AARCH64_QUINT8_K8X8X4_DOTPROD", handle(), stype, dtype,
  1535. dtype, dtype);
  1536. #else
  1537. benchmark_conv1x1("AARCH64_QUINT8_K8X8X8", handle(), stype, dtype, dtype,
  1538. dtype);
  1539. #endif
  1540. #elif MEGDNN_ARMV7
  1541. benchmark_conv1x1("ARMV7_QUINT8_K4X8X8", handle(), stype, dtype, dtype,
  1542. dtype);
  1543. #endif
  1544. }
  1545. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_CONV1X1_S1_INT8x8x16) {
  1546. #if MEGDNN_AARCH64
  1547. benchmark_conv1x1("AARCH64_INT8X8X16_K8X8X8", handle(), dtype::Int8{},
  1548. dtype::Int16{}, dtype::Int16{}, dtype::Int16{});
  1549. benchmark_conv1x1("AARCH64_INT8X8X16_K4X4X16", handle(), dtype::Int8{},
  1550. dtype::Int16{}, dtype::Int16{}, dtype::Int16{});
  1551. #elif MEGDNN_ARMV7
  1552. benchmark_conv1x1("ARMV7_INT8X8X16_K4X8X8", handle(), dtype::Int8{},
  1553. dtype::Int16{}, dtype::Int16{}, dtype::Int16{});
  1554. benchmark_conv1x1("ARMV7_INT8X8X16_K4X2X16", handle(), dtype::Int8{},
  1555. dtype::Int16{}, dtype::Int16{}, dtype::Int16{});
  1556. #endif
  1557. }
  1558. #ifndef __ARM_FEATURE_DOTPROD
  1559. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_1X1_S1_NCHW_VS_NCHW44_INT8x8x32) {
  1560. std::vector<TestArg> conv_bias_1x1_args_nchw44 =
  1561. get_conv_bias_1x1_benchmark_args(4);
  1562. std::vector<TestArg> conv_bias_1x1_args_nchw =
  1563. get_conv_bias_1x1_benchmark_args(1);
  1564. constexpr size_t RUNS = 50;
  1565. Benchmarker<ConvBias> benchmark_conv1x1_nchw44(handle());
  1566. benchmark_conv1x1_nchw44.set_before_exec_callback(
  1567. conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1568. "CONV1x1:AARCH64_INT8X8X32_MK4_4X4X16:24"));
  1569. benchmark_conv1x1_nchw44.set_times(RUNS)
  1570. .set_dtype(0, dtype::Int8())
  1571. .set_dtype(1, dtype::Int8())
  1572. .set_dtype(2, dtype::Int32())
  1573. .set_dtype(4, dtype::Int32())
  1574. .set_display(false);
  1575. Benchmarker<ConvBias> benchmark_conv1x1_nchw(handle());
  1576. benchmark_conv1x1_nchw.set_before_exec_callback(
  1577. conv_bias::ConvBiasAlgoChecker<ConvBias>(
  1578. "CONV1x1:AARCH64_INT8X8X32_K4X4X16:24"));
  1579. benchmark_conv1x1_nchw.set_times(RUNS)
  1580. .set_dtype(0, dtype::Int8())
  1581. .set_dtype(1, dtype::Int8())
  1582. .set_dtype(2, dtype::Int32())
  1583. .set_dtype(4, dtype::Int32())
  1584. .set_display(false);
  1585. for (size_t i = 0; i < conv_bias_1x1_args_nchw44.size(); ++i) {
  1586. auto&& arg_nchw = conv_bias_1x1_args_nchw[i];
  1587. auto&& arg_nchw44 = conv_bias_1x1_args_nchw44[i];
  1588. size_t IC = arg_nchw.src[1];
  1589. size_t OH = arg_nchw.src[2];
  1590. size_t OW = arg_nchw.src[3];
  1591. size_t OC = arg_nchw.filter[0];
  1592. size_t M = OC;
  1593. size_t K = IC;
  1594. size_t N = OH * OW;
  1595. float computations = M * N * K * 2.f / (1024 * 1024 * 1024) * 1e3;
  1596. auto conv1x1_nchw = benchmark_conv1x1_nchw.set_param(arg_nchw.param)
  1597. .exec({arg_nchw.src,
  1598. arg_nchw.filter,
  1599. arg_nchw.bias,
  1600. {},
  1601. {}}) /
  1602. RUNS;
  1603. auto conv1x1_nchw44 =
  1604. benchmark_conv1x1_nchw44.set_param(arg_nchw44.param)
  1605. .exec({arg_nchw44.src,
  1606. arg_nchw44.filter,
  1607. arg_nchw44.bias,
  1608. {},
  1609. {}}) /
  1610. RUNS;
  1611. printf("%s %s:\n conv_1x1_nchw: %f ms %f Gflops\nconv1x1_nchw44: %f ms "
  1612. "%f GFlops "
  1613. "speedup: "
  1614. "%f\n",
  1615. arg_nchw.src.to_string().c_str(),
  1616. arg_nchw.filter.to_string().c_str(), conv1x1_nchw,
  1617. computations / conv1x1_nchw, conv1x1_nchw44,
  1618. computations / conv1x1_nchw44, conv1x1_nchw / conv1x1_nchw44);
  1619. }
  1620. }
  1621. #endif
  1622. TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_WINOGRAD_VS_IM2COL_INT8) {
  1623. auto&& args = get_winograd_benchmark_args(3, 8);
  1624. using namespace conv_bias;
  1625. constexpr size_t RUN = 10;
  1626. Benchmarker<ConvBias> benchmark_im2col(handle());
  1627. benchmark_im2col.set_display(false);
  1628. benchmark_im2col.set_times(RUN);
  1629. benchmark_im2col.set_dtype(0, dtype::QuantizedS8(2.5f))
  1630. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1631. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1632. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1633. Benchmarker<ConvBias> benchmark_winograd(handle());
  1634. benchmark_winograd.set_display(false);
  1635. benchmark_winograd.set_times(RUN);
  1636. benchmark_winograd.set_dtype(0, dtype::QuantizedS8(2.5f))
  1637. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1638. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1639. .set_dtype(4, dtype::QuantizedS8(60.25f));
  1640. for (auto&& arg : args) {
  1641. TensorLayout dst_layout;
  1642. auto opr = handle()->create_operator<ConvBias>();
  1643. opr->param() = arg.param;
  1644. opr->deduce_layout({arg.src, dtype::Float32()},
  1645. {arg.filter, dtype::Float32()},
  1646. {arg.bias, dtype::Float32()}, {}, dst_layout);
  1647. //! dst.nr_elems * IC * FH * FW * 2
  1648. float computations = dst_layout.total_nr_elems() * arg.filter[1] *
  1649. arg.filter[2] * arg.filter[3] * 2.0 /
  1650. (1024 * 1024 * 1024) * 1e3;
  1651. benchmark_im2col.set_param(arg.param);
  1652. auto im2col_used =
  1653. algo_benchmark<ConvBias>(
  1654. benchmark_im2col, {arg.src, arg.filter, {}, {}, {}},
  1655. "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16") /
  1656. RUN;
  1657. benchmark_winograd.set_param(arg.param);
  1658. auto winograd_used =
  1659. algo_benchmark<ConvBias>(
  1660. benchmark_winograd, {arg.src, arg.filter, {}, {}, {}},
  1661. "WINOGRAD:AARCH64_INT16X16X32_MK8_8X8:8:2") /
  1662. RUN;
  1663. printf("%s %s: im2col: %f ms %f Gflops winograd: %f ms %f GFlops "
  1664. "speedup: "
  1665. "%f\n",
  1666. arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
  1667. im2col_used, computations / im2col_used, winograd_used,
  1668. computations / winograd_used, im2col_used / winograd_used);
  1669. }
  1670. }
  1671. #endif
  1672. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台