You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias.cpp 47 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166
  1. #include "test/common/conv_bias.h"
  2. #include "megdnn/opr_param_defs.h"
  3. #include "megdnn/oprs.h"
  4. #include "test/common/benchmarker.h"
  5. #include "test/common/checker.h"
  6. #include "test/common/rng.h"
  7. #include "test/common/task_record_check.h"
  8. #include "test/common/tensor.h"
  9. #include "test/fallback/fixture.h"
  10. #if MEGDNN_X86
  11. #include "src/x86/utils.h"
  12. #endif
  13. namespace megdnn {
  14. namespace test {
  15. TEST_F(FALLBACK, CONV_BIAS_FORWARD) {
  16. using namespace conv_bias;
  17. std::vector<TestArg> args = get_args();
  18. Checker<ConvBiasForward> checker(handle());
  19. NormalRNG default_rng;
  20. UniformIntRNG int_rng{-50, 50};
  21. param::ConvBias param;
  22. {
  23. param.format = param::ConvBias::Format::NHWC;
  24. auto src_shape = TensorShape{2, 16, 32, 24};
  25. auto filter_shape = TensorShape{4, 3, 3, 24};
  26. auto bias_shape_channel = TensorShape{1, 1, 1, 4};
  27. checker.set_dtype(0, dtype::Float32())
  28. .set_dtype(1, dtype::Float32())
  29. .set_dtype(2, dtype::Float32())
  30. .set_rng(0, &default_rng)
  31. .set_rng(1, &default_rng)
  32. .set_rng(2, &default_rng)
  33. .set_param(param)
  34. .execs({src_shape, filter_shape, bias_shape_channel, {}, {}});
  35. }
  36. checker.set_before_exec_callback(
  37. conv_bias::ConvBiasAlgoChecker<ConvBias>("FALLBACK_NAIVE"));
  38. for (auto&& arg : args) {
  39. checker.set_dtype(0, dtype::Float32())
  40. .set_dtype(1, dtype::Float32())
  41. .set_dtype(2, dtype::Float32())
  42. .set_rng(0, &default_rng)
  43. .set_rng(1, &default_rng)
  44. .set_rng(2, &default_rng)
  45. .set_epsilon(1e-3)
  46. .set_param(arg.param)
  47. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  48. }
  49. {
  50. param.format = param::ConvBias::Format::NCHW;
  51. param.sparse = ConvBias::Param::Sparse::GROUP;
  52. auto src_shape = TensorShape{2, 16, 32, 24};
  53. auto filter_shape = TensorShape{4, 4, 4, 1, 1};
  54. auto bias_shape_channel = TensorShape{1, 16, 1, 1};
  55. auto bias_shape = TensorShape{2, 16, 32, 24};
  56. checker.set_dtype(0, dtype::Float32())
  57. .set_dtype(1, dtype::Float32())
  58. .set_dtype(2, dtype::Float32())
  59. .set_rng(0, &default_rng)
  60. .set_rng(1, &default_rng)
  61. .set_rng(2, &default_rng)
  62. .set_param(param)
  63. .execs({src_shape, filter_shape, bias_shape, {}, {}})
  64. .execs({src_shape, filter_shape, bias_shape_channel, {}, {}});
  65. }
  66. }
  67. TEST_F(FALLBACK, CONV_BIAS_FORWARD_RECORD) {
  68. using namespace conv_bias;
  69. TaskRecordChecker<ConvBiasForward> checker(1);
  70. NormalRNG default_rng;
  71. UniformIntRNG int_rng{-50, 50};
  72. param::ConvBias param;
  73. {
  74. param.format = param::ConvBias::Format::NHWC;
  75. auto src_shape = TensorShape{2, 16, 32, 24};
  76. auto filter_shape = TensorShape{4, 3, 3, 24};
  77. auto bias_shape_channel = TensorShape{1, 1, 1, 4};
  78. checker.set_dtype(0, dtype::Float32())
  79. .set_dtype(1, dtype::Float32())
  80. .set_dtype(2, dtype::Float32())
  81. .set_rng(0, &default_rng)
  82. .set_rng(1, &default_rng)
  83. .set_rng(2, &default_rng)
  84. .set_param(param)
  85. .execs({src_shape, filter_shape, bias_shape_channel, {}, {}});
  86. }
  87. {
  88. param.format = param::ConvBias::Format::NCHW;
  89. param.sparse = ConvBias::Param::Sparse::GROUP;
  90. auto src_shape = TensorShape{2, 16, 32, 24};
  91. auto filter_shape = TensorShape{4, 4, 4, 1, 1};
  92. auto bias_shape_channel = TensorShape{1, 16, 1, 1};
  93. auto bias_shape = TensorShape{2, 16, 32, 24};
  94. checker.set_dtype(0, dtype::Float32())
  95. .set_dtype(1, dtype::Float32())
  96. .set_dtype(2, dtype::Float32())
  97. .set_rng(0, &default_rng)
  98. .set_rng(1, &default_rng)
  99. .set_rng(2, &default_rng)
  100. .set_param(param)
  101. .execs({src_shape, filter_shape, bias_shape, {}, {}})
  102. .execs({src_shape, filter_shape, bias_shape_channel, {}, {}});
  103. }
  104. }
  105. TEST_F(FALLBACK, FP32_GEMV_MK4_GI) {
  106. Checker<MatrixMul> checker(handle());
  107. using Param = MatrixMul::Param;
  108. checker.set_before_exec_callback(AlgoChecker<MatrixMul>("FB_GI_F32_GEMV_MK4"));
  109. checker.set_epsilon(1e-2);
  110. auto run = [&](size_t M, size_t K) {
  111. Param param;
  112. param.format = param::MatrixMul::Format::MK4;
  113. param.transposeA = false;
  114. param.transposeB = false;
  115. TensorShape A, B;
  116. A = TensorShape{M / 4, K / 4, 4, 4};
  117. B = TensorShape{K / 4, 1, 4};
  118. checker.set_param(param).execs({A, B, {}});
  119. };
  120. // N = 1
  121. for (size_t M : {4, 16, 128, 1024})
  122. for (size_t K : {4, 8, 12, 128, 256, 4096})
  123. run(M, K);
  124. }
  125. std::vector<conv_bias::TestArg> get_conv_bias_args(
  126. std::vector<size_t> kernel, std::vector<size_t> padv,
  127. std::vector<param::ConvBias::NonlineMode> nlmodev, std::vector<size_t> stridev,
  128. bool no_bias, bool only_broadbias) {
  129. using namespace conv_bias;
  130. using Param = param::ConvBias;
  131. using NLMode = param::ConvBias::NonlineMode;
  132. std::vector<TestArg> args;
  133. auto pack = [&](size_t n, size_t oc, size_t ic, size_t w, size_t h, size_t pad,
  134. size_t kernel, size_t stride, NLMode nonlinemode) {
  135. Param param;
  136. param.stride_h = stride;
  137. param.stride_w = stride;
  138. param.pad_h = pad;
  139. param.pad_w = pad;
  140. param.nonlineMode = nonlinemode;
  141. args.emplace_back(
  142. param, TensorShape{n, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  143. TensorShape{});
  144. if (!no_bias) {
  145. args.emplace_back(
  146. param, TensorShape{n, ic, h, w},
  147. TensorShape{oc, ic, kernel, kernel}, TensorShape{1, oc, 1, 1});
  148. if (!only_broadbias) {
  149. args.emplace_back(
  150. param, TensorShape{n, ic, h, w},
  151. TensorShape{oc, ic, kernel, kernel},
  152. TensorShape{
  153. n, oc, (h + 2 * param.pad_h - kernel) / stride + 1,
  154. (w + 2 * param.pad_h - kernel) / stride + 1});
  155. }
  156. }
  157. };
  158. auto pack_group = [&](size_t n, size_t oc, size_t ic, size_t w, size_t h,
  159. size_t pad, size_t kernel, size_t stride,
  160. NLMode nonlinemode) {
  161. Param param;
  162. param.stride_h = stride;
  163. param.stride_w = stride;
  164. param.pad_h = pad;
  165. param.pad_w = pad;
  166. param.nonlineMode = nonlinemode;
  167. param.sparse = param::ConvBias::Sparse::GROUP;
  168. args.emplace_back(
  169. param, TensorShape{n, 2 * ic, h, w},
  170. TensorShape{2, oc, ic, kernel, kernel}, TensorShape{});
  171. if (!no_bias) {
  172. args.emplace_back(
  173. param, TensorShape{n, 2 * ic, h, w},
  174. TensorShape{2, oc, ic, kernel, kernel},
  175. TensorShape{1, oc * 2, 1, 1});
  176. if (!only_broadbias) {
  177. args.emplace_back(
  178. param, TensorShape{n, 2 * ic, h, w},
  179. TensorShape{2, oc, ic, kernel, kernel},
  180. TensorShape{
  181. n, 2 * oc, (h + 2 * param.pad_h - kernel) / stride + 1,
  182. (w + 2 * param.pad_h - kernel) / stride + 1});
  183. }
  184. }
  185. };
  186. for (size_t n : {1, 2}) {
  187. for (auto nlmode : nlmodev) {
  188. for (auto pad : padv) {
  189. for (auto stride : stridev) {
  190. for (size_t ic : {1, 5}) {
  191. for (size_t oc : {1, 11}) {
  192. for (size_t size : {9, 30}) {
  193. for (size_t kern : kernel) {
  194. pack(n, oc, ic, size + 4, size + 4, pad, kern,
  195. stride, nlmode);
  196. pack_group(
  197. n, oc, ic, size, size, pad, kern, stride,
  198. nlmode);
  199. }
  200. }
  201. }
  202. }
  203. }
  204. }
  205. }
  206. }
  207. return args;
  208. }
  209. void checker_conv_bias(
  210. std::vector<conv_bias::TestArg> args, Handle* handle, RNG* rng, float epsilon,
  211. DType type0, DType type1, DType type2, DType type3, const char* algo_name) {
  212. using namespace conv_bias;
  213. Checker<ConvBias> checker(handle);
  214. checker.set_before_exec_callback(
  215. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));
  216. checker.set_dtype(0, type0);
  217. checker.set_dtype(1, type1);
  218. checker.set_dtype(2, type2);
  219. checker.set_dtype(4, type3);
  220. checker.set_epsilon(epsilon);
  221. if (NULL != rng) {
  222. checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng).set_rng(3, rng);
  223. }
  224. for (auto&& arg : args) {
  225. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}});
  226. }
  227. }
  228. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_1X1_S1_MK4_PACK_F32) {
  229. using namespace conv_bias;
  230. std::vector<conv_bias::TestArg> args =
  231. get_nchw44_conv_bias_args({1}, FULL_NLMODE, ALL_BIASMODE, 1, true);
  232. check_conv_bias(args, handle(), "CONV1x1:FB_GI_F32_MK4_PACK_4x12:24");
  233. }
  234. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_IM2COL_S1_MK4_PACK_F32_PREPROCESS) {
  235. using namespace conv_bias;
  236. std::vector<conv_bias::TestArg> args =
  237. get_nchw44_conv_bias_args({2, 4, 7}, FULL_NLMODE, BR_AND_NO_BIASMODE, 1);
  238. #define cb(name) \
  239. check_conv_bias_preprocess( \
  240. args, handle(), nullptr, 0.001, dtype::Float32(), dtype::Float32(), \
  241. dtype::Float32(), dtype::Float32(), name);
  242. cb("IM2COLMATMUL:FB_GI_F32_MK4_PACK_4x12");
  243. #undef cb
  244. }
  245. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_IM2COL_S2_MK4_PACK_F32_FUSE_PREPROCESS) {
  246. using namespace conv_bias;
  247. std::vector<conv_bias::TestArg> args =
  248. get_nchw44_conv_bias_args({3}, FULL_NLMODE, BR_AND_BIAS_BIASMODE, 2);
  249. #define cb(name) \
  250. check_conv_bias_preprocess( \
  251. args, handle(), nullptr, 0.001, dtype::Float32(), dtype::Float32(), \
  252. dtype::Float32(), dtype::Float32(), name);
  253. cb("IM2COLMATMUL:FB_GI_F32_MK4_PACK_4x12");
  254. #undef cb
  255. }
  256. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_1X1_S1_MK4_PACK_F32_PREPROCESS) {
  257. using namespace conv_bias;
  258. std::vector<conv_bias::TestArg> args =
  259. get_nchw44_conv_bias_args({1}, FULL_NLMODE, ALL_BIASMODE, 1, true);
  260. #define cb(name) \
  261. check_conv_bias_preprocess( \
  262. args, handle(), nullptr, 0.001, dtype::Float32(), dtype::Float32(), \
  263. dtype::Float32(), dtype::Float32(), name);
  264. cb("CONV1x1:FB_GI_F32_MK4_PACK_4x12:24");
  265. #undef cb
  266. }
  267. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_IM2COL_S1_MK4_PACK_F32) {
  268. using namespace conv_bias;
  269. std::vector<conv_bias::TestArg> args =
  270. get_nchw44_conv_bias_args({2, 4, 7}, FULL_NLMODE, BR_AND_BIAS_BIASMODE, 1);
  271. check_conv_bias(args, handle(), "IM2COLMATMUL:FB_GI_F32_MK4_PACK_4x12");
  272. }
  273. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_IM2COL_S2_MK4_PACK_F32) {
  274. using namespace conv_bias;
  275. std::vector<conv_bias::TestArg> args =
  276. get_nchw44_conv_bias_args({3, 5, 6}, FULL_NLMODE, BR_AND_BIAS_BIASMODE, 2);
  277. #define cb(name) check_conv_bias(args, handle(), name);
  278. cb("IM2COLMATMUL:FB_GI_F32_MK4_PACK_4x12");
  279. #undef cb
  280. }
  281. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_IM2COL_S2_MK4_PACK_F32_FUSE) {
  282. using namespace conv_bias;
  283. std::vector<conv_bias::TestArg> args =
  284. get_nchw44_conv_bias_args({3}, FULL_NLMODE, ALL_BIASMODE, 2);
  285. #define cb(name) check_conv_bias(args, handle(), name);
  286. cb("IM2COLMATMUL:FB_GI_F32_MK4_PACK_4x12");
  287. #undef cb
  288. }
  289. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_FORWARD_IM2COL_8X8X16) {
  290. using namespace conv_bias;
  291. param::ConvBias cur_param;
  292. using NLMode = param::ConvBias::NonlineMode;
  293. std::vector<conv_bias::TestArg> args = get_conv_bias_args(
  294. {1, 3}, {0}, {NLMode::IDENTITY, NLMode::RELU}, {1}, false, true);
  295. NormalRNG default_rng;
  296. Checker<ConvBias> checker(handle());
  297. checker.set_dtype(0, dtype::Int8{});
  298. checker.set_dtype(1, dtype::Int8{});
  299. checker.set_dtype(2, dtype::Int16{});
  300. checker.set_dtype(4, dtype::Int16{});
  301. for (auto&& arg : args) {
  302. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}});
  303. }
  304. }
  305. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_FORWARD) {
  306. using namespace conv_bias;
  307. param::ConvBias cur_param;
  308. using NLMode = param::ConvBias::NonlineMode;
  309. std::vector<conv_bias::TestArg> args = get_conv_bias_args(
  310. {1, 3, 5}, {0, 3},
  311. {NLMode::IDENTITY, NLMode::H_SWISH, NLMode::SIGMOID, NLMode::RELU}, {1, 2},
  312. false, false);
  313. NormalRNG default_rng;
  314. checker_conv_bias(
  315. args, handle(), &default_rng, 1e-3, dtype::Float32{}, dtype::Float32{},
  316. dtype::Float32{}, dtype::Float32{}, "FALLBACK_NAIVE");
  317. }
  318. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_NCHW_NCHW44_F32_S2) {
  319. check_conv_bias(
  320. conv_bias::get_nchw44_conv_bias_args(
  321. {2, 3, 5, 7}, ONLY_IDENTITY_NLMODE, ONLY_BR_BIASMODE, 2, false,
  322. true),
  323. handle(), "F32_CONV_NCHW_NCHW44");
  324. }
  325. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_NCHW_NCHW44_F32_S1) {
  326. check_conv_bias(
  327. conv_bias::get_nchw44_conv_bias_args(
  328. {2, 3, 5, 7}, ONLY_IDENTITY_NLMODE, ONLY_BR_BIASMODE, 1, false,
  329. true),
  330. handle(), "F32_CONV_NCHW_NCHW44");
  331. }
  332. std::vector<conv_bias::TestArg> get_nchw44_channel_wise_args(
  333. std::vector<size_t> kernel, size_t stride, bool no_bias, bool no_nonlinemode,
  334. bool no_full_bias) {
  335. using namespace conv_bias;
  336. using Param = param::ConvBias;
  337. using NLMode = param::ConvBias::NonlineMode;
  338. std::vector<TestArg> args;
  339. auto pack = [&](size_t n, size_t group, size_t w, size_t h, size_t kernel,
  340. size_t stride, NLMode nlmode, bool pad) {
  341. Param param;
  342. param.stride_h = stride;
  343. param.stride_w = stride;
  344. if (pad) {
  345. param.pad_h = kernel / 2;
  346. param.pad_w = kernel / 2;
  347. } else {
  348. param.pad_h = 0;
  349. param.pad_w = 0;
  350. }
  351. param.nonlineMode = nlmode;
  352. param.format = param::ConvBias::Format::NCHW44;
  353. param.sparse = param::ConvBias::Sparse::GROUP;
  354. args.emplace_back(
  355. param, TensorShape{n, group, h, w, 4},
  356. TensorShape{group, 1, 1, kernel, kernel, 4}, TensorShape{});
  357. if (!no_bias) {
  358. args.emplace_back(
  359. param, TensorShape{n, group, h, w, 4},
  360. TensorShape{group, 1, 1, kernel, kernel, 4},
  361. TensorShape{1, group, 1, 1, 4});
  362. }
  363. if (!no_full_bias) {
  364. args.emplace_back(
  365. param, TensorShape{n, group, h, w, 4},
  366. TensorShape{group, 1, 1, kernel, kernel, 4},
  367. TensorShape{
  368. n, group, (h + 2 * param.pad_w - kernel) / stride + 1,
  369. (w + 2 * param.pad_w - kernel) / stride + 1, 4});
  370. }
  371. };
  372. std::vector<NLMode> nonlinemode = {NLMode::IDENTITY};
  373. if (!no_nonlinemode) {
  374. nonlinemode.emplace_back(NLMode::RELU);
  375. nonlinemode.emplace_back(NLMode::H_SWISH);
  376. }
  377. for (size_t n : {1, 2}) {
  378. for (auto nlmode : nonlinemode) {
  379. for (bool pad : {true}) {
  380. for (size_t group : {1, 2, 4, 7, 16}) {
  381. for (size_t size : {4, 6, 7, 9, 20}) {
  382. for (size_t kern : kernel) {
  383. pack(n, group, size, size, kern, stride, nlmode, pad);
  384. }
  385. }
  386. }
  387. }
  388. for (bool pad : {false}) {
  389. for (size_t group : {1, 2, 7, 16}) {
  390. for (size_t size : {7, 9, 20}) {
  391. for (size_t kern : kernel) {
  392. pack(n, group, size, size, kern, stride, nlmode, pad);
  393. }
  394. }
  395. }
  396. }
  397. }
  398. }
  399. return args;
  400. }
  401. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_CHANNEL_WISE_STRIDE1_FP32_NCHW44_1) {
  402. check_conv_bias(
  403. get_nchw44_channel_wise_args({2, 3}, 1, false, false, false), handle(),
  404. "F32_CHANNEL_WISE_NCHW44");
  405. }
  406. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_CHANNEL_WISE_STRIDE1_FP32_NCHW44_2) {
  407. check_conv_bias(
  408. get_nchw44_channel_wise_args({5}, 1, false, false, false), handle(),
  409. "F32_CHANNEL_WISE_NCHW44");
  410. }
  411. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_CHANNEL_WISE_STRIDE2_FP32_NCHW44) {
  412. check_conv_bias(
  413. get_nchw44_channel_wise_args({2, 3, 5}, 2, false, false, false), handle(),
  414. "F32_CHANNEL_WISE_NCHW44");
  415. }
  416. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S1_K7) {
  417. //! k=7 s=1
  418. check_conv_bias(
  419. conv_bias::get_nchw44_conv_bias_args(
  420. {7}, ONLY_IDENTITY_NLMODE, BR_AND_NO_BIASMODE, 1),
  421. handle(), "F32_CONV_NCHW44_DIRECT");
  422. }
  423. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S1_K2K3) {
  424. check_conv_bias(
  425. conv_bias::get_nchw44_conv_bias_args(
  426. {2, 3}, FULL_NLMODE, ONLY_BR_BIASMODE, 1),
  427. handle(), "F32_CONV_NCHW44_DIRECT");
  428. }
  429. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S1_K5) {
  430. check_conv_bias(
  431. conv_bias::get_nchw44_conv_bias_args({5}, FULL_NLMODE, ONLY_BR_BIASMODE, 1),
  432. handle(), "F32_CONV_NCHW44_DIRECT");
  433. }
  434. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S2) {
  435. check_conv_bias(
  436. conv_bias::get_nchw44_conv_bias_args(
  437. {2, 3, 5, 7}, FULL_NLMODE, ONLY_BR_BIASMODE, 2),
  438. handle(), "F32_CONV_NCHW44_DIRECT");
  439. }
  440. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32) {
  441. check_conv_bias(
  442. conv_bias::get_conv_bias_args(
  443. {1, 2, 3, 4, 5, 6, 7}, 1, false, false, false),
  444. handle(), "F32DIRECT");
  445. }
  446. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_STR2) {
  447. check_conv_bias(
  448. conv_bias::get_conv_bias_args({2, 3, 5, 7}, 2, false, false, false),
  449. handle(), "F32STRD2");
  450. }
  451. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_STR1) {
  452. check_conv_bias(
  453. conv_bias::get_conv_bias_args({2, 3, 5, 7}, 1, false, false, false),
  454. handle(), "F32STRD1");
  455. }
  456. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_PREPROCESS_NCHW44) {
  457. using namespace conv_bias;
  458. std::vector<TestArg> nchw44_args = conv_bias::get_nchw44_conv_bias_args(
  459. {3}, QUAN_NLMODE, BR_AND_NO_BIASMODE, 1);
  460. Checker<ConvBiasForward> checker(handle());
  461. auto run = [&checker](
  462. const std::vector<TestArg>& args, DType A_dtype, DType B_dtype,
  463. DType C_dtype, DType D_dtype, const float eps) {
  464. for (auto&& arg : args) {
  465. checker.set_dtype(0, A_dtype)
  466. .set_dtype(1, B_dtype)
  467. .set_dtype(2, C_dtype)
  468. .set_dtype(4, D_dtype)
  469. .set_epsilon(eps)
  470. .set_param(arg.param)
  471. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  472. }
  473. };
  474. //! uncomment this when low precision mode is ok
  475. // run(handle(), nchw44_args, {2, 6, 7}, dtype::Float32(), dtype::Float32(),
  476. // dtype::Float32(), dtype::Float32(), 1e-2f);
  477. //! remove this when low precision mode is ok
  478. run(nchw44_args, dtype::Float32(), dtype::Float32(), dtype::Float32(),
  479. dtype::Float32(), 1e-3f);
  480. }
  481. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_FORWARD_QUANTIZED) {
  482. using namespace conv_bias;
  483. param::ConvBias cur_param;
  484. using NLMode = param::ConvBias::NonlineMode;
  485. std::vector<conv_bias::TestArg> args = get_conv_bias_args(
  486. {1, 3, 5, 7}, {0, 3}, {NLMode::IDENTITY, NLMode::H_SWISH, NLMode::RELU},
  487. {1, 2}, false, false);
  488. UniformIntRNG int_rng{-50, 50};
  489. float epsilon = 1e-3;
  490. checker_conv_bias(
  491. args, handle(), &int_rng, epsilon, dtype::QuantizedS8(2.5f),
  492. dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f),
  493. dtype::QuantizedS8(60.25f), "FALLBACK_NAIVE");
  494. }
  495. #if MEGDNN_WITH_BENCHMARK
  496. namespace {
  497. void benchmark_impl(
  498. const param::ConvBias param,
  499. std::vector<std::pair<SmallVector<TensorShape>, float>>& shapes_and_computation,
  500. const std::string algo_name, size_t RUNS,
  501. TaskExecutorConfig&& multi_thread_config,
  502. TaskExecutorConfig&& single_thread_config, std::vector<DType>& data_type) {
  503. std::vector<float> multi_thread_times, single_thread_times;
  504. {
  505. auto multi_thread_hanle = create_cpu_handle(0, true, &multi_thread_config);
  506. auto benchmarker = Benchmarker<ConvBias>(multi_thread_hanle.get());
  507. benchmarker.set_times(RUNS)
  508. .set_display(false)
  509. .set_param(param)
  510. .set_dtype(0, data_type[0])
  511. .set_dtype(1, data_type[1])
  512. .set_dtype(2, data_type[2])
  513. .set_dtype(4, data_type[3])
  514. .set_before_exec_callback(
  515. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name.c_str()));
  516. for (auto shape : shapes_and_computation) {
  517. multi_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  518. }
  519. }
  520. {
  521. auto single_thread_handle = create_cpu_handle(0, true, &single_thread_config);
  522. auto benchmarker = Benchmarker<ConvBias>(single_thread_handle.get());
  523. benchmarker.set_times(RUNS)
  524. .set_display(false)
  525. .set_param(param)
  526. .set_dtype(0, data_type[0])
  527. .set_dtype(1, data_type[1])
  528. .set_dtype(2, data_type[2])
  529. .set_dtype(4, data_type[3])
  530. .set_before_exec_callback(
  531. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name.c_str()));
  532. for (auto shape : shapes_and_computation) {
  533. single_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  534. }
  535. }
  536. printf("Benchmark : Multi threads %zu, ", multi_thread_config.nr_thread);
  537. printf("core_ids:");
  538. for (size_t i = 0; i < multi_thread_config.affinity_core_set.size(); i++) {
  539. printf("%zu ", multi_thread_config.affinity_core_set[i]);
  540. }
  541. printf(", Single thread core_id %zu\n", single_thread_config.affinity_core_set[0]);
  542. for (size_t i = 0; i < shapes_and_computation.size(); i++) {
  543. auto shapes = shapes_and_computation[i];
  544. printf("Bench case: ");
  545. for (auto&& shape : shapes.first) {
  546. printf("%s ", shape.to_string().c_str());
  547. }
  548. float computations = shapes.second;
  549. printf("%zu threads gflops: %f,\n single thread gflops: "
  550. "%f. spead up = %f, speedup/cores=%f\n",
  551. multi_thread_config.nr_thread, computations / multi_thread_times[i],
  552. computations / single_thread_times[i],
  553. single_thread_times[i] / multi_thread_times[i],
  554. single_thread_times[i] / multi_thread_times[i] /
  555. multi_thread_config.nr_thread);
  556. }
  557. }
  558. } // namespace
  559. TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_GI_CONVBIAS_DIRECTF32) {
  560. constexpr size_t RUNS = 50;
  561. param::ConvBias param;
  562. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  563. param.pad_h = 1;
  564. param.pad_w = 1;
  565. param.stride_h = 1;
  566. param.stride_w = 1;
  567. param.sparse = param::ConvBias::Sparse::GROUP;
  568. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  569. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  570. size_t group) {
  571. SmallVector<TensorShape> shapes{
  572. {N, IC, H, W},
  573. {group, OC / group, IC / group, FS, FS},
  574. {1, OC, 1, 1},
  575. {},
  576. {N, OC, H, W}};
  577. TensorShape dst{N, OC, H, W};
  578. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  579. dst.total_nr_elems()) *
  580. 1e-6;
  581. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  582. };
  583. bench_case(1, 32, 32, 200, 200, 3, 4);
  584. bench_case(1, 32, 32, 200, 200, 3, 32);
  585. bench_case(1, 32, 32, 128, 128, 3, 4);
  586. bench_case(1, 32, 32, 128, 128, 3, 32);
  587. bench_case(1, 32, 32, 100, 100, 3, 4);
  588. bench_case(1, 32, 32, 100, 100, 3, 32);
  589. bench_case(1, 32, 32, 80, 80, 3, 4);
  590. bench_case(1, 32, 32, 80, 80, 3, 32);
  591. std::string algo_name = "F32DIRECT";
  592. printf("Benchmark F32DIRECT_LARGE_GROUP algo\n");
  593. std::vector<DType> data_type = {
  594. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  595. benchmark_impl(
  596. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  597. data_type);
  598. benchmark_impl(
  599. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  600. data_type);
  601. benchmark_impl(
  602. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  603. data_type);
  604. shapes_and_computation.clear();
  605. algo_name = "F32DIRECT";
  606. printf("Benchmark F32DIRECT_SMALL_GROUP algo\n");
  607. bench_case(1, 32, 32, 200, 200, 3, 1);
  608. bench_case(1, 32, 32, 128, 128, 3, 1);
  609. bench_case(1, 32, 32, 100, 100, 3, 1);
  610. bench_case(1, 32, 32, 80, 80, 3, 1);
  611. benchmark_impl(
  612. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  613. data_type);
  614. benchmark_impl(
  615. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  616. data_type);
  617. benchmark_impl(
  618. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  619. data_type);
  620. }
  621. TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_GI_CONVBIAS_DIRECTF32_STR1) {
  622. constexpr size_t RUNS = 50;
  623. param::ConvBias param;
  624. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  625. param.pad_h = 1;
  626. param.pad_w = 1;
  627. param.stride_h = 1;
  628. param.stride_w = 1;
  629. param.sparse = param::ConvBias::Sparse::GROUP;
  630. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  631. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  632. size_t group) {
  633. SmallVector<TensorShape> shapes{
  634. {N, IC, H, W},
  635. {group, OC / group, IC / group, FS, FS},
  636. {1, OC, 1, 1},
  637. {},
  638. {N, OC, H, W}};
  639. TensorShape dst{N, OC, H, W};
  640. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  641. dst.total_nr_elems()) *
  642. 1e-6;
  643. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  644. };
  645. bench_case(1, 32, 32, 200, 200, 3, 4);
  646. bench_case(1, 32, 32, 200, 200, 3, 32);
  647. bench_case(1, 32, 32, 128, 128, 3, 4);
  648. bench_case(1, 32, 32, 128, 128, 3, 32);
  649. bench_case(1, 32, 32, 100, 100, 3, 4);
  650. bench_case(1, 32, 32, 100, 100, 3, 32);
  651. bench_case(1, 32, 32, 80, 80, 3, 4);
  652. bench_case(1, 32, 32, 80, 80, 3, 32);
  653. std::string algo_name = "F32STRD1";
  654. printf("Benchmark F32STRD1_LARGE_GROUP algo\n");
  655. std::vector<DType> data_type = {
  656. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  657. benchmark_impl(
  658. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  659. data_type);
  660. benchmark_impl(
  661. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  662. data_type);
  663. benchmark_impl(
  664. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  665. data_type);
  666. shapes_and_computation.clear();
  667. algo_name = "F32STRD1";
  668. printf("Benchmark F32STRD1_SMALL_GROUP algo\n");
  669. bench_case(1, 32, 32, 200, 200, 3, 1);
  670. bench_case(1, 32, 32, 128, 128, 3, 1);
  671. bench_case(1, 32, 32, 100, 100, 3, 1);
  672. bench_case(1, 32, 32, 80, 80, 3, 1);
  673. benchmark_impl(
  674. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  675. data_type);
  676. benchmark_impl(
  677. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  678. data_type);
  679. benchmark_impl(
  680. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  681. data_type);
  682. }
  683. TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_GI_CONVBIAS_DIRECTF32_STR2) {
  684. constexpr size_t RUNS = 50;
  685. param::ConvBias param;
  686. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  687. param.pad_h = 1;
  688. param.pad_w = 1;
  689. param.stride_h = 2;
  690. param.stride_w = 2;
  691. param.sparse = param::ConvBias::Sparse::GROUP;
  692. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  693. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  694. size_t group, size_t P, size_t S) {
  695. SmallVector<TensorShape> shapes{
  696. {N, IC, H, W},
  697. {group, OC / group, IC / group, FS, FS},
  698. {1, OC, 1, 1},
  699. {},
  700. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  701. TensorShape dst{N, OC, H, W};
  702. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  703. dst.total_nr_elems()) *
  704. 1e-6;
  705. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  706. };
  707. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 2);
  708. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 2);
  709. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 2);
  710. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 2);
  711. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 2);
  712. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 2);
  713. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 2);
  714. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 2);
  715. std::string algo_name = "F32STRD2";
  716. printf("Benchmark F32STRD2_LARGE_GROUP algo\n");
  717. std::vector<DType> data_type = {
  718. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  719. benchmark_impl(
  720. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  721. data_type);
  722. benchmark_impl(
  723. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  724. data_type);
  725. benchmark_impl(
  726. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  727. data_type);
  728. shapes_and_computation.clear();
  729. algo_name = "F32STRD2";
  730. printf("Benchmark F32STRD2_SMALL_GROUP algo\n");
  731. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 2);
  732. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 2);
  733. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 2);
  734. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 2);
  735. benchmark_impl(
  736. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  737. data_type);
  738. benchmark_impl(
  739. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  740. data_type);
  741. benchmark_impl(
  742. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  743. data_type);
  744. }
  745. TEST_F(FALLBACK, BENCHMARK_GI_CHANNEL_WISE_F32_STRIDE1_NCHW44) {
  746. // have to remove preferred restrict in usable func before run the benchmark
  747. using namespace conv_bias;
  748. param::ConvBias param;
  749. param.stride_h = 1;
  750. param.stride_w = 1;
  751. param.pad_h = 1;
  752. param.pad_w = 1;
  753. param.nonlineMode = NonlineMode::RELU;
  754. param.sparse = param::ConvBias::Sparse::GROUP;
  755. constexpr size_t RUN = 50;
  756. Benchmarker<ConvBias> benchmark0(handle());
  757. benchmark0.set_display(false);
  758. benchmark0.set_param(param);
  759. benchmark0.set_times(RUN);
  760. benchmark0.set_before_exec_callback(
  761. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32STRD1"));
  762. auto opr = handle()->create_operator<ConvBias>();
  763. opr->param() = param;
  764. param.format = param::ConvBias::Format::NCHW44;
  765. Benchmarker<ConvBias> benchmark1(handle());
  766. benchmark1.set_display(false);
  767. benchmark1.set_param(param);
  768. benchmark1.set_times(RUN);
  769. benchmark1.set_before_exec_callback(
  770. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32_CHANNEL_WISE_NCHW44"));
  771. auto run = [&](size_t group, size_t w, size_t h, size_t kernel) {
  772. TensorLayout dst_layout;
  773. opr->deduce_layout(
  774. {{1, group * 4, h, w}, dtype::Int8()},
  775. {{group * 4, 1, 1, kernel, kernel}, dtype::Int8()},
  776. {{1, group * 4, 1, 1}, dtype::Int32()}, {}, dst_layout);
  777. //! dst.nr_elems * IC * FH * FW * 2
  778. float computations = dst_layout.total_nr_elems() * kernel * kernel * 2.0 /
  779. (1024 * 1024 * 1024) * 1e3;
  780. auto used0 = benchmark0.exec(
  781. {{1, group * 4, h, w},
  782. {group * 4, 1, 1, kernel, kernel},
  783. {1, group * 4, 1, 1},
  784. {},
  785. {}}) /
  786. RUN;
  787. auto used1 = benchmark1.exec(
  788. {{1, group, h, w, 4},
  789. {group, 1, 1, kernel, kernel, 4},
  790. {1, group, 1, 1, 4},
  791. {},
  792. {}}) /
  793. RUN;
  794. printf("group/h/w/kernel:%zu,%zu,%zu,%zu: nchw: %f ms %f Gflops "
  795. "nchw44: "
  796. "%f ms %f GFlops "
  797. "speedup: %f\n",
  798. group, h, w, kernel, used0, computations / used0, used1,
  799. computations / used1, used0 / used1);
  800. };
  801. for (size_t group : {8, 16, 32, 64}) {
  802. for (size_t kerenl : {2, 3, 5}) {
  803. run(group, 112, 112, kerenl);
  804. run(group, 56, 56, kerenl);
  805. run(group, 48, 48, kerenl);
  806. run(group, 28, 28, kerenl);
  807. run(group, 14, 14, kerenl);
  808. }
  809. }
  810. run(8, 112, 112, 3);
  811. run(32, 56, 56, 3);
  812. run(64, 28, 28, 3);
  813. run(128, 14, 14, 3);
  814. }
  815. TEST_F(FALLBACK, BENCHMARK_GI_CHANNEL_WISE_F32_STRIDE2_NCHW44) {
  816. // have to remove preferred restrict in usable func before run the benchmark
  817. using namespace conv_bias;
  818. param::ConvBias param;
  819. param.stride_h = 2;
  820. param.stride_w = 2;
  821. param.pad_h = 1;
  822. param.pad_w = 1;
  823. param.nonlineMode = NonlineMode::RELU;
  824. param.sparse = param::ConvBias::Sparse::GROUP;
  825. constexpr size_t RUN = 50;
  826. Benchmarker<ConvBias> benchmark0(handle());
  827. benchmark0.set_display(false);
  828. benchmark0.set_param(param);
  829. benchmark0.set_times(RUN);
  830. benchmark0.set_before_exec_callback(
  831. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32STRD2"));
  832. auto opr = handle()->create_operator<ConvBias>();
  833. opr->param() = param;
  834. param.format = param::ConvBias::Format::NCHW44;
  835. Benchmarker<ConvBias> benchmark1(handle());
  836. benchmark1.set_display(false);
  837. benchmark1.set_param(param);
  838. benchmark1.set_times(RUN);
  839. benchmark1.set_before_exec_callback(
  840. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32_CHANNEL_WISE_NCHW44"));
  841. auto run = [&](size_t group, size_t w, size_t h, size_t kernel) {
  842. TensorLayout dst_layout;
  843. opr->deduce_layout(
  844. {{1, group * 4, h, w}, dtype::Int8()},
  845. {{group * 4, 1, 1, kernel, kernel}, dtype::Int8()},
  846. {{1, group * 4, 1, 1}, dtype::Int32()}, {}, dst_layout);
  847. //! dst.nr_elems * IC * FH * FW * 2
  848. float computations = dst_layout.total_nr_elems() * kernel * kernel * 2.0 /
  849. (1024 * 1024 * 1024) * 1e3;
  850. auto used0 = benchmark0.exec(
  851. {{1, group * 4, h, w},
  852. {group * 4, 1, 1, kernel, kernel},
  853. {1, group * 4, 1, 1},
  854. {},
  855. {}}) /
  856. RUN;
  857. auto used1 = benchmark1.exec(
  858. {{1, group, h, w, 4},
  859. {group, 1, 1, kernel, kernel, 4},
  860. {1, group, 1, 1, 4},
  861. {},
  862. {}}) /
  863. RUN;
  864. printf("group/h/w/kernel:%zu,%zu,%zu,%zu: nchw: %f ms %f Gflops "
  865. "nchw44: "
  866. "%f ms %f GFlops "
  867. "speedup: %f\n",
  868. group, h, w, kernel, used0, computations / used0, used1,
  869. computations / used1, used0 / used1);
  870. };
  871. for (size_t group : {8, 16, 32, 64}) {
  872. for (size_t kerenl : {2, 3, 5}) {
  873. run(group, 112, 112, kerenl);
  874. run(group, 56, 56, kerenl);
  875. run(group, 48, 48, kerenl);
  876. run(group, 28, 28, kerenl);
  877. run(group, 14, 14, kerenl);
  878. }
  879. }
  880. run(8, 112, 112, 3);
  881. run(32, 56, 56, 3);
  882. run(64, 28, 28, 3);
  883. run(128, 14, 14, 3);
  884. }
  885. TEST_F(FALLBACK, BENCHMARK_CONVBIAS) {
  886. constexpr size_t RUNS = 10;
  887. param::ConvBias param;
  888. param.stride_h = 1;
  889. param.stride_w = 1;
  890. Benchmarker<ConvBias> benchmarker_int(handle());
  891. benchmarker_int.set_times(RUNS)
  892. .set_dtype(0, dtype::QuantizedS8(2.5f))
  893. .set_dtype(1, dtype::QuantizedS8(2.5f))
  894. .set_dtype(2, dtype::QuantizedS32(6.25f))
  895. .set_dtype(4, dtype::QuantizedS8(40.25f))
  896. .set_display(false);
  897. Benchmarker<ConvBias> benchmarker_float(handle());
  898. benchmarker_float.set_display(false).set_times(RUNS);
  899. auto run = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS) {
  900. TensorShape src({N, IC, H, W}), filter({OC, IC, FS, FS}), bias({N, OC, 1, 1}),
  901. z({}), dst({N, OC, H, W});
  902. param.pad_h = FS / 2;
  903. param.pad_w = FS / 2;
  904. auto int_used =
  905. benchmarker_int.set_param(param).exec({src, filter, bias, z, dst}) /
  906. RUNS;
  907. auto float_used =
  908. benchmarker_float.set_param(param).exec({src, filter, bias, z, dst}) /
  909. RUNS;
  910. float computations = IC * (FS * FS + 1) * dst.total_nr_elems() * 2 * 1e-6;
  911. printf("run: %s %s %s->%s \nfloat: %f ms %f Gflops int: %f ms "
  912. "%f Gflops speedup: %f\n",
  913. src.to_string().c_str(), filter.to_string().c_str(),
  914. bias.to_string().c_str(), dst.to_string().c_str(), float_used,
  915. computations / float_used, int_used, computations / int_used,
  916. float_used / int_used);
  917. };
  918. run(1, 128, 128, 32, 32, 3);
  919. for (size_t IC : {32, 64, 128}) {
  920. for (size_t OC : {32, 64, 128}) {
  921. for (size_t size : {28, 56}) {
  922. for (size_t FS : {3, 5}) {
  923. run(1, IC, OC, size, size, FS);
  924. }
  925. }
  926. }
  927. }
  928. }
  929. TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F23_4x4) {
  930. #if MEGDNN_AARCH64
  931. conv_bias::benchmark_winograd("WINOGRAD:AARCH64_F32_MK4_4x16:4:2", handle(), 3, 4);
  932. #elif MEGDNN_ARMV7
  933. conv_bias::benchmark_winograd("WINOGRAD:ARMV7_F32_MK4_4x8:4:2", handle(), 3, 4);
  934. #else
  935. conv_bias::benchmark_winograd("WINOGRAD:FB_GI_F32_MK4_4x8:4:2", handle(), 3, 4);
  936. #endif
  937. }
  938. void benchmark_winograd_nchw_vs_nchw44(
  939. const char* algo_name0, const char* algo_name1, Handle* handle) {
  940. using namespace conv_bias;
  941. using NLMode = param::ConvBias::NonlineMode;
  942. std::vector<conv_bias::TestArg> args_nchw44;
  943. std::vector<conv_bias::TestArg> args_nchw;
  944. auto pack = [&](size_t n, size_t oc, size_t ic, size_t h, size_t w, size_t group,
  945. NLMode nlmode) {
  946. param::ConvBias param;
  947. param.format = param::ConvBias::Format::NCHW44;
  948. param.stride_h = 1;
  949. param.stride_w = 1;
  950. param.pad_h = 1;
  951. param.pad_w = 1;
  952. param.nonlineMode = nlmode;
  953. if (group == 1) {
  954. param.sparse = param::ConvBias::Sparse::DENSE;
  955. args_nchw44.emplace_back(
  956. param, TensorShape{n, ic / 4, h, w, 4},
  957. TensorShape{oc / 4, ic / 4, 3, 3, 4, 4}, TensorShape{});
  958. param.format = param::ConvBias::Format::NCHW;
  959. args_nchw.emplace_back(
  960. param, TensorShape{n, ic, h, w}, TensorShape{oc, ic, 3, 3},
  961. TensorShape{});
  962. } else {
  963. auto oc_per_group = oc / group;
  964. auto ic_per_group = ic / group;
  965. param.sparse = param::ConvBias::Sparse::GROUP;
  966. args_nchw44.emplace_back(
  967. param, TensorShape{n, ic_per_group / 4, h, w, 4},
  968. TensorShape{group, oc_per_group / 4, ic_per_group / 4, 3, 3, 4, 4},
  969. TensorShape{});
  970. param.format = param::ConvBias::Format::NCHW;
  971. args_nchw.emplace_back(
  972. param, TensorShape{n, ic, h, w},
  973. TensorShape{group, oc_per_group, ic_per_group, 3, 3},
  974. TensorShape{});
  975. }
  976. };
  977. std::vector<NLMode> nonlinemode = {NLMode::IDENTITY};
  978. for (auto nlmode : nonlinemode)
  979. for (size_t n : {1})
  980. for (size_t group = 1; group <= 1; ++group) {
  981. pack(n, 512, 512, 15, 15, group, nlmode);
  982. pack(n, 512, 256, 15, 15, group, nlmode);
  983. pack(n, 256, 256, 29, 29, group, nlmode);
  984. pack(n, 256, 128, 29, 29, group, nlmode);
  985. pack(n, 128, 128, 57, 57, group, nlmode);
  986. pack(n, 128, 64, 57, 57, group, nlmode);
  987. pack(n, 24, 24, 224, 224, group, nlmode);
  988. pack(n, 64, 24, 123, 123, group, nlmode);
  989. pack(n, 64, 64, 56, 56, group, nlmode);
  990. pack(n, 128, 128, 28, 28, group, nlmode);
  991. pack(n, 256, 256, 14, 14, group, nlmode);
  992. pack(n, 512, 512, 7, 7, group, nlmode);
  993. }
  994. using namespace conv_bias;
  995. constexpr size_t RUN = 10;
  996. Benchmarker<ConvBias> benchmark_winograd_nchw(handle);
  997. benchmark_winograd_nchw.set_display(false);
  998. benchmark_winograd_nchw.set_times(RUN);
  999. Benchmarker<ConvBias> benchmark_winograd_nchw44(handle);
  1000. benchmark_winograd_nchw44.set_display(false);
  1001. benchmark_winograd_nchw44.set_times(RUN);
  1002. std::string winograd_nchw_algo_name = ssprintf("WINOGRAD:%s", algo_name0);
  1003. std::string winograd_nchw44_algo_name = ssprintf("WINOGRAD_NCHW44:%s", algo_name1);
  1004. for (size_t i = 0; i < args_nchw.size(); ++i) {
  1005. auto arg_nchw = args_nchw[i];
  1006. auto arg_nchw44 = args_nchw44[i];
  1007. TensorLayout dst_layout;
  1008. auto opr = handle->create_operator<ConvBias>();
  1009. opr->param() = arg_nchw.param;
  1010. opr->deduce_layout(
  1011. {arg_nchw.src, dtype::Float32()}, {arg_nchw.filter, dtype::Float32()},
  1012. {arg_nchw.bias, dtype::Float32()}, {}, dst_layout);
  1013. //! dst.nr_elems * IC * FH * FW * 2
  1014. float computations = dst_layout.total_nr_elems() * arg_nchw.filter[1] *
  1015. arg_nchw.filter[2] * arg_nchw.filter[3] * 2.0 /
  1016. (1024 * 1024 * 1024) * 1e3;
  1017. benchmark_winograd_nchw.set_param(arg_nchw.param);
  1018. auto nchw_used = algo_benchmark<ConvBias>(
  1019. benchmark_winograd_nchw,
  1020. {arg_nchw.src, arg_nchw.filter, {}, {}, {}},
  1021. winograd_nchw_algo_name.c_str()) /
  1022. RUN;
  1023. benchmark_winograd_nchw44.set_param(arg_nchw44.param);
  1024. auto nchw44_used = algo_benchmark<ConvBias>(
  1025. benchmark_winograd_nchw44,
  1026. {arg_nchw44.src, arg_nchw44.filter, {}, {}, {}},
  1027. winograd_nchw44_algo_name.c_str()) /
  1028. RUN;
  1029. printf("%s %s: nchw: %f ms %f Gflops nchw44: %f ms %f GFlops "
  1030. "speedup: "
  1031. "%f\n",
  1032. arg_nchw.src.to_string().c_str(), arg_nchw.filter.to_string().c_str(),
  1033. nchw_used, computations / nchw_used, nchw44_used,
  1034. computations / nchw44_used, nchw_used / nchw44_used);
  1035. }
  1036. }
  1037. TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F23_MK4_NCHW_VS_NCHW44) {
  1038. #if MEGDNN_AARCH64
  1039. benchmark_winograd_nchw_vs_nchw44(
  1040. "AARCH64_F32_MK4_4x16:4:2", "AARCH64_F32_MK4_4x16:4:2", handle());
  1041. #elif MEGDNN_ARMV7
  1042. benchmark_winograd_nchw_vs_nchw44(
  1043. "ARMV7_F32_MK4_4x8:4:2", "ARMV7_F32_MK4_4x8:4:2", handle());
  1044. #else
  1045. benchmark_winograd_nchw_vs_nchw44(
  1046. "FB_GI_F32_MK4_4x8:4:2", "FB_GI_F32_MK4_4x8:4:2", handle());
  1047. #endif
  1048. }
  1049. TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F63_4x4) {
  1050. #if MEGDNN_AARCH64
  1051. conv_bias::benchmark_winograd("WINOGRAD:AARCH64_F32_MK4_4x16:4:6", handle(), 3, 4);
  1052. #elif MEGDNN_ARMV7
  1053. conv_bias::benchmark_winograd("WINOGRAD:ARMV7_F32_MK4_4x8:4:6", handle(), 3, 4);
  1054. #else
  1055. conv_bias::benchmark_winograd("WINOGRAD:FB_GI_F32_MK4_4x8:4:6", handle(), 3, 4);
  1056. #endif
  1057. }
  1058. TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F63_MK4_NCHW_VS_NCHW44) {
  1059. #if MEGDNN_AARCH64
  1060. benchmark_winograd_nchw_vs_nchw44(
  1061. "AARCH64_F32_MK4_4x16:4:6", "AARCH64_F32_MK4_4x16:4:6", handle());
  1062. #elif MEGDNN_ARMV7
  1063. benchmark_winograd_nchw_vs_nchw44(
  1064. "ARMV7_F32_MK4_4x8:4:6", "ARMV7_F32_MK4_4x8:4:6", handle());
  1065. #else
  1066. benchmark_winograd_nchw_vs_nchw44(
  1067. "FB_GI_F32_MK4_4x8:4:6", "FB_GI_F32_MK4_4x8:4:6", handle());
  1068. #endif
  1069. }
  1070. #endif
  1071. } // namespace test
  1072. } // namespace megdnn
  1073. // vim: syntax=cpp.doxygen