You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias.cpp 54 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331
  1. #include "test/common/conv_bias.h"
  2. #include "megdnn/opr_param_defs.h"
  3. #include "megdnn/oprs.h"
  4. #include "test/common/benchmarker.h"
  5. #include "test/common/checker.h"
  6. #include "test/common/rng.h"
  7. #include "test/common/task_record_check.h"
  8. #include "test/common/tensor.h"
  9. #include "test/fallback/fixture.h"
  10. #if MEGDNN_X86
  11. #include "src/x86/utils.h"
  12. #endif
  13. namespace megdnn {
  14. namespace test {
  15. TEST_F(FALLBACK, CONV_BIAS_FORWARD) {
  16. using namespace conv_bias;
  17. std::vector<TestArg> args = get_args();
  18. Checker<ConvBiasForward> checker(handle());
  19. NormalRNG default_rng;
  20. UniformIntRNG int_rng{-50, 50};
  21. param::ConvBias param;
  22. {
  23. param.format = param::ConvBias::Format::NHWC;
  24. auto src_shape = TensorShape{2, 16, 32, 24};
  25. auto filter_shape = TensorShape{4, 3, 3, 24};
  26. auto bias_shape_channel = TensorShape{1, 1, 1, 4};
  27. checker.set_dtype(0, dtype::Float32())
  28. .set_dtype(1, dtype::Float32())
  29. .set_dtype(2, dtype::Float32())
  30. .set_rng(0, &default_rng)
  31. .set_rng(1, &default_rng)
  32. .set_rng(2, &default_rng)
  33. .set_param(param)
  34. .execs({src_shape, filter_shape, bias_shape_channel, {}, {}});
  35. }
  36. checker.set_before_exec_callback(
  37. conv_bias::ConvBiasAlgoChecker<ConvBias>("FALLBACK_NAIVE"));
  38. for (auto&& arg : args) {
  39. checker.set_dtype(0, dtype::Float32())
  40. .set_dtype(1, dtype::Float32())
  41. .set_dtype(2, dtype::Float32())
  42. .set_rng(0, &default_rng)
  43. .set_rng(1, &default_rng)
  44. .set_rng(2, &default_rng)
  45. .set_epsilon(1e-3)
  46. .set_param(arg.param)
  47. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  48. }
  49. {
  50. param.format = param::ConvBias::Format::NCHW;
  51. param.sparse = ConvBias::Param::Sparse::GROUP;
  52. auto src_shape = TensorShape{2, 16, 32, 24};
  53. auto filter_shape = TensorShape{4, 4, 4, 1, 1};
  54. auto bias_shape_channel = TensorShape{1, 16, 1, 1};
  55. auto bias_shape = TensorShape{2, 16, 32, 24};
  56. checker.set_dtype(0, dtype::Float32())
  57. .set_dtype(1, dtype::Float32())
  58. .set_dtype(2, dtype::Float32())
  59. .set_rng(0, &default_rng)
  60. .set_rng(1, &default_rng)
  61. .set_rng(2, &default_rng)
  62. .set_param(param)
  63. .execs({src_shape, filter_shape, bias_shape, {}, {}})
  64. .execs({src_shape, filter_shape, bias_shape_channel, {}, {}});
  65. }
  66. }
  67. TEST_F(FALLBACK, CONV_BIAS_FORWARD_RECORD) {
  68. using namespace conv_bias;
  69. TaskRecordChecker<ConvBiasForward> checker(1);
  70. NormalRNG default_rng;
  71. UniformIntRNG int_rng{-50, 50};
  72. param::ConvBias param;
  73. {
  74. param.format = param::ConvBias::Format::NHWC;
  75. auto src_shape = TensorShape{2, 16, 32, 24};
  76. auto filter_shape = TensorShape{4, 3, 3, 24};
  77. auto bias_shape_channel = TensorShape{1, 1, 1, 4};
  78. checker.set_dtype(0, dtype::Float32())
  79. .set_dtype(1, dtype::Float32())
  80. .set_dtype(2, dtype::Float32())
  81. .set_rng(0, &default_rng)
  82. .set_rng(1, &default_rng)
  83. .set_rng(2, &default_rng)
  84. .set_param(param)
  85. .execs({src_shape, filter_shape, bias_shape_channel, {}, {}});
  86. }
  87. {
  88. param.format = param::ConvBias::Format::NCHW;
  89. param.sparse = ConvBias::Param::Sparse::GROUP;
  90. auto src_shape = TensorShape{2, 16, 32, 24};
  91. auto filter_shape = TensorShape{4, 4, 4, 1, 1};
  92. auto bias_shape_channel = TensorShape{1, 16, 1, 1};
  93. auto bias_shape = TensorShape{2, 16, 32, 24};
  94. checker.set_dtype(0, dtype::Float32())
  95. .set_dtype(1, dtype::Float32())
  96. .set_dtype(2, dtype::Float32())
  97. .set_rng(0, &default_rng)
  98. .set_rng(1, &default_rng)
  99. .set_rng(2, &default_rng)
  100. .set_param(param)
  101. .execs({src_shape, filter_shape, bias_shape, {}, {}})
  102. .execs({src_shape, filter_shape, bias_shape_channel, {}, {}});
  103. }
  104. }
  105. TEST_F(FALLBACK, FP32_GEMV_MK4_GI) {
  106. Checker<MatrixMul> checker(handle());
  107. using Param = MatrixMul::Param;
  108. checker.set_before_exec_callback(AlgoChecker<MatrixMul>("FB_GI_F32_GEMV_MK4"));
  109. checker.set_epsilon(1e-2);
  110. auto run = [&](size_t M, size_t K) {
  111. Param param;
  112. param.format = param::MatrixMul::Format::MK4;
  113. param.transposeA = false;
  114. param.transposeB = false;
  115. TensorShape A, B;
  116. A = TensorShape{M / 4, K / 4, 4, 4};
  117. B = TensorShape{K / 4, 1, 4};
  118. checker.set_param(param).execs({A, B, {}});
  119. };
  120. // N = 1
  121. for (size_t M : {4, 16, 128, 1024})
  122. for (size_t K : {4, 8, 12, 128, 256, 4096})
  123. run(M, K);
  124. }
  125. std::vector<conv_bias::TestArg> get_conv_bias_args(
  126. std::vector<size_t> kernel, std::vector<size_t> padv,
  127. std::vector<param::ConvBias::NonlineMode> nlmodev, std::vector<size_t> stridev,
  128. bool no_bias, bool only_broadbias) {
  129. using namespace conv_bias;
  130. using Param = param::ConvBias;
  131. using NLMode = param::ConvBias::NonlineMode;
  132. std::vector<TestArg> args;
  133. auto pack = [&](size_t n, size_t oc, size_t ic, size_t w, size_t h, size_t pad,
  134. size_t kernel, size_t stride, NLMode nonlinemode) {
  135. Param param;
  136. param.stride_h = stride;
  137. param.stride_w = stride;
  138. param.pad_h = pad;
  139. param.pad_w = pad;
  140. param.nonlineMode = nonlinemode;
  141. args.emplace_back(
  142. param, TensorShape{n, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
  143. TensorShape{});
  144. if (!no_bias) {
  145. args.emplace_back(
  146. param, TensorShape{n, ic, h, w},
  147. TensorShape{oc, ic, kernel, kernel}, TensorShape{1, oc, 1, 1});
  148. if (!only_broadbias) {
  149. args.emplace_back(
  150. param, TensorShape{n, ic, h, w},
  151. TensorShape{oc, ic, kernel, kernel},
  152. TensorShape{
  153. n, oc, (h + 2 * param.pad_h - kernel) / stride + 1,
  154. (w + 2 * param.pad_h - kernel) / stride + 1});
  155. }
  156. }
  157. };
  158. auto pack_group = [&](size_t n, size_t oc, size_t ic, size_t w, size_t h,
  159. size_t pad, size_t kernel, size_t stride,
  160. NLMode nonlinemode) {
  161. Param param;
  162. param.stride_h = stride;
  163. param.stride_w = stride;
  164. param.pad_h = pad;
  165. param.pad_w = pad;
  166. param.nonlineMode = nonlinemode;
  167. param.sparse = param::ConvBias::Sparse::GROUP;
  168. args.emplace_back(
  169. param, TensorShape{n, 2 * ic, h, w},
  170. TensorShape{2, oc, ic, kernel, kernel}, TensorShape{});
  171. if (!no_bias) {
  172. args.emplace_back(
  173. param, TensorShape{n, 2 * ic, h, w},
  174. TensorShape{2, oc, ic, kernel, kernel},
  175. TensorShape{1, oc * 2, 1, 1});
  176. if (!only_broadbias) {
  177. args.emplace_back(
  178. param, TensorShape{n, 2 * ic, h, w},
  179. TensorShape{2, oc, ic, kernel, kernel},
  180. TensorShape{
  181. n, 2 * oc, (h + 2 * param.pad_h - kernel) / stride + 1,
  182. (w + 2 * param.pad_h - kernel) / stride + 1});
  183. }
  184. }
  185. };
  186. for (size_t n : {1, 2}) {
  187. for (auto nlmode : nlmodev) {
  188. for (auto pad : padv) {
  189. for (auto stride : stridev) {
  190. for (size_t ic : {1, 5}) {
  191. for (size_t oc : {1, 11}) {
  192. for (size_t size : {9, 30}) {
  193. for (size_t kern : kernel) {
  194. pack(n, oc, ic, size + 4, size + 4, pad, kern,
  195. stride, nlmode);
  196. pack_group(
  197. n, oc, ic, size, size, pad, kern, stride,
  198. nlmode);
  199. }
  200. }
  201. }
  202. }
  203. }
  204. }
  205. }
  206. }
  207. return args;
  208. }
  209. void checker_conv_bias(
  210. std::vector<conv_bias::TestArg> args, Handle* handle, RNG* rng, float epsilon,
  211. DType type0, DType type1, DType type2, DType type3, const char* algo_name) {
  212. using namespace conv_bias;
  213. Checker<ConvBias> checker(handle);
  214. checker.set_before_exec_callback(
  215. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));
  216. checker.set_dtype(0, type0);
  217. checker.set_dtype(1, type1);
  218. checker.set_dtype(2, type2);
  219. checker.set_dtype(4, type3);
  220. checker.set_epsilon(epsilon);
  221. if (NULL != rng) {
  222. checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng).set_rng(3, rng);
  223. }
  224. for (auto&& arg : args) {
  225. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}});
  226. }
  227. }
  228. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_1X1_S1_MK4_PACK_F32) {
  229. using namespace conv_bias;
  230. std::vector<conv_bias::TestArg> args =
  231. get_nchw44_conv_bias_args({1}, FULL_NLMODE, ALL_BIASMODE, 1, true);
  232. check_conv_bias(args, handle(), "CONV1x1:FB_GI_F32_MK4_PACK_4x12:24");
  233. }
  234. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_IM2COL_S1_MK4_PACK_F32_PREPROCESS) {
  235. using namespace conv_bias;
  236. std::vector<conv_bias::TestArg> args =
  237. get_nchw44_conv_bias_args({2, 4, 7}, FULL_NLMODE, BR_AND_NO_BIASMODE, 1);
  238. #define cb(name) \
  239. check_conv_bias_preprocess( \
  240. args, handle(), nullptr, 0.001, dtype::Float32(), dtype::Float32(), \
  241. dtype::Float32(), dtype::Float32(), name);
  242. cb("IM2COLMATMUL:FB_GI_F32_MK4_PACK_4x12");
  243. #undef cb
  244. }
  245. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_IM2COL_S2_MK4_PACK_F32_FUSE_PREPROCESS) {
  246. using namespace conv_bias;
  247. std::vector<conv_bias::TestArg> args =
  248. get_nchw44_conv_bias_args({3}, FULL_NLMODE, BR_AND_BIAS_BIASMODE, 2);
  249. #define cb(name) \
  250. check_conv_bias_preprocess( \
  251. args, handle(), nullptr, 0.001, dtype::Float32(), dtype::Float32(), \
  252. dtype::Float32(), dtype::Float32(), name);
  253. cb("IM2COLMATMUL:FB_GI_F32_MK4_PACK_4x12");
  254. #undef cb
  255. }
  256. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_1X1_S1_MK4_PACK_F32_PREPROCESS) {
  257. using namespace conv_bias;
  258. std::vector<conv_bias::TestArg> args =
  259. get_nchw44_conv_bias_args({1}, FULL_NLMODE, ALL_BIASMODE, 1, true);
  260. #define cb(name) \
  261. check_conv_bias_preprocess( \
  262. args, handle(), nullptr, 0.001, dtype::Float32(), dtype::Float32(), \
  263. dtype::Float32(), dtype::Float32(), name);
  264. cb("CONV1x1:FB_GI_F32_MK4_PACK_4x12:24");
  265. #undef cb
  266. }
  267. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_IM2COL_S1_MK4_PACK_F32) {
  268. using namespace conv_bias;
  269. std::vector<conv_bias::TestArg> args =
  270. get_nchw44_conv_bias_args({2, 4, 7}, FULL_NLMODE, BR_AND_BIAS_BIASMODE, 1);
  271. check_conv_bias(args, handle(), "IM2COLMATMUL:FB_GI_F32_MK4_PACK_4x12");
  272. }
  273. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_IM2COL_S2_MK4_PACK_F32) {
  274. using namespace conv_bias;
  275. std::vector<conv_bias::TestArg> args =
  276. get_nchw44_conv_bias_args({3, 5, 6}, FULL_NLMODE, BR_AND_BIAS_BIASMODE, 2);
  277. #define cb(name) check_conv_bias(args, handle(), name);
  278. cb("IM2COLMATMUL:FB_GI_F32_MK4_PACK_4x12");
  279. #undef cb
  280. }
  281. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_IM2COL_S2_MK4_PACK_F32_FUSE) {
  282. using namespace conv_bias;
  283. std::vector<conv_bias::TestArg> args =
  284. get_nchw44_conv_bias_args({3}, FULL_NLMODE, ALL_BIASMODE, 2);
  285. #define cb(name) check_conv_bias(args, handle(), name);
  286. cb("IM2COLMATMUL:FB_GI_F32_MK4_PACK_4x12");
  287. #undef cb
  288. }
  289. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_FORWARD_IM2COL_8X8X16) {
  290. using namespace conv_bias;
  291. param::ConvBias cur_param;
  292. using NLMode = param::ConvBias::NonlineMode;
  293. std::vector<conv_bias::TestArg> args = get_conv_bias_args(
  294. {1, 3}, {0}, {NLMode::IDENTITY, NLMode::RELU}, {1}, false, true);
  295. NormalRNG default_rng;
  296. Checker<ConvBias> checker(handle());
  297. checker.set_dtype(0, dtype::Int8{});
  298. checker.set_dtype(1, dtype::Int8{});
  299. checker.set_dtype(2, dtype::Int16{});
  300. checker.set_dtype(4, dtype::Int16{});
  301. for (auto&& arg : args) {
  302. checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}});
  303. }
  304. }
  305. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_FORWARD) {
  306. using namespace conv_bias;
  307. param::ConvBias cur_param;
  308. using NLMode = param::ConvBias::NonlineMode;
  309. std::vector<conv_bias::TestArg> args = get_conv_bias_args(
  310. {1, 3, 5}, {0, 3},
  311. {NLMode::IDENTITY, NLMode::H_SWISH, NLMode::SIGMOID, NLMode::RELU}, {1, 2},
  312. false, false);
  313. NormalRNG default_rng;
  314. checker_conv_bias(
  315. args, handle(), &default_rng, 1e-3, dtype::Float32{}, dtype::Float32{},
  316. dtype::Float32{}, dtype::Float32{}, "FALLBACK_NAIVE");
  317. }
  318. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_NCHW_NCHW44_F32_S2) {
  319. check_conv_bias(
  320. conv_bias::get_nchw44_conv_bias_args(
  321. {2, 3, 5, 7}, ONLY_IDENTITY_NLMODE, ONLY_BR_BIASMODE, 2, false,
  322. true),
  323. handle(), "F32_CONV_NCHW_NCHW44");
  324. }
  325. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_NCHW_NCHW44_F32_S1) {
  326. check_conv_bias(
  327. conv_bias::get_nchw44_conv_bias_args(
  328. {2, 3, 5, 7}, ONLY_IDENTITY_NLMODE, ONLY_BR_BIASMODE, 1, false,
  329. true),
  330. handle(), "F32_CONV_NCHW_NCHW44");
  331. }
  332. #define CB(_MODE, _SUFFIX) \
  333. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_NCHW_NCHW44_F32_S2_AGENT_##_SUFFIX) { \
  334. check_conv_bias( \
  335. conv_bias::get_nchw44_conv_bias_args( \
  336. {2, 3, 5, 7}, ONLY_IDENTITY_NLMODE, {_MODE}, 2, false, true), \
  337. handle(), "F32_CONV_AGENT_NCHW_NCHW44"); \
  338. }
  339. CB(megdnn::BiasMode::NO_BIAS, NO_BIAS);
  340. CB(megdnn::BiasMode::BROADCAST_CHANNEL_BIAS, BROADCAST_CHANNEL_BIAS);
  341. #undef CB
  342. #define CB(_MODE, _SUFFIX) \
  343. TEST_F(FALLBACK_MULTI_THREADS, \
  344. CONVBIAS_GI_NCHW_NCHW44_F32_S1_AGENT_IDENTITY_##_SUFFIX) { \
  345. check_conv_bias( \
  346. conv_bias::get_nchw44_conv_bias_args( \
  347. {2, 3, 5, 7}, {_MODE}, ONLY_BR_BIASMODE, 1, false, true), \
  348. handle(), "F32_CONV_AGENT_NCHW_NCHW44"); \
  349. }
  350. CB(param::ConvBias::NonlineMode::IDENTITY, IDENTITY);
  351. CB(param::ConvBias::NonlineMode::RELU, RELU);
  352. CB(param::ConvBias::NonlineMode::H_SWISH, H_SWISH);
  353. CB(param::ConvBias::NonlineMode::SIGMOID, SIGMOID);
  354. #undef CB
  355. std::vector<conv_bias::TestArg> get_nchw44_channel_wise_args(
  356. std::vector<size_t> kernel, size_t stride, bool no_bias, bool no_nonlinemode,
  357. bool no_full_bias) {
  358. using namespace conv_bias;
  359. using Param = param::ConvBias;
  360. using NLMode = param::ConvBias::NonlineMode;
  361. std::vector<TestArg> args;
  362. auto pack = [&](size_t n, size_t group, size_t w, size_t h, size_t kernel,
  363. size_t stride, NLMode nlmode, bool pad) {
  364. Param param;
  365. param.stride_h = stride;
  366. param.stride_w = stride;
  367. if (pad) {
  368. param.pad_h = kernel / 2;
  369. param.pad_w = kernel / 2;
  370. } else {
  371. param.pad_h = 0;
  372. param.pad_w = 0;
  373. }
  374. param.nonlineMode = nlmode;
  375. param.format = param::ConvBias::Format::NCHW44;
  376. param.sparse = param::ConvBias::Sparse::GROUP;
  377. args.emplace_back(
  378. param, TensorShape{n, group, h, w, 4},
  379. TensorShape{group, 1, 1, kernel, kernel, 4}, TensorShape{});
  380. if (!no_bias) {
  381. args.emplace_back(
  382. param, TensorShape{n, group, h, w, 4},
  383. TensorShape{group, 1, 1, kernel, kernel, 4},
  384. TensorShape{1, group, 1, 1, 4});
  385. }
  386. if (!no_full_bias) {
  387. args.emplace_back(
  388. param, TensorShape{n, group, h, w, 4},
  389. TensorShape{group, 1, 1, kernel, kernel, 4},
  390. TensorShape{
  391. n, group, (h + 2 * param.pad_w - kernel) / stride + 1,
  392. (w + 2 * param.pad_w - kernel) / stride + 1, 4});
  393. }
  394. };
  395. std::vector<NLMode> nonlinemode = {NLMode::IDENTITY};
  396. if (!no_nonlinemode) {
  397. nonlinemode.emplace_back(NLMode::RELU);
  398. nonlinemode.emplace_back(NLMode::H_SWISH);
  399. }
  400. for (size_t n : {1, 2}) {
  401. for (auto nlmode : nonlinemode) {
  402. for (bool pad : {true}) {
  403. for (size_t group : {1, 2, 4, 7, 16}) {
  404. for (size_t size : {4, 6, 7, 9, 20}) {
  405. for (size_t kern : kernel) {
  406. pack(n, group, size, size, kern, stride, nlmode, pad);
  407. }
  408. }
  409. }
  410. }
  411. for (bool pad : {false}) {
  412. for (size_t group : {1, 2, 7, 16}) {
  413. for (size_t size : {7, 9, 20}) {
  414. for (size_t kern : kernel) {
  415. pack(n, group, size, size, kern, stride, nlmode, pad);
  416. }
  417. }
  418. }
  419. }
  420. }
  421. }
  422. return args;
  423. }
  424. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_CHANNEL_WISE_STRIDE1_FP32_NCHW44_1) {
  425. check_conv_bias(
  426. get_nchw44_channel_wise_args({2, 3}, 1, false, false, false), handle(),
  427. "F32_CHANNEL_WISE_NCHW44");
  428. }
  429. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_CHANNEL_WISE_STRIDE1_FP32_NCHW44_2) {
  430. check_conv_bias(
  431. get_nchw44_channel_wise_args({5}, 1, false, false, false), handle(),
  432. "F32_CHANNEL_WISE_NCHW44");
  433. }
  434. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_CHANNEL_WISE_STRIDE2_FP32_NCHW44) {
  435. check_conv_bias(
  436. get_nchw44_channel_wise_args({2, 3, 5}, 2, false, false, false), handle(),
  437. "F32_CHANNEL_WISE_NCHW44");
  438. }
  439. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S1_K7) {
  440. //! k=7 s=1
  441. check_conv_bias(
  442. conv_bias::get_nchw44_conv_bias_args(
  443. {7}, ONLY_IDENTITY_NLMODE, BR_AND_NO_BIASMODE, 1),
  444. handle(), "F32_CONV_NCHW44_DIRECT");
  445. }
  446. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S1_K2K3) {
  447. check_conv_bias(
  448. conv_bias::get_nchw44_conv_bias_args(
  449. {2, 3}, FULL_NLMODE, ONLY_BR_BIASMODE, 1),
  450. handle(), "F32_CONV_NCHW44_DIRECT");
  451. }
  452. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S1_K5) {
  453. check_conv_bias(
  454. conv_bias::get_nchw44_conv_bias_args({5}, FULL_NLMODE, ONLY_BR_BIASMODE, 1),
  455. handle(), "F32_CONV_NCHW44_DIRECT");
  456. }
  457. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S2) {
  458. check_conv_bias(
  459. conv_bias::get_nchw44_conv_bias_args(
  460. {2, 3, 5, 7}, FULL_NLMODE, ONLY_BR_BIASMODE, 2),
  461. handle(), "F32_CONV_NCHW44_DIRECT");
  462. }
  463. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32) {
  464. check_conv_bias(
  465. conv_bias::get_conv_bias_args(
  466. {1, 2, 3, 4, 5, 6, 7}, 1, false, false, false),
  467. handle(), "F32DIRECT");
  468. }
  469. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_STR2) {
  470. check_conv_bias(
  471. conv_bias::get_conv_bias_args({2, 3, 5, 7}, 2, false, false, false),
  472. handle(), "F32STRD2");
  473. }
  474. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_STR1) {
  475. check_conv_bias(
  476. conv_bias::get_conv_bias_args({2, 3, 5, 7}, 1, false, false, false),
  477. handle(), "F32STRD1");
  478. }
  479. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F23_4) {
  480. using namespace conv_bias;
  481. std::vector<TestArg> args = get_winograd_mk_packed_args();
  482. Checker<ConvBiasForward> checker(handle());
  483. check_winograd("4:2:32", checker, args, param::MatrixMul::Format::MK4);
  484. }
  485. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F23_4_NCHW44) {
  486. using namespace conv_bias;
  487. std::vector<TestArg> args =
  488. get_nchw44_conv_bias_args({3}, QUAN_NLMODE, BR_AND_NO_BIASMODE, 1);
  489. Checker<ConvBiasForward> checker(handle());
  490. check_winograd(
  491. "4:2:32", checker, args, param::MatrixMul::Format::MK4,
  492. param::ConvBias::Format::NCHW44);
  493. }
  494. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F23_4_WEIGHT_PREPROCESS) {
  495. using namespace conv_bias;
  496. std::vector<TestArg> args = get_winograd_mk_packed_args();
  497. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  498. handle());
  499. check_winograd("4:2:32", checker, args, param::MatrixMul::Format::MK4);
  500. }
  501. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F23_4_NCHW44_WEIGHT_PREPROCESS) {
  502. using namespace conv_bias;
  503. std::vector<TestArg> args =
  504. get_nchw44_conv_bias_args({3}, QUAN_NLMODE, BR_AND_NO_BIASMODE, 1);
  505. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  506. handle());
  507. check_winograd(
  508. "4:2:32", checker, args, param::MatrixMul::Format::MK4,
  509. param::ConvBias::Format::NCHW44);
  510. }
  511. TEST_F(FALLBACK, CONVBIAS_GI_WINOGRAD_F63_4) {
  512. using namespace conv_bias;
  513. std::vector<TestArg> args = get_winograd_mk_packed_args();
  514. Checker<ConvBiasForward> checker(handle());
  515. check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4);
  516. }
  517. TEST_F(FALLBACK, CONVBIAS_GI_WINOGRAD_F63_4_WEIGHT_PREPROCESS) {
  518. using namespace conv_bias;
  519. std::vector<TestArg> args = get_winograd_mk_packed_args();
  520. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  521. handle());
  522. check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4);
  523. }
  524. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F63) {
  525. using namespace conv_bias;
  526. std::vector<TestArg> args = get_winograd_args(3);
  527. Checker<ConvBiasForward> checker(handle());
  528. check_winograd("1:6:32", checker, args);
  529. }
  530. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F63_4) {
  531. using namespace conv_bias;
  532. std::vector<TestArg> args = get_winograd_mk_packed_args();
  533. Checker<ConvBiasForward> checker(handle());
  534. check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4);
  535. }
  536. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F63_4_NCHW44) {
  537. using namespace conv_bias;
  538. std::vector<TestArg> args =
  539. get_nchw44_conv_bias_args({3}, QUAN_NLMODE, BR_AND_NO_BIASMODE, 1);
  540. Checker<ConvBiasForward> checker(handle());
  541. check_winograd(
  542. "4:6:16", checker, args, param::MatrixMul::Format::MK4,
  543. param::ConvBias::Format::NCHW44);
  544. }
  545. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F54) {
  546. using namespace conv_bias;
  547. std::vector<TestArg> args = get_winograd_args(4);
  548. Checker<ConvBiasForward> checker(handle());
  549. check_winograd("1:5:32", checker, args);
  550. }
  551. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F45) {
  552. using namespace conv_bias;
  553. std::vector<TestArg> args = get_winograd_args(5);
  554. Checker<ConvBiasForward> checker(handle());
  555. check_winograd("1:4:32", checker, args);
  556. }
  557. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F63_WEIGHT_PREPROCESS) {
  558. using namespace conv_bias;
  559. std::vector<TestArg> args = get_winograd_args(3);
  560. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  561. handle());
  562. check_winograd("1:6:32", checker, args);
  563. }
  564. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F63_4_WEIGHT_PREPROCESS) {
  565. using namespace conv_bias;
  566. std::vector<TestArg> args = get_winograd_mk_packed_args();
  567. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  568. handle());
  569. check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4);
  570. }
  571. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F63_4_NCHW44_WEIGHT_PREPROCESS) {
  572. using namespace conv_bias;
  573. std::vector<TestArg> args =
  574. get_nchw44_conv_bias_args({3}, QUAN_NLMODE, BR_AND_NO_BIASMODE, 1);
  575. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  576. handle());
  577. check_winograd(
  578. "4:6:16", checker, args, param::MatrixMul::Format::MK4,
  579. param::ConvBias::Format::NCHW44);
  580. }
  581. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F54_WEIGHT_PREPROCESS) {
  582. using namespace conv_bias;
  583. std::vector<TestArg> args = get_winograd_args(4);
  584. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  585. handle());
  586. check_winograd("1:5:32", checker, args);
  587. }
  588. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F45_WEIGHT_PREPROCESS) {
  589. using namespace conv_bias;
  590. std::vector<TestArg> args = get_winograd_args(5);
  591. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  592. handle());
  593. check_winograd("1:4:32", checker, args);
  594. }
  595. TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_PREPROCESS_NCHW44) {
  596. using namespace conv_bias;
  597. std::vector<TestArg> nchw44_args = conv_bias::get_nchw44_conv_bias_args(
  598. {3}, QUAN_NLMODE, BR_AND_NO_BIASMODE, 1);
  599. Checker<ConvBiasForward> checker(handle());
  600. auto run = [&checker](
  601. const std::vector<TestArg>& args, DType A_dtype, DType B_dtype,
  602. DType C_dtype, DType D_dtype, const float eps) {
  603. for (auto&& arg : args) {
  604. checker.set_dtype(0, A_dtype)
  605. .set_dtype(1, B_dtype)
  606. .set_dtype(2, C_dtype)
  607. .set_dtype(4, D_dtype)
  608. .set_epsilon(eps)
  609. .set_param(arg.param)
  610. .execs({arg.src, arg.filter, arg.bias, {}, {}});
  611. }
  612. };
  613. //! uncomment this when low precision mode is ok
  614. // run(handle(), nchw44_args, {2, 6, 7}, dtype::Float32(), dtype::Float32(),
  615. // dtype::Float32(), dtype::Float32(), 1e-2f);
  616. //! remove this when low precision mode is ok
  617. run(nchw44_args, dtype::Float32(), dtype::Float32(), dtype::Float32(),
  618. dtype::Float32(), 1e-3f);
  619. }
  620. TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_FORWARD_QUANTIZED) {
  621. using namespace conv_bias;
  622. param::ConvBias cur_param;
  623. using NLMode = param::ConvBias::NonlineMode;
  624. std::vector<conv_bias::TestArg> args = get_conv_bias_args(
  625. {1, 3, 5, 7}, {0, 3}, {NLMode::IDENTITY, NLMode::H_SWISH, NLMode::RELU},
  626. {1, 2}, false, false);
  627. UniformIntRNG int_rng{-50, 50};
  628. float epsilon = 1e-3;
  629. checker_conv_bias(
  630. args, handle(), &int_rng, epsilon, dtype::QuantizedS8(2.5f),
  631. dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f),
  632. dtype::QuantizedS8(60.25f), "FALLBACK_NAIVE");
  633. }
  634. #if MEGDNN_WITH_BENCHMARK
  635. namespace {
  636. void benchmark_impl(
  637. const param::ConvBias param,
  638. std::vector<std::pair<SmallVector<TensorShape>, float>>& shapes_and_computation,
  639. const std::string algo_name, size_t RUNS,
  640. TaskExecutorConfig&& multi_thread_config,
  641. TaskExecutorConfig&& single_thread_config, std::vector<DType>& data_type) {
  642. std::vector<float> multi_thread_times, single_thread_times;
  643. {
  644. auto multi_thread_hanle = create_cpu_handle(0, true, &multi_thread_config);
  645. auto benchmarker = Benchmarker<ConvBias>(multi_thread_hanle.get());
  646. benchmarker.set_times(RUNS)
  647. .set_display(false)
  648. .set_param(param)
  649. .set_dtype(0, data_type[0])
  650. .set_dtype(1, data_type[1])
  651. .set_dtype(2, data_type[2])
  652. .set_dtype(4, data_type[3])
  653. .set_before_exec_callback(
  654. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name.c_str()));
  655. for (auto shape : shapes_and_computation) {
  656. multi_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  657. }
  658. }
  659. {
  660. auto single_thread_handle = create_cpu_handle(0, true, &single_thread_config);
  661. auto benchmarker = Benchmarker<ConvBias>(single_thread_handle.get());
  662. benchmarker.set_times(RUNS)
  663. .set_display(false)
  664. .set_param(param)
  665. .set_dtype(0, data_type[0])
  666. .set_dtype(1, data_type[1])
  667. .set_dtype(2, data_type[2])
  668. .set_dtype(4, data_type[3])
  669. .set_before_exec_callback(
  670. conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name.c_str()));
  671. for (auto shape : shapes_and_computation) {
  672. single_thread_times.push_back(benchmarker.exec(shape.first) / RUNS);
  673. }
  674. }
  675. printf("Benchmark : Multi threads %zu, ", multi_thread_config.nr_thread);
  676. printf("core_ids:");
  677. for (size_t i = 0; i < multi_thread_config.affinity_core_set.size(); i++) {
  678. printf("%zu ", multi_thread_config.affinity_core_set[i]);
  679. }
  680. printf(", Single thread core_id %zu\n", single_thread_config.affinity_core_set[0]);
  681. for (size_t i = 0; i < shapes_and_computation.size(); i++) {
  682. auto shapes = shapes_and_computation[i];
  683. printf("Bench case: ");
  684. for (auto&& shape : shapes.first) {
  685. printf("%s ", shape.to_string().c_str());
  686. }
  687. float computations = shapes.second;
  688. printf("%zu threads gflops: %f,\n single thread gflops: "
  689. "%f. spead up = %f, speedup/cores=%f\n",
  690. multi_thread_config.nr_thread, computations / multi_thread_times[i],
  691. computations / single_thread_times[i],
  692. single_thread_times[i] / multi_thread_times[i],
  693. single_thread_times[i] / multi_thread_times[i] /
  694. multi_thread_config.nr_thread);
  695. }
  696. }
  697. } // namespace
  698. TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_GI_CONVBIAS_DIRECTF32) {
  699. constexpr size_t RUNS = 50;
  700. param::ConvBias param;
  701. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  702. param.pad_h = 1;
  703. param.pad_w = 1;
  704. param.stride_h = 1;
  705. param.stride_w = 1;
  706. param.sparse = param::ConvBias::Sparse::GROUP;
  707. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  708. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  709. size_t group) {
  710. SmallVector<TensorShape> shapes{
  711. {N, IC, H, W},
  712. {group, OC / group, IC / group, FS, FS},
  713. {1, OC, 1, 1},
  714. {},
  715. {N, OC, H, W}};
  716. TensorShape dst{N, OC, H, W};
  717. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  718. dst.total_nr_elems()) *
  719. 1e-6;
  720. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  721. };
  722. bench_case(1, 32, 32, 200, 200, 3, 4);
  723. bench_case(1, 32, 32, 200, 200, 3, 32);
  724. bench_case(1, 32, 32, 128, 128, 3, 4);
  725. bench_case(1, 32, 32, 128, 128, 3, 32);
  726. bench_case(1, 32, 32, 100, 100, 3, 4);
  727. bench_case(1, 32, 32, 100, 100, 3, 32);
  728. bench_case(1, 32, 32, 80, 80, 3, 4);
  729. bench_case(1, 32, 32, 80, 80, 3, 32);
  730. std::string algo_name = "F32DIRECT";
  731. printf("Benchmark F32DIRECT_LARGE_GROUP algo\n");
  732. std::vector<DType> data_type = {
  733. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  734. benchmark_impl(
  735. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  736. data_type);
  737. benchmark_impl(
  738. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  739. data_type);
  740. benchmark_impl(
  741. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  742. data_type);
  743. shapes_and_computation.clear();
  744. algo_name = "F32DIRECT";
  745. printf("Benchmark F32DIRECT_SMALL_GROUP algo\n");
  746. bench_case(1, 32, 32, 200, 200, 3, 1);
  747. bench_case(1, 32, 32, 128, 128, 3, 1);
  748. bench_case(1, 32, 32, 100, 100, 3, 1);
  749. bench_case(1, 32, 32, 80, 80, 3, 1);
  750. benchmark_impl(
  751. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  752. data_type);
  753. benchmark_impl(
  754. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  755. data_type);
  756. benchmark_impl(
  757. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  758. data_type);
  759. }
  760. TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_GI_CONVBIAS_DIRECTF32_STR1) {
  761. constexpr size_t RUNS = 50;
  762. param::ConvBias param;
  763. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  764. param.pad_h = 1;
  765. param.pad_w = 1;
  766. param.stride_h = 1;
  767. param.stride_w = 1;
  768. param.sparse = param::ConvBias::Sparse::GROUP;
  769. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  770. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  771. size_t group) {
  772. SmallVector<TensorShape> shapes{
  773. {N, IC, H, W},
  774. {group, OC / group, IC / group, FS, FS},
  775. {1, OC, 1, 1},
  776. {},
  777. {N, OC, H, W}};
  778. TensorShape dst{N, OC, H, W};
  779. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  780. dst.total_nr_elems()) *
  781. 1e-6;
  782. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  783. };
  784. bench_case(1, 32, 32, 200, 200, 3, 4);
  785. bench_case(1, 32, 32, 200, 200, 3, 32);
  786. bench_case(1, 32, 32, 128, 128, 3, 4);
  787. bench_case(1, 32, 32, 128, 128, 3, 32);
  788. bench_case(1, 32, 32, 100, 100, 3, 4);
  789. bench_case(1, 32, 32, 100, 100, 3, 32);
  790. bench_case(1, 32, 32, 80, 80, 3, 4);
  791. bench_case(1, 32, 32, 80, 80, 3, 32);
  792. std::string algo_name = "F32STRD1";
  793. printf("Benchmark F32STRD1_LARGE_GROUP algo\n");
  794. std::vector<DType> data_type = {
  795. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  796. benchmark_impl(
  797. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  798. data_type);
  799. benchmark_impl(
  800. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  801. data_type);
  802. benchmark_impl(
  803. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  804. data_type);
  805. shapes_and_computation.clear();
  806. algo_name = "F32STRD1";
  807. printf("Benchmark F32STRD1_SMALL_GROUP algo\n");
  808. bench_case(1, 32, 32, 200, 200, 3, 1);
  809. bench_case(1, 32, 32, 128, 128, 3, 1);
  810. bench_case(1, 32, 32, 100, 100, 3, 1);
  811. bench_case(1, 32, 32, 80, 80, 3, 1);
  812. benchmark_impl(
  813. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  814. data_type);
  815. benchmark_impl(
  816. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  817. data_type);
  818. benchmark_impl(
  819. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  820. data_type);
  821. }
  822. TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_GI_CONVBIAS_DIRECTF32_STR2) {
  823. constexpr size_t RUNS = 50;
  824. param::ConvBias param;
  825. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  826. param.pad_h = 1;
  827. param.pad_w = 1;
  828. param.stride_h = 2;
  829. param.stride_w = 2;
  830. param.sparse = param::ConvBias::Sparse::GROUP;
  831. std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
  832. auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
  833. size_t group, size_t P, size_t S) {
  834. SmallVector<TensorShape> shapes{
  835. {N, IC, H, W},
  836. {group, OC / group, IC / group, FS, FS},
  837. {1, OC, 1, 1},
  838. {},
  839. {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}};
  840. TensorShape dst{N, OC, H, W};
  841. float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
  842. dst.total_nr_elems()) *
  843. 1e-6;
  844. shapes_and_computation.push_back(std::make_pair(shapes, computations));
  845. };
  846. bench_case(1, 32, 32, 200, 200, 3, 4, 1, 2);
  847. bench_case(1, 32, 32, 200, 200, 3, 32, 1, 2);
  848. bench_case(1, 32, 32, 128, 128, 3, 4, 1, 2);
  849. bench_case(1, 32, 32, 128, 128, 3, 32, 1, 2);
  850. bench_case(1, 32, 32, 100, 100, 3, 4, 1, 2);
  851. bench_case(1, 32, 32, 100, 100, 3, 32, 1, 2);
  852. bench_case(1, 32, 32, 80, 80, 3, 4, 1, 2);
  853. bench_case(1, 32, 32, 80, 80, 3, 32, 1, 2);
  854. std::string algo_name = "F32STRD2";
  855. printf("Benchmark F32STRD2_LARGE_GROUP algo\n");
  856. std::vector<DType> data_type = {
  857. dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
  858. benchmark_impl(
  859. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  860. data_type);
  861. benchmark_impl(
  862. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  863. data_type);
  864. benchmark_impl(
  865. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  866. data_type);
  867. shapes_and_computation.clear();
  868. algo_name = "F32STRD2";
  869. printf("Benchmark F32STRD2_SMALL_GROUP algo\n");
  870. bench_case(1, 32, 32, 200, 200, 3, 1, 1, 2);
  871. bench_case(1, 32, 32, 128, 128, 3, 1, 1, 2);
  872. bench_case(1, 32, 32, 100, 100, 3, 1, 1, 2);
  873. bench_case(1, 32, 32, 80, 80, 3, 1, 1, 2);
  874. benchmark_impl(
  875. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
  876. data_type);
  877. benchmark_impl(
  878. param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
  879. data_type);
  880. benchmark_impl(
  881. param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
  882. data_type);
  883. }
  884. TEST_F(FALLBACK, BENCHMARK_GI_CHANNEL_WISE_F32_STRIDE1_NCHW44) {
  885. // have to remove preferred restrict in usable func before run the benchmark
  886. using namespace conv_bias;
  887. param::ConvBias param;
  888. param.stride_h = 1;
  889. param.stride_w = 1;
  890. param.pad_h = 1;
  891. param.pad_w = 1;
  892. param.nonlineMode = NonlineMode::RELU;
  893. param.sparse = param::ConvBias::Sparse::GROUP;
  894. constexpr size_t RUN = 50;
  895. Benchmarker<ConvBias> benchmark0(handle());
  896. benchmark0.set_display(false);
  897. benchmark0.set_param(param);
  898. benchmark0.set_times(RUN);
  899. benchmark0.set_before_exec_callback(
  900. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32STRD1"));
  901. auto opr = handle()->create_operator<ConvBias>();
  902. opr->param() = param;
  903. param.format = param::ConvBias::Format::NCHW44;
  904. Benchmarker<ConvBias> benchmark1(handle());
  905. benchmark1.set_display(false);
  906. benchmark1.set_param(param);
  907. benchmark1.set_times(RUN);
  908. benchmark1.set_before_exec_callback(
  909. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32_CHANNEL_WISE_NCHW44"));
  910. auto run = [&](size_t group, size_t w, size_t h, size_t kernel) {
  911. TensorLayout dst_layout;
  912. opr->deduce_layout(
  913. {{1, group * 4, h, w}, dtype::Int8()},
  914. {{group * 4, 1, 1, kernel, kernel}, dtype::Int8()},
  915. {{1, group * 4, 1, 1}, dtype::Int32()}, {}, dst_layout);
  916. //! dst.nr_elems * IC * FH * FW * 2
  917. float computations = dst_layout.total_nr_elems() * kernel * kernel * 2.0 /
  918. (1024 * 1024 * 1024) * 1e3;
  919. auto used0 = benchmark0.exec(
  920. {{1, group * 4, h, w},
  921. {group * 4, 1, 1, kernel, kernel},
  922. {1, group * 4, 1, 1},
  923. {},
  924. {}}) /
  925. RUN;
  926. auto used1 = benchmark1.exec(
  927. {{1, group, h, w, 4},
  928. {group, 1, 1, kernel, kernel, 4},
  929. {1, group, 1, 1, 4},
  930. {},
  931. {}}) /
  932. RUN;
  933. printf("group/h/w/kernel:%zu,%zu,%zu,%zu: nchw: %f ms %f Gflops "
  934. "nchw44: "
  935. "%f ms %f GFlops "
  936. "speedup: %f\n",
  937. group, h, w, kernel, used0, computations / used0, used1,
  938. computations / used1, used0 / used1);
  939. };
  940. for (size_t group : {8, 16, 32, 64}) {
  941. for (size_t kerenl : {2, 3, 5}) {
  942. run(group, 112, 112, kerenl);
  943. run(group, 56, 56, kerenl);
  944. run(group, 48, 48, kerenl);
  945. run(group, 28, 28, kerenl);
  946. run(group, 14, 14, kerenl);
  947. }
  948. }
  949. run(8, 112, 112, 3);
  950. run(32, 56, 56, 3);
  951. run(64, 28, 28, 3);
  952. run(128, 14, 14, 3);
  953. }
  954. TEST_F(FALLBACK, BENCHMARK_GI_CHANNEL_WISE_F32_STRIDE2_NCHW44) {
  955. // have to remove preferred restrict in usable func before run the benchmark
  956. using namespace conv_bias;
  957. param::ConvBias param;
  958. param.stride_h = 2;
  959. param.stride_w = 2;
  960. param.pad_h = 1;
  961. param.pad_w = 1;
  962. param.nonlineMode = NonlineMode::RELU;
  963. param.sparse = param::ConvBias::Sparse::GROUP;
  964. constexpr size_t RUN = 50;
  965. Benchmarker<ConvBias> benchmark0(handle());
  966. benchmark0.set_display(false);
  967. benchmark0.set_param(param);
  968. benchmark0.set_times(RUN);
  969. benchmark0.set_before_exec_callback(
  970. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32STRD2"));
  971. auto opr = handle()->create_operator<ConvBias>();
  972. opr->param() = param;
  973. param.format = param::ConvBias::Format::NCHW44;
  974. Benchmarker<ConvBias> benchmark1(handle());
  975. benchmark1.set_display(false);
  976. benchmark1.set_param(param);
  977. benchmark1.set_times(RUN);
  978. benchmark1.set_before_exec_callback(
  979. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>("F32_CHANNEL_WISE_NCHW44"));
  980. auto run = [&](size_t group, size_t w, size_t h, size_t kernel) {
  981. TensorLayout dst_layout;
  982. opr->deduce_layout(
  983. {{1, group * 4, h, w}, dtype::Int8()},
  984. {{group * 4, 1, 1, kernel, kernel}, dtype::Int8()},
  985. {{1, group * 4, 1, 1}, dtype::Int32()}, {}, dst_layout);
  986. //! dst.nr_elems * IC * FH * FW * 2
  987. float computations = dst_layout.total_nr_elems() * kernel * kernel * 2.0 /
  988. (1024 * 1024 * 1024) * 1e3;
  989. auto used0 = benchmark0.exec(
  990. {{1, group * 4, h, w},
  991. {group * 4, 1, 1, kernel, kernel},
  992. {1, group * 4, 1, 1},
  993. {},
  994. {}}) /
  995. RUN;
  996. auto used1 = benchmark1.exec(
  997. {{1, group, h, w, 4},
  998. {group, 1, 1, kernel, kernel, 4},
  999. {1, group, 1, 1, 4},
  1000. {},
  1001. {}}) /
  1002. RUN;
  1003. printf("group/h/w/kernel:%zu,%zu,%zu,%zu: nchw: %f ms %f Gflops "
  1004. "nchw44: "
  1005. "%f ms %f GFlops "
  1006. "speedup: %f\n",
  1007. group, h, w, kernel, used0, computations / used0, used1,
  1008. computations / used1, used0 / used1);
  1009. };
  1010. for (size_t group : {8, 16, 32, 64}) {
  1011. for (size_t kerenl : {2, 3, 5}) {
  1012. run(group, 112, 112, kerenl);
  1013. run(group, 56, 56, kerenl);
  1014. run(group, 48, 48, kerenl);
  1015. run(group, 28, 28, kerenl);
  1016. run(group, 14, 14, kerenl);
  1017. }
  1018. }
  1019. run(8, 112, 112, 3);
  1020. run(32, 56, 56, 3);
  1021. run(64, 28, 28, 3);
  1022. run(128, 14, 14, 3);
  1023. }
  1024. TEST_F(FALLBACK, BENCHMARK_CONVBIAS) {
  1025. constexpr size_t RUNS = 10;
  1026. param::ConvBias param;
  1027. param.stride_h = 1;
  1028. param.stride_w = 1;
  1029. Benchmarker<ConvBias> benchmarker_int(handle());
  1030. benchmarker_int.set_times(RUNS)
  1031. .set_dtype(0, dtype::QuantizedS8(2.5f))
  1032. .set_dtype(1, dtype::QuantizedS8(2.5f))
  1033. .set_dtype(2, dtype::QuantizedS32(6.25f))
  1034. .set_dtype(4, dtype::QuantizedS8(40.25f))
  1035. .set_display(false);
  1036. Benchmarker<ConvBias> benchmarker_float(handle());
  1037. benchmarker_float.set_display(false).set_times(RUNS);
  1038. auto run = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS) {
  1039. TensorShape src({N, IC, H, W}), filter({OC, IC, FS, FS}), bias({N, OC, 1, 1}),
  1040. z({}), dst({N, OC, H, W});
  1041. param.pad_h = FS / 2;
  1042. param.pad_w = FS / 2;
  1043. auto int_used =
  1044. benchmarker_int.set_param(param).exec({src, filter, bias, z, dst}) /
  1045. RUNS;
  1046. auto float_used =
  1047. benchmarker_float.set_param(param).exec({src, filter, bias, z, dst}) /
  1048. RUNS;
  1049. float computations = IC * (FS * FS + 1) * dst.total_nr_elems() * 2 * 1e-6;
  1050. printf("run: %s %s %s->%s \nfloat: %f ms %f Gflops int: %f ms "
  1051. "%f Gflops speedup: %f\n",
  1052. src.to_string().c_str(), filter.to_string().c_str(),
  1053. bias.to_string().c_str(), dst.to_string().c_str(), float_used,
  1054. computations / float_used, int_used, computations / int_used,
  1055. float_used / int_used);
  1056. };
  1057. run(1, 128, 128, 32, 32, 3);
  1058. for (size_t IC : {32, 64, 128}) {
  1059. for (size_t OC : {32, 64, 128}) {
  1060. for (size_t size : {28, 56}) {
  1061. for (size_t FS : {3, 5}) {
  1062. run(1, IC, OC, size, size, FS);
  1063. }
  1064. }
  1065. }
  1066. }
  1067. }
  1068. TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F23_4x4) {
  1069. #if MEGDNN_AARCH64
  1070. conv_bias::benchmark_winograd("WINOGRAD:AARCH64_F32_MK4_4x16:4:2", handle(), 3, 4);
  1071. #elif MEGDNN_ARMV7
  1072. conv_bias::benchmark_winograd("WINOGRAD:ARMV7_F32_MK4_4x8:4:2", handle(), 3, 4);
  1073. #else
  1074. conv_bias::benchmark_winograd("WINOGRAD:FB_GI_F32_MK4_4x8:4:2", handle(), 3, 4);
  1075. #endif
  1076. }
  1077. void benchmark_winograd_nchw_vs_nchw44(
  1078. const char* algo_name0, const char* algo_name1, Handle* handle) {
  1079. using namespace conv_bias;
  1080. using NLMode = param::ConvBias::NonlineMode;
  1081. std::vector<conv_bias::TestArg> args_nchw44;
  1082. std::vector<conv_bias::TestArg> args_nchw;
  1083. auto pack = [&](size_t n, size_t oc, size_t ic, size_t h, size_t w, size_t group,
  1084. NLMode nlmode) {
  1085. param::ConvBias param;
  1086. param.format = param::ConvBias::Format::NCHW44;
  1087. param.stride_h = 1;
  1088. param.stride_w = 1;
  1089. param.pad_h = 1;
  1090. param.pad_w = 1;
  1091. param.nonlineMode = nlmode;
  1092. if (group == 1) {
  1093. param.sparse = param::ConvBias::Sparse::DENSE;
  1094. args_nchw44.emplace_back(
  1095. param, TensorShape{n, ic / 4, h, w, 4},
  1096. TensorShape{oc / 4, ic / 4, 3, 3, 4, 4}, TensorShape{});
  1097. param.format = param::ConvBias::Format::NCHW;
  1098. args_nchw.emplace_back(
  1099. param, TensorShape{n, ic, h, w}, TensorShape{oc, ic, 3, 3},
  1100. TensorShape{});
  1101. } else {
  1102. auto oc_per_group = oc / group;
  1103. auto ic_per_group = ic / group;
  1104. param.sparse = param::ConvBias::Sparse::GROUP;
  1105. args_nchw44.emplace_back(
  1106. param, TensorShape{n, ic_per_group / 4, h, w, 4},
  1107. TensorShape{group, oc_per_group / 4, ic_per_group / 4, 3, 3, 4, 4},
  1108. TensorShape{});
  1109. param.format = param::ConvBias::Format::NCHW;
  1110. args_nchw.emplace_back(
  1111. param, TensorShape{n, ic, h, w},
  1112. TensorShape{group, oc_per_group, ic_per_group, 3, 3},
  1113. TensorShape{});
  1114. }
  1115. };
  1116. std::vector<NLMode> nonlinemode = {NLMode::IDENTITY};
  1117. for (auto nlmode : nonlinemode)
  1118. for (size_t n : {1})
  1119. for (size_t group = 1; group <= 1; ++group) {
  1120. pack(n, 512, 512, 15, 15, group, nlmode);
  1121. pack(n, 512, 256, 15, 15, group, nlmode);
  1122. pack(n, 256, 256, 29, 29, group, nlmode);
  1123. pack(n, 256, 128, 29, 29, group, nlmode);
  1124. pack(n, 128, 128, 57, 57, group, nlmode);
  1125. pack(n, 128, 64, 57, 57, group, nlmode);
  1126. pack(n, 24, 24, 224, 224, group, nlmode);
  1127. pack(n, 64, 24, 123, 123, group, nlmode);
  1128. pack(n, 64, 64, 56, 56, group, nlmode);
  1129. pack(n, 128, 128, 28, 28, group, nlmode);
  1130. pack(n, 256, 256, 14, 14, group, nlmode);
  1131. pack(n, 512, 512, 7, 7, group, nlmode);
  1132. }
  1133. using namespace conv_bias;
  1134. constexpr size_t RUN = 10;
  1135. Benchmarker<ConvBias> benchmark_winograd_nchw(handle);
  1136. benchmark_winograd_nchw.set_display(false);
  1137. benchmark_winograd_nchw.set_times(RUN);
  1138. Benchmarker<ConvBias> benchmark_winograd_nchw44(handle);
  1139. benchmark_winograd_nchw44.set_display(false);
  1140. benchmark_winograd_nchw44.set_times(RUN);
  1141. std::string winograd_nchw_algo_name = ssprintf("WINOGRAD:%s", algo_name0);
  1142. std::string winograd_nchw44_algo_name = ssprintf("WINOGRAD_NCHW44:%s", algo_name1);
  1143. for (size_t i = 0; i < args_nchw.size(); ++i) {
  1144. auto arg_nchw = args_nchw[i];
  1145. auto arg_nchw44 = args_nchw44[i];
  1146. TensorLayout dst_layout;
  1147. auto opr = handle->create_operator<ConvBias>();
  1148. opr->param() = arg_nchw.param;
  1149. opr->deduce_layout(
  1150. {arg_nchw.src, dtype::Float32()}, {arg_nchw.filter, dtype::Float32()},
  1151. {arg_nchw.bias, dtype::Float32()}, {}, dst_layout);
  1152. //! dst.nr_elems * IC * FH * FW * 2
  1153. float computations = dst_layout.total_nr_elems() * arg_nchw.filter[1] *
  1154. arg_nchw.filter[2] * arg_nchw.filter[3] * 2.0 /
  1155. (1024 * 1024 * 1024) * 1e3;
  1156. benchmark_winograd_nchw.set_param(arg_nchw.param);
  1157. auto nchw_used = algo_benchmark<ConvBias>(
  1158. benchmark_winograd_nchw,
  1159. {arg_nchw.src, arg_nchw.filter, {}, {}, {}},
  1160. winograd_nchw_algo_name.c_str()) /
  1161. RUN;
  1162. benchmark_winograd_nchw44.set_param(arg_nchw44.param);
  1163. auto nchw44_used = algo_benchmark<ConvBias>(
  1164. benchmark_winograd_nchw44,
  1165. {arg_nchw44.src, arg_nchw44.filter, {}, {}, {}},
  1166. winograd_nchw44_algo_name.c_str()) /
  1167. RUN;
  1168. printf("%s %s: nchw: %f ms %f Gflops nchw44: %f ms %f GFlops "
  1169. "speedup: "
  1170. "%f\n",
  1171. arg_nchw.src.to_string().c_str(), arg_nchw.filter.to_string().c_str(),
  1172. nchw_used, computations / nchw_used, nchw44_used,
  1173. computations / nchw44_used, nchw_used / nchw44_used);
  1174. }
  1175. }
  1176. TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F23_MK4_NCHW_VS_NCHW44) {
  1177. #if MEGDNN_AARCH64
  1178. benchmark_winograd_nchw_vs_nchw44(
  1179. "AARCH64_F32_MK4_4x16:4:2", "AARCH64_F32_MK4_4x16:4:2", handle());
  1180. #elif MEGDNN_ARMV7
  1181. benchmark_winograd_nchw_vs_nchw44(
  1182. "ARMV7_F32_MK4_4x8:4:2", "ARMV7_F32_MK4_4x8:4:2", handle());
  1183. #else
  1184. benchmark_winograd_nchw_vs_nchw44(
  1185. "FB_GI_F32_MK4_4x8:4:2", "FB_GI_F32_MK4_4x8:4:2", handle());
  1186. #endif
  1187. }
  1188. TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F63_4x4) {
  1189. #if MEGDNN_AARCH64
  1190. conv_bias::benchmark_winograd("WINOGRAD:AARCH64_F32_MK4_4x16:4:6", handle(), 3, 4);
  1191. #elif MEGDNN_ARMV7
  1192. conv_bias::benchmark_winograd("WINOGRAD:ARMV7_F32_MK4_4x8:4:6", handle(), 3, 4);
  1193. #else
  1194. conv_bias::benchmark_winograd("WINOGRAD:FB_GI_F32_MK4_4x8:4:6", handle(), 3, 4);
  1195. #endif
  1196. }
  1197. TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F63_MK4_NCHW_VS_NCHW44) {
  1198. #if MEGDNN_AARCH64
  1199. benchmark_winograd_nchw_vs_nchw44(
  1200. "AARCH64_F32_MK4_4x16:4:6", "AARCH64_F32_MK4_4x16:4:6", handle());
  1201. #elif MEGDNN_ARMV7
  1202. benchmark_winograd_nchw_vs_nchw44(
  1203. "ARMV7_F32_MK4_4x8:4:6", "ARMV7_F32_MK4_4x8:4:6", handle());
  1204. #else
  1205. benchmark_winograd_nchw_vs_nchw44(
  1206. "FB_GI_F32_MK4_4x8:4:6", "FB_GI_F32_MK4_4x8:4:6", handle());
  1207. #endif
  1208. }
  1209. #endif
  1210. } // namespace test
  1211. } // namespace megdnn
  1212. // vim: syntax=cpp.doxygen