You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

local_share.cpp 47 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215
  1. /**
  2. * \file dnn/test/cuda/local_share.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megdnn/oprs/nn.h"
  12. #include "src/common/utils.h"
  13. #include "test/common/checker.h"
  14. #include "test/common/convolution.h"
  15. #include "test/common/tensor.h"
  16. #include "test/common/workspace_wrapper.h"
  17. #include "test/cuda/benchmark.h"
  18. #include "test/cuda/fixture.h"
  19. #include "test/cuda/utils.h"
  20. using namespace megdnn;
  21. using namespace test;
  22. namespace {
  23. struct LocalShareArgs {
  24. size_t b, c, f, p, s, h, w, sg;
  25. };
  26. std::vector<LocalShareArgs> get_local_share_conv_1x1_args_lar_bs() {
  27. std::vector<LocalShareArgs> ret;
  28. // clang-format off
  29. for (size_t b : {32, 64}) {
  30. for (size_t c : {32, 16, 8}) {
  31. for (size_t f : {1}) {
  32. for (int p : {0}) {
  33. for (size_t s : {1, 2}) {
  34. for (size_t h : {8, 16}) {
  35. for (size_t w : {2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 24, 32, 33}) {
  36. for (size_t sg : {3, 2}) {
  37. size_t ho = infer_conv_shape(h, f, s, p);
  38. size_t wo = infer_conv_shape(w, f, s, p);
  39. if (ho % sg != 0 || wo % sg != 0)
  40. continue;
  41. ret.emplace_back(LocalShareArgs{b, c, f, static_cast<size_t>(p),
  42. s, h, w, sg});
  43. } } } } } } } }
  44. // clang-format on
  45. return ret;
  46. }
  47. std::vector<LocalShareArgs> get_local_share_conv_3x3_args_lar_bs() {
  48. std::vector<LocalShareArgs> ret;
  49. // clang-format off
  50. for (size_t b : {32, 64}) {
  51. for (size_t c : {32, 16, 8}) {
  52. for (size_t f : {3}) {
  53. for (int p : {static_cast<int>(f / 2), 0}) {
  54. for (size_t s : {1, 2}) {
  55. for (size_t h : {8, 16}) {
  56. for (size_t w : {3, 4, 5, 6, 7, 8, 9, 10, 16, 24, 32, 33}) {
  57. for (size_t sg : {3, 2}) {
  58. size_t ho = infer_conv_shape(h, f, s, p);
  59. size_t wo = infer_conv_shape(w, f, s, p);
  60. if (ho % sg != 0 || wo % sg != 0)
  61. continue;
  62. ret.emplace_back(LocalShareArgs{b, c, f, static_cast<size_t>(p),
  63. s, h, w, sg});
  64. } } } } } } } }
  65. // clang-format on
  66. return ret;
  67. }
  68. std::vector<LocalShareArgs> get_local_share_conv_5x5_args_lar_bs() {
  69. std::vector<LocalShareArgs> ret;
  70. // clang-format off
  71. for (size_t b : {32, 64}) {
  72. for (size_t c : {32, 16, 8}) {
  73. for (size_t f : {5}) {
  74. for (int p : {static_cast<int>(f / 2), 0}) {
  75. for (size_t s : {1, 2}) {
  76. for (size_t h : {8, 16}) {
  77. for (size_t w : {8, 9, 10, 16, 24, 32, 33}) {
  78. for (size_t sg : {3, 2}) {
  79. size_t ho = infer_conv_shape(h, f, s, p);
  80. size_t wo = infer_conv_shape(w, f, s, p);
  81. if (ho % sg != 0 || wo % sg != 0)
  82. continue;
  83. ret.emplace_back(LocalShareArgs{b, c, f, static_cast<size_t>(p), s,
  84. h, w, sg});
  85. } } } } } } } }
  86. // clang-format on
  87. return ret;
  88. }
  89. std::vector<LocalShareArgs> get_local_share_conv_7x7_args_lar_bs() {
  90. std::vector<LocalShareArgs> ret;
  91. // clang-format off
  92. for (size_t b : {32, 64}) {
  93. for (size_t c : {32, 16, 8}) {
  94. for (size_t f : {7}) {
  95. for (int p : {static_cast<int>(f / 2), 0}) {
  96. for (size_t s : {1, 2}) {
  97. for (size_t h : {8, 16}) {
  98. for (size_t w : {8, 9, 10, 16, 24, 32, 33}) {
  99. for (size_t sg : {3, 2}) {
  100. size_t ho = infer_conv_shape(h, f, s, p);
  101. size_t wo = infer_conv_shape(w, f, s, p);
  102. if (ho % sg != 0 || wo % sg != 0)
  103. continue;
  104. ret.emplace_back(LocalShareArgs{b, c, f, static_cast<size_t>(p), s,
  105. h, w, sg});
  106. } } } } } } } }
  107. // clang-format on
  108. return ret;
  109. }
  110. std::vector<LocalShareArgs> get_local_share_conv_small_image(size_t kernel_size) {
  111. size_t f = kernel_size;
  112. std::vector<LocalShareArgs> ret;
  113. // clang-format off
  114. for (size_t b : {8, 16, 32, 48, 64}) {
  115. for (size_t c : {8, 16, 32, 48, 64, 96, 128}) {
  116. for (int p : {static_cast<int>(f / 2), 0}) {
  117. for (size_t s : {1, 2}) {
  118. for (size_t h : {12}) {
  119. for (size_t w : {12}) {
  120. for (size_t sg : {3, 2}) {
  121. size_t ho = infer_conv_shape(h, f, s, p);
  122. size_t wo = infer_conv_shape(w, f, s, p);
  123. if (ho % sg != 0 || wo % sg != 0)
  124. continue;
  125. ret.emplace_back(LocalShareArgs{b, c, f, static_cast<size_t>(p), s,
  126. h, w, sg});
  127. } } } } } } }
  128. // clang-format on
  129. return ret;
  130. }
  131. std::vector<LocalShareArgs> get_local_share_conv_small_image() {
  132. std::vector<LocalShareArgs> ret = get_local_share_conv_small_image(3);
  133. auto ret1 = get_local_share_conv_small_image(5);
  134. auto ret2 = get_local_share_conv_small_image(7);
  135. ret.insert(ret.begin(), ret1.begin(), ret1.end());
  136. ret.insert(ret.begin(), ret2.begin(), ret2.end());
  137. return ret;
  138. }
  139. void test_local_share_bwd_data_implicit_gemm(size_t kernel_size,
  140. Handle* handle) {
  141. Checker<LocalShareBackwardData> checker(handle);
  142. bool require_algo = false;
  143. checker.set_before_exec_callback(AlgoChecker<LocalShareBackwardData>(
  144. "LOCAL_SHARE_IMPLICIT_GEMM", &require_algo));
  145. using Param = LocalShare::Param;
  146. ConstValue const_0{0};
  147. auto args = get_local_share_conv_small_image(kernel_size);
  148. for (auto&& arg : args) {
  149. static_cast<void>(arg);
  150. size_t b = arg.b, c = arg.c, f = arg.f, p = arg.p, s = arg.s, h = arg.h,
  151. w = arg.w, sg = arg.sg;
  152. size_t ho = infer_conv_shape(h, f, s, p),
  153. wo = infer_conv_shape(w, f, s, p);
  154. Param param;
  155. param.stride_h = param.stride_w = s;
  156. param.pad_h = param.pad_w = p;
  157. param.spatial_groups_h = param.spatial_groups_w = sg;
  158. checker.set_param(param);
  159. checker.set_rng(2, &const_0);
  160. TensorShape diff{b, c, ho, wo}, filter{sg, sg, 4, f, f, c},
  161. grad{b, 4, h, w};
  162. checker.execs({filter, diff, grad});
  163. diff = TensorShape{b, c, ho, wo},
  164. filter = TensorShape{sg, sg, 8, f, f, c};
  165. grad = {b, 8, h, w};
  166. checker.exec({filter, diff, grad});
  167. }
  168. }
  169. } // namespace
  170. TEST_F(CUDA, LOCAL_SHARE_FORWARD_1x1_LAR_BS) {
  171. require_compute_capability(6, 0);
  172. Checker<LocalShare> checker(handle_cuda());
  173. bool require_algo = false;
  174. checker.set_before_exec_callback(AlgoChecker<LocalShare>(
  175. "LOCAL_SHARE_CHWN_BATCH_SIZE_AWARE", &require_algo));
  176. using Param = LocalShare::Param;
  177. auto args = get_local_share_conv_1x1_args_lar_bs();
  178. for (auto&& arg : args) {
  179. size_t b = arg.b, c = arg.c, f = arg.f, p = arg.p, s = arg.s, h = arg.h,
  180. w = arg.w, sg = arg.sg;
  181. Param param;
  182. param.stride_h = param.stride_w = s;
  183. param.pad_h = param.pad_w = p;
  184. param.spatial_groups_h = param.spatial_groups_w = sg;
  185. checker.set_param(param);
  186. TensorShape src{b, 4, h, w}, filter{sg, sg, 4, f, f, c};
  187. checker.execs({src, filter, {}});
  188. src = TensorShape{b, 8, h, w}, filter = TensorShape{sg, sg, 8, f, f, c};
  189. checker.exec({src, filter, {}});
  190. }
  191. }
  192. TEST_F(CUDA, LOCAL_SHARE_FORWARD_3x3_LAR_BS) {
  193. require_compute_capability(6, 0);
  194. Checker<LocalShare> checker(handle_cuda());
  195. bool require_algo = false;
  196. checker.set_before_exec_callback(AlgoChecker<LocalShare>(
  197. "LOCAL_SHARE_CHWN_BATCH_SIZE_AWARE", &require_algo));
  198. using Param = LocalShare::Param;
  199. auto args = get_local_share_conv_3x3_args_lar_bs();
  200. ConstValue const_1{1};
  201. for (auto&& arg : args) {
  202. size_t b = arg.b, c = arg.c, f = arg.f, p = arg.p, s = arg.s, h = arg.h,
  203. w = arg.w, sg = arg.sg;
  204. Param param;
  205. param.stride_h = param.stride_w = s;
  206. param.pad_h = param.pad_w = p;
  207. param.spatial_groups_h = param.spatial_groups_w = sg;
  208. checker.set_param(param);
  209. TensorShape src{b, 4, h, w}, filter{sg, sg, 4, f, f, c};
  210. checker.execs({src, filter, {}});
  211. src = TensorShape{b, 8, h, w}, filter = TensorShape{sg, sg, 8, f, f, c};
  212. checker.exec({src, filter, {}});
  213. }
  214. }
  215. TEST_F(CUDA, LOCAL_SHARE_FORWARD_5x5_LAR_BS) {
  216. require_compute_capability(6, 0);
  217. Checker<LocalShare> checker(handle_cuda());
  218. bool require_algo = false;
  219. checker.set_before_exec_callback(AlgoChecker<LocalShare>(
  220. "LOCAL_SHARE_CHWN_BATCH_SIZE_AWARE", &require_algo));
  221. using Param = LocalShare::Param;
  222. auto args = get_local_share_conv_5x5_args_lar_bs();
  223. for (auto&& arg : args) {
  224. size_t b = arg.b, c = arg.c, f = arg.f, p = arg.p, s = arg.s, h = arg.h,
  225. w = arg.w, sg = arg.sg;
  226. Param param;
  227. param.stride_h = param.stride_w = s;
  228. param.pad_h = param.pad_w = p;
  229. param.spatial_groups_h = param.spatial_groups_w = sg;
  230. checker.set_param(param);
  231. TensorShape src{b, 4, h, w}, filter{sg, sg, 4, f, f, c};
  232. checker.execs({src, filter, {}});
  233. src = TensorShape{b, 8, h, w}, filter = TensorShape{sg, sg, 8, f, f, c};
  234. checker.exec({src, filter, {}});
  235. }
  236. }
  237. TEST_F(CUDA, LOCAL_SHARE_FORWARD_7x7_LAR_BS) {
  238. require_compute_capability(6, 0);
  239. Checker<LocalShare> checker(handle_cuda());
  240. bool require_algo = false;
  241. checker.set_before_exec_callback(AlgoChecker<LocalShare>(
  242. "LOCAL_SHARE_CHWN_BATCH_SIZE_AWARE", &require_algo));
  243. using Param = LocalShare::Param;
  244. auto args = get_local_share_conv_7x7_args_lar_bs();
  245. for (auto&& arg : args) {
  246. size_t b = arg.b, c = arg.c, f = arg.f, p = arg.p, s = arg.s, h = arg.h,
  247. w = arg.w, sg = arg.sg;
  248. Param param;
  249. param.stride_h = param.stride_w = s;
  250. param.pad_h = param.pad_w = p;
  251. param.spatial_groups_h = param.spatial_groups_w = sg;
  252. checker.set_param(param);
  253. TensorShape src{b, 4, h, w}, filter{sg, sg, 4, f, f, c};
  254. checker.execs({src, filter, {}});
  255. src = TensorShape{b, 8, h, w}, filter = TensorShape{sg, sg, 8, f, f, c};
  256. checker.exec({src, filter, {}});
  257. }
  258. }
  259. TEST_F(CUDA, LOCAL_SHARE_BATCHED_MATMUL) {
  260. Checker<LocalShare> checker(handle_cuda());
  261. bool require_algo = false;
  262. checker.set_before_exec_callback(AlgoChecker<LocalShare>(
  263. "LOCAL_SHARE_BATCHED_MATMUL", &require_algo));
  264. using Param = LocalShare::Param;
  265. auto args = convolution::get_args();
  266. for (size_t sg : {2, 3}) {
  267. for (auto&& arg : args) {
  268. if (arg.param.sparse != LocalShare::Param::Sparse::DENSE)
  269. continue;
  270. if (arg.param.format != LocalShare::Param::Format::NCHW)
  271. continue;
  272. if (arg.param.dilate_h != 1 || arg.param.dilate_w != 1)
  273. continue;
  274. Param param;
  275. param.sparse = arg.param.sparse;
  276. param.stride_h = arg.param.stride_h,
  277. param.stride_w = arg.param.stride_w;
  278. param.pad_h = arg.param.pad_h, param.pad_w = arg.param.pad_w;
  279. param.dilate_h = arg.param.dilate_h,
  280. param.dilate_w = arg.param.dilate_w;
  281. param.spatial_groups_h = param.spatial_groups_w = sg;
  282. size_t ho = infer_conv_shape(arg.src[2], arg.filter[2],
  283. param.stride_h, param.pad_h);
  284. size_t wo = infer_conv_shape(arg.src[3], arg.filter[3],
  285. param.stride_w, param.pad_w);
  286. if (ho % sg != 0 || wo % sg != 0)
  287. continue;
  288. TensorShape filter{sg,
  289. sg,
  290. arg.filter[1],
  291. arg.filter[2],
  292. arg.filter[3],
  293. arg.filter[0]};
  294. checker.set_param(param);
  295. checker.exec({arg.src, filter, {}});
  296. }
  297. }
  298. }
  299. TEST_F(CUDA, GROUP_LOCAL_SHARE_BATCHED_MATMUL) {
  300. Checker<LocalShare> checker(handle_cuda());
  301. bool require_algo = false;
  302. checker.set_before_exec_callback(AlgoChecker<LocalShare>(
  303. "LOCAL_SHARE_BATCHED_MATMUL", &require_algo));
  304. using Param = LocalShare::Param;
  305. auto args = convolution::get_args();
  306. for (size_t sg : {2, 3}) {
  307. for (auto&& arg : args) {
  308. if (arg.param.sparse != LocalShare::Param::Sparse::DENSE)
  309. continue;
  310. if (arg.param.format != LocalShare::Param::Format::NCHW)
  311. continue;
  312. if (arg.param.dilate_h != 1 || arg.param.dilate_w != 1)
  313. continue;
  314. if (arg.filter.ndim != 4)
  315. continue;
  316. Param param;
  317. param.sparse = Param::Sparse::GROUP;
  318. param.stride_h = arg.param.stride_h,
  319. param.stride_w = arg.param.stride_w;
  320. param.pad_h = arg.param.pad_h, param.pad_w = arg.param.pad_w;
  321. param.dilate_h = arg.param.dilate_h,
  322. param.dilate_w = arg.param.dilate_w;
  323. param.spatial_groups_h = param.spatial_groups_w = sg;
  324. size_t ho = infer_conv_shape(arg.src[2], arg.filter[2],
  325. param.stride_h, param.pad_h);
  326. size_t wo = infer_conv_shape(arg.src[3], arg.filter[3],
  327. param.stride_w, param.pad_w);
  328. if (ho % sg != 0 || wo % sg != 0)
  329. continue;
  330. size_t nr_groups = 3;
  331. TensorShape filter{nr_groups,
  332. sg,
  333. sg,
  334. arg.filter[1],
  335. arg.filter[2],
  336. arg.filter[3],
  337. arg.filter[0]};
  338. TensorShape src{arg.src[0], arg.src[1] * nr_groups, arg.src[2],
  339. arg.src[3]};
  340. checker.set_param(param);
  341. checker.exec({src, filter, {}});
  342. }
  343. }
  344. }
  345. TEST_F(CUDA, LOCAL_SHARE_FORWARD_SMALL_IMAGE_GENERAL) {
  346. require_compute_capability(6, 0);
  347. Checker<LocalShare> checker(handle_cuda());
  348. bool require_algo = false;
  349. checker.set_before_exec_callback(AlgoChecker<LocalShare>(
  350. "LOCAL_SHARE_CHWN_BATCH_SIZE_AWARE_SMALL_IMAGE", &require_algo));
  351. using Param = LocalShare::Param;
  352. auto args = convolution::get_args();
  353. for (size_t sg : {2, 3}) {
  354. for (auto&& arg : args) {
  355. if (arg.param.sparse != LocalShare::Param::Sparse::DENSE)
  356. continue;
  357. if (arg.param.format != LocalShare::Param::Format::NCHW)
  358. continue;
  359. if (arg.param.dilate_h != 1 || arg.param.dilate_w != 1)
  360. continue;
  361. Param param;
  362. param.stride_h = arg.param.stride_h,
  363. param.stride_w = arg.param.stride_w;
  364. param.pad_h = arg.param.pad_h, param.pad_w = arg.param.pad_w;
  365. param.dilate_h = arg.param.dilate_h,
  366. param.dilate_w = arg.param.dilate_w;
  367. param.spatial_groups_h = param.spatial_groups_w = sg;
  368. size_t ho = infer_conv_shape(arg.src[2], arg.filter[2],
  369. param.stride_h, param.pad_h);
  370. size_t wo = infer_conv_shape(arg.src[3], arg.filter[3],
  371. param.stride_w, param.pad_w);
  372. if (ho % sg != 0 || wo % sg != 0)
  373. continue;
  374. arg.filter[1] = arg.filter[1] + (4 - arg.filter[1] % 4);
  375. arg.src[1] = arg.filter[1];
  376. TensorShape filter{sg,
  377. sg,
  378. arg.filter[1],
  379. arg.filter[2],
  380. arg.filter[3],
  381. arg.filter[0]};
  382. checker.set_param(param);
  383. checker.exec({arg.src, filter, {}});
  384. }
  385. }
  386. }
  387. TEST_F(CUDA, LOCAL_SHARE_FORWARD_SMALL_IMAGE_SPECIAL) {
  388. require_compute_capability(6, 0);
  389. Checker<LocalShare> checker(handle_cuda());
  390. bool require_algo = false;
  391. checker.set_before_exec_callback(AlgoChecker<LocalShare>(
  392. "LOCAL_SHARE_CHWN_BATCH_SIZE_AWARE_SMALL_IMAGE", &require_algo));
  393. using Param = LocalShare::Param;
  394. auto args = get_local_share_conv_small_image();
  395. for (auto&& arg : args) {
  396. size_t b = arg.b, c = arg.c, f = arg.f, p = arg.p, s = arg.s, h = arg.h,
  397. w = arg.w, sg = arg.sg;
  398. Param param;
  399. param.stride_h = param.stride_w = s;
  400. param.pad_h = param.pad_w = p;
  401. param.spatial_groups_h = param.spatial_groups_w = sg;
  402. checker.set_param(param);
  403. TensorShape src{b, 4, h, w}, filter{sg, sg, 4, f, f, c};
  404. checker.execs({src, filter, {}});
  405. src = TensorShape{b, 8, h, w}, filter = TensorShape{sg, sg, 8, f, f, c};
  406. checker.exec({src, filter, {}});
  407. }
  408. }
  409. TEST_F(CUDA, LOCAL_SHARE_BWD_DATA_IMPLICIT_GEMM_GENERAL) {
  410. require_compute_capability(6, 0);
  411. Checker<LocalShareBackwardData> checker(handle_cuda());
  412. bool require_algo = false;
  413. checker.set_before_exec_callback(AlgoChecker<LocalShareBackwardData>(
  414. "LOCAL_SHARE_IMPLICIT_GEMM", &require_algo));
  415. using Param = LocalShare::Param;
  416. auto args = convolution::get_args();
  417. ConstValue const_0{0};
  418. for (size_t sg : {2, 3}) {
  419. for (auto&& arg : args) {
  420. if (arg.param.sparse != LocalShare::Param::Sparse::DENSE)
  421. continue;
  422. if (arg.param.format != LocalShare::Param::Format::NCHW)
  423. continue;
  424. if (arg.param.dilate_h != 1 || arg.param.dilate_w != 1)
  425. continue;
  426. Param param;
  427. param.stride_h = arg.param.stride_h,
  428. param.stride_w = arg.param.stride_w;
  429. param.pad_h = arg.param.pad_h, param.pad_w = arg.param.pad_w;
  430. param.dilate_h = arg.param.dilate_h,
  431. param.dilate_w = arg.param.dilate_w;
  432. param.spatial_groups_h = param.spatial_groups_w = sg;
  433. size_t ho = infer_conv_shape(arg.src[2], arg.filter[2],
  434. param.stride_h, param.pad_h);
  435. size_t wo = infer_conv_shape(arg.src[3], arg.filter[3],
  436. param.stride_w, param.pad_w);
  437. if (ho % sg != 0 || wo % sg != 0)
  438. continue;
  439. arg.filter[0] = arg.filter[0] + (4 - arg.filter[0] % 4);
  440. TensorShape filter{sg,
  441. sg,
  442. arg.filter[1],
  443. arg.filter[2],
  444. arg.filter[3],
  445. arg.filter[0]};
  446. TensorShape diff{arg.src[0], arg.filter[0], ho, wo};
  447. checker.set_param(param);
  448. checker.set_rng(2, &const_0);
  449. checker.exec({filter, diff, arg.src});
  450. }
  451. }
  452. }
  453. TEST_F(CUDA, LOCAL_SHARE_BWD_DATA_IMPLICIT_GEMM_SPECIAL_PART1) {
  454. require_compute_capability(6, 0);
  455. test_local_share_bwd_data_implicit_gemm(3, handle_cuda());
  456. }
  457. TEST_F(CUDA, LOCAL_SHARE_BWD_DATA_IMPLICIT_GEMM_SPECIAL_PART2) {
  458. require_compute_capability(6, 0);
  459. test_local_share_bwd_data_implicit_gemm(5, handle_cuda());
  460. }
  461. TEST_F(CUDA, LOCAL_SHARE_BWD_DATA_IMPLICIT_GEMM_SPECIAL_PART3) {
  462. require_compute_capability(6, 0);
  463. test_local_share_bwd_data_implicit_gemm(7, handle_cuda());
  464. }
  465. TEST_F(CUDA, LOCAL_SHARE_BWD_DATA_BATCHED_MATMUL) {
  466. Checker<LocalShareBackwardData> checker(handle_cuda());
  467. bool require_algo = false;
  468. checker.set_before_exec_callback(AlgoChecker<LocalShareBackwardData>(
  469. "LOCAL_SHARE_BATCHED_MATMUL", &require_algo));
  470. using Param = LocalShare::Param;
  471. auto args = convolution::get_args();
  472. ConstValue const_0{0};
  473. for (size_t sg : {2, 3}) {
  474. for (auto&& arg : args) {
  475. if (arg.param.sparse != LocalShare::Param::Sparse::DENSE)
  476. continue;
  477. if (arg.param.format != LocalShare::Param::Format::NCHW)
  478. continue;
  479. if (arg.param.dilate_h != 1 || arg.param.dilate_w != 1)
  480. continue;
  481. Param param;
  482. param.stride_h = arg.param.stride_h,
  483. param.stride_w = arg.param.stride_w;
  484. param.pad_h = arg.param.pad_h, param.pad_w = arg.param.pad_w;
  485. param.dilate_h = arg.param.dilate_h,
  486. param.dilate_w = arg.param.dilate_w;
  487. param.spatial_groups_h = param.spatial_groups_w = sg;
  488. size_t ho = infer_conv_shape(arg.src[2], arg.filter[2],
  489. param.stride_h, param.pad_h);
  490. size_t wo = infer_conv_shape(arg.src[3], arg.filter[3],
  491. param.stride_w, param.pad_w);
  492. if (ho % sg != 0 || wo % sg != 0)
  493. continue;
  494. TensorShape filter{sg,
  495. sg,
  496. arg.filter[1],
  497. arg.filter[2],
  498. arg.filter[3],
  499. arg.filter[0]};
  500. TensorShape diff{arg.src[0], arg.filter[0], ho, wo};
  501. checker.set_rng(2, &const_0);
  502. checker.set_param(param);
  503. checker.exec({filter, diff, arg.src});
  504. }
  505. }
  506. }
  507. TEST_F(CUDA, GROUP_LOCAL_SHARE_BWD_DATA_BATCHED_MATMUL) {
  508. Checker<LocalShareBackwardData> checker(handle_cuda());
  509. bool require_algo = false;
  510. checker.set_before_exec_callback(AlgoChecker<LocalShareBackwardData>(
  511. "LOCAL_SHARE_BATCHED_MATMUL", &require_algo));
  512. using Param = LocalShare::Param;
  513. auto args = convolution::get_args();
  514. ConstValue const_0{0};
  515. for (size_t sg : {2, 3}) {
  516. for (auto&& arg : args) {
  517. if (arg.param.sparse != LocalShare::Param::Sparse::DENSE)
  518. continue;
  519. if (arg.param.format != LocalShare::Param::Format::NCHW)
  520. continue;
  521. if (arg.param.dilate_h != 1 || arg.param.dilate_w != 1)
  522. continue;
  523. Param param;
  524. param.sparse = Param::Sparse::GROUP;
  525. param.stride_h = arg.param.stride_h,
  526. param.stride_w = arg.param.stride_w;
  527. param.pad_h = arg.param.pad_h, param.pad_w = arg.param.pad_w;
  528. param.dilate_h = arg.param.dilate_h,
  529. param.dilate_w = arg.param.dilate_w;
  530. param.spatial_groups_h = param.spatial_groups_w = sg;
  531. size_t ho = infer_conv_shape(arg.src[2], arg.filter[2],
  532. param.stride_h, param.pad_h);
  533. size_t wo = infer_conv_shape(arg.src[3], arg.filter[3],
  534. param.stride_w, param.pad_w);
  535. if (ho % sg != 0 || wo % sg != 0)
  536. continue;
  537. size_t nr_groups = 3;
  538. TensorShape filter{nr_groups,
  539. sg,
  540. sg,
  541. arg.filter[1],
  542. arg.filter[2],
  543. arg.filter[3],
  544. arg.filter[0]};
  545. TensorShape diff{arg.src[0], arg.filter[0] * nr_groups, ho, wo};
  546. TensorShape grad{arg.src[0], arg.src[1] * nr_groups, arg.src[2],
  547. arg.src[3]};
  548. checker.set_rng(2, &const_0);
  549. checker.set_param(param);
  550. checker.exec({filter, diff, grad});
  551. }
  552. }
  553. }
  554. TEST_F(CUDA, LOCAL_SHARE_BWD_FILTER_IMPLICIT_GEMM_GENERAL) {
  555. require_compute_capability(6, 0);
  556. Checker<LocalShareBackwardFilter> checker(handle_cuda());
  557. bool require_algo = false;
  558. checker.set_before_exec_callback(AlgoChecker<LocalShareBackwardFilter>(
  559. "LOCAL_SHARE_IMPLICIT_GEMM", &require_algo));
  560. using Param = LocalShare::Param;
  561. auto args = convolution::get_args();
  562. ConstValue const_0{0};
  563. for (size_t sg : {2, 3}) {
  564. for (auto&& arg : args) {
  565. if (arg.param.sparse != LocalShare::Param::Sparse::DENSE)
  566. continue;
  567. if (arg.param.format != LocalShare::Param::Format::NCHW)
  568. continue;
  569. if (arg.param.dilate_h != 1 || arg.param.dilate_w != 1)
  570. continue;
  571. Param param;
  572. param.stride_h = arg.param.stride_h,
  573. param.stride_w = arg.param.stride_w;
  574. param.pad_h = arg.param.pad_h, param.pad_w = arg.param.pad_w;
  575. param.dilate_h = arg.param.dilate_h,
  576. param.dilate_w = arg.param.dilate_w;
  577. param.spatial_groups_h = param.spatial_groups_w = sg;
  578. size_t ho = infer_conv_shape(arg.src[2], arg.filter[2],
  579. param.stride_h, param.pad_h);
  580. size_t wo = infer_conv_shape(arg.src[3], arg.filter[3],
  581. param.stride_w, param.pad_w);
  582. if (ho % sg != 0 || wo % sg != 0)
  583. continue;
  584. arg.src[0] = arg.src[0] + (4 - arg.src[0] % 4);
  585. TensorShape grad{sg,
  586. sg,
  587. arg.filter[1],
  588. arg.filter[2],
  589. arg.filter[3],
  590. arg.filter[0]};
  591. TensorShape diff{arg.src[0], arg.filter[0], ho, wo};
  592. checker.set_param(param);
  593. checker.set_rng(2, &const_0);
  594. checker.exec({arg.src, diff, grad});
  595. }
  596. }
  597. }
  598. TEST_F(CUDA, LOCAL_SHARE_BWD_FILTER_IMPLICIT_GEMM_SPECIAL) {
  599. require_compute_capability(6, 0);
  600. Checker<LocalShareBackwardFilter> checker(handle_cuda());
  601. bool require_algo = false;
  602. checker.set_before_exec_callback(AlgoChecker<LocalShareBackwardFilter>(
  603. "LOCAL_SHARE_IMPLICIT_GEMM", &require_algo));
  604. using Param = LocalShare::Param;
  605. ConstValue const_0{0};
  606. auto args = get_local_share_conv_small_image();
  607. for (auto&& arg : args) {
  608. static_cast<void>(arg);
  609. size_t b = arg.b, c = arg.c, f = arg.f, p = arg.p, s = arg.s, h = arg.h,
  610. w = arg.w, sg = arg.sg;
  611. size_t ho = infer_conv_shape(h, f, s, p),
  612. wo = infer_conv_shape(w, f, s, p);
  613. Param param;
  614. param.stride_h = param.stride_w = s;
  615. param.pad_h = param.pad_w = p;
  616. param.spatial_groups_h = param.spatial_groups_w = sg;
  617. checker.set_param(param);
  618. checker.set_rng(2, &const_0);
  619. TensorShape diff{b, c, ho, wo}, grad{sg, sg, 4, f, f, c},
  620. src{b, 4, h, w};
  621. checker.execs({src, diff, grad});
  622. src = {b, 8, h, w};
  623. diff = TensorShape{b, c, ho, wo},
  624. grad = TensorShape{sg, sg, 8, f, f, c};
  625. checker.exec({src, diff, grad});
  626. }
  627. }
  628. TEST_F(CUDA, LOCAL_SHARE_BWD_FILTER_BATCHED_MATMUL) {
  629. Checker<LocalShareBackwardFilter> checker(handle_cuda());
  630. bool require_algo = false;
  631. checker.set_before_exec_callback(AlgoChecker<LocalShareBackwardFilter>(
  632. "LOCAL_SHARE_BATCHED_MATMUL", &require_algo));
  633. using Param = LocalShare::Param;
  634. auto args = convolution::get_args();
  635. ConstValue const_0{0};
  636. for (size_t sg : {2, 3}) {
  637. for (auto&& arg : args) {
  638. if (arg.param.sparse != LocalShare::Param::Sparse::DENSE)
  639. continue;
  640. if (arg.param.format != LocalShare::Param::Format::NCHW)
  641. continue;
  642. if (arg.param.dilate_h != 1 || arg.param.dilate_w != 1)
  643. continue;
  644. Param param;
  645. param.stride_h = arg.param.stride_h,
  646. param.stride_w = arg.param.stride_w;
  647. param.pad_h = arg.param.pad_h, param.pad_w = arg.param.pad_w;
  648. param.dilate_h = arg.param.dilate_h,
  649. param.dilate_w = arg.param.dilate_w;
  650. param.spatial_groups_h = param.spatial_groups_w = sg;
  651. size_t ho = infer_conv_shape(arg.src[2], arg.filter[2],
  652. param.stride_h, param.pad_h);
  653. size_t wo = infer_conv_shape(arg.src[3], arg.filter[3],
  654. param.stride_w, param.pad_w);
  655. if (ho % sg != 0 || wo % sg != 0)
  656. continue;
  657. TensorShape grad{sg,
  658. sg,
  659. arg.filter[1],
  660. arg.filter[2],
  661. arg.filter[3],
  662. arg.filter[0]};
  663. TensorShape diff{arg.src[0], arg.filter[0], ho, wo};
  664. checker.set_rng(2, &const_0);
  665. checker.set_param(param);
  666. checker.exec({arg.src, diff, grad});
  667. }
  668. }
  669. }
  670. TEST_F(CUDA, GROUP_LOCAL_SHARE_BWD_FILTER_BATCHED_MATMUL) {
  671. Checker<LocalShareBackwardFilter> checker(handle_cuda());
  672. bool require_algo = false;
  673. checker.set_before_exec_callback(AlgoChecker<LocalShareBackwardFilter>(
  674. "LOCAL_SHARE_BATCHED_MATMUL", &require_algo));
  675. using Param = LocalShare::Param;
  676. auto args = convolution::get_args();
  677. ConstValue const_0{0};
  678. for (size_t sg : {2, 3}) {
  679. for (auto&& arg : args) {
  680. if (arg.param.sparse != LocalShare::Param::Sparse::DENSE)
  681. continue;
  682. if (arg.param.format != LocalShare::Param::Format::NCHW)
  683. continue;
  684. if (arg.param.dilate_h != 1 || arg.param.dilate_w != 1)
  685. continue;
  686. Param param;
  687. param.sparse = Param::Sparse::GROUP;
  688. param.stride_h = arg.param.stride_h,
  689. param.stride_w = arg.param.stride_w;
  690. param.pad_h = arg.param.pad_h, param.pad_w = arg.param.pad_w;
  691. param.dilate_h = arg.param.dilate_h,
  692. param.dilate_w = arg.param.dilate_w;
  693. param.spatial_groups_h = param.spatial_groups_w = sg;
  694. size_t ho = infer_conv_shape(arg.src[2], arg.filter[2],
  695. param.stride_h, param.pad_h);
  696. size_t wo = infer_conv_shape(arg.src[3], arg.filter[3],
  697. param.stride_w, param.pad_w);
  698. if (ho % sg != 0 || wo % sg != 0)
  699. continue;
  700. size_t nr_groups = 3;
  701. TensorShape grad{nr_groups,
  702. sg,
  703. sg,
  704. arg.filter[1],
  705. arg.filter[2],
  706. arg.filter[3],
  707. arg.filter[0]};
  708. TensorShape diff{arg.src[0], arg.filter[0] * nr_groups, ho, wo};
  709. TensorShape src{arg.src[0], arg.src[1] * nr_groups, arg.src[2],
  710. arg.src[3]};
  711. checker.set_rng(2, &const_0);
  712. checker.set_param(param);
  713. checker.exec({src, diff, grad});
  714. }
  715. }
  716. }
  717. #if MEGDNN_WITH_BENCHMARK
  718. TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_BWD_FILTER) {
  719. CUBenchmarker<LocalShareBackwardFilter> bencher(handle_cuda());
  720. size_t RUNS = 1000;
  721. bencher.set_display(false).set_times(RUNS);
  722. std::unique_ptr<OprProxy<LocalShareBackwardFilter>> proxy{
  723. new OprProxy<LocalShareBackwardFilter>{true}};
  724. bencher.set_proxy(proxy);
  725. LocalShare::Param param;
  726. NormalRNG rng;
  727. auto run = [&](size_t batch, size_t ic, size_t ih, size_t iw, size_t oc,
  728. size_t f, size_t s, size_t sg) {
  729. param.pad_h = f / 2;
  730. param.pad_w = f / 2;
  731. param.stride_h = s;
  732. param.stride_w = s;
  733. param.spatial_groups_h = sg;
  734. param.spatial_groups_w = sg;
  735. TensorShape src = {batch, ic, ih, iw}, grad = {sg, sg, ic, f, f, oc};
  736. size_t ho = infer_conv_shape(ih, f, s, f / 2);
  737. size_t wo = infer_conv_shape(iw, f, s, f / 2);
  738. TensorShape diff = {batch, oc, ho, wo};
  739. float flo = 2.0 * batch * oc * ho * wo * ic * f * f / (1e12);
  740. bencher.set_param(param)
  741. .set_dtype(0, dtype::Float32())
  742. .set_dtype(1, dtype::Float32())
  743. .set_dtype(2, dtype::Float32())
  744. .set_rng(0, &rng)
  745. .set_rng(1, &rng);
  746. bencher.proxy()->target_execution_policy.algo.reset();
  747. auto time_in_ms = bencher.execs({src, diff, grad}) / RUNS;
  748. printf("src=%s, diff=%s, grad=%s, float32: %.2fms "
  749. "%.2fTFlops\n",
  750. src.to_string().c_str(), diff.to_string().c_str(),
  751. grad.to_string().c_str(), time_in_ms,
  752. (flo / (time_in_ms * 1e-3)));
  753. };
  754. // stride = 1
  755. run(32, 128, 24, 24, 128, 1, 1, 3);
  756. run(32, 256, 12, 12, 256, 1, 1, 3);
  757. // stride = 2
  758. run(32, 256, 12, 12, 512, 1, 2, 3);
  759. run(32, 512, 6, 6, 1024, 1, 2, 3);
  760. // stride = 1
  761. run(32, 128, 24, 24, 128, 3, 1, 3);
  762. run(32, 256, 12, 12, 256, 3, 1, 3);
  763. // stride = 2
  764. run(32, 128, 24, 24, 256, 3, 2, 3);
  765. run(32, 256, 12, 12, 512, 3, 2, 3);
  766. // stride = 1
  767. run(64, 128, 24, 24, 128, 1, 1, 3);
  768. run(64, 256, 12, 12, 256, 1, 1, 3);
  769. // stride = 2
  770. run(64, 256, 12, 12, 512, 1, 2, 3);
  771. run(64, 512, 6, 6, 1024, 1, 2, 3);
  772. // stride = 1
  773. run(64, 128, 24, 24, 128, 3, 1, 3);
  774. run(64, 256, 12, 12, 256, 3, 1, 3);
  775. // stride = 2
  776. run(64, 128, 24, 24, 256, 3, 2, 3);
  777. run(64, 256, 12, 12, 512, 3, 2, 3);
  778. }
  779. TEST_F(CUDA, BENCHMARK_GROUP_LOCAL_SHARE_FORWARD) {
  780. CUBenchmarker<LocalShare> bencher(handle_cuda());
  781. size_t RUNS = 1000;
  782. bencher.set_display(false).set_times(RUNS);
  783. std::unique_ptr<OprProxy<LocalShareForward>> proxy{
  784. new OprProxy<LocalShareForward>{true}};
  785. bencher.set_proxy(proxy);
  786. LocalShare::Param param;
  787. NormalRNG rng;
  788. auto run = [&](size_t batch, size_t ic, size_t ih, size_t iw, size_t oc,
  789. size_t f, size_t s, size_t sg) {
  790. param.pad_h = f / 2;
  791. param.pad_w = f / 2;
  792. param.stride_h = s;
  793. param.stride_w = s;
  794. param.spatial_groups_h = sg;
  795. param.spatial_groups_w = sg;
  796. param.sparse = LocalShare::Param::Sparse::GROUP;
  797. TensorShape src = {1, batch * ic, ih, iw},
  798. filter = {batch, sg, sg, ic, f, f, oc};
  799. size_t ho = infer_conv_shape(ih, f, s, f / 2);
  800. size_t wo = infer_conv_shape(iw, f, s, f / 2);
  801. float flo = 2.0 * batch * oc * ho * wo * ic * f * f / (1e12);
  802. bencher.set_param(param)
  803. .set_dtype(0, dtype::Float32())
  804. .set_dtype(1, dtype::Float32())
  805. .set_dtype(2, dtype::Float32())
  806. .set_rng(0, &rng)
  807. .set_rng(1, &rng);
  808. bencher.proxy()->target_execution_policy.algo.reset();
  809. auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS;
  810. ;
  811. printf("src=%s, filter=%s, float32: %.2fms %.2fTFlops\n",
  812. src.to_string().c_str(), filter.to_string().c_str(), time_in_ms,
  813. (flo / (time_in_ms * 1e-3)));
  814. };
  815. // stride = 1
  816. run(32, 128, 24, 24, 128, 1, 1, 3);
  817. run(32, 256, 12, 12, 256, 1, 1, 3);
  818. // stride = 2
  819. run(32, 256, 12, 12, 512, 1, 2, 3);
  820. run(32, 512, 6, 6, 1024, 1, 2, 3);
  821. // stride = 1
  822. run(64, 128, 24, 24, 128, 1, 1, 3);
  823. run(64, 256, 12, 12, 256, 1, 1, 3);
  824. // stride = 2
  825. run(64, 256, 12, 12, 512, 1, 2, 3);
  826. run(64, 512, 6, 6, 1024, 1, 2, 3);
  827. }
  828. TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_BWD_DATA) {
  829. CUBenchmarker<LocalShareBackwardData> bencher(handle_cuda());
  830. size_t RUNS = 1000;
  831. bencher.set_display(false).set_times(RUNS);
  832. std::unique_ptr<OprProxy<LocalShareBackwardData>> proxy{
  833. new OprProxy<LocalShareBackwardData>{true}};
  834. bencher.set_proxy(proxy);
  835. LocalShare::Param param;
  836. NormalRNG rng;
  837. auto run = [&](size_t batch, size_t ic, size_t ih, size_t iw, size_t oc,
  838. size_t f, size_t s, size_t sg) {
  839. param.pad_h = f / 2;
  840. param.pad_w = f / 2;
  841. param.stride_h = s;
  842. param.stride_w = s;
  843. param.spatial_groups_h = sg;
  844. param.spatial_groups_w = sg;
  845. TensorShape grad = {batch, ic, ih, iw}, filter = {sg, sg, ic, f, f, oc};
  846. size_t ho = infer_conv_shape(ih, f, s, f / 2);
  847. size_t wo = infer_conv_shape(iw, f, s, f / 2);
  848. TensorShape diff = {batch, oc, ho, wo};
  849. float flo = 2.0 * batch * oc * ho * wo * ic * f * f / (1e12);
  850. bencher.set_param(param)
  851. .set_dtype(0, dtype::Float32())
  852. .set_dtype(1, dtype::Float32())
  853. .set_dtype(2, dtype::Float32())
  854. .set_rng(0, &rng)
  855. .set_rng(1, &rng);
  856. bencher.proxy()->target_execution_policy.algo.reset();
  857. auto time_in_ms = bencher.execs({filter, diff, grad}) / RUNS;
  858. printf("filter=%s, diff=%s, grad=%s, float32: %.2fms "
  859. "%.2fTFlops\n",
  860. filter.to_string().c_str(), diff.to_string().c_str(),
  861. grad.to_string().c_str(), time_in_ms,
  862. (flo / (time_in_ms * 1e-3)));
  863. };
  864. // stride = 1
  865. run(32, 128, 24, 24, 128, 1, 1, 3);
  866. run(32, 256, 12, 12, 256, 1, 1, 3);
  867. // stride = 2
  868. run(32, 256, 12, 12, 512, 1, 2, 3);
  869. run(32, 512, 6, 6, 1024, 1, 2, 3);
  870. // stride = 1
  871. run(32, 128, 24, 24, 128, 3, 1, 3);
  872. run(32, 256, 12, 12, 256, 3, 1, 3);
  873. // stride = 2
  874. run(32, 128, 24, 24, 256, 3, 2, 3);
  875. run(32, 256, 12, 12, 512, 3, 2, 3);
  876. // stride = 1
  877. run(64, 128, 24, 24, 128, 1, 1, 3);
  878. run(64, 256, 12, 12, 256, 1, 1, 3);
  879. // stride = 2
  880. run(64, 256, 12, 12, 512, 1, 2, 3);
  881. run(64, 512, 6, 6, 1024, 1, 2, 3);
  882. // stride = 1
  883. run(64, 128, 24, 24, 128, 3, 1, 3);
  884. run(64, 256, 12, 12, 256, 3, 1, 3);
  885. // stride = 2
  886. run(64, 128, 24, 24, 256, 3, 2, 3);
  887. run(64, 256, 12, 12, 512, 3, 2, 3);
  888. }
  889. TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_FORWARD_BOTTLENECK) {
  890. CUBenchmarker<LocalShare> bencher(handle_cuda());
  891. CUBenchmarker<Convolution> bencher_conv(handle_cuda());
  892. size_t RUNS = 1000;
  893. bencher.set_display(false).set_times(RUNS);
  894. std::unique_ptr<OprProxy<LocalShareForward>> proxy{
  895. new OprProxy<LocalShareForward>{true}};
  896. bencher.set_proxy(proxy);
  897. bencher_conv.set_display(false).set_times(RUNS);
  898. std::unique_ptr<OprProxy<Convolution>> conv_proxy{
  899. new OprProxy<Convolution>{true}};
  900. bencher_conv.set_proxy(conv_proxy);
  901. LocalShare::Param param;
  902. Convolution::Param conv_param;
  903. NormalRNG rng;
  904. auto run = [&](size_t batch, size_t ic, size_t ih, size_t iw, size_t oc,
  905. size_t f, size_t s, size_t sg) {
  906. param.pad_h = f / 2;
  907. param.pad_w = f / 2;
  908. param.stride_h = s;
  909. param.stride_w = s;
  910. param.spatial_groups_h = sg;
  911. param.spatial_groups_w = sg;
  912. conv_param.pad_h = f / 2;
  913. conv_param.pad_w = f / 2;
  914. conv_param.stride_h = s;
  915. conv_param.stride_w = s;
  916. TensorShape src = {batch, ic, ih, iw}, filter = {sg, sg, ic, f, f, oc};
  917. size_t ho = infer_conv_shape(ih, f, s, f / 2);
  918. size_t wo = infer_conv_shape(iw, f, s, f / 2);
  919. float flo = 2.0 * batch * oc * ho * wo * ic * f * f / (1e12);
  920. bencher.set_param(param)
  921. .set_dtype(0, dtype::Float32())
  922. .set_dtype(1, dtype::Float32())
  923. .set_dtype(2, dtype::Float32())
  924. .set_rng(0, &rng)
  925. .set_rng(1, &rng);
  926. bencher.proxy()->target_execution_policy.algo.reset();
  927. auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS;
  928. bencher_conv.set_param(conv_param);
  929. bencher_conv.proxy()->target_execution_policy.algo.reset();
  930. auto time_in_ms_conv =
  931. bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS;
  932. printf("src=%s, filter=%s, float32: %.2fms %.2fTFlops, "
  933. "conv(float32): %.2fms %.2fTFlops, local_share/conv=%.2f\n",
  934. src.to_string().c_str(), filter.to_string().c_str(), time_in_ms,
  935. (flo / (time_in_ms * 1e-3)), time_in_ms_conv,
  936. (flo / (time_in_ms_conv * 1e-3)), time_in_ms / time_in_ms_conv);
  937. };
  938. // stride = 1
  939. run(32, 128, 24, 24, 128, 1, 1, 3);
  940. run(32, 256, 12, 12, 256, 1, 1, 3);
  941. // stride = 2
  942. run(32, 256, 12, 12, 512, 1, 2, 3);
  943. run(32, 512, 6, 6, 1024, 1, 2, 3);
  944. // stride = 1
  945. run(32, 128, 24, 24, 128, 3, 1, 3);
  946. run(32, 256, 12, 12, 256, 3, 1, 3);
  947. // stride = 2
  948. run(32, 128, 24, 24, 256, 3, 2, 3);
  949. run(32, 256, 12, 12, 512, 3, 2, 3);
  950. // stride = 1
  951. run(64, 128, 24, 24, 128, 1, 1, 3);
  952. run(64, 256, 12, 12, 256, 1, 1, 3);
  953. // stride = 2
  954. run(64, 256, 12, 12, 512, 1, 2, 3);
  955. run(64, 512, 6, 6, 1024, 1, 2, 3);
  956. // stride = 1
  957. run(64, 128, 24, 24, 128, 3, 1, 3);
  958. run(64, 256, 12, 12, 256, 3, 1, 3);
  959. // stride = 2
  960. run(64, 128, 24, 24, 256, 3, 2, 3);
  961. run(64, 256, 12, 12, 512, 3, 2, 3);
  962. }
  963. TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_FORWARD_FROM_RESEARCH) {
  964. CUBenchmarker<LocalShare> bencher(handle_cuda());
  965. CUBenchmarker<Convolution> bencher_conv(handle_cuda());
  966. size_t RUNS = 1000;
  967. bencher.set_display(false).set_times(RUNS);
  968. std::unique_ptr<OprProxy<LocalShareForward>> proxy{
  969. new OprProxy<LocalShareForward>{true}};
  970. bencher.set_proxy(proxy);
  971. bencher_conv.set_display(false).set_times(RUNS);
  972. std::unique_ptr<OprProxy<Convolution>> conv_proxy{
  973. new OprProxy<Convolution>{true}};
  974. bencher_conv.set_proxy(conv_proxy);
  975. LocalShare::Param param;
  976. Convolution::Param conv_param;
  977. NormalRNG rng;
  978. auto run = [&](size_t batch, size_t ic, size_t ih, size_t iw, size_t oc,
  979. size_t f, size_t s, size_t sg) {
  980. param.pad_h = f / 2;
  981. param.pad_w = f / 2;
  982. param.stride_h = s;
  983. param.stride_w = s;
  984. param.spatial_groups_h = sg;
  985. param.spatial_groups_w = sg;
  986. conv_param.pad_h = f / 2;
  987. conv_param.pad_w = f / 2;
  988. conv_param.stride_h = s;
  989. conv_param.stride_w = s;
  990. TensorShape src = {batch, ic, ih, iw}, filter = {sg, sg, ic, f, f, oc};
  991. size_t ho = infer_conv_shape(ih, f, s, f / 2);
  992. size_t wo = infer_conv_shape(iw, f, s, f / 2);
  993. float flo = 2.0 * batch * oc * ho * wo * ic * f * f / (1e12);
  994. bencher.set_param(param)
  995. .set_dtype(0, dtype::Float32())
  996. .set_dtype(1, dtype::Float32())
  997. .set_dtype(2, dtype::Float32())
  998. .set_rng(0, &rng)
  999. .set_rng(1, &rng);
  1000. bencher.proxy()->target_execution_policy.algo.reset();
  1001. auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS;
  1002. bencher_conv.set_param(conv_param);
  1003. bencher_conv.proxy()->target_execution_policy.algo.reset();
  1004. auto time_in_ms_conv =
  1005. bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS;
  1006. printf("src=%s, filter=%s, float32: %.2fms %.2fTFlops, "
  1007. "conv(float32): %.2fms %.2fTFlops, local_share/conv=%.2f\n",
  1008. src.to_string().c_str(), filter.to_string().c_str(), time_in_ms,
  1009. (flo / (time_in_ms * 1e-3)), time_in_ms_conv,
  1010. (flo / (time_in_ms_conv * 1e-3)), time_in_ms / time_in_ms_conv);
  1011. };
  1012. // stride = 1
  1013. run(64, 128, 24, 24, 128, 1, 1, 3);
  1014. run(64, 256, 12, 12, 256, 1, 1, 3);
  1015. run(64, 512, 6, 6, 512, 1, 1, 3);
  1016. run(64, 1024, 3, 3, 1024, 1, 1, 3);
  1017. // stride = 2
  1018. run(64, 128, 24, 24, 256, 1, 2, 3);
  1019. run(64, 256, 12, 12, 512, 1, 2, 3);
  1020. run(64, 512, 6, 6, 1024, 1, 2, 3);
  1021. // stride = 1
  1022. run(64, 128, 24, 24, 128, 3, 1, 3);
  1023. run(64, 256, 12, 12, 256, 3, 1, 3);
  1024. run(64, 512, 6, 6, 512, 3, 1, 3);
  1025. run(64, 1024, 3, 3, 1024, 3, 1, 3);
  1026. // stride = 2
  1027. run(64, 128, 24, 24, 256, 3, 2, 3);
  1028. run(64, 256, 12, 12, 512, 3, 2, 3);
  1029. run(64, 512, 6, 6, 1024, 3, 2, 3);
  1030. }
  1031. TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_FORWARD) {
  1032. require_compute_capability(6, 0);
  1033. CUBenchmarker<LocalShare> bencher(handle_cuda());
  1034. CUBenchmarker<Convolution> bencher_conv(handle_cuda());
  1035. size_t RUNS = 200;
  1036. bencher.set_display(false).set_times(RUNS);
  1037. std::unique_ptr<OprProxy<LocalShareForward>> proxy{
  1038. new OprProxy<LocalShareForward>{true}};
  1039. bencher.set_proxy(proxy);
  1040. bencher_conv.set_display(false).set_times(RUNS);
  1041. std::unique_ptr<OprProxy<Convolution>> conv_proxy{
  1042. new OprProxy<Convolution>{true}};
  1043. bencher_conv.set_proxy(conv_proxy);
  1044. LocalShare::Param param;
  1045. Convolution::Param conv_param;
  1046. NormalRNG rng;
  1047. auto run = [&](size_t batch, size_t ic, size_t ih, size_t iw, size_t oc,
  1048. size_t f, size_t s, size_t sg) {
  1049. param.pad_h = f / 2;
  1050. param.pad_w = f / 2;
  1051. param.stride_h = s;
  1052. param.stride_w = s;
  1053. param.spatial_groups_h = sg;
  1054. param.spatial_groups_w = sg;
  1055. conv_param.pad_h = f / 2;
  1056. conv_param.pad_w = f / 2;
  1057. conv_param.stride_h = s;
  1058. conv_param.stride_w = s;
  1059. TensorShape src = {batch, ic, ih, iw}, filter = {sg, sg, ic, f, f, oc};
  1060. size_t ho = infer_conv_shape(ih, f, s, f / 2);
  1061. size_t wo = infer_conv_shape(iw, f, s, f / 2);
  1062. float flo = 2.0 * batch * oc * ho * wo * ic * f * f / (1e12);
  1063. bencher.set_param(param)
  1064. .set_dtype(0, dtype::Float32())
  1065. .set_dtype(1, dtype::Float32())
  1066. .set_dtype(2, dtype::Float32())
  1067. .set_rng(0, &rng)
  1068. .set_rng(1, &rng);
  1069. bencher.proxy()->target_execution_policy.algo.reset();
  1070. auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS;
  1071. bencher_conv.set_param(conv_param);
  1072. bencher_conv.proxy()->target_execution_policy.algo.reset();
  1073. auto time_in_ms_conv =
  1074. bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS;
  1075. printf("src=%s, filter=%s, float32: %.2fms %.2fTFlops, "
  1076. "conv(float32): %.2fms %.2fTFlops, local_share/conv=%.2f\n",
  1077. src.to_string().c_str(), filter.to_string().c_str(), time_in_ms,
  1078. (flo / (time_in_ms * 1e-3)), time_in_ms_conv,
  1079. (flo / (time_in_ms_conv * 1e-3)), time_in_ms / time_in_ms_conv);
  1080. };
  1081. run(64, 256, 48, 48, 256, 7, 1, 3);
  1082. run(64, 128, 24, 24, 128, 7, 1, 3);
  1083. run(64, 256, 12, 12, 256, 7, 1, 3);
  1084. run(64, 512, 6, 6, 512, 7, 1, 3);
  1085. run(64, 256, 48, 48, 256, 5, 1, 3);
  1086. run(64, 128, 24, 24, 128, 5, 1, 3);
  1087. run(64, 256, 12, 12, 256, 5, 1, 3);
  1088. run(64, 512, 6, 6, 512, 5, 1, 3);
  1089. run(32, 64, 96, 96, 256, 7, 2, 3);
  1090. run(32, 128, 24, 24, 128, 7, 2, 3);
  1091. run(32, 256, 12, 12, 256, 7, 2, 3);
  1092. run(32, 64, 96, 96, 256, 5, 2, 3);
  1093. run(32, 128, 24, 24, 128, 5, 2, 3);
  1094. run(32, 256, 12, 12, 256, 5, 2, 3);
  1095. }
  1096. #endif
  1097. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台