You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

local_share.cpp 45 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126
  1. #include "megdnn/oprs/nn.h"
  2. #include "src/common/utils.h"
  3. #include "test/common/checker.h"
  4. #include "test/common/convolution.h"
  5. #include "test/common/tensor.h"
  6. #include "test/common/workspace_wrapper.h"
  7. #include "test/cuda/benchmark.h"
  8. #include "test/cuda/fixture.h"
  9. #include "test/cuda/utils.h"
  10. using namespace megdnn;
  11. using namespace test;
  12. namespace {
  13. struct LocalShareArgs {
  14. size_t b, c, f, p, s, h, w, sg;
  15. };
  16. std::vector<LocalShareArgs> get_local_share_conv_1x1_args_lar_bs() {
  17. std::vector<LocalShareArgs> ret;
  18. // clang-format off
  19. for (size_t b : {32, 64}) {
  20. for (size_t c : {32, 16, 8}) {
  21. for (size_t f : {1}) {
  22. for (int p : {0}) {
  23. for (size_t s : {1, 2}) {
  24. for (size_t h : {8, 16}) {
  25. for (size_t w : {2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 24, 32, 33}) {
  26. for (size_t sg : {3, 2}) {
  27. size_t ho = infer_conv_shape(h, f, s, p);
  28. size_t wo = infer_conv_shape(w, f, s, p);
  29. if (ho % sg != 0 || wo % sg != 0)
  30. continue;
  31. ret.emplace_back(LocalShareArgs{b, c, f, static_cast<size_t>(p),
  32. s, h, w, sg});
  33. } } } } } } } }
  34. // clang-format on
  35. return ret;
  36. }
  37. std::vector<LocalShareArgs> get_local_share_conv_3x3_args_lar_bs() {
  38. std::vector<LocalShareArgs> ret;
  39. // clang-format off
  40. for (size_t b : {32, 64}) {
  41. for (size_t c : {32, 16, 8}) {
  42. for (size_t f : {3}) {
  43. for (int p : {static_cast<int>(f / 2), 0}) {
  44. for (size_t s : {1, 2}) {
  45. for (size_t h : {8, 16}) {
  46. for (size_t w : {3, 4, 5, 6, 7, 8, 9, 10, 16, 24, 32, 33}) {
  47. for (size_t sg : {3, 2}) {
  48. size_t ho = infer_conv_shape(h, f, s, p);
  49. size_t wo = infer_conv_shape(w, f, s, p);
  50. if (ho % sg != 0 || wo % sg != 0)
  51. continue;
  52. ret.emplace_back(LocalShareArgs{b, c, f, static_cast<size_t>(p),
  53. s, h, w, sg});
  54. } } } } } } } }
  55. // clang-format on
  56. return ret;
  57. }
  58. std::vector<LocalShareArgs> get_local_share_conv_5x5_args_lar_bs() {
  59. std::vector<LocalShareArgs> ret;
  60. // clang-format off
  61. for (size_t b : {32, 64}) {
  62. for (size_t c : {32, 16, 8}) {
  63. for (size_t f : {5}) {
  64. for (int p : {static_cast<int>(f / 2), 0}) {
  65. for (size_t s : {1, 2}) {
  66. for (size_t h : {8, 16}) {
  67. for (size_t w : {8, 9, 10, 16, 24, 32, 33}) {
  68. for (size_t sg : {3, 2}) {
  69. size_t ho = infer_conv_shape(h, f, s, p);
  70. size_t wo = infer_conv_shape(w, f, s, p);
  71. if (ho % sg != 0 || wo % sg != 0)
  72. continue;
  73. ret.emplace_back(LocalShareArgs{b, c, f, static_cast<size_t>(p), s,
  74. h, w, sg});
  75. } } } } } } } }
  76. // clang-format on
  77. return ret;
  78. }
  79. std::vector<LocalShareArgs> get_local_share_conv_7x7_args_lar_bs() {
  80. std::vector<LocalShareArgs> ret;
  81. // clang-format off
  82. for (size_t b : {32, 64}) {
  83. for (size_t c : {32, 16, 8}) {
  84. for (size_t f : {7}) {
  85. for (int p : {static_cast<int>(f / 2), 0}) {
  86. for (size_t s : {1, 2}) {
  87. for (size_t h : {8, 16}) {
  88. for (size_t w : {8, 9, 10, 16, 24, 32, 33}) {
  89. for (size_t sg : {3, 2}) {
  90. size_t ho = infer_conv_shape(h, f, s, p);
  91. size_t wo = infer_conv_shape(w, f, s, p);
  92. if (ho % sg != 0 || wo % sg != 0)
  93. continue;
  94. ret.emplace_back(LocalShareArgs{b, c, f, static_cast<size_t>(p), s,
  95. h, w, sg});
  96. } } } } } } } }
  97. // clang-format on
  98. return ret;
  99. }
  100. std::vector<LocalShareArgs> get_local_share_conv_small_image(size_t kernel_size) {
  101. size_t f = kernel_size;
  102. std::vector<LocalShareArgs> ret;
  103. // clang-format off
  104. for (size_t b : {8, 16, 32, 48, 64}) {
  105. for (size_t c : {8, 16, 32, 48, 64, 96, 128}) {
  106. for (int p : {static_cast<int>(f / 2), 0}) {
  107. for (size_t s : {1, 2}) {
  108. for (size_t h : {12}) {
  109. for (size_t w : {12}) {
  110. for (size_t sg : {3, 2}) {
  111. size_t ho = infer_conv_shape(h, f, s, p);
  112. size_t wo = infer_conv_shape(w, f, s, p);
  113. if (ho % sg != 0 || wo % sg != 0)
  114. continue;
  115. ret.emplace_back(LocalShareArgs{b, c, f, static_cast<size_t>(p), s,
  116. h, w, sg});
  117. } } } } } } }
  118. // clang-format on
  119. return ret;
  120. }
  121. std::vector<LocalShareArgs> get_local_share_conv_small_image() {
  122. std::vector<LocalShareArgs> ret = get_local_share_conv_small_image(3);
  123. auto ret1 = get_local_share_conv_small_image(5);
  124. auto ret2 = get_local_share_conv_small_image(7);
  125. ret.insert(ret.begin(), ret1.begin(), ret1.end());
  126. ret.insert(ret.begin(), ret2.begin(), ret2.end());
  127. return ret;
  128. }
  129. void test_local_share_bwd_data_implicit_gemm(size_t kernel_size, Handle* handle) {
  130. Checker<LocalShareBackwardData> checker(handle);
  131. bool require_algo = false;
  132. checker.set_before_exec_callback(AlgoChecker<LocalShareBackwardData>(
  133. "LOCAL_SHARE_IMPLICIT_GEMM", &require_algo));
  134. using Param = LocalShare::Param;
  135. ConstValue const_0{0};
  136. auto args = get_local_share_conv_small_image(kernel_size);
  137. for (auto&& arg : args) {
  138. static_cast<void>(arg);
  139. size_t b = arg.b, c = arg.c, f = arg.f, p = arg.p, s = arg.s, h = arg.h,
  140. w = arg.w, sg = arg.sg;
  141. size_t ho = infer_conv_shape(h, f, s, p), wo = infer_conv_shape(w, f, s, p);
  142. Param param;
  143. param.stride_h = param.stride_w = s;
  144. param.pad_h = param.pad_w = p;
  145. param.spatial_groups_h = param.spatial_groups_w = sg;
  146. checker.set_param(param);
  147. checker.set_rng(2, &const_0);
  148. TensorShape diff{b, c, ho, wo}, filter{sg, sg, 4, f, f, c}, grad{b, 4, h, w};
  149. checker.execs({filter, diff, grad});
  150. diff = TensorShape{b, c, ho, wo}, filter = TensorShape{sg, sg, 8, f, f, c};
  151. grad = {b, 8, h, w};
  152. checker.exec({filter, diff, grad});
  153. }
  154. }
  155. } // namespace
  156. TEST_F(CUDA, LOCAL_SHARE_FORWARD_1x1_LAR_BS) {
  157. require_compute_capability(6, 0);
  158. Checker<LocalShare> checker(handle_cuda());
  159. bool require_algo = false;
  160. checker.set_before_exec_callback(AlgoChecker<LocalShare>(
  161. "LOCAL_SHARE_CHWN_BATCH_SIZE_AWARE", &require_algo));
  162. using Param = LocalShare::Param;
  163. auto args = get_local_share_conv_1x1_args_lar_bs();
  164. for (auto&& arg : args) {
  165. size_t b = arg.b, c = arg.c, f = arg.f, p = arg.p, s = arg.s, h = arg.h,
  166. w = arg.w, sg = arg.sg;
  167. Param param;
  168. param.stride_h = param.stride_w = s;
  169. param.pad_h = param.pad_w = p;
  170. param.spatial_groups_h = param.spatial_groups_w = sg;
  171. checker.set_param(param);
  172. TensorShape src{b, 4, h, w}, filter{sg, sg, 4, f, f, c};
  173. checker.execs({src, filter, {}});
  174. src = TensorShape{b, 8, h, w}, filter = TensorShape{sg, sg, 8, f, f, c};
  175. checker.exec({src, filter, {}});
  176. }
  177. }
  178. TEST_F(CUDA, LOCAL_SHARE_FORWARD_3x3_LAR_BS) {
  179. require_compute_capability(6, 0);
  180. Checker<LocalShare> checker(handle_cuda());
  181. bool require_algo = false;
  182. checker.set_before_exec_callback(AlgoChecker<LocalShare>(
  183. "LOCAL_SHARE_CHWN_BATCH_SIZE_AWARE", &require_algo));
  184. using Param = LocalShare::Param;
  185. auto args = get_local_share_conv_3x3_args_lar_bs();
  186. ConstValue const_1{1};
  187. for (auto&& arg : args) {
  188. size_t b = arg.b, c = arg.c, f = arg.f, p = arg.p, s = arg.s, h = arg.h,
  189. w = arg.w, sg = arg.sg;
  190. Param param;
  191. param.stride_h = param.stride_w = s;
  192. param.pad_h = param.pad_w = p;
  193. param.spatial_groups_h = param.spatial_groups_w = sg;
  194. checker.set_param(param);
  195. TensorShape src{b, 4, h, w}, filter{sg, sg, 4, f, f, c};
  196. checker.execs({src, filter, {}});
  197. src = TensorShape{b, 8, h, w}, filter = TensorShape{sg, sg, 8, f, f, c};
  198. checker.exec({src, filter, {}});
  199. }
  200. }
  201. TEST_F(CUDA, LOCAL_SHARE_FORWARD_5x5_LAR_BS) {
  202. require_compute_capability(6, 0);
  203. Checker<LocalShare> checker(handle_cuda());
  204. bool require_algo = false;
  205. checker.set_before_exec_callback(AlgoChecker<LocalShare>(
  206. "LOCAL_SHARE_CHWN_BATCH_SIZE_AWARE", &require_algo));
  207. using Param = LocalShare::Param;
  208. auto args = get_local_share_conv_5x5_args_lar_bs();
  209. for (auto&& arg : args) {
  210. size_t b = arg.b, c = arg.c, f = arg.f, p = arg.p, s = arg.s, h = arg.h,
  211. w = arg.w, sg = arg.sg;
  212. Param param;
  213. param.stride_h = param.stride_w = s;
  214. param.pad_h = param.pad_w = p;
  215. param.spatial_groups_h = param.spatial_groups_w = sg;
  216. checker.set_param(param);
  217. TensorShape src{b, 4, h, w}, filter{sg, sg, 4, f, f, c};
  218. checker.execs({src, filter, {}});
  219. src = TensorShape{b, 8, h, w}, filter = TensorShape{sg, sg, 8, f, f, c};
  220. checker.exec({src, filter, {}});
  221. }
  222. }
  223. TEST_F(CUDA, LOCAL_SHARE_FORWARD_7x7_LAR_BS) {
  224. require_compute_capability(6, 0);
  225. Checker<LocalShare> checker(handle_cuda());
  226. bool require_algo = false;
  227. checker.set_before_exec_callback(AlgoChecker<LocalShare>(
  228. "LOCAL_SHARE_CHWN_BATCH_SIZE_AWARE", &require_algo));
  229. using Param = LocalShare::Param;
  230. auto args = get_local_share_conv_7x7_args_lar_bs();
  231. for (auto&& arg : args) {
  232. size_t b = arg.b, c = arg.c, f = arg.f, p = arg.p, s = arg.s, h = arg.h,
  233. w = arg.w, sg = arg.sg;
  234. Param param;
  235. param.stride_h = param.stride_w = s;
  236. param.pad_h = param.pad_w = p;
  237. param.spatial_groups_h = param.spatial_groups_w = sg;
  238. checker.set_param(param);
  239. TensorShape src{b, 4, h, w}, filter{sg, sg, 4, f, f, c};
  240. checker.execs({src, filter, {}});
  241. src = TensorShape{b, 8, h, w}, filter = TensorShape{sg, sg, 8, f, f, c};
  242. checker.exec({src, filter, {}});
  243. }
  244. }
  245. TEST_F(CUDA, LOCAL_SHARE_BATCHED_MATMUL) {
  246. Checker<LocalShare> checker(handle_cuda());
  247. bool require_algo = false;
  248. checker.set_before_exec_callback(
  249. AlgoChecker<LocalShare>("LOCAL_SHARE_BATCHED_MATMUL", &require_algo));
  250. using Param = LocalShare::Param;
  251. auto args = convolution::get_args();
  252. for (size_t sg : {2, 3}) {
  253. for (auto&& arg : args) {
  254. if (arg.param.sparse != LocalShare::Param::Sparse::DENSE)
  255. continue;
  256. if (arg.param.format != LocalShare::Param::Format::NCHW)
  257. continue;
  258. if (arg.param.dilate_h != 1 || arg.param.dilate_w != 1)
  259. continue;
  260. Param param;
  261. param.sparse = arg.param.sparse;
  262. param.stride_h = arg.param.stride_h, param.stride_w = arg.param.stride_w;
  263. param.pad_h = arg.param.pad_h, param.pad_w = arg.param.pad_w;
  264. param.dilate_h = arg.param.dilate_h, param.dilate_w = arg.param.dilate_w;
  265. param.spatial_groups_h = param.spatial_groups_w = sg;
  266. size_t ho = infer_conv_shape(
  267. arg.src[2], arg.filter[2], param.stride_h, param.pad_h);
  268. size_t wo = infer_conv_shape(
  269. arg.src[3], arg.filter[3], param.stride_w, param.pad_w);
  270. if (ho % sg != 0 || wo % sg != 0)
  271. continue;
  272. TensorShape filter{
  273. sg, sg, arg.filter[1], arg.filter[2], arg.filter[3], arg.filter[0]};
  274. checker.set_param(param);
  275. checker.exec({arg.src, filter, {}});
  276. }
  277. }
  278. }
  279. TEST_F(CUDA, GROUP_LOCAL_SHARE_BATCHED_MATMUL) {
  280. Checker<LocalShare> checker(handle_cuda());
  281. bool require_algo = false;
  282. checker.set_before_exec_callback(
  283. AlgoChecker<LocalShare>("LOCAL_SHARE_BATCHED_MATMUL", &require_algo));
  284. using Param = LocalShare::Param;
  285. auto args = convolution::get_args();
  286. for (size_t sg : {2, 3}) {
  287. for (auto&& arg : args) {
  288. if (arg.param.sparse != LocalShare::Param::Sparse::DENSE)
  289. continue;
  290. if (arg.param.format != LocalShare::Param::Format::NCHW)
  291. continue;
  292. if (arg.param.dilate_h != 1 || arg.param.dilate_w != 1)
  293. continue;
  294. if (arg.filter.ndim != 4)
  295. continue;
  296. Param param;
  297. param.sparse = Param::Sparse::GROUP;
  298. param.stride_h = arg.param.stride_h, param.stride_w = arg.param.stride_w;
  299. param.pad_h = arg.param.pad_h, param.pad_w = arg.param.pad_w;
  300. param.dilate_h = arg.param.dilate_h, param.dilate_w = arg.param.dilate_w;
  301. param.spatial_groups_h = param.spatial_groups_w = sg;
  302. size_t ho = infer_conv_shape(
  303. arg.src[2], arg.filter[2], param.stride_h, param.pad_h);
  304. size_t wo = infer_conv_shape(
  305. arg.src[3], arg.filter[3], param.stride_w, param.pad_w);
  306. if (ho % sg != 0 || wo % sg != 0)
  307. continue;
  308. size_t nr_groups = 3;
  309. TensorShape filter{
  310. nr_groups, sg, sg, arg.filter[1], arg.filter[2],
  311. arg.filter[3], arg.filter[0]};
  312. TensorShape src{arg.src[0], arg.src[1] * nr_groups, arg.src[2], arg.src[3]};
  313. checker.set_param(param);
  314. checker.exec({src, filter, {}});
  315. }
  316. }
  317. }
  318. TEST_F(CUDA, LOCAL_SHARE_FORWARD_SMALL_IMAGE_GENERAL) {
  319. require_compute_capability(6, 0);
  320. Checker<LocalShare> checker(handle_cuda());
  321. bool require_algo = false;
  322. checker.set_before_exec_callback(AlgoChecker<LocalShare>(
  323. "LOCAL_SHARE_CHWN_BATCH_SIZE_AWARE_SMALL_IMAGE", &require_algo));
  324. using Param = LocalShare::Param;
  325. auto args = convolution::get_args();
  326. for (size_t sg : {2, 3}) {
  327. for (auto&& arg : args) {
  328. if (arg.param.sparse != LocalShare::Param::Sparse::DENSE)
  329. continue;
  330. if (arg.param.format != LocalShare::Param::Format::NCHW)
  331. continue;
  332. if (arg.param.dilate_h != 1 || arg.param.dilate_w != 1)
  333. continue;
  334. Param param;
  335. param.stride_h = arg.param.stride_h, param.stride_w = arg.param.stride_w;
  336. param.pad_h = arg.param.pad_h, param.pad_w = arg.param.pad_w;
  337. param.dilate_h = arg.param.dilate_h, param.dilate_w = arg.param.dilate_w;
  338. param.spatial_groups_h = param.spatial_groups_w = sg;
  339. size_t ho = infer_conv_shape(
  340. arg.src[2], arg.filter[2], param.stride_h, param.pad_h);
  341. size_t wo = infer_conv_shape(
  342. arg.src[3], arg.filter[3], param.stride_w, param.pad_w);
  343. if (ho % sg != 0 || wo % sg != 0)
  344. continue;
  345. arg.filter[1] = arg.filter[1] + (4 - arg.filter[1] % 4);
  346. arg.src[1] = arg.filter[1];
  347. TensorShape filter{
  348. sg, sg, arg.filter[1], arg.filter[2], arg.filter[3], arg.filter[0]};
  349. checker.set_param(param);
  350. checker.exec({arg.src, filter, {}});
  351. }
  352. }
  353. }
  354. TEST_F(CUDA, LOCAL_SHARE_FORWARD_SMALL_IMAGE_SPECIAL) {
  355. require_compute_capability(6, 0);
  356. Checker<LocalShare> checker(handle_cuda());
  357. bool require_algo = false;
  358. checker.set_before_exec_callback(AlgoChecker<LocalShare>(
  359. "LOCAL_SHARE_CHWN_BATCH_SIZE_AWARE_SMALL_IMAGE", &require_algo));
  360. using Param = LocalShare::Param;
  361. auto args = get_local_share_conv_small_image();
  362. for (auto&& arg : args) {
  363. size_t b = arg.b, c = arg.c, f = arg.f, p = arg.p, s = arg.s, h = arg.h,
  364. w = arg.w, sg = arg.sg;
  365. Param param;
  366. param.stride_h = param.stride_w = s;
  367. param.pad_h = param.pad_w = p;
  368. param.spatial_groups_h = param.spatial_groups_w = sg;
  369. checker.set_param(param);
  370. TensorShape src{b, 4, h, w}, filter{sg, sg, 4, f, f, c};
  371. checker.execs({src, filter, {}});
  372. src = TensorShape{b, 8, h, w}, filter = TensorShape{sg, sg, 8, f, f, c};
  373. checker.exec({src, filter, {}});
  374. }
  375. }
  376. TEST_F(CUDA, LOCAL_SHARE_BWD_DATA_IMPLICIT_GEMM_GENERAL) {
  377. require_compute_capability(6, 0);
  378. Checker<LocalShareBackwardData> checker(handle_cuda());
  379. bool require_algo = false;
  380. checker.set_before_exec_callback(AlgoChecker<LocalShareBackwardData>(
  381. "LOCAL_SHARE_IMPLICIT_GEMM", &require_algo));
  382. using Param = LocalShare::Param;
  383. auto args = convolution::get_args();
  384. ConstValue const_0{0};
  385. for (size_t sg : {2, 3}) {
  386. for (auto&& arg : args) {
  387. if (arg.param.sparse != LocalShare::Param::Sparse::DENSE)
  388. continue;
  389. if (arg.param.format != LocalShare::Param::Format::NCHW)
  390. continue;
  391. if (arg.param.dilate_h != 1 || arg.param.dilate_w != 1)
  392. continue;
  393. Param param;
  394. param.stride_h = arg.param.stride_h, param.stride_w = arg.param.stride_w;
  395. param.pad_h = arg.param.pad_h, param.pad_w = arg.param.pad_w;
  396. param.dilate_h = arg.param.dilate_h, param.dilate_w = arg.param.dilate_w;
  397. param.spatial_groups_h = param.spatial_groups_w = sg;
  398. size_t ho = infer_conv_shape(
  399. arg.src[2], arg.filter[2], param.stride_h, param.pad_h);
  400. size_t wo = infer_conv_shape(
  401. arg.src[3], arg.filter[3], param.stride_w, param.pad_w);
  402. if (ho % sg != 0 || wo % sg != 0)
  403. continue;
  404. arg.filter[0] = arg.filter[0] + (4 - arg.filter[0] % 4);
  405. TensorShape filter{
  406. sg, sg, arg.filter[1], arg.filter[2], arg.filter[3], arg.filter[0]};
  407. TensorShape diff{arg.src[0], arg.filter[0], ho, wo};
  408. checker.set_param(param);
  409. checker.set_rng(2, &const_0);
  410. checker.exec({filter, diff, arg.src});
  411. }
  412. }
  413. }
  414. TEST_F(CUDA, LOCAL_SHARE_BWD_DATA_IMPLICIT_GEMM_SPECIAL_PART1) {
  415. require_compute_capability(6, 0);
  416. test_local_share_bwd_data_implicit_gemm(3, handle_cuda());
  417. }
  418. TEST_F(CUDA, LOCAL_SHARE_BWD_DATA_IMPLICIT_GEMM_SPECIAL_PART2) {
  419. require_compute_capability(6, 0);
  420. test_local_share_bwd_data_implicit_gemm(5, handle_cuda());
  421. }
  422. TEST_F(CUDA, LOCAL_SHARE_BWD_DATA_IMPLICIT_GEMM_SPECIAL_PART3) {
  423. require_compute_capability(6, 0);
  424. test_local_share_bwd_data_implicit_gemm(7, handle_cuda());
  425. }
  426. TEST_F(CUDA, LOCAL_SHARE_BWD_DATA_BATCHED_MATMUL) {
  427. Checker<LocalShareBackwardData> checker(handle_cuda());
  428. bool require_algo = false;
  429. checker.set_before_exec_callback(AlgoChecker<LocalShareBackwardData>(
  430. "LOCAL_SHARE_BATCHED_MATMUL", &require_algo));
  431. using Param = LocalShare::Param;
  432. auto args = convolution::get_args();
  433. ConstValue const_0{0};
  434. for (size_t sg : {2, 3}) {
  435. for (auto&& arg : args) {
  436. if (arg.param.sparse != LocalShare::Param::Sparse::DENSE)
  437. continue;
  438. if (arg.param.format != LocalShare::Param::Format::NCHW)
  439. continue;
  440. if (arg.param.dilate_h != 1 || arg.param.dilate_w != 1)
  441. continue;
  442. Param param;
  443. param.stride_h = arg.param.stride_h, param.stride_w = arg.param.stride_w;
  444. param.pad_h = arg.param.pad_h, param.pad_w = arg.param.pad_w;
  445. param.dilate_h = arg.param.dilate_h, param.dilate_w = arg.param.dilate_w;
  446. param.spatial_groups_h = param.spatial_groups_w = sg;
  447. size_t ho = infer_conv_shape(
  448. arg.src[2], arg.filter[2], param.stride_h, param.pad_h);
  449. size_t wo = infer_conv_shape(
  450. arg.src[3], arg.filter[3], param.stride_w, param.pad_w);
  451. if (ho % sg != 0 || wo % sg != 0)
  452. continue;
  453. TensorShape filter{
  454. sg, sg, arg.filter[1], arg.filter[2], arg.filter[3], arg.filter[0]};
  455. TensorShape diff{arg.src[0], arg.filter[0], ho, wo};
  456. checker.set_rng(2, &const_0);
  457. checker.set_param(param);
  458. checker.exec({filter, diff, arg.src});
  459. }
  460. }
  461. }
  462. TEST_F(CUDA, GROUP_LOCAL_SHARE_BWD_DATA_BATCHED_MATMUL) {
  463. Checker<LocalShareBackwardData> checker(handle_cuda());
  464. bool require_algo = false;
  465. checker.set_before_exec_callback(AlgoChecker<LocalShareBackwardData>(
  466. "LOCAL_SHARE_BATCHED_MATMUL", &require_algo));
  467. using Param = LocalShare::Param;
  468. auto args = convolution::get_args();
  469. ConstValue const_0{0};
  470. for (size_t sg : {2, 3}) {
  471. for (auto&& arg : args) {
  472. if (arg.param.sparse != LocalShare::Param::Sparse::DENSE)
  473. continue;
  474. if (arg.param.format != LocalShare::Param::Format::NCHW)
  475. continue;
  476. if (arg.param.dilate_h != 1 || arg.param.dilate_w != 1)
  477. continue;
  478. Param param;
  479. param.sparse = Param::Sparse::GROUP;
  480. param.stride_h = arg.param.stride_h, param.stride_w = arg.param.stride_w;
  481. param.pad_h = arg.param.pad_h, param.pad_w = arg.param.pad_w;
  482. param.dilate_h = arg.param.dilate_h, param.dilate_w = arg.param.dilate_w;
  483. param.spatial_groups_h = param.spatial_groups_w = sg;
  484. size_t ho = infer_conv_shape(
  485. arg.src[2], arg.filter[2], param.stride_h, param.pad_h);
  486. size_t wo = infer_conv_shape(
  487. arg.src[3], arg.filter[3], param.stride_w, param.pad_w);
  488. if (ho % sg != 0 || wo % sg != 0)
  489. continue;
  490. size_t nr_groups = 3;
  491. TensorShape filter{
  492. nr_groups, sg, sg, arg.filter[1], arg.filter[2],
  493. arg.filter[3], arg.filter[0]};
  494. TensorShape diff{arg.src[0], arg.filter[0] * nr_groups, ho, wo};
  495. TensorShape grad{
  496. arg.src[0], arg.src[1] * nr_groups, arg.src[2], arg.src[3]};
  497. checker.set_rng(2, &const_0);
  498. checker.set_param(param);
  499. checker.exec({filter, diff, grad});
  500. }
  501. }
  502. }
  503. TEST_F(CUDA, LOCAL_SHARE_BWD_FILTER_IMPLICIT_GEMM_GENERAL) {
  504. require_compute_capability(6, 0);
  505. Checker<LocalShareBackwardFilter> checker(handle_cuda());
  506. bool require_algo = false;
  507. checker.set_before_exec_callback(AlgoChecker<LocalShareBackwardFilter>(
  508. "LOCAL_SHARE_IMPLICIT_GEMM", &require_algo));
  509. using Param = LocalShare::Param;
  510. auto args = convolution::get_args();
  511. ConstValue const_0{0};
  512. for (size_t sg : {2, 3}) {
  513. for (auto&& arg : args) {
  514. if (arg.param.sparse != LocalShare::Param::Sparse::DENSE)
  515. continue;
  516. if (arg.param.format != LocalShare::Param::Format::NCHW)
  517. continue;
  518. if (arg.param.dilate_h != 1 || arg.param.dilate_w != 1)
  519. continue;
  520. Param param;
  521. param.stride_h = arg.param.stride_h, param.stride_w = arg.param.stride_w;
  522. param.pad_h = arg.param.pad_h, param.pad_w = arg.param.pad_w;
  523. param.dilate_h = arg.param.dilate_h, param.dilate_w = arg.param.dilate_w;
  524. param.spatial_groups_h = param.spatial_groups_w = sg;
  525. size_t ho = infer_conv_shape(
  526. arg.src[2], arg.filter[2], param.stride_h, param.pad_h);
  527. size_t wo = infer_conv_shape(
  528. arg.src[3], arg.filter[3], param.stride_w, param.pad_w);
  529. if (ho % sg != 0 || wo % sg != 0)
  530. continue;
  531. arg.src[0] = arg.src[0] + (4 - arg.src[0] % 4);
  532. TensorShape grad{
  533. sg, sg, arg.filter[1], arg.filter[2], arg.filter[3], arg.filter[0]};
  534. TensorShape diff{arg.src[0], arg.filter[0], ho, wo};
  535. checker.set_param(param);
  536. checker.set_rng(2, &const_0);
  537. checker.exec({arg.src, diff, grad});
  538. }
  539. }
  540. }
  541. TEST_F(CUDA, LOCAL_SHARE_BWD_FILTER_IMPLICIT_GEMM_SPECIAL) {
  542. require_compute_capability(6, 0);
  543. Checker<LocalShareBackwardFilter> checker(handle_cuda());
  544. bool require_algo = false;
  545. checker.set_before_exec_callback(AlgoChecker<LocalShareBackwardFilter>(
  546. "LOCAL_SHARE_IMPLICIT_GEMM", &require_algo));
  547. using Param = LocalShare::Param;
  548. ConstValue const_0{0};
  549. auto args = get_local_share_conv_small_image();
  550. for (auto&& arg : args) {
  551. static_cast<void>(arg);
  552. size_t b = arg.b, c = arg.c, f = arg.f, p = arg.p, s = arg.s, h = arg.h,
  553. w = arg.w, sg = arg.sg;
  554. size_t ho = infer_conv_shape(h, f, s, p), wo = infer_conv_shape(w, f, s, p);
  555. Param param;
  556. param.stride_h = param.stride_w = s;
  557. param.pad_h = param.pad_w = p;
  558. param.spatial_groups_h = param.spatial_groups_w = sg;
  559. checker.set_param(param);
  560. checker.set_rng(2, &const_0);
  561. TensorShape diff{b, c, ho, wo}, grad{sg, sg, 4, f, f, c}, src{b, 4, h, w};
  562. checker.execs({src, diff, grad});
  563. src = {b, 8, h, w};
  564. diff = TensorShape{b, c, ho, wo}, grad = TensorShape{sg, sg, 8, f, f, c};
  565. checker.exec({src, diff, grad});
  566. }
  567. }
  568. TEST_F(CUDA, LOCAL_SHARE_BWD_FILTER_BATCHED_MATMUL) {
  569. Checker<LocalShareBackwardFilter> checker(handle_cuda());
  570. bool require_algo = false;
  571. checker.set_before_exec_callback(AlgoChecker<LocalShareBackwardFilter>(
  572. "LOCAL_SHARE_BATCHED_MATMUL", &require_algo));
  573. using Param = LocalShare::Param;
  574. auto args = convolution::get_args();
  575. ConstValue const_0{0};
  576. for (size_t sg : {2, 3}) {
  577. for (auto&& arg : args) {
  578. if (arg.param.sparse != LocalShare::Param::Sparse::DENSE)
  579. continue;
  580. if (arg.param.format != LocalShare::Param::Format::NCHW)
  581. continue;
  582. if (arg.param.dilate_h != 1 || arg.param.dilate_w != 1)
  583. continue;
  584. Param param;
  585. param.stride_h = arg.param.stride_h, param.stride_w = arg.param.stride_w;
  586. param.pad_h = arg.param.pad_h, param.pad_w = arg.param.pad_w;
  587. param.dilate_h = arg.param.dilate_h, param.dilate_w = arg.param.dilate_w;
  588. param.spatial_groups_h = param.spatial_groups_w = sg;
  589. size_t ho = infer_conv_shape(
  590. arg.src[2], arg.filter[2], param.stride_h, param.pad_h);
  591. size_t wo = infer_conv_shape(
  592. arg.src[3], arg.filter[3], param.stride_w, param.pad_w);
  593. if (ho % sg != 0 || wo % sg != 0)
  594. continue;
  595. TensorShape grad{
  596. sg, sg, arg.filter[1], arg.filter[2], arg.filter[3], arg.filter[0]};
  597. TensorShape diff{arg.src[0], arg.filter[0], ho, wo};
  598. checker.set_rng(2, &const_0);
  599. checker.set_param(param);
  600. checker.exec({arg.src, diff, grad});
  601. }
  602. }
  603. }
  604. TEST_F(CUDA, GROUP_LOCAL_SHARE_BWD_FILTER_BATCHED_MATMUL) {
  605. Checker<LocalShareBackwardFilter> checker(handle_cuda());
  606. bool require_algo = false;
  607. checker.set_before_exec_callback(AlgoChecker<LocalShareBackwardFilter>(
  608. "LOCAL_SHARE_BATCHED_MATMUL", &require_algo));
  609. using Param = LocalShare::Param;
  610. auto args = convolution::get_args();
  611. ConstValue const_0{0};
  612. for (size_t sg : {2, 3}) {
  613. for (auto&& arg : args) {
  614. if (arg.param.sparse != LocalShare::Param::Sparse::DENSE)
  615. continue;
  616. if (arg.param.format != LocalShare::Param::Format::NCHW)
  617. continue;
  618. if (arg.param.dilate_h != 1 || arg.param.dilate_w != 1)
  619. continue;
  620. Param param;
  621. param.sparse = Param::Sparse::GROUP;
  622. param.stride_h = arg.param.stride_h, param.stride_w = arg.param.stride_w;
  623. param.pad_h = arg.param.pad_h, param.pad_w = arg.param.pad_w;
  624. param.dilate_h = arg.param.dilate_h, param.dilate_w = arg.param.dilate_w;
  625. param.spatial_groups_h = param.spatial_groups_w = sg;
  626. size_t ho = infer_conv_shape(
  627. arg.src[2], arg.filter[2], param.stride_h, param.pad_h);
  628. size_t wo = infer_conv_shape(
  629. arg.src[3], arg.filter[3], param.stride_w, param.pad_w);
  630. if (ho % sg != 0 || wo % sg != 0)
  631. continue;
  632. size_t nr_groups = 3;
  633. TensorShape grad{
  634. nr_groups, sg, sg, arg.filter[1], arg.filter[2],
  635. arg.filter[3], arg.filter[0]};
  636. TensorShape diff{arg.src[0], arg.filter[0] * nr_groups, ho, wo};
  637. TensorShape src{arg.src[0], arg.src[1] * nr_groups, arg.src[2], arg.src[3]};
  638. checker.set_rng(2, &const_0);
  639. checker.set_param(param);
  640. checker.exec({src, diff, grad});
  641. }
  642. }
  643. }
  644. #if MEGDNN_WITH_BENCHMARK
  645. TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_BWD_FILTER) {
  646. CUBenchmarker<LocalShareBackwardFilter> bencher(handle_cuda());
  647. size_t RUNS = 1000;
  648. bencher.set_display(false).set_times(RUNS);
  649. std::unique_ptr<OprProxy<LocalShareBackwardFilter>> proxy{
  650. new OprProxy<LocalShareBackwardFilter>{true}};
  651. bencher.set_proxy(proxy);
  652. LocalShare::Param param;
  653. NormalRNG rng;
  654. auto run = [&](size_t batch, size_t ic, size_t ih, size_t iw, size_t oc, size_t f,
  655. size_t s, size_t sg) {
  656. param.pad_h = f / 2;
  657. param.pad_w = f / 2;
  658. param.stride_h = s;
  659. param.stride_w = s;
  660. param.spatial_groups_h = sg;
  661. param.spatial_groups_w = sg;
  662. TensorShape src = {batch, ic, ih, iw}, grad = {sg, sg, ic, f, f, oc};
  663. size_t ho = infer_conv_shape(ih, f, s, f / 2);
  664. size_t wo = infer_conv_shape(iw, f, s, f / 2);
  665. TensorShape diff = {batch, oc, ho, wo};
  666. float flo = 2.0 * batch * oc * ho * wo * ic * f * f / (1e12);
  667. bencher.set_param(param)
  668. .set_dtype(0, dtype::Float32())
  669. .set_dtype(1, dtype::Float32())
  670. .set_dtype(2, dtype::Float32())
  671. .set_rng(0, &rng)
  672. .set_rng(1, &rng);
  673. bencher.proxy()->target_execution_policy.algo.reset();
  674. auto time_in_ms = bencher.execs({src, diff, grad}) / RUNS;
  675. printf("src=%s, diff=%s, grad=%s, float32: %.2fms "
  676. "%.2fTFlops\n",
  677. src.to_string().c_str(), diff.to_string().c_str(),
  678. grad.to_string().c_str(), time_in_ms, (flo / (time_in_ms * 1e-3)));
  679. };
  680. // stride = 1
  681. run(32, 128, 24, 24, 128, 1, 1, 3);
  682. run(32, 256, 12, 12, 256, 1, 1, 3);
  683. // stride = 2
  684. run(32, 256, 12, 12, 512, 1, 2, 3);
  685. run(32, 512, 6, 6, 1024, 1, 2, 3);
  686. // stride = 1
  687. run(32, 128, 24, 24, 128, 3, 1, 3);
  688. run(32, 256, 12, 12, 256, 3, 1, 3);
  689. // stride = 2
  690. run(32, 128, 24, 24, 256, 3, 2, 3);
  691. run(32, 256, 12, 12, 512, 3, 2, 3);
  692. // stride = 1
  693. run(64, 128, 24, 24, 128, 1, 1, 3);
  694. run(64, 256, 12, 12, 256, 1, 1, 3);
  695. // stride = 2
  696. run(64, 256, 12, 12, 512, 1, 2, 3);
  697. run(64, 512, 6, 6, 1024, 1, 2, 3);
  698. // stride = 1
  699. run(64, 128, 24, 24, 128, 3, 1, 3);
  700. run(64, 256, 12, 12, 256, 3, 1, 3);
  701. // stride = 2
  702. run(64, 128, 24, 24, 256, 3, 2, 3);
  703. run(64, 256, 12, 12, 512, 3, 2, 3);
  704. }
  705. TEST_F(CUDA, BENCHMARK_GROUP_LOCAL_SHARE_FORWARD) {
  706. CUBenchmarker<LocalShare> bencher(handle_cuda());
  707. size_t RUNS = 1000;
  708. bencher.set_display(false).set_times(RUNS);
  709. std::unique_ptr<OprProxy<LocalShareForward>> proxy{
  710. new OprProxy<LocalShareForward>{true}};
  711. bencher.set_proxy(proxy);
  712. LocalShare::Param param;
  713. NormalRNG rng;
  714. auto run = [&](size_t batch, size_t ic, size_t ih, size_t iw, size_t oc, size_t f,
  715. size_t s, size_t sg) {
  716. param.pad_h = f / 2;
  717. param.pad_w = f / 2;
  718. param.stride_h = s;
  719. param.stride_w = s;
  720. param.spatial_groups_h = sg;
  721. param.spatial_groups_w = sg;
  722. param.sparse = LocalShare::Param::Sparse::GROUP;
  723. TensorShape src = {1, batch * ic, ih, iw},
  724. filter = {batch, sg, sg, ic, f, f, oc};
  725. size_t ho = infer_conv_shape(ih, f, s, f / 2);
  726. size_t wo = infer_conv_shape(iw, f, s, f / 2);
  727. float flo = 2.0 * batch * oc * ho * wo * ic * f * f / (1e12);
  728. bencher.set_param(param)
  729. .set_dtype(0, dtype::Float32())
  730. .set_dtype(1, dtype::Float32())
  731. .set_dtype(2, dtype::Float32())
  732. .set_rng(0, &rng)
  733. .set_rng(1, &rng);
  734. bencher.proxy()->target_execution_policy.algo.reset();
  735. auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS;
  736. ;
  737. printf("src=%s, filter=%s, float32: %.2fms %.2fTFlops\n",
  738. src.to_string().c_str(), filter.to_string().c_str(), time_in_ms,
  739. (flo / (time_in_ms * 1e-3)));
  740. };
  741. // stride = 1
  742. run(32, 128, 24, 24, 128, 1, 1, 3);
  743. run(32, 256, 12, 12, 256, 1, 1, 3);
  744. // stride = 2
  745. run(32, 256, 12, 12, 512, 1, 2, 3);
  746. run(32, 512, 6, 6, 1024, 1, 2, 3);
  747. // stride = 1
  748. run(64, 128, 24, 24, 128, 1, 1, 3);
  749. run(64, 256, 12, 12, 256, 1, 1, 3);
  750. // stride = 2
  751. run(64, 256, 12, 12, 512, 1, 2, 3);
  752. run(64, 512, 6, 6, 1024, 1, 2, 3);
  753. }
  754. TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_BWD_DATA) {
  755. CUBenchmarker<LocalShareBackwardData> bencher(handle_cuda());
  756. size_t RUNS = 1000;
  757. bencher.set_display(false).set_times(RUNS);
  758. std::unique_ptr<OprProxy<LocalShareBackwardData>> proxy{
  759. new OprProxy<LocalShareBackwardData>{true}};
  760. bencher.set_proxy(proxy);
  761. LocalShare::Param param;
  762. NormalRNG rng;
  763. auto run = [&](size_t batch, size_t ic, size_t ih, size_t iw, size_t oc, size_t f,
  764. size_t s, size_t sg) {
  765. param.pad_h = f / 2;
  766. param.pad_w = f / 2;
  767. param.stride_h = s;
  768. param.stride_w = s;
  769. param.spatial_groups_h = sg;
  770. param.spatial_groups_w = sg;
  771. TensorShape grad = {batch, ic, ih, iw}, filter = {sg, sg, ic, f, f, oc};
  772. size_t ho = infer_conv_shape(ih, f, s, f / 2);
  773. size_t wo = infer_conv_shape(iw, f, s, f / 2);
  774. TensorShape diff = {batch, oc, ho, wo};
  775. float flo = 2.0 * batch * oc * ho * wo * ic * f * f / (1e12);
  776. bencher.set_param(param)
  777. .set_dtype(0, dtype::Float32())
  778. .set_dtype(1, dtype::Float32())
  779. .set_dtype(2, dtype::Float32())
  780. .set_rng(0, &rng)
  781. .set_rng(1, &rng);
  782. bencher.proxy()->target_execution_policy.algo.reset();
  783. auto time_in_ms = bencher.execs({filter, diff, grad}) / RUNS;
  784. printf("filter=%s, diff=%s, grad=%s, float32: %.2fms "
  785. "%.2fTFlops\n",
  786. filter.to_string().c_str(), diff.to_string().c_str(),
  787. grad.to_string().c_str(), time_in_ms, (flo / (time_in_ms * 1e-3)));
  788. };
  789. // stride = 1
  790. run(32, 128, 24, 24, 128, 1, 1, 3);
  791. run(32, 256, 12, 12, 256, 1, 1, 3);
  792. // stride = 2
  793. run(32, 256, 12, 12, 512, 1, 2, 3);
  794. run(32, 512, 6, 6, 1024, 1, 2, 3);
  795. // stride = 1
  796. run(32, 128, 24, 24, 128, 3, 1, 3);
  797. run(32, 256, 12, 12, 256, 3, 1, 3);
  798. // stride = 2
  799. run(32, 128, 24, 24, 256, 3, 2, 3);
  800. run(32, 256, 12, 12, 512, 3, 2, 3);
  801. // stride = 1
  802. run(64, 128, 24, 24, 128, 1, 1, 3);
  803. run(64, 256, 12, 12, 256, 1, 1, 3);
  804. // stride = 2
  805. run(64, 256, 12, 12, 512, 1, 2, 3);
  806. run(64, 512, 6, 6, 1024, 1, 2, 3);
  807. // stride = 1
  808. run(64, 128, 24, 24, 128, 3, 1, 3);
  809. run(64, 256, 12, 12, 256, 3, 1, 3);
  810. // stride = 2
  811. run(64, 128, 24, 24, 256, 3, 2, 3);
  812. run(64, 256, 12, 12, 512, 3, 2, 3);
  813. }
  814. TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_FORWARD_BOTTLENECK) {
  815. CUBenchmarker<LocalShare> bencher(handle_cuda());
  816. CUBenchmarker<Convolution> bencher_conv(handle_cuda());
  817. size_t RUNS = 1000;
  818. bencher.set_display(false).set_times(RUNS);
  819. std::unique_ptr<OprProxy<LocalShareForward>> proxy{
  820. new OprProxy<LocalShareForward>{true}};
  821. bencher.set_proxy(proxy);
  822. bencher_conv.set_display(false).set_times(RUNS);
  823. std::unique_ptr<OprProxy<Convolution>> conv_proxy{new OprProxy<Convolution>{true}};
  824. bencher_conv.set_proxy(conv_proxy);
  825. LocalShare::Param param;
  826. Convolution::Param conv_param;
  827. NormalRNG rng;
  828. auto run = [&](size_t batch, size_t ic, size_t ih, size_t iw, size_t oc, size_t f,
  829. size_t s, size_t sg) {
  830. param.pad_h = f / 2;
  831. param.pad_w = f / 2;
  832. param.stride_h = s;
  833. param.stride_w = s;
  834. param.spatial_groups_h = sg;
  835. param.spatial_groups_w = sg;
  836. conv_param.pad_h = f / 2;
  837. conv_param.pad_w = f / 2;
  838. conv_param.stride_h = s;
  839. conv_param.stride_w = s;
  840. TensorShape src = {batch, ic, ih, iw}, filter = {sg, sg, ic, f, f, oc};
  841. size_t ho = infer_conv_shape(ih, f, s, f / 2);
  842. size_t wo = infer_conv_shape(iw, f, s, f / 2);
  843. float flo = 2.0 * batch * oc * ho * wo * ic * f * f / (1e12);
  844. bencher.set_param(param)
  845. .set_dtype(0, dtype::Float32())
  846. .set_dtype(1, dtype::Float32())
  847. .set_dtype(2, dtype::Float32())
  848. .set_rng(0, &rng)
  849. .set_rng(1, &rng);
  850. bencher.proxy()->target_execution_policy.algo.reset();
  851. auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS;
  852. bencher_conv.set_param(conv_param);
  853. bencher_conv.proxy()->target_execution_policy.algo.reset();
  854. auto time_in_ms_conv = bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS;
  855. printf("src=%s, filter=%s, float32: %.2fms %.2fTFlops, "
  856. "conv(float32): %.2fms %.2fTFlops, local_share/conv=%.2f\n",
  857. src.to_string().c_str(), filter.to_string().c_str(), time_in_ms,
  858. (flo / (time_in_ms * 1e-3)), time_in_ms_conv,
  859. (flo / (time_in_ms_conv * 1e-3)), time_in_ms / time_in_ms_conv);
  860. };
  861. // stride = 1
  862. run(32, 128, 24, 24, 128, 1, 1, 3);
  863. run(32, 256, 12, 12, 256, 1, 1, 3);
  864. // stride = 2
  865. run(32, 256, 12, 12, 512, 1, 2, 3);
  866. run(32, 512, 6, 6, 1024, 1, 2, 3);
  867. // stride = 1
  868. run(32, 128, 24, 24, 128, 3, 1, 3);
  869. run(32, 256, 12, 12, 256, 3, 1, 3);
  870. // stride = 2
  871. run(32, 128, 24, 24, 256, 3, 2, 3);
  872. run(32, 256, 12, 12, 512, 3, 2, 3);
  873. // stride = 1
  874. run(64, 128, 24, 24, 128, 1, 1, 3);
  875. run(64, 256, 12, 12, 256, 1, 1, 3);
  876. // stride = 2
  877. run(64, 256, 12, 12, 512, 1, 2, 3);
  878. run(64, 512, 6, 6, 1024, 1, 2, 3);
  879. // stride = 1
  880. run(64, 128, 24, 24, 128, 3, 1, 3);
  881. run(64, 256, 12, 12, 256, 3, 1, 3);
  882. // stride = 2
  883. run(64, 128, 24, 24, 256, 3, 2, 3);
  884. run(64, 256, 12, 12, 512, 3, 2, 3);
  885. }
  886. TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_FORWARD_FROM_RESEARCH) {
  887. CUBenchmarker<LocalShare> bencher(handle_cuda());
  888. CUBenchmarker<Convolution> bencher_conv(handle_cuda());
  889. size_t RUNS = 1000;
  890. bencher.set_display(false).set_times(RUNS);
  891. std::unique_ptr<OprProxy<LocalShareForward>> proxy{
  892. new OprProxy<LocalShareForward>{true}};
  893. bencher.set_proxy(proxy);
  894. bencher_conv.set_display(false).set_times(RUNS);
  895. std::unique_ptr<OprProxy<Convolution>> conv_proxy{new OprProxy<Convolution>{true}};
  896. bencher_conv.set_proxy(conv_proxy);
  897. LocalShare::Param param;
  898. Convolution::Param conv_param;
  899. NormalRNG rng;
  900. auto run = [&](size_t batch, size_t ic, size_t ih, size_t iw, size_t oc, size_t f,
  901. size_t s, size_t sg) {
  902. param.pad_h = f / 2;
  903. param.pad_w = f / 2;
  904. param.stride_h = s;
  905. param.stride_w = s;
  906. param.spatial_groups_h = sg;
  907. param.spatial_groups_w = sg;
  908. conv_param.pad_h = f / 2;
  909. conv_param.pad_w = f / 2;
  910. conv_param.stride_h = s;
  911. conv_param.stride_w = s;
  912. TensorShape src = {batch, ic, ih, iw}, filter = {sg, sg, ic, f, f, oc};
  913. size_t ho = infer_conv_shape(ih, f, s, f / 2);
  914. size_t wo = infer_conv_shape(iw, f, s, f / 2);
  915. float flo = 2.0 * batch * oc * ho * wo * ic * f * f / (1e12);
  916. bencher.set_param(param)
  917. .set_dtype(0, dtype::Float32())
  918. .set_dtype(1, dtype::Float32())
  919. .set_dtype(2, dtype::Float32())
  920. .set_rng(0, &rng)
  921. .set_rng(1, &rng);
  922. bencher.proxy()->target_execution_policy.algo.reset();
  923. auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS;
  924. bencher_conv.set_param(conv_param);
  925. bencher_conv.proxy()->target_execution_policy.algo.reset();
  926. auto time_in_ms_conv = bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS;
  927. printf("src=%s, filter=%s, float32: %.2fms %.2fTFlops, "
  928. "conv(float32): %.2fms %.2fTFlops, local_share/conv=%.2f\n",
  929. src.to_string().c_str(), filter.to_string().c_str(), time_in_ms,
  930. (flo / (time_in_ms * 1e-3)), time_in_ms_conv,
  931. (flo / (time_in_ms_conv * 1e-3)), time_in_ms / time_in_ms_conv);
  932. };
  933. // stride = 1
  934. run(64, 128, 24, 24, 128, 1, 1, 3);
  935. run(64, 256, 12, 12, 256, 1, 1, 3);
  936. run(64, 512, 6, 6, 512, 1, 1, 3);
  937. run(64, 1024, 3, 3, 1024, 1, 1, 3);
  938. // stride = 2
  939. run(64, 128, 24, 24, 256, 1, 2, 3);
  940. run(64, 256, 12, 12, 512, 1, 2, 3);
  941. run(64, 512, 6, 6, 1024, 1, 2, 3);
  942. // stride = 1
  943. run(64, 128, 24, 24, 128, 3, 1, 3);
  944. run(64, 256, 12, 12, 256, 3, 1, 3);
  945. run(64, 512, 6, 6, 512, 3, 1, 3);
  946. run(64, 1024, 3, 3, 1024, 3, 1, 3);
  947. // stride = 2
  948. run(64, 128, 24, 24, 256, 3, 2, 3);
  949. run(64, 256, 12, 12, 512, 3, 2, 3);
  950. run(64, 512, 6, 6, 1024, 3, 2, 3);
  951. }
  952. TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_FORWARD) {
  953. require_compute_capability(6, 0);
  954. CUBenchmarker<LocalShare> bencher(handle_cuda());
  955. CUBenchmarker<Convolution> bencher_conv(handle_cuda());
  956. size_t RUNS = 200;
  957. bencher.set_display(false).set_times(RUNS);
  958. std::unique_ptr<OprProxy<LocalShareForward>> proxy{
  959. new OprProxy<LocalShareForward>{true}};
  960. bencher.set_proxy(proxy);
  961. bencher_conv.set_display(false).set_times(RUNS);
  962. std::unique_ptr<OprProxy<Convolution>> conv_proxy{new OprProxy<Convolution>{true}};
  963. bencher_conv.set_proxy(conv_proxy);
  964. LocalShare::Param param;
  965. Convolution::Param conv_param;
  966. NormalRNG rng;
  967. auto run = [&](size_t batch, size_t ic, size_t ih, size_t iw, size_t oc, size_t f,
  968. size_t s, size_t sg) {
  969. param.pad_h = f / 2;
  970. param.pad_w = f / 2;
  971. param.stride_h = s;
  972. param.stride_w = s;
  973. param.spatial_groups_h = sg;
  974. param.spatial_groups_w = sg;
  975. conv_param.pad_h = f / 2;
  976. conv_param.pad_w = f / 2;
  977. conv_param.stride_h = s;
  978. conv_param.stride_w = s;
  979. TensorShape src = {batch, ic, ih, iw}, filter = {sg, sg, ic, f, f, oc};
  980. size_t ho = infer_conv_shape(ih, f, s, f / 2);
  981. size_t wo = infer_conv_shape(iw, f, s, f / 2);
  982. float flo = 2.0 * batch * oc * ho * wo * ic * f * f / (1e12);
  983. bencher.set_param(param)
  984. .set_dtype(0, dtype::Float32())
  985. .set_dtype(1, dtype::Float32())
  986. .set_dtype(2, dtype::Float32())
  987. .set_rng(0, &rng)
  988. .set_rng(1, &rng);
  989. bencher.proxy()->target_execution_policy.algo.reset();
  990. auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS;
  991. bencher_conv.set_param(conv_param);
  992. bencher_conv.proxy()->target_execution_policy.algo.reset();
  993. auto time_in_ms_conv = bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS;
  994. printf("src=%s, filter=%s, float32: %.2fms %.2fTFlops, "
  995. "conv(float32): %.2fms %.2fTFlops, local_share/conv=%.2f\n",
  996. src.to_string().c_str(), filter.to_string().c_str(), time_in_ms,
  997. (flo / (time_in_ms * 1e-3)), time_in_ms_conv,
  998. (flo / (time_in_ms_conv * 1e-3)), time_in_ms / time_in_ms_conv);
  999. };
  1000. run(64, 256, 48, 48, 256, 7, 1, 3);
  1001. run(64, 128, 24, 24, 128, 7, 1, 3);
  1002. run(64, 256, 12, 12, 256, 7, 1, 3);
  1003. run(64, 512, 6, 6, 512, 7, 1, 3);
  1004. run(64, 256, 48, 48, 256, 5, 1, 3);
  1005. run(64, 128, 24, 24, 128, 5, 1, 3);
  1006. run(64, 256, 12, 12, 256, 5, 1, 3);
  1007. run(64, 512, 6, 6, 512, 5, 1, 3);
  1008. run(32, 64, 96, 96, 256, 7, 2, 3);
  1009. run(32, 128, 24, 24, 128, 7, 2, 3);
  1010. run(32, 256, 12, 12, 256, 7, 2, 3);
  1011. run(32, 64, 96, 96, 256, 5, 2, 3);
  1012. run(32, 128, 24, 24, 128, 5, 2, 3);
  1013. run(32, 256, 12, 12, 256, 5, 2, 3);
  1014. }
  1015. #endif
  1016. // vim: syntax=cpp.doxygen