You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

region_restricted_convolution.cpp 45 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035
  1. #include "megdnn/dtype.h"
  2. #include "megdnn/opr_param_defs.h"
  3. #include "megdnn/oprs.h"
  4. #include "megdnn/oprs/nn.h"
  5. #include "test/common/checker.h"
  6. #include "test/common/conv_bias.h"
  7. #include "test/common/rng.h"
  8. #include "test/common/tensor.h"
  9. #include "test/common/workspace_wrapper.h"
  10. #include "test/cuda/benchmark.h"
  11. #include "test/cuda/fixture.h"
  12. #include "test/cuda/utils.h"
  13. #include <cudnn.h>
  14. #include <gtest/gtest.h>
  15. #define V1(x) #x
  16. #define V(x) V1(x)
  17. #define CUDNN_VERSION_STRING \
  18. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  19. namespace megdnn {
  20. namespace test {
  21. TEST_F(CUDA, REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER) {
  22. Checker<RegionRestrictedConvolutionForward> checker(handle_cuda());
  23. auto opr = handle_cuda()->create_operator<ConvolutionForward>();
  24. for (auto dt : std::vector<DType>{dtype::Int32(), dtype::Uint8()}) {
  25. auto run = [&checker, &dt, &opr](
  26. size_t n, size_t g, size_t h, size_t fh, size_t padding,
  27. size_t stride) {
  28. RegionRestrictedConvolution::Param cur_param;
  29. cur_param.mode =
  30. RegionRestrictedConvolution::Param::Mode::CROSS_CORRELATION;
  31. cur_param.sparse = RegionRestrictedConvolution::Param::Sparse::GROUP;
  32. checker.set_dtype(2, dt).set_dtype(3, dt);
  33. float scale = 64.f / sqrt(fh * fh);
  34. UniformFloatRNG rng(scale, 2 * scale);
  35. UniformIntRNG r_rng{0, 2};
  36. checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng(
  37. 3, &r_rng);
  38. cur_param.pad_h = cur_param.pad_w = padding;
  39. cur_param.stride_h = cur_param.stride_w = stride;
  40. size_t ho = infer_conv_shape(h, fh, stride, padding);
  41. checker.set_param(cur_param).execs(
  42. {{n, g, h, h}, {g, 1, 1, fh, fh}, {n, h, h}, {n, ho, ho}, {}});
  43. };
  44. run(1, 1, 3, 2, 1, 1);
  45. run(1, 1, 5, 2, 1, 1);
  46. run(1, 1, 6, 2, 1, 1);
  47. run(1, 1, 7, 2, 1, 1);
  48. run(1, 1, 9, 2, 1, 1);
  49. run(1, 1, 10, 2, 1, 1);
  50. run(1, 1, 11, 2, 1, 1);
  51. run(1, 1, 13, 2, 1, 1);
  52. run(1, 1, 14, 2, 1, 1);
  53. run(1, 1, 15, 2, 1, 1);
  54. run(1, 1, 17, 2, 1, 1);
  55. run(1, 1, 18, 2, 1, 1);
  56. run(1, 1, 19, 2, 1, 1);
  57. run(1, 1, 21, 2, 1, 1);
  58. run(1, 1, 22, 2, 1, 1);
  59. run(1, 1, 23, 2, 1, 1);
  60. run(1, 1, 25, 2, 1, 1);
  61. run(1, 1, 26, 2, 1, 1);
  62. run(1, 1, 27, 2, 1, 1);
  63. run(1, 1, 29, 2, 1, 1);
  64. run(1, 1, 30, 2, 1, 1);
  65. run(1, 1, 31, 2, 1, 1);
  66. run(4, 8, 32, 3, 3 / 2, 1);
  67. run(4, 8, 32, 5, 5 / 2, 1);
  68. run(4, 8, 32, 7, 7 / 2, 1);
  69. run(4, 8, 32, 9, 9 / 2, 1);
  70. run(4, 8, 32, 11, 11 / 2, 1);
  71. run(4, 8, 32, 13, 13 / 2, 1);
  72. run(4, 8, 32, 15, 15 / 2, 1);
  73. run(4, 8, 32, 17, 17 / 2, 1);
  74. run(4, 8, 32, 19, 19 / 2, 1);
  75. run(4, 8, 32, 21, 21 / 2, 1);
  76. run(4, 8, 32, 23, 23 / 2, 1);
  77. run(4, 8, 32, 25, 25 / 2, 1);
  78. run(4, 8, 32, 27, 27 / 2, 1);
  79. run(4, 8, 32, 29, 29 / 2, 1);
  80. run(4, 8, 32, 31, 31 / 2, 1);
  81. run(4, 8, 31, 3, 3 / 2, 1);
  82. run(4, 8, 31, 5, 5 / 2, 1);
  83. run(4, 8, 31, 7, 7 / 2, 1);
  84. run(4, 8, 31, 9, 9 / 2, 1);
  85. run(4, 8, 31, 11, 11 / 2, 1);
  86. run(4, 8, 31, 13, 13 / 2, 1);
  87. run(4, 8, 31, 15, 15 / 2, 1);
  88. run(4, 8, 31, 17, 17 / 2, 1);
  89. run(4, 8, 31, 19, 19 / 2, 1);
  90. run(4, 8, 31, 21, 21 / 2, 1);
  91. run(4, 8, 31, 23, 23 / 2, 1);
  92. run(4, 8, 31, 25, 25 / 2, 1);
  93. run(4, 8, 31, 27, 27 / 2, 1);
  94. run(4, 8, 31, 29, 29 / 2, 1);
  95. run(4, 8, 31, 31, 31 / 2, 1);
  96. }
  97. }
  98. #if MEGDNN_WITH_BENCHMARK
  99. TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER_FP32_INT32) {
  100. require_compute_capability(7, 5);
  101. Benchmarker<ConvBiasForward> bencher(handle_cuda());
  102. bencher.set_display(false);
  103. bencher.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  104. ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  105. "DEPTHWISE_LARGE_FILTER", {})
  106. .c_str()));
  107. Benchmarker<RegionRestrictedConvolutionForward> rr_bencher(handle_cuda());
  108. rr_bencher.set_display(false);
  109. ConvBias::Param param;
  110. param.format = ConvBias::Param::Format::NCHW;
  111. using NonlineMode = ConvBias::Param::NonlineMode;
  112. param.nonlineMode = NonlineMode::IDENTITY;
  113. param.sparse = ConvBias::Param::Sparse::GROUP;
  114. RegionRestrictedConvolutionForward::Param rr_param;
  115. rr_param.format = RegionRestrictedConvolutionForward::Param::Format::NCHW;
  116. rr_param.sparse = RegionRestrictedConvolutionForward::Param::Sparse::GROUP;
  117. UniformIntRNG r_rng{0, 2};
  118. auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
  119. size_t fw, size_t sh, size_t sw, size_t nr_times) {
  120. param.pad_h = fh / 2;
  121. param.pad_w = fw / 2;
  122. param.stride_h = sh;
  123. param.stride_w = sw;
  124. rr_param.pad_h = fh / 2;
  125. rr_param.pad_w = fw / 2;
  126. rr_param.stride_h = sh;
  127. rr_param.stride_w = sw;
  128. bencher.set_param(param)
  129. .set_dtype(0, dtype::Float32())
  130. .set_dtype(1, dtype::Float32())
  131. .set_dtype(2, dtype::Float32())
  132. .set_dtype(4, dtype::Float32());
  133. bencher.set_times(nr_times);
  134. rr_bencher.set_param(rr_param)
  135. .set_dtype(0, dtype::Float32())
  136. .set_dtype(1, dtype::Float32())
  137. .set_dtype(2, dtype::Int32())
  138. .set_dtype(3, dtype::Int32());
  139. rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng);
  140. rr_bencher.set_times(nr_times);
  141. size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
  142. size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
  143. TensorShape inp{batch, g, hi, wi}, kern{g, 1, 1, fh, fw}, rin{batch, hi, wi},
  144. rout{batch, ho, wo}, out{batch, g, ho, wo};
  145. float bandwith = static_cast<float>(
  146. inp.total_nr_elems() + kern.total_nr_elems() +
  147. out.total_nr_elems()) /
  148. (1024 * 1024 * 1024) * 1e3;
  149. float rr_bandwith = static_cast<float>(
  150. inp.total_nr_elems() + kern.total_nr_elems() +
  151. rin.total_nr_elems() + rout.total_nr_elems() +
  152. out.total_nr_elems()) /
  153. (1024 * 1024 * 1024) * 1e3;
  154. auto time_in_ms = bencher.execs({inp, kern, {}, {}, out}) / nr_times;
  155. auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12;
  156. auto rr_time_in_ms = rr_bencher.execs({inp, kern, rin, rout, out}) / nr_times;
  157. auto rr_ops =
  158. 2.0 * batch * g * ho * wo * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
  159. printf("RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: inp=%s, "
  160. "kern=%s, out=%s\n"
  161. "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
  162. "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
  163. inp.to_string().c_str(), kern.to_string().c_str(),
  164. out.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
  165. bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
  166. time_in_ms / rr_time_in_ms);
  167. };
  168. run_bench(64, 384, 32, 32, 3, 3, 1, 1, 1000);
  169. run_bench(64, 384, 32, 32, 5, 5, 1, 1, 1000);
  170. run_bench(64, 384, 32, 32, 7, 7, 1, 1, 1000);
  171. run_bench(64, 384, 32, 32, 9, 9, 1, 1, 1000);
  172. run_bench(64, 384, 32, 32, 11, 11, 1, 1, 1000);
  173. run_bench(64, 384, 32, 32, 13, 13, 1, 1, 1000);
  174. run_bench(64, 384, 32, 32, 15, 15, 1, 1, 1000);
  175. run_bench(64, 384, 32, 32, 17, 17, 1, 1, 1000);
  176. run_bench(64, 384, 32, 32, 19, 19, 1, 1, 1000);
  177. run_bench(64, 384, 32, 32, 21, 21, 1, 1, 1000);
  178. run_bench(64, 384, 32, 32, 23, 23, 1, 1, 1000);
  179. run_bench(64, 384, 32, 32, 25, 25, 1, 1, 1000);
  180. run_bench(64, 384, 32, 32, 27, 27, 1, 1, 1000);
  181. run_bench(64, 384, 32, 32, 29, 29, 1, 1, 1000);
  182. run_bench(64, 384, 32, 32, 31, 31, 1, 1, 1000);
  183. }
  184. TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_DATA_FP32_INT32) {
  185. require_compute_capability(7, 5);
  186. Benchmarker<ConvolutionBackwardData> bencher(handle_cuda());
  187. bencher.set_display(false);
  188. bencher.set_before_exec_callback(
  189. AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
  190. Benchmarker<RegionRestrictedConvolutionBackwardData> rr_bencher(handle_cuda());
  191. rr_bencher.set_display(false);
  192. ConvolutionBackwardData::Param param;
  193. param.format = ConvolutionBackwardData::Param::Format::NCHW;
  194. param.sparse = ConvolutionBackwardData::Param::Sparse::GROUP;
  195. RegionRestrictedConvolutionBackwardData::Param rr_param;
  196. rr_param.format = RegionRestrictedConvolutionBackwardData::Param::Format::NCHW;
  197. rr_param.sparse = RegionRestrictedConvolutionBackwardData::Param::Sparse::GROUP;
  198. UniformIntRNG r_rng{1, 3};
  199. auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
  200. size_t fw, size_t sh, size_t sw, size_t nr_times) {
  201. param.pad_h = fh / 2;
  202. param.pad_w = fw / 2;
  203. param.stride_h = sh;
  204. param.stride_w = sw;
  205. rr_param.pad_h = fh / 2;
  206. rr_param.pad_w = fw / 2;
  207. rr_param.stride_h = sh;
  208. rr_param.stride_w = sw;
  209. bencher.set_param(param)
  210. .set_dtype(0, dtype::Float32())
  211. .set_dtype(1, dtype::Float32())
  212. .set_dtype(2, dtype::Float32())
  213. .set_dtype(4, dtype::Float32());
  214. bencher.set_times(nr_times);
  215. rr_bencher.set_param(rr_param)
  216. .set_dtype(0, dtype::Float32())
  217. .set_dtype(1, dtype::Float32())
  218. .set_dtype(2, dtype::Int32())
  219. .set_dtype(3, dtype::Int32());
  220. rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng);
  221. rr_bencher.set_times(nr_times);
  222. size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
  223. size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
  224. TensorShape inp{batch, g, hi, wi} /*src*/, kern{g, 1, 1, fh, fw} /*filter*/,
  225. rin{batch, hi, wi}, rout{batch, ho, wo},
  226. out{batch, g, ho, wo} /*output*/;
  227. float bandwith = static_cast<float>(
  228. inp.total_nr_elems() + kern.total_nr_elems() +
  229. out.total_nr_elems()) /
  230. (1024 * 1024 * 1024) * 1e3;
  231. float rr_bandwith = static_cast<float>(
  232. inp.total_nr_elems() + kern.total_nr_elems() +
  233. rin.total_nr_elems() + rout.total_nr_elems() +
  234. out.total_nr_elems()) /
  235. (1024 * 1024 * 1024) * 1e3;
  236. auto time_in_ms = bencher.execs({kern, out, inp}) / nr_times;
  237. auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12;
  238. auto rr_time_in_ms = rr_bencher.execs({kern, out, rin, rout, inp}) / nr_times;
  239. auto rr_ops =
  240. 2.0 * batch * g * ho * wo * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
  241. printf("[DGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: "
  242. "grad=%s, "
  243. "kern=%s, diff=%s\n"
  244. "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
  245. "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
  246. inp.to_string().c_str(), kern.to_string().c_str(),
  247. out.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
  248. bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
  249. time_in_ms / rr_time_in_ms);
  250. };
  251. run_bench(64, 384, 32, 32, 3, 3, 1, 1, 1000);
  252. run_bench(64, 384, 32, 32, 5, 5, 1, 1, 1000);
  253. run_bench(64, 384, 32, 32, 7, 7, 1, 1, 1000);
  254. run_bench(64, 384, 32, 32, 9, 9, 1, 1, 1000);
  255. run_bench(64, 384, 32, 32, 11, 11, 1, 1, 1000);
  256. run_bench(64, 384, 32, 32, 13, 13, 1, 1, 1000);
  257. run_bench(64, 384, 32, 32, 15, 15, 1, 1, 1000);
  258. run_bench(64, 384, 32, 32, 17, 17, 1, 1, 1000);
  259. run_bench(64, 384, 32, 32, 19, 19, 1, 1, 1000);
  260. run_bench(64, 384, 32, 32, 21, 21, 1, 1, 1000);
  261. run_bench(64, 384, 32, 32, 23, 23, 1, 1, 1000);
  262. run_bench(64, 384, 32, 32, 25, 25, 1, 1, 1000);
  263. run_bench(64, 384, 32, 32, 27, 27, 1, 1, 1000);
  264. run_bench(64, 384, 32, 32, 29, 29, 1, 1, 1000);
  265. run_bench(64, 384, 32, 32, 31, 31, 1, 1, 1000);
  266. }
  267. TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_DATA_FP32_UINT8) {
  268. require_compute_capability(7, 5);
  269. Benchmarker<ConvolutionBackwardData> bencher(handle_cuda());
  270. bencher.set_display(false);
  271. bencher.set_before_exec_callback(
  272. AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
  273. Benchmarker<RegionRestrictedConvolutionBackwardData> rr_bencher(handle_cuda());
  274. rr_bencher.set_display(false);
  275. ConvolutionBackwardData::Param param;
  276. param.format = ConvolutionBackwardData::Param::Format::NCHW;
  277. param.sparse = ConvolutionBackwardData::Param::Sparse::GROUP;
  278. RegionRestrictedConvolutionBackwardData::Param rr_param;
  279. rr_param.format = RegionRestrictedConvolutionBackwardData::Param::Format::NCHW;
  280. rr_param.sparse = RegionRestrictedConvolutionBackwardData::Param::Sparse::GROUP;
  281. UniformIntRNG r_rng{1, 3};
  282. auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
  283. size_t fw, size_t sh, size_t sw, size_t nr_times) {
  284. param.pad_h = fh / 2;
  285. param.pad_w = fw / 2;
  286. param.stride_h = sh;
  287. param.stride_w = sw;
  288. rr_param.pad_h = fh / 2;
  289. rr_param.pad_w = fw / 2;
  290. rr_param.stride_h = sh;
  291. rr_param.stride_w = sw;
  292. bencher.set_param(param)
  293. .set_dtype(0, dtype::Float32())
  294. .set_dtype(1, dtype::Float32())
  295. .set_dtype(2, dtype::Float32())
  296. .set_dtype(4, dtype::Float32());
  297. bencher.set_times(nr_times);
  298. rr_bencher.set_param(rr_param)
  299. .set_dtype(0, dtype::Float32())
  300. .set_dtype(1, dtype::Float32())
  301. .set_dtype(2, dtype::Uint8())
  302. .set_dtype(3, dtype::Uint8());
  303. rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng);
  304. rr_bencher.set_times(nr_times);
  305. size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
  306. size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
  307. TensorShape inp{batch, g, hi, wi} /*src*/, kern{g, 1, 1, fh, fw} /*filter*/,
  308. rin{batch, hi, wi}, rout{batch, ho, wo},
  309. out{batch, g, ho, wo} /*output*/;
  310. float bandwith = static_cast<float>(
  311. inp.total_nr_elems() + kern.total_nr_elems() +
  312. out.total_nr_elems()) /
  313. (1024 * 1024 * 1024) * 1e3;
  314. float rr_bandwith = static_cast<float>(
  315. inp.total_nr_elems() + kern.total_nr_elems() +
  316. rin.total_nr_elems() + rout.total_nr_elems() +
  317. out.total_nr_elems()) /
  318. (1024 * 1024 * 1024) * 1e3;
  319. auto time_in_ms = bencher.execs({kern, out, inp}) / nr_times;
  320. auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12;
  321. auto rr_time_in_ms = rr_bencher.execs({kern, out, rin, rout, inp}) / nr_times;
  322. auto rr_ops =
  323. 2.0 * batch * g * ho * wo * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
  324. printf("[DGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: "
  325. "grad=%s, "
  326. "kern=%s, diff=%s\n"
  327. "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
  328. "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
  329. inp.to_string().c_str(), kern.to_string().c_str(),
  330. out.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
  331. bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
  332. time_in_ms / rr_time_in_ms);
  333. };
  334. run_bench(64, 384, 32, 32, 3, 3, 1, 1, 1000);
  335. run_bench(64, 384, 32, 32, 5, 5, 1, 1, 1000);
  336. run_bench(64, 384, 32, 32, 7, 7, 1, 1, 1000);
  337. run_bench(64, 384, 32, 32, 9, 9, 1, 1, 1000);
  338. run_bench(64, 384, 32, 32, 11, 11, 1, 1, 1000);
  339. run_bench(64, 384, 32, 32, 13, 13, 1, 1, 1000);
  340. run_bench(64, 384, 32, 32, 15, 15, 1, 1, 1000);
  341. run_bench(64, 384, 32, 32, 17, 17, 1, 1, 1000);
  342. run_bench(64, 384, 32, 32, 19, 19, 1, 1, 1000);
  343. run_bench(64, 384, 32, 32, 21, 21, 1, 1, 1000);
  344. run_bench(64, 384, 32, 32, 23, 23, 1, 1, 1000);
  345. run_bench(64, 384, 32, 32, 25, 25, 1, 1, 1000);
  346. run_bench(64, 384, 32, 32, 27, 27, 1, 1, 1000);
  347. run_bench(64, 384, 32, 32, 29, 29, 1, 1, 1000);
  348. run_bench(64, 384, 32, 32, 31, 31, 1, 1, 1000);
  349. run_bench(64, 384, 31, 31, 3, 3, 1, 1, 1000);
  350. run_bench(64, 384, 31, 31, 5, 5, 1, 1, 1000);
  351. run_bench(64, 384, 31, 31, 7, 7, 1, 1, 1000);
  352. run_bench(64, 384, 31, 31, 9, 9, 1, 1, 1000);
  353. run_bench(64, 384, 31, 31, 11, 11, 1, 1, 1000);
  354. run_bench(64, 384, 31, 31, 13, 13, 1, 1, 1000);
  355. run_bench(64, 384, 31, 31, 15, 15, 1, 1, 1000);
  356. run_bench(64, 384, 31, 31, 17, 17, 1, 1, 1000);
  357. run_bench(64, 384, 31, 31, 19, 19, 1, 1, 1000);
  358. run_bench(64, 384, 31, 31, 21, 21, 1, 1, 1000);
  359. run_bench(64, 384, 31, 31, 23, 23, 1, 1, 1000);
  360. run_bench(64, 384, 31, 31, 25, 25, 1, 1, 1000);
  361. run_bench(64, 384, 31, 31, 27, 27, 1, 1, 1000);
  362. run_bench(64, 384, 31, 31, 29, 29, 1, 1, 1000);
  363. run_bench(64, 384, 31, 31, 31, 31, 1, 1, 1000);
  364. }
  365. TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER_UINT8) {
  366. require_compute_capability(7, 5);
  367. Benchmarker<ConvBiasForward> bencher(handle_cuda());
  368. bencher.set_display(false);
  369. bencher.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  370. ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  371. "DEPTHWISE_LARGE_FILTER", {})
  372. .c_str()));
  373. Benchmarker<RegionRestrictedConvolutionForward> rr_bencher(handle_cuda());
  374. rr_bencher.set_display(false);
  375. ConvBias::Param param;
  376. param.format = ConvBias::Param::Format::NCHW;
  377. using NonlineMode = ConvBias::Param::NonlineMode;
  378. param.nonlineMode = NonlineMode::IDENTITY;
  379. param.sparse = ConvBias::Param::Sparse::GROUP;
  380. RegionRestrictedConvolutionForward::Param rr_param;
  381. rr_param.format = RegionRestrictedConvolutionForward::Param::Format::NCHW;
  382. rr_param.sparse = RegionRestrictedConvolutionForward::Param::Sparse::GROUP;
  383. UniformIntRNG r_rng{0, 2};
  384. auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
  385. size_t fw, size_t sh, size_t sw, size_t nr_times) {
  386. param.pad_h = fh / 2;
  387. param.pad_w = fw / 2;
  388. param.stride_h = sh;
  389. param.stride_w = sw;
  390. rr_param.pad_h = fh / 2;
  391. rr_param.pad_w = fw / 2;
  392. rr_param.stride_h = sh;
  393. rr_param.stride_w = sw;
  394. bencher.set_param(param)
  395. .set_dtype(0, dtype::Float32())
  396. .set_dtype(1, dtype::Float32())
  397. .set_dtype(2, dtype::Float32())
  398. .set_dtype(4, dtype::Float32());
  399. bencher.set_times(nr_times);
  400. rr_bencher.set_param(rr_param)
  401. .set_dtype(0, dtype::Float32())
  402. .set_dtype(1, dtype::Float32())
  403. .set_dtype(2, dtype::Uint8())
  404. .set_dtype(3, dtype::Uint8());
  405. rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng).set_rng(0, &r_rng);
  406. rr_bencher.set_times(nr_times);
  407. size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
  408. size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
  409. TensorShape inp{batch, g, hi, wi}, kern{g, 1, 1, fh, fw}, rin{batch, hi, wi},
  410. rout{batch, ho, wo}, out{batch, g, ho, wo};
  411. float bandwith = static_cast<float>(
  412. inp.total_nr_elems() + kern.total_nr_elems() +
  413. out.total_nr_elems()) /
  414. (1024 * 1024 * 1024) * 1e3;
  415. float rr_bandwith = static_cast<float>(
  416. inp.total_nr_elems() + kern.total_nr_elems() +
  417. rin.total_nr_elems() + rout.total_nr_elems() +
  418. out.total_nr_elems()) /
  419. (1024 * 1024 * 1024) * 1e3;
  420. auto time_in_ms = bencher.execs({inp, kern, {}, {}, out}) / nr_times;
  421. auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12;
  422. auto rr_time_in_ms = rr_bencher.execs({inp, kern, rin, rout, out}) / nr_times;
  423. auto rr_ops =
  424. 2.0 * batch * g * ho * wo * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
  425. printf("RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: inp=%s, "
  426. "kern=%s, out=%s\n"
  427. "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
  428. "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
  429. inp.to_string().c_str(), kern.to_string().c_str(),
  430. out.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
  431. bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
  432. time_in_ms / rr_time_in_ms);
  433. };
  434. run_bench(64, 384, 32, 32, 3, 3, 1, 1, 1000);
  435. run_bench(64, 384, 32, 32, 5, 5, 1, 1, 1000);
  436. run_bench(64, 384, 32, 32, 7, 7, 1, 1, 1000);
  437. run_bench(64, 384, 32, 32, 9, 9, 1, 1, 1000);
  438. run_bench(64, 384, 32, 32, 11, 11, 1, 1, 1000);
  439. run_bench(64, 384, 32, 32, 13, 13, 1, 1, 1000);
  440. run_bench(64, 384, 32, 32, 15, 15, 1, 1, 1000);
  441. run_bench(64, 384, 32, 32, 17, 17, 1, 1, 1000);
  442. run_bench(64, 384, 32, 32, 19, 19, 1, 1, 1000);
  443. run_bench(64, 384, 32, 32, 21, 21, 1, 1, 1000);
  444. run_bench(64, 384, 32, 32, 23, 23, 1, 1, 1000);
  445. run_bench(64, 384, 32, 32, 25, 25, 1, 1, 1000);
  446. run_bench(64, 384, 32, 32, 27, 27, 1, 1, 1000);
  447. run_bench(64, 384, 32, 32, 29, 29, 1, 1, 1000);
  448. run_bench(64, 384, 32, 32, 31, 31, 1, 1, 1000);
  449. run_bench(64, 384, 31, 31, 3, 3, 1, 1, 1000);
  450. run_bench(64, 384, 31, 31, 5, 5, 1, 1, 1000);
  451. run_bench(64, 384, 31, 31, 7, 7, 1, 1, 1000);
  452. run_bench(64, 384, 31, 31, 9, 9, 1, 1, 1000);
  453. run_bench(64, 384, 31, 31, 11, 11, 1, 1, 1000);
  454. run_bench(64, 384, 31, 31, 13, 13, 1, 1, 1000);
  455. run_bench(64, 384, 31, 31, 15, 15, 1, 1, 1000);
  456. run_bench(64, 384, 31, 31, 17, 17, 1, 1, 1000);
  457. run_bench(64, 384, 31, 31, 19, 19, 1, 1, 1000);
  458. run_bench(64, 384, 31, 31, 21, 21, 1, 1, 1000);
  459. run_bench(64, 384, 31, 31, 23, 23, 1, 1, 1000);
  460. run_bench(64, 384, 31, 31, 25, 25, 1, 1, 1000);
  461. run_bench(64, 384, 31, 31, 27, 27, 1, 1, 1000);
  462. run_bench(64, 384, 31, 31, 29, 29, 1, 1, 1000);
  463. run_bench(64, 384, 31, 31, 31, 31, 1, 1, 1000);
  464. }
  465. TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_FILTER_FP32) {
  466. require_compute_capability(7, 5);
  467. Benchmarker<ConvolutionBackwardFilter> bencher(handle_cuda());
  468. bencher.set_display(false);
  469. bencher.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  470. "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_128X128X8_32X64X8_2stage"));
  471. Benchmarker<RegionRestrictedConvolutionBackwardFilter> rr_bencher(handle_cuda());
  472. rr_bencher.set_display(false);
  473. ConvolutionBackwardFilter::Param param;
  474. param.format = ConvolutionBackwardFilter::Param::Format::NCHW;
  475. param.sparse = ConvolutionBackwardFilter::Param::Sparse::GROUP;
  476. RegionRestrictedConvolutionBackwardFilter::Param rr_param;
  477. rr_param.format = RegionRestrictedConvolutionBackwardFilter::Param::Format::NCHW;
  478. rr_param.sparse = RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP;
  479. UniformIntRNG r_rng{1, 3};
  480. auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
  481. size_t fw, size_t sh, size_t sw, size_t nr_times) {
  482. param.pad_h = fh / 2;
  483. param.pad_w = fw / 2;
  484. param.stride_h = sh;
  485. param.stride_w = sw;
  486. rr_param.pad_h = fh / 2;
  487. rr_param.pad_w = fw / 2;
  488. rr_param.stride_h = sh;
  489. rr_param.stride_w = sw;
  490. bencher.set_param(param)
  491. .set_dtype(0, dtype::Float32())
  492. .set_dtype(1, dtype::Float32())
  493. .set_dtype(2, dtype::Float32())
  494. .set_dtype(4, dtype::Float32());
  495. bencher.proxy()->target_execution_policy = {};
  496. bencher.set_times(nr_times);
  497. rr_bencher.set_param(rr_param)
  498. .set_dtype(0, dtype::Float32())
  499. .set_dtype(1, dtype::Float32())
  500. .set_dtype(2, dtype::Int32())
  501. .set_dtype(3, dtype::Int32());
  502. rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng);
  503. rr_bencher.set_times(nr_times);
  504. size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
  505. size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
  506. TensorShape src{batch, g, hi, wi}, diff{batch, g, ho, wo}, rin{batch, hi, wi},
  507. rout{batch, ho, wo}, grad{g, 1, 1, fh, fw};
  508. float bandwith = static_cast<float>(
  509. src.total_nr_elems() + diff.total_nr_elems() +
  510. grad.total_nr_elems()) /
  511. (1024 * 1024 * 1024) * 1e3;
  512. float rr_bandwith = static_cast<float>(
  513. src.total_nr_elems() + diff.total_nr_elems() +
  514. rin.total_nr_elems() + rout.total_nr_elems() +
  515. grad.total_nr_elems()) /
  516. (1024 * 1024 * 1024) * 1e3;
  517. auto time_in_ms = bencher.execs({src, diff, grad}) / nr_times;
  518. auto ops = 2.0 * batch * g * hi * wi * fh * fw / (time_in_ms * 1e-3) * 1e-12;
  519. auto rr_time_in_ms = rr_bencher.execs({src, diff, rin, rout, grad}) / nr_times;
  520. auto rr_ops =
  521. 2.0 * batch * g * hi * wi * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
  522. printf("[WGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: "
  523. "src=%s, "
  524. "diff=%s, grad=%s\n"
  525. "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
  526. "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
  527. src.to_string().c_str(), diff.to_string().c_str(),
  528. grad.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
  529. bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
  530. time_in_ms / rr_time_in_ms);
  531. };
  532. run_bench(64, 384, 32, 32, 3, 3, 1, 1, 1000);
  533. run_bench(64, 384, 32, 32, 5, 5, 1, 1, 1000);
  534. run_bench(64, 384, 32, 32, 7, 7, 1, 1, 1000);
  535. run_bench(64, 384, 32, 32, 9, 9, 1, 1, 1000);
  536. run_bench(64, 384, 32, 32, 11, 11, 1, 1, 1000);
  537. run_bench(64, 384, 32, 32, 13, 13, 1, 1, 1000);
  538. run_bench(64, 384, 32, 32, 15, 15, 1, 1, 1000);
  539. run_bench(64, 384, 32, 32, 17, 17, 1, 1, 1000);
  540. run_bench(64, 384, 32, 32, 19, 19, 1, 1, 1000);
  541. run_bench(64, 384, 32, 32, 21, 21, 1, 1, 1000);
  542. run_bench(64, 384, 32, 32, 23, 23, 1, 1, 1000);
  543. run_bench(64, 384, 32, 32, 25, 25, 1, 1, 1000);
  544. run_bench(64, 384, 32, 32, 27, 27, 1, 1, 1000);
  545. run_bench(64, 384, 32, 32, 29, 29, 1, 1, 1000);
  546. run_bench(64, 384, 32, 32, 31, 31, 1, 1, 1000);
  547. }
  548. TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_FILTER_FP32_RINT8) {
  549. require_compute_capability(7, 5);
  550. Benchmarker<ConvolutionBackwardFilter> bencher(handle_cuda());
  551. bencher.set_display(false);
  552. bencher.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  553. "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_128X128X8_32X64X8_2stage"));
  554. Benchmarker<RegionRestrictedConvolutionBackwardFilter> rr_bencher(handle_cuda());
  555. rr_bencher.set_display(false);
  556. ConvolutionBackwardFilter::Param param;
  557. param.format = ConvolutionBackwardFilter::Param::Format::NCHW;
  558. param.sparse = ConvolutionBackwardFilter::Param::Sparse::GROUP;
  559. RegionRestrictedConvolutionBackwardFilter::Param rr_param;
  560. rr_param.format = RegionRestrictedConvolutionBackwardFilter::Param::Format::NCHW;
  561. rr_param.sparse = RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP;
  562. UniformIntRNG r_rng{1, 3};
  563. auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
  564. size_t fw, size_t sh, size_t sw, size_t nr_times) {
  565. param.pad_h = fh / 2;
  566. param.pad_w = fw / 2;
  567. param.stride_h = sh;
  568. param.stride_w = sw;
  569. rr_param.pad_h = fh / 2;
  570. rr_param.pad_w = fw / 2;
  571. rr_param.stride_h = sh;
  572. rr_param.stride_w = sw;
  573. bencher.set_param(param)
  574. .set_dtype(0, dtype::Float32())
  575. .set_dtype(1, dtype::Float32())
  576. .set_dtype(2, dtype::Float32())
  577. .set_dtype(4, dtype::Float32());
  578. bencher.proxy()->target_execution_policy = {};
  579. bencher.set_times(nr_times);
  580. rr_bencher.set_param(rr_param)
  581. .set_dtype(0, dtype::Float32())
  582. .set_dtype(1, dtype::Float32())
  583. .set_dtype(2, dtype::Uint8())
  584. .set_dtype(3, dtype::Uint8());
  585. rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng);
  586. rr_bencher.set_times(nr_times);
  587. size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
  588. size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
  589. TensorShape src{batch, g, hi, wi}, diff{batch, g, ho, wo}, rin{batch, hi, wi},
  590. rout{batch, ho, wo}, grad{g, 1, 1, fh, fw};
  591. float bandwith = static_cast<float>(
  592. src.total_nr_elems() + diff.total_nr_elems() +
  593. grad.total_nr_elems()) /
  594. (1024 * 1024 * 1024) * 1e3;
  595. float rr_bandwith = static_cast<float>(
  596. src.total_nr_elems() + diff.total_nr_elems() +
  597. rin.total_nr_elems() + rout.total_nr_elems() +
  598. grad.total_nr_elems()) /
  599. (1024 * 1024 * 1024) * 1e3;
  600. auto time_in_ms = bencher.execs({src, diff, grad}) / nr_times;
  601. auto ops = 2.0 * batch * g * hi * wi * fh * fw / (time_in_ms * 1e-3) * 1e-12;
  602. auto rr_time_in_ms = rr_bencher.execs({src, diff, rin, rout, grad}) / nr_times;
  603. auto rr_ops =
  604. 2.0 * batch * g * hi * wi * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
  605. printf("[WGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: "
  606. "src=%s, "
  607. "diff=%s, grad=%s\n"
  608. "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
  609. "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
  610. src.to_string().c_str(), diff.to_string().c_str(),
  611. grad.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
  612. bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
  613. time_in_ms / rr_time_in_ms);
  614. };
  615. run_bench(64, 384, 32, 32, 3, 3, 1, 1, 1000);
  616. run_bench(64, 384, 32, 32, 5, 5, 1, 1, 1000);
  617. run_bench(64, 384, 32, 32, 7, 7, 1, 1, 1000);
  618. run_bench(64, 384, 32, 32, 9, 9, 1, 1, 1000);
  619. run_bench(64, 384, 32, 32, 11, 11, 1, 1, 1000);
  620. run_bench(64, 384, 32, 32, 13, 13, 1, 1, 1000);
  621. run_bench(64, 384, 32, 32, 15, 15, 1, 1, 1000);
  622. run_bench(64, 384, 32, 32, 17, 17, 1, 1, 1000);
  623. run_bench(64, 384, 32, 32, 19, 19, 1, 1, 1000);
  624. run_bench(64, 384, 32, 32, 21, 21, 1, 1, 1000);
  625. run_bench(64, 384, 32, 32, 23, 23, 1, 1, 1000);
  626. run_bench(64, 384, 32, 32, 25, 25, 1, 1, 1000);
  627. run_bench(64, 384, 32, 32, 27, 27, 1, 1, 1000);
  628. run_bench(64, 384, 32, 32, 29, 29, 1, 1, 1000);
  629. run_bench(64, 384, 32, 32, 31, 31, 1, 1, 1000);
  630. }
  631. #endif
  632. TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_DATA_FP32) {
  633. Checker<RegionRestrictedConvolutionBackwardData> checker(handle_cuda());
  634. for (auto dt : std::vector<DType>{dtype::Int32(), dtype::Uint8()}) {
  635. auto run = [&checker, &dt](
  636. size_t n, size_t g, size_t ih, size_t fh, size_t padding,
  637. size_t stride) {
  638. RegionRestrictedConvolutionBackwardData::Param cur_param;
  639. cur_param.mode = RegionRestrictedConvolutionBackwardData::Param::Mode::
  640. CROSS_CORRELATION;
  641. cur_param.compute_mode = RegionRestrictedConvolutionBackwardData::Param::
  642. ComputeMode::DEFAULT;
  643. cur_param.sparse =
  644. RegionRestrictedConvolutionBackwardData::Param::Sparse::GROUP;
  645. checker.set_dtype(0, dtype::Float32())
  646. .set_dtype(1, dtype::Float32())
  647. .set_dtype(2, dt)
  648. .set_dtype(3, dt);
  649. float scale = 64.f / sqrt(fh * fh);
  650. UniformFloatRNG rng(scale, 2 * scale);
  651. UniformIntRNG r_rng{1, 2};
  652. checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng(
  653. 3, &r_rng);
  654. cur_param.pad_h = cur_param.pad_w = padding;
  655. cur_param.stride_h = cur_param.stride_w = stride;
  656. size_t oh = (ih + 2 * padding - fh + 1) / stride;
  657. checker.set_param(cur_param).execs({
  658. {g, 1, 1, fh, fh}, // filter
  659. {n, g * 1, oh, oh}, // diff
  660. {n, ih, ih}, // rin
  661. {n, oh, oh}, // rout
  662. {n, g * 1, ih, ih} // grad
  663. });
  664. };
  665. run(1, 1, 3, 2, 1, 1);
  666. run(1, 1, 5, 2, 1, 1);
  667. run(1, 1, 6, 2, 1, 1);
  668. run(1, 1, 7, 2, 1, 1);
  669. run(1, 1, 9, 2, 1, 1);
  670. run(1, 1, 10, 2, 1, 1);
  671. run(1, 1, 11, 2, 1, 1);
  672. run(1, 1, 13, 2, 1, 1);
  673. run(1, 1, 14, 2, 1, 1);
  674. run(1, 1, 15, 2, 1, 1);
  675. run(1, 1, 17, 2, 1, 1);
  676. run(1, 1, 18, 2, 1, 1);
  677. run(1, 1, 19, 2, 1, 1);
  678. run(1, 1, 21, 2, 1, 1);
  679. run(1, 1, 22, 2, 1, 1);
  680. run(1, 1, 23, 2, 1, 1);
  681. run(1, 1, 25, 2, 1, 1);
  682. run(1, 1, 26, 2, 1, 1);
  683. run(1, 1, 27, 2, 1, 1);
  684. run(1, 1, 29, 2, 1, 1);
  685. run(1, 1, 30, 2, 1, 1);
  686. run(1, 1, 31, 2, 1, 1);
  687. run(4, 8, 32, 3, 3 / 2, 1);
  688. run(4, 8, 32, 5, 5 / 2, 1);
  689. run(4, 8, 32, 7, 7 / 2, 1);
  690. run(4, 8, 32, 9, 9 / 2, 1);
  691. run(4, 8, 32, 11, 11 / 2, 1);
  692. run(4, 8, 32, 13, 13 / 2, 1);
  693. run(4, 8, 32, 15, 15 / 2, 1);
  694. run(4, 8, 32, 17, 17 / 2, 1);
  695. run(4, 8, 32, 19, 19 / 2, 1);
  696. run(4, 8, 32, 21, 21 / 2, 1);
  697. run(4, 8, 32, 23, 23 / 2, 1);
  698. run(4, 8, 32, 25, 25 / 2, 1);
  699. run(4, 8, 32, 27, 27 / 2, 1);
  700. run(4, 8, 32, 29, 29 / 2, 1);
  701. run(4, 8, 32, 31, 31 / 2, 1);
  702. run(4, 8, 31, 3, 3 / 2, 1);
  703. run(4, 8, 31, 5, 5 / 2, 1);
  704. run(4, 8, 31, 7, 7 / 2, 1);
  705. run(4, 8, 31, 9, 9 / 2, 1);
  706. run(4, 8, 31, 11, 11 / 2, 1);
  707. run(4, 8, 31, 13, 13 / 2, 1);
  708. run(4, 8, 31, 15, 15 / 2, 1);
  709. run(4, 8, 31, 17, 17 / 2, 1);
  710. run(4, 8, 31, 19, 19 / 2, 1);
  711. run(4, 8, 31, 21, 21 / 2, 1);
  712. run(4, 8, 31, 23, 23 / 2, 1);
  713. run(4, 8, 31, 25, 25 / 2, 1);
  714. run(4, 8, 31, 27, 27 / 2, 1);
  715. run(4, 8, 31, 29, 29 / 2, 1);
  716. run(4, 8, 31, 31, 31 / 2, 1);
  717. }
  718. }
  719. TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_DATA_FP32_RIN_EQ_ROUT) {
  720. Checker<RegionRestrictedConvolutionBackwardData> checker(handle_cuda());
  721. for (auto dt : std::vector<DType>{dtype::Int32()}) {
  722. auto run = [&checker, &dt](
  723. size_t n, size_t g, size_t ih, size_t fh, size_t padding,
  724. size_t stride) {
  725. RegionRestrictedConvolutionBackwardData::Param cur_param;
  726. cur_param.mode = RegionRestrictedConvolutionBackwardData::Param::Mode::
  727. CROSS_CORRELATION;
  728. cur_param.compute_mode = RegionRestrictedConvolutionBackwardData::Param::
  729. ComputeMode::DEFAULT;
  730. cur_param.sparse =
  731. RegionRestrictedConvolutionBackwardData::Param::Sparse::GROUP;
  732. checker.set_dtype(2, dt).set_dtype(3, dt);
  733. float scale = 64.f / sqrt(fh * fh);
  734. UniformFloatRNG rng(scale, 2 * scale);
  735. // value 0 mask may cause unexpected behaviour.
  736. UniformIntRNG r_rng{1, 1};
  737. checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng(
  738. 3, &r_rng);
  739. cur_param.pad_h = cur_param.pad_w = padding;
  740. cur_param.stride_h = cur_param.stride_w = stride;
  741. size_t oh = (ih + 2 * padding - fh + 1) / stride;
  742. checker.set_param(cur_param).execs(
  743. {/*filter*/ {g, 1, 1, fh, fh},
  744. /*diff*/ {n, g * 1, oh, oh},
  745. /*rin*/ {n, ih, ih},
  746. /*rout*/ {n, oh, oh},
  747. /*grad*/ {n, g * 1, ih, ih}});
  748. };
  749. run(1, 1, 3, 2, 1, 1);
  750. run(1, 1, 5, 2, 1, 1);
  751. run(1, 1, 6, 2, 1, 1);
  752. run(1, 1, 7, 2, 1, 1);
  753. run(1, 1, 9, 2, 1, 1);
  754. run(1, 1, 10, 2, 1, 1);
  755. run(1, 1, 11, 2, 1, 1);
  756. run(1, 1, 13, 2, 1, 1);
  757. run(1, 1, 14, 2, 1, 1);
  758. run(1, 1, 15, 2, 1, 1);
  759. run(1, 1, 17, 2, 1, 1);
  760. run(1, 1, 18, 2, 1, 1);
  761. run(1, 1, 19, 2, 1, 1);
  762. run(1, 1, 21, 2, 1, 1);
  763. run(1, 1, 22, 2, 1, 1);
  764. run(1, 1, 23, 2, 1, 1);
  765. run(1, 1, 25, 2, 1, 1);
  766. run(1, 1, 26, 2, 1, 1);
  767. run(1, 1, 27, 2, 1, 1);
  768. run(1, 1, 29, 2, 1, 1);
  769. run(1, 1, 30, 2, 1, 1);
  770. run(1, 1, 31, 2, 1, 1);
  771. run(4, 8, 32, 3, 3 / 2, 1);
  772. run(4, 8, 32, 5, 5 / 2, 1);
  773. run(4, 8, 32, 7, 7 / 2, 1);
  774. run(4, 8, 32, 9, 9 / 2, 1);
  775. run(4, 8, 32, 11, 11 / 2, 1);
  776. run(4, 8, 32, 13, 13 / 2, 1);
  777. run(4, 8, 32, 15, 15 / 2, 1);
  778. run(4, 8, 32, 17, 17 / 2, 1);
  779. run(4, 8, 32, 19, 19 / 2, 1);
  780. run(4, 8, 32, 21, 21 / 2, 1);
  781. run(4, 8, 32, 23, 23 / 2, 1);
  782. run(4, 8, 32, 25, 25 / 2, 1);
  783. run(4, 8, 32, 27, 27 / 2, 1);
  784. run(4, 8, 32, 29, 29 / 2, 1);
  785. run(4, 8, 32, 31, 31 / 2, 1);
  786. run(4, 8, 31, 3, 3 / 2, 1);
  787. run(4, 8, 31, 5, 5 / 2, 1);
  788. run(4, 8, 31, 7, 7 / 2, 1);
  789. run(4, 8, 31, 9, 9 / 2, 1);
  790. run(4, 8, 31, 11, 11 / 2, 1);
  791. run(4, 8, 31, 13, 13 / 2, 1);
  792. run(4, 8, 31, 15, 15 / 2, 1);
  793. run(4, 8, 31, 17, 17 / 2, 1);
  794. run(4, 8, 31, 19, 19 / 2, 1);
  795. run(4, 8, 31, 21, 21 / 2, 1);
  796. run(4, 8, 31, 23, 23 / 2, 1);
  797. run(4, 8, 31, 25, 25 / 2, 1);
  798. run(4, 8, 31, 27, 27 / 2, 1);
  799. run(4, 8, 31, 29, 29 / 2, 1);
  800. run(4, 8, 31, 31, 31 / 2, 1);
  801. }
  802. }
  803. TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_FILTER_FP32) {
  804. require_compute_capability(6, 1);
  805. Checker<RegionRestrictedConvolutionBackwardFilter> checker(handle_cuda());
  806. for (auto dt : std::vector<DType>{dtype::Int32(), dtype::Uint8()}) {
  807. auto run = [&checker, &dt](
  808. size_t n, size_t g, size_t ih, size_t fh, size_t padding,
  809. size_t stride) {
  810. RegionRestrictedConvolutionBackwardFilter::Param cur_param;
  811. cur_param.mode = RegionRestrictedConvolutionBackwardFilter::Param::Mode::
  812. CROSS_CORRELATION;
  813. cur_param.compute_mode = RegionRestrictedConvolutionBackwardFilter::Param::
  814. ComputeMode::DEFAULT;
  815. cur_param.sparse =
  816. RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP;
  817. checker.set_dtype(0, dtype::Float32())
  818. .set_dtype(1, dtype::Float32())
  819. .set_dtype(2, dt)
  820. .set_dtype(3, dt);
  821. float scale = 64.f / sqrt(fh * fh);
  822. UniformFloatRNG rng(scale, 2 * scale);
  823. UniformIntRNG r_rng{1, 2};
  824. checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng(
  825. 3, &r_rng);
  826. cur_param.pad_h = cur_param.pad_w = padding;
  827. cur_param.stride_h = cur_param.stride_w = stride;
  828. size_t oh = (ih + 2 * padding - fh + 1) / stride;
  829. checker.set_param(cur_param).execs({
  830. {n, g * 1, ih, ih}, // src
  831. {n, g * 1, oh, oh}, // diff
  832. {n, ih, ih}, // rin
  833. {n, oh, oh}, // rout
  834. {g, 1, 1, fh, fh} // grad
  835. });
  836. };
  837. run(4, 8, 32, 5, 5 / 2, 1);
  838. run(1, 2, 2, 2, 0, 1);
  839. run(1, 2, 3, 3, 0, 1);
  840. run(1, 2, 4, 4, 0, 1);
  841. run(1, 2, 5, 5, 0, 1);
  842. run(1, 2, 6, 6, 0, 1);
  843. run(1, 2, 7, 7, 0, 1);
  844. run(4, 8, 32, 7, 7 / 2, 1);
  845. run(4, 8, 32, 9, 9 / 2, 1);
  846. run(4, 8, 32, 11, 11 / 2, 1);
  847. run(4, 8, 32, 13, 13 / 2, 1);
  848. run(4, 8, 32, 15, 15 / 2, 1);
  849. run(4, 8, 32, 17, 17 / 2, 1);
  850. run(4, 8, 32, 19, 19 / 2, 1);
  851. run(4, 8, 32, 21, 21 / 2, 1);
  852. run(4, 8, 32, 23, 23 / 2, 1);
  853. run(4, 8, 32, 25, 25 / 2, 1);
  854. run(4, 8, 32, 27, 27 / 2, 1);
  855. run(4, 1, 32, 27, 27 / 2, 1);
  856. run(4, 8, 32, 29, 29 / 2, 1);
  857. run(4, 8, 32, 31, 31 / 2, 1);
  858. }
  859. }
  860. TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_FILTER_FP32_RIN_EQ_ROUT) {
  861. require_compute_capability(6, 1);
  862. Checker<RegionRestrictedConvolutionBackwardFilter> checker(handle_cuda());
  863. for (auto dt : std::vector<DType>{dtype::Int32(), dtype::Uint8()}) {
  864. auto run = [&checker, &dt](
  865. size_t n, size_t g, size_t ih, size_t fh, size_t padding,
  866. size_t stride) {
  867. RegionRestrictedConvolutionBackwardFilter::Param cur_param;
  868. cur_param.mode = RegionRestrictedConvolutionBackwardFilter::Param::Mode::
  869. CROSS_CORRELATION;
  870. cur_param.compute_mode = RegionRestrictedConvolutionBackwardFilter::Param::
  871. ComputeMode::DEFAULT;
  872. cur_param.sparse =
  873. RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP;
  874. checker.set_dtype(0, dtype::Float32())
  875. .set_dtype(1, dtype::Float32())
  876. .set_dtype(2, dt)
  877. .set_dtype(3, dt);
  878. float scale = 64.f / sqrt(fh * fh);
  879. UniformFloatRNG rng(scale, 2 * scale);
  880. UniformIntRNG r_rng{1, 1};
  881. checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng(
  882. 3, &r_rng);
  883. cur_param.pad_h = cur_param.pad_w = padding;
  884. cur_param.stride_h = cur_param.stride_w = stride;
  885. size_t oh = (ih + 2 * padding - fh + 1) / stride;
  886. checker.set_param(cur_param).execs({
  887. {n, g * 1, ih, ih}, // src
  888. {n, g * 1, oh, oh}, // diff
  889. {n, ih, ih}, // rin
  890. {n, oh, oh}, // rout
  891. {g, 1, 1, fh, fh} // grad
  892. });
  893. };
  894. run(4, 8, 32, 5, 5 / 2, 1);
  895. run(1, 2, 2, 2, 0, 1);
  896. run(1, 2, 3, 3, 0, 1);
  897. run(1, 2, 4, 4, 0, 1);
  898. run(1, 2, 5, 5, 0, 1);
  899. run(1, 2, 6, 6, 0, 1);
  900. run(1, 2, 7, 7, 0, 1);
  901. run(4, 8, 32, 7, 7 / 2, 1);
  902. run(4, 8, 32, 9, 9 / 2, 1);
  903. run(4, 8, 32, 11, 11 / 2, 1);
  904. run(4, 8, 32, 13, 13 / 2, 1);
  905. run(4, 8, 32, 15, 15 / 2, 1);
  906. run(4, 8, 32, 17, 17 / 2, 1);
  907. run(4, 8, 32, 19, 19 / 2, 1);
  908. run(4, 8, 32, 21, 21 / 2, 1);
  909. run(4, 1, 32, 21, 21 / 2, 1);
  910. run(4, 8, 32, 23, 23 / 2, 1);
  911. run(4, 8, 32, 25, 25 / 2, 1);
  912. run(4, 8, 32, 27, 27 / 2, 1);
  913. run(4, 8, 32, 29, 29 / 2, 1);
  914. run(4, 8, 32, 31, 31 / 2, 1);
  915. }
  916. }
  917. } // namespace test
  918. } // namespace megdnn
  919. // vim: syntax=cpp.doxygen