You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

region_restricted_convolution.cpp 26 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591
  1. #include "megdnn/dtype.h"
  2. #include "megdnn/opr_param_defs.h"
  3. #include "megdnn/oprs.h"
  4. #include "test/common/checker.h"
  5. #include "test/common/conv_bias.h"
  6. #include "test/common/rng.h"
  7. #include "test/common/tensor.h"
  8. #include "test/common/workspace_wrapper.h"
  9. #include "test/cuda/benchmark.h"
  10. #include "test/cuda/fixture.h"
  11. #include "test/cuda/utils.h"
  12. #include <cudnn.h>
  13. #define V1(x) #x
  14. #define V(x) V1(x)
  15. #define CUDNN_VERSION_STRING \
  16. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  17. namespace megdnn {
  18. namespace test {
  19. TEST_F(CUDA, REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER) {
  20. Checker<RegionRestrictedConvolutionForward> checker(handle_cuda());
  21. auto opr = handle_cuda()->create_operator<ConvolutionForward>();
  22. for (auto dt : std::vector<DType>{dtype::Int32(), dtype::Uint8()}) {
  23. auto run = [&checker, &dt, &opr](
  24. size_t n, size_t g, size_t h, size_t fh, size_t padding,
  25. size_t stride) {
  26. RegionRestrictedConvolution::Param cur_param;
  27. cur_param.mode =
  28. RegionRestrictedConvolution::Param::Mode::CROSS_CORRELATION;
  29. cur_param.sparse = RegionRestrictedConvolution::Param::Sparse::GROUP;
  30. checker.set_dtype(2, dt).set_dtype(3, dt);
  31. float scale = 64.f / sqrt(fh * fh);
  32. UniformFloatRNG rng(scale, 2 * scale);
  33. UniformIntRNG r_rng{0, 2};
  34. checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng(
  35. 3, &r_rng);
  36. if (dt.enumv() == DTypeEnum::Float16) {
  37. checker.set_epsilon(1e-1);
  38. }
  39. cur_param.pad_h = cur_param.pad_w = padding;
  40. cur_param.stride_h = cur_param.stride_w = stride;
  41. size_t ho = infer_conv_shape(h, fh, stride, padding);
  42. checker.set_param(cur_param).execs(
  43. {{n, g, h, h}, {g, 1, 1, fh, fh}, {n, h, h}, {n, ho, ho}, {}});
  44. };
  45. run(4, 8, 32, 3, 3 / 2, 1);
  46. run(4, 8, 32, 5, 5 / 2, 1);
  47. run(4, 8, 32, 7, 7 / 2, 1);
  48. run(1, 2, 32, 9, 9 / 2, 1);
  49. run(4, 8, 32, 11, 11 / 2, 1);
  50. run(4, 8, 32, 13, 13 / 2, 1);
  51. run(4, 8, 32, 15, 15 / 2, 1);
  52. run(4, 8, 32, 17, 17 / 2, 1);
  53. run(4, 8, 32, 19, 19 / 2, 1);
  54. run(4, 8, 32, 21, 21 / 2, 1);
  55. run(4, 8, 32, 23, 23 / 2, 1);
  56. run(4, 8, 32, 25, 25 / 2, 1);
  57. run(4, 8, 32, 27, 27 / 2, 1);
  58. run(4, 8, 32, 29, 29 / 2, 1);
  59. run(4, 8, 32, 31, 31 / 2, 1);
  60. }
  61. }
  62. #if MEGDNN_WITH_BENCHMARK
  63. TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER_FP32) {
  64. require_compute_capability(7, 5);
  65. Benchmarker<ConvBiasForward> bencher(handle_cuda());
  66. bencher.set_display(false);
  67. bencher.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  68. ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  69. "DEPTHWISE_LARGE_FILTER", {})
  70. .c_str()));
  71. Benchmarker<RegionRestrictedConvolutionForward> rr_bencher(handle_cuda());
  72. rr_bencher.set_display(false);
  73. ConvBias::Param param;
  74. param.format = ConvBias::Param::Format::NCHW;
  75. using NonlineMode = ConvBias::Param::NonlineMode;
  76. param.nonlineMode = NonlineMode::IDENTITY;
  77. param.sparse = ConvBias::Param::Sparse::GROUP;
  78. RegionRestrictedConvolutionForward::Param rr_param;
  79. rr_param.format = RegionRestrictedConvolutionForward::Param::Format::NCHW;
  80. rr_param.sparse = RegionRestrictedConvolutionForward::Param::Sparse::GROUP;
  81. UniformIntRNG r_rng{0, 2};
  82. auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
  83. size_t fw, size_t sh, size_t sw, size_t nr_times) {
  84. param.pad_h = fh / 2;
  85. param.pad_w = fw / 2;
  86. param.stride_h = sh;
  87. param.stride_w = sw;
  88. rr_param.pad_h = fh / 2;
  89. rr_param.pad_w = fw / 2;
  90. rr_param.stride_h = sh;
  91. rr_param.stride_w = sw;
  92. bencher.set_param(param)
  93. .set_dtype(0, dtype::Float32())
  94. .set_dtype(1, dtype::Float32())
  95. .set_dtype(2, dtype::Float32())
  96. .set_dtype(4, dtype::Float32());
  97. bencher.set_times(nr_times);
  98. rr_bencher.set_param(rr_param)
  99. .set_dtype(0, dtype::Float32())
  100. .set_dtype(1, dtype::Float32())
  101. .set_dtype(2, dtype::Int32())
  102. .set_dtype(3, dtype::Int32());
  103. rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng);
  104. rr_bencher.set_times(nr_times);
  105. size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
  106. size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
  107. TensorShape inp{batch, g, hi, wi}, kern{g, 1, 1, fh, fw}, rin{batch, hi, wi},
  108. rout{batch, ho, wo}, out{batch, g, ho, wo};
  109. float bandwith = static_cast<float>(
  110. inp.total_nr_elems() + kern.total_nr_elems() +
  111. out.total_nr_elems()) /
  112. (1024 * 1024 * 1024) * 1e3;
  113. float rr_bandwith = static_cast<float>(
  114. inp.total_nr_elems() + kern.total_nr_elems() +
  115. rin.total_nr_elems() + rout.total_nr_elems() +
  116. out.total_nr_elems()) /
  117. (1024 * 1024 * 1024) * 1e3;
  118. auto time_in_ms = bencher.execs({inp, kern, {}, {}, out}) / nr_times;
  119. auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12;
  120. auto rr_time_in_ms = rr_bencher.execs({inp, kern, rin, rout, out}) / nr_times;
  121. auto rr_ops =
  122. 2.0 * batch * g * ho * wo * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
  123. printf("RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: inp=%s, "
  124. "kern=%s, out=%s\n"
  125. "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
  126. "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
  127. inp.to_string().c_str(), kern.to_string().c_str(),
  128. out.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
  129. bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
  130. time_in_ms / rr_time_in_ms);
  131. };
  132. run_bench(64, 384, 32, 32, 3, 3, 1, 1, 10);
  133. run_bench(64, 384, 32, 32, 5, 5, 1, 1, 10);
  134. run_bench(64, 384, 32, 32, 7, 7, 1, 1, 10);
  135. run_bench(64, 384, 32, 32, 9, 9, 1, 1, 10);
  136. run_bench(64, 384, 32, 32, 11, 11, 1, 1, 10);
  137. run_bench(64, 384, 32, 32, 13, 13, 1, 1, 10);
  138. run_bench(64, 384, 32, 32, 15, 15, 1, 1, 10);
  139. run_bench(64, 384, 32, 32, 17, 17, 1, 1, 10);
  140. run_bench(64, 384, 32, 32, 19, 19, 1, 1, 10);
  141. run_bench(64, 384, 32, 32, 21, 21, 1, 1, 10);
  142. run_bench(64, 384, 32, 32, 23, 23, 1, 1, 10);
  143. run_bench(64, 384, 32, 32, 25, 25, 1, 1, 10);
  144. run_bench(64, 384, 32, 32, 27, 27, 1, 1, 10);
  145. run_bench(64, 384, 32, 32, 29, 29, 1, 1, 10);
  146. run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10);
  147. }
  148. TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_LARGE_FILTER_FP32) {
  149. require_compute_capability(7, 5);
  150. Benchmarker<ConvolutionBackwardData> bencher(handle_cuda());
  151. bencher.set_display(false);
  152. bencher.set_before_exec_callback(
  153. AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
  154. Benchmarker<RegionRestrictedConvolutionBackwardData> rr_bencher(handle_cuda());
  155. rr_bencher.set_display(false);
  156. ConvolutionBackwardData::Param param;
  157. param.format = ConvolutionBackwardData::Param::Format::NCHW;
  158. param.sparse = ConvolutionBackwardData::Param::Sparse::GROUP;
  159. RegionRestrictedConvolutionBackwardData::Param rr_param;
  160. rr_param.format = RegionRestrictedConvolutionBackwardData::Param::Format::NCHW;
  161. rr_param.sparse = RegionRestrictedConvolutionBackwardData::Param::Sparse::GROUP;
  162. UniformIntRNG r_rng{1, 3};
  163. auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
  164. size_t fw, size_t sh, size_t sw, size_t nr_times) {
  165. param.pad_h = fh / 2;
  166. param.pad_w = fw / 2;
  167. param.stride_h = sh;
  168. param.stride_w = sw;
  169. rr_param.pad_h = fh / 2;
  170. rr_param.pad_w = fw / 2;
  171. rr_param.stride_h = sh;
  172. rr_param.stride_w = sw;
  173. bencher.set_param(param)
  174. .set_dtype(0, dtype::Float32())
  175. .set_dtype(1, dtype::Float32())
  176. .set_dtype(2, dtype::Float32())
  177. .set_dtype(4, dtype::Float32());
  178. bencher.set_times(nr_times);
  179. rr_bencher.set_param(rr_param)
  180. .set_dtype(0, dtype::Float32())
  181. .set_dtype(1, dtype::Float32())
  182. .set_dtype(2, dtype::Int32())
  183. .set_dtype(3, dtype::Int32());
  184. rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng);
  185. rr_bencher.set_times(nr_times);
  186. size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
  187. size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
  188. TensorShape inp{batch, g, hi, wi} /*src*/, kern{g, 1, 1, fh, fw} /*filter*/,
  189. rin{batch, hi, wi}, rout{batch, ho, wo},
  190. out{batch, g, ho, wo} /*output*/;
  191. float bandwith = static_cast<float>(
  192. inp.total_nr_elems() + kern.total_nr_elems() +
  193. out.total_nr_elems()) /
  194. (1024 * 1024 * 1024) * 1e3;
  195. float rr_bandwith = static_cast<float>(
  196. inp.total_nr_elems() + kern.total_nr_elems() +
  197. rin.total_nr_elems() + rout.total_nr_elems() +
  198. out.total_nr_elems()) /
  199. (1024 * 1024 * 1024) * 1e3;
  200. auto time_in_ms = bencher.execs({kern, out, inp}) / nr_times;
  201. auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12;
  202. auto rr_time_in_ms = rr_bencher.execs({kern, out, rin, rout, inp}) / nr_times;
  203. auto rr_ops =
  204. 2.0 * batch * g * ho * wo * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
  205. printf("[DGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: "
  206. "grad=%s, "
  207. "kern=%s, diff=%s\n"
  208. "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
  209. "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
  210. inp.to_string().c_str(), kern.to_string().c_str(),
  211. out.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
  212. bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
  213. time_in_ms / rr_time_in_ms);
  214. };
  215. run_bench(64, 384, 32, 32, 3, 3, 1, 1, 10);
  216. run_bench(64, 384, 32, 32, 5, 5, 1, 1, 10);
  217. run_bench(64, 384, 32, 32, 7, 7, 1, 1, 10);
  218. run_bench(64, 384, 32, 32, 9, 9, 1, 1, 10);
  219. run_bench(64, 384, 32, 32, 11, 11, 1, 1, 10);
  220. run_bench(64, 384, 32, 32, 13, 13, 1, 1, 10);
  221. run_bench(64, 384, 32, 32, 15, 15, 1, 1, 10);
  222. run_bench(64, 384, 32, 32, 17, 17, 1, 1, 10);
  223. run_bench(64, 384, 32, 32, 19, 19, 1, 1, 10);
  224. run_bench(64, 384, 32, 32, 21, 21, 1, 1, 10);
  225. run_bench(64, 384, 32, 32, 23, 23, 1, 1, 10);
  226. run_bench(64, 384, 32, 32, 25, 25, 1, 1, 10);
  227. run_bench(64, 384, 32, 32, 27, 27, 1, 1, 10);
  228. run_bench(64, 384, 32, 32, 29, 29, 1, 1, 10);
  229. run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10);
  230. }
  231. TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_LARGE_FILTER_FP32_UINT8) {
  232. require_compute_capability(7, 5);
  233. Benchmarker<ConvolutionBackwardData> bencher(handle_cuda());
  234. bencher.set_display(false);
  235. bencher.set_before_exec_callback(
  236. AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
  237. Benchmarker<RegionRestrictedConvolutionBackwardData> rr_bencher(handle_cuda());
  238. rr_bencher.set_display(false);
  239. ConvolutionBackwardData::Param param;
  240. param.format = ConvolutionBackwardData::Param::Format::NCHW;
  241. param.sparse = ConvolutionBackwardData::Param::Sparse::GROUP;
  242. RegionRestrictedConvolutionBackwardData::Param rr_param;
  243. rr_param.format = RegionRestrictedConvolutionBackwardData::Param::Format::NCHW;
  244. rr_param.sparse = RegionRestrictedConvolutionBackwardData::Param::Sparse::GROUP;
  245. UniformIntRNG r_rng{1, 3};
  246. auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
  247. size_t fw, size_t sh, size_t sw, size_t nr_times) {
  248. param.pad_h = fh / 2;
  249. param.pad_w = fw / 2;
  250. param.stride_h = sh;
  251. param.stride_w = sw;
  252. rr_param.pad_h = fh / 2;
  253. rr_param.pad_w = fw / 2;
  254. rr_param.stride_h = sh;
  255. rr_param.stride_w = sw;
  256. bencher.set_param(param)
  257. .set_dtype(0, dtype::Float32())
  258. .set_dtype(1, dtype::Float32())
  259. .set_dtype(2, dtype::Float32())
  260. .set_dtype(4, dtype::Float32());
  261. bencher.set_times(nr_times);
  262. rr_bencher.set_param(rr_param)
  263. .set_dtype(0, dtype::Float32())
  264. .set_dtype(1, dtype::Float32())
  265. .set_dtype(2, dtype::Uint8())
  266. .set_dtype(3, dtype::Uint8());
  267. rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng);
  268. rr_bencher.set_times(nr_times);
  269. size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
  270. size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
  271. TensorShape inp{batch, g, hi, wi} /*src*/, kern{g, 1, 1, fh, fw} /*filter*/,
  272. rin{batch, hi, wi}, rout{batch, ho, wo},
  273. out{batch, g, ho, wo} /*output*/;
  274. float bandwith = static_cast<float>(
  275. inp.total_nr_elems() + kern.total_nr_elems() +
  276. out.total_nr_elems()) /
  277. (1024 * 1024 * 1024) * 1e3;
  278. float rr_bandwith = static_cast<float>(
  279. inp.total_nr_elems() + kern.total_nr_elems() +
  280. rin.total_nr_elems() + rout.total_nr_elems() +
  281. out.total_nr_elems()) /
  282. (1024 * 1024 * 1024) * 1e3;
  283. auto time_in_ms = bencher.execs({kern, out, inp}) / nr_times;
  284. auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12;
  285. auto rr_time_in_ms = rr_bencher.execs({kern, out, rin, rout, inp}) / nr_times;
  286. auto rr_ops =
  287. 2.0 * batch * g * ho * wo * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
  288. printf("[DGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: "
  289. "grad=%s, "
  290. "kern=%s, diff=%s\n"
  291. "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
  292. "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
  293. inp.to_string().c_str(), kern.to_string().c_str(),
  294. out.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
  295. bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
  296. time_in_ms / rr_time_in_ms);
  297. };
  298. run_bench(64, 384, 32, 32, 3, 3, 1, 1, 10);
  299. run_bench(64, 384, 32, 32, 5, 5, 1, 1, 10);
  300. run_bench(64, 384, 32, 32, 7, 7, 1, 1, 10);
  301. run_bench(64, 384, 32, 32, 9, 9, 1, 1, 10);
  302. run_bench(64, 384, 32, 32, 11, 11, 1, 1, 10);
  303. run_bench(64, 384, 32, 32, 13, 13, 1, 1, 10);
  304. run_bench(64, 384, 32, 32, 15, 15, 1, 1, 10);
  305. run_bench(64, 384, 32, 32, 17, 17, 1, 1, 10);
  306. run_bench(64, 384, 32, 32, 19, 19, 1, 1, 10);
  307. run_bench(64, 384, 32, 32, 21, 21, 1, 1, 10);
  308. run_bench(64, 384, 32, 32, 23, 23, 1, 1, 10);
  309. run_bench(64, 384, 32, 32, 25, 25, 1, 1, 10);
  310. run_bench(64, 384, 32, 32, 27, 27, 1, 1, 10);
  311. run_bench(64, 384, 32, 32, 29, 29, 1, 1, 10);
  312. run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10);
  313. }
  314. TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER_UINT8) {
  315. require_compute_capability(7, 5);
  316. Benchmarker<ConvBiasForward> bencher(handle_cuda());
  317. bencher.set_display(false);
  318. bencher.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  319. ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  320. "DEPTHWISE_LARGE_FILTER", {})
  321. .c_str()));
  322. Benchmarker<RegionRestrictedConvolutionForward> rr_bencher(handle_cuda());
  323. rr_bencher.set_display(false);
  324. ConvBias::Param param;
  325. param.format = ConvBias::Param::Format::NCHW;
  326. using NonlineMode = ConvBias::Param::NonlineMode;
  327. param.nonlineMode = NonlineMode::IDENTITY;
  328. param.sparse = ConvBias::Param::Sparse::GROUP;
  329. RegionRestrictedConvolutionForward::Param rr_param;
  330. rr_param.format = RegionRestrictedConvolutionForward::Param::Format::NCHW;
  331. rr_param.sparse = RegionRestrictedConvolutionForward::Param::Sparse::GROUP;
  332. UniformIntRNG r_rng{0, 2};
  333. auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
  334. size_t fw, size_t sh, size_t sw, size_t nr_times) {
  335. param.pad_h = fh / 2;
  336. param.pad_w = fw / 2;
  337. param.stride_h = sh;
  338. param.stride_w = sw;
  339. rr_param.pad_h = fh / 2;
  340. rr_param.pad_w = fw / 2;
  341. rr_param.stride_h = sh;
  342. rr_param.stride_w = sw;
  343. bencher.set_param(param)
  344. .set_dtype(0, dtype::Float32())
  345. .set_dtype(1, dtype::Float32())
  346. .set_dtype(2, dtype::Float32())
  347. .set_dtype(4, dtype::Float32());
  348. bencher.set_times(nr_times);
  349. rr_bencher.set_param(rr_param)
  350. .set_dtype(0, dtype::Float32())
  351. .set_dtype(1, dtype::Float32())
  352. .set_dtype(2, dtype::Uint8())
  353. .set_dtype(3, dtype::Uint8());
  354. rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng).set_rng(0, &r_rng);
  355. rr_bencher.set_times(nr_times);
  356. size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
  357. size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
  358. TensorShape inp{batch, g, hi, wi}, kern{g, 1, 1, fh, fw}, rin{batch, hi, wi},
  359. rout{batch, ho, wo}, out{batch, g, ho, wo};
  360. float bandwith = static_cast<float>(
  361. inp.total_nr_elems() + kern.total_nr_elems() +
  362. out.total_nr_elems()) /
  363. (1024 * 1024 * 1024) * 1e3;
  364. float rr_bandwith = static_cast<float>(
  365. inp.total_nr_elems() + kern.total_nr_elems() +
  366. rin.total_nr_elems() + rout.total_nr_elems() +
  367. out.total_nr_elems()) /
  368. (1024 * 1024 * 1024) * 1e3;
  369. auto time_in_ms = bencher.execs({inp, kern, {}, {}, out}) / nr_times;
  370. auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12;
  371. auto rr_time_in_ms = rr_bencher.execs({inp, kern, rin, rout, out}) / nr_times;
  372. auto rr_ops =
  373. 2.0 * batch * g * ho * wo * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
  374. printf("RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: inp=%s, "
  375. "kern=%s, out=%s\n"
  376. "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
  377. "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
  378. inp.to_string().c_str(), kern.to_string().c_str(),
  379. out.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
  380. bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
  381. time_in_ms / rr_time_in_ms);
  382. };
  383. run_bench(64, 384, 32, 32, 3, 3, 1, 1, 10);
  384. run_bench(64, 384, 32, 32, 5, 5, 1, 1, 10);
  385. run_bench(64, 384, 32, 32, 7, 7, 1, 1, 10);
  386. run_bench(64, 384, 32, 32, 9, 9, 1, 1, 10);
  387. run_bench(64, 384, 32, 32, 11, 11, 1, 1, 10);
  388. run_bench(64, 384, 32, 32, 13, 13, 1, 1, 10);
  389. run_bench(64, 384, 32, 32, 15, 15, 1, 1, 10);
  390. run_bench(64, 384, 32, 32, 17, 17, 1, 1, 10);
  391. run_bench(64, 384, 32, 32, 19, 19, 1, 1, 10);
  392. run_bench(64, 384, 32, 32, 21, 21, 1, 1, 10);
  393. run_bench(64, 384, 32, 32, 23, 23, 1, 1, 10);
  394. run_bench(64, 384, 32, 32, 25, 25, 1, 1, 10);
  395. run_bench(64, 384, 32, 32, 27, 27, 1, 1, 10);
  396. run_bench(64, 384, 32, 32, 29, 29, 1, 1, 10);
  397. run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10);
  398. }
  399. #endif
  400. TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_DATA_FP32) {
  401. Checker<RegionRestrictedConvolutionBackwardData> checker(handle_cuda());
  402. for (auto dt : std::vector<DType>{dtype::Int32(), dtype::Uint8()}) {
  403. auto run = [&checker, &dt](
  404. size_t n, size_t g, size_t ih, size_t fh, size_t padding,
  405. size_t stride) {
  406. RegionRestrictedConvolutionBackwardData::Param cur_param;
  407. cur_param.mode = RegionRestrictedConvolutionBackwardData::Param::Mode::
  408. CROSS_CORRELATION;
  409. cur_param.compute_mode = RegionRestrictedConvolutionBackwardData::Param::
  410. ComputeMode::DEFAULT;
  411. cur_param.sparse =
  412. RegionRestrictedConvolutionBackwardData::Param::Sparse::GROUP;
  413. checker.set_dtype(0, dtype::Float32())
  414. .set_dtype(1, dtype::Float32())
  415. .set_dtype(2, dt)
  416. .set_dtype(3, dt);
  417. float scale = 64.f / sqrt(fh * fh);
  418. UniformFloatRNG rng(scale, 2 * scale);
  419. UniformIntRNG r_rng{1, 2};
  420. checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng(
  421. 3, &r_rng);
  422. cur_param.pad_h = cur_param.pad_w = padding;
  423. cur_param.stride_h = cur_param.stride_w = stride;
  424. size_t oh = (ih + 2 * padding - fh + 1) / stride;
  425. checker.set_param(cur_param).execs({
  426. {g, 1, 1, fh, fh}, // filter
  427. {n, g * 1, oh, oh}, // diff
  428. {n, ih, ih}, // rin
  429. {n, oh, oh}, // rout
  430. {n, g * 1, ih, ih} // grad
  431. });
  432. };
  433. if (dt == dtype::Int32()) {
  434. run(4, 8, 32, 5, 5 / 2, 1);
  435. run(1, 2, 2, 2, 0, 1);
  436. run(1, 2, 3, 3, 0, 1);
  437. run(1, 2, 4, 4, 0, 1);
  438. run(1, 2, 5, 5, 0, 1);
  439. run(1, 2, 6, 6, 0, 1);
  440. run(1, 2, 7, 7, 0, 1);
  441. }
  442. run(4, 8, 32, 7, 7 / 2, 1);
  443. run(4, 8, 32, 9, 9 / 2, 1);
  444. run(4, 8, 32, 11, 11 / 2, 1);
  445. run(4, 8, 32, 13, 13 / 2, 1);
  446. run(4, 8, 32, 15, 15 / 2, 1);
  447. run(4, 8, 32, 17, 17 / 2, 1);
  448. run(4, 8, 32, 19, 19 / 2, 1);
  449. run(4, 8, 32, 21, 21 / 2, 1);
  450. run(4, 8, 32, 23, 23 / 2, 1);
  451. run(4, 8, 32, 25, 25 / 2, 1);
  452. run(4, 8, 32, 27, 27 / 2, 1);
  453. run(4, 8, 32, 29, 29 / 2, 1);
  454. run(4, 8, 32, 31, 31 / 2, 1);
  455. }
  456. }
  457. TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_DATA_FP32_RIN_EQ_ROUT) {
  458. Checker<RegionRestrictedConvolutionBackwardData> checker(handle_cuda());
  459. for (auto dt : std::vector<DType>{dtype::Int32()}) {
  460. auto run = [&checker, &dt](
  461. size_t n, size_t g, size_t ih, size_t fh, size_t padding,
  462. size_t stride) {
  463. RegionRestrictedConvolutionBackwardData::Param cur_param;
  464. cur_param.mode = RegionRestrictedConvolutionBackwardData::Param::Mode::
  465. CROSS_CORRELATION;
  466. cur_param.compute_mode = RegionRestrictedConvolutionBackwardData::Param::
  467. ComputeMode::DEFAULT;
  468. cur_param.sparse =
  469. RegionRestrictedConvolutionBackwardData::Param::Sparse::GROUP;
  470. checker.set_dtype(2, dt).set_dtype(3, dt);
  471. float scale = 64.f / sqrt(fh * fh);
  472. UniformFloatRNG rng(scale, 2 * scale);
  473. // value 0 mask may cause unexpected behaviour.
  474. UniformIntRNG r_rng{1, 1};
  475. checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng(
  476. 3, &r_rng);
  477. cur_param.pad_h = cur_param.pad_w = padding;
  478. cur_param.stride_h = cur_param.stride_w = stride;
  479. size_t oh = (ih + 2 * padding - fh + 1) / stride;
  480. checker.set_param(cur_param).execs(
  481. {/*filter*/ {g, 1, 1, fh, fh},
  482. /*diff*/ {n, g * 1, oh, oh},
  483. /*rin*/ {n, ih, ih},
  484. /*rout*/ {n, oh, oh},
  485. /*grad*/ {n, g * 1, ih, ih}});
  486. };
  487. if (dt == dtype::Int32()) {
  488. // NOTE: UINT8 assert the spatial size of src&dst is 4*N
  489. run(4, 8, 32, 5, 5 / 2, 1);
  490. run(1, 2, 2, 2, 0, 1);
  491. run(1, 2, 3, 3, 0, 1);
  492. run(1, 2, 4, 4, 0, 1);
  493. run(1, 2, 5, 5, 0, 1);
  494. run(1, 2, 6, 6, 0, 1);
  495. run(1, 2, 7, 7, 0, 1);
  496. }
  497. run(4, 8, 32, 7, 7 / 2, 1);
  498. run(4, 8, 32, 9, 9 / 2, 1);
  499. run(4, 8, 32, 11, 11 / 2, 1);
  500. run(4, 8, 32, 13, 13 / 2, 1);
  501. run(4, 8, 32, 15, 15 / 2, 1);
  502. run(4, 8, 32, 17, 17 / 2, 1);
  503. run(4, 8, 32, 19, 19 / 2, 1);
  504. run(4, 8, 32, 21, 21 / 2, 1);
  505. run(4, 8, 32, 23, 23 / 2, 1);
  506. run(4, 8, 32, 25, 25 / 2, 1);
  507. run(4, 8, 32, 27, 27 / 2, 1);
  508. run(4, 8, 32, 29, 29 / 2, 1);
  509. run(4, 8, 32, 31, 31 / 2, 1);
  510. }
  511. }
  512. } // namespace test
  513. } // namespace megdnn
  514. // vim: syntax=cpp.doxygen