You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

region_restricted_convolution.cpp 40 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912
  1. #include "megdnn/dtype.h"
  2. #include "megdnn/opr_param_defs.h"
  3. #include "megdnn/oprs.h"
  4. #include "test/common/checker.h"
  5. #include "test/common/conv_bias.h"
  6. #include "test/common/rng.h"
  7. #include "test/common/tensor.h"
  8. #include "test/common/workspace_wrapper.h"
  9. #include "test/cuda/benchmark.h"
  10. #include "test/cuda/fixture.h"
  11. #include "test/cuda/utils.h"
  12. #include <cudnn.h>
  13. #define V1(x) #x
  14. #define V(x) V1(x)
  15. #define CUDNN_VERSION_STRING \
  16. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  17. namespace megdnn {
  18. namespace test {
  19. TEST_F(CUDA, REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER) {
  20. Checker<RegionRestrictedConvolutionForward> checker(handle_cuda());
  21. auto opr = handle_cuda()->create_operator<ConvolutionForward>();
  22. for (auto dt : std::vector<DType>{dtype::Int32(), dtype::Uint8()}) {
  23. auto run = [&checker, &dt, &opr](
  24. size_t n, size_t g, size_t h, size_t fh, size_t padding,
  25. size_t stride) {
  26. RegionRestrictedConvolution::Param cur_param;
  27. cur_param.mode =
  28. RegionRestrictedConvolution::Param::Mode::CROSS_CORRELATION;
  29. cur_param.sparse = RegionRestrictedConvolution::Param::Sparse::GROUP;
  30. checker.set_dtype(2, dt).set_dtype(3, dt);
  31. float scale = 64.f / sqrt(fh * fh);
  32. UniformFloatRNG rng(scale, 2 * scale);
  33. UniformIntRNG r_rng{0, 2};
  34. checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng(
  35. 3, &r_rng);
  36. if (dt.enumv() == DTypeEnum::Float16) {
  37. checker.set_epsilon(1e-1);
  38. }
  39. cur_param.pad_h = cur_param.pad_w = padding;
  40. cur_param.stride_h = cur_param.stride_w = stride;
  41. size_t ho = infer_conv_shape(h, fh, stride, padding);
  42. checker.set_param(cur_param).execs(
  43. {{n, g, h, h}, {g, 1, 1, fh, fh}, {n, h, h}, {n, ho, ho}, {}});
  44. };
  45. run(4, 8, 32, 3, 3 / 2, 1);
  46. run(4, 8, 32, 5, 5 / 2, 1);
  47. run(4, 8, 32, 7, 7 / 2, 1);
  48. run(1, 2, 32, 9, 9 / 2, 1);
  49. run(4, 8, 32, 11, 11 / 2, 1);
  50. run(4, 8, 32, 13, 13 / 2, 1);
  51. run(4, 8, 32, 15, 15 / 2, 1);
  52. run(4, 8, 32, 17, 17 / 2, 1);
  53. run(4, 8, 32, 19, 19 / 2, 1);
  54. run(4, 8, 32, 21, 21 / 2, 1);
  55. run(4, 8, 32, 23, 23 / 2, 1);
  56. run(4, 8, 32, 25, 25 / 2, 1);
  57. run(4, 8, 32, 27, 27 / 2, 1);
  58. run(4, 8, 32, 29, 29 / 2, 1);
  59. run(4, 8, 32, 31, 31 / 2, 1);
  60. }
  61. }
  62. #if MEGDNN_WITH_BENCHMARK
  63. TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER_FP32) {
  64. require_compute_capability(7, 5);
  65. Benchmarker<ConvBiasForward> bencher(handle_cuda());
  66. bencher.set_display(false);
  67. bencher.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  68. ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  69. "DEPTHWISE_LARGE_FILTER", {})
  70. .c_str()));
  71. Benchmarker<RegionRestrictedConvolutionForward> rr_bencher(handle_cuda());
  72. rr_bencher.set_display(false);
  73. ConvBias::Param param;
  74. param.format = ConvBias::Param::Format::NCHW;
  75. using NonlineMode = ConvBias::Param::NonlineMode;
  76. param.nonlineMode = NonlineMode::IDENTITY;
  77. param.sparse = ConvBias::Param::Sparse::GROUP;
  78. RegionRestrictedConvolutionForward::Param rr_param;
  79. rr_param.format = RegionRestrictedConvolutionForward::Param::Format::NCHW;
  80. rr_param.sparse = RegionRestrictedConvolutionForward::Param::Sparse::GROUP;
  81. UniformIntRNG r_rng{0, 2};
  82. auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
  83. size_t fw, size_t sh, size_t sw, size_t nr_times) {
  84. param.pad_h = fh / 2;
  85. param.pad_w = fw / 2;
  86. param.stride_h = sh;
  87. param.stride_w = sw;
  88. rr_param.pad_h = fh / 2;
  89. rr_param.pad_w = fw / 2;
  90. rr_param.stride_h = sh;
  91. rr_param.stride_w = sw;
  92. bencher.set_param(param)
  93. .set_dtype(0, dtype::Float32())
  94. .set_dtype(1, dtype::Float32())
  95. .set_dtype(2, dtype::Float32())
  96. .set_dtype(4, dtype::Float32());
  97. bencher.set_times(nr_times);
  98. rr_bencher.set_param(rr_param)
  99. .set_dtype(0, dtype::Float32())
  100. .set_dtype(1, dtype::Float32())
  101. .set_dtype(2, dtype::Int32())
  102. .set_dtype(3, dtype::Int32());
  103. rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng);
  104. rr_bencher.set_times(nr_times);
  105. size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
  106. size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
  107. TensorShape inp{batch, g, hi, wi}, kern{g, 1, 1, fh, fw}, rin{batch, hi, wi},
  108. rout{batch, ho, wo}, out{batch, g, ho, wo};
  109. float bandwith = static_cast<float>(
  110. inp.total_nr_elems() + kern.total_nr_elems() +
  111. out.total_nr_elems()) /
  112. (1024 * 1024 * 1024) * 1e3;
  113. float rr_bandwith = static_cast<float>(
  114. inp.total_nr_elems() + kern.total_nr_elems() +
  115. rin.total_nr_elems() + rout.total_nr_elems() +
  116. out.total_nr_elems()) /
  117. (1024 * 1024 * 1024) * 1e3;
  118. auto time_in_ms = bencher.execs({inp, kern, {}, {}, out}) / nr_times;
  119. auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12;
  120. auto rr_time_in_ms = rr_bencher.execs({inp, kern, rin, rout, out}) / nr_times;
  121. auto rr_ops =
  122. 2.0 * batch * g * ho * wo * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
  123. printf("RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: inp=%s, "
  124. "kern=%s, out=%s\n"
  125. "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
  126. "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
  127. inp.to_string().c_str(), kern.to_string().c_str(),
  128. out.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
  129. bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
  130. time_in_ms / rr_time_in_ms);
  131. };
  132. run_bench(64, 384, 32, 32, 3, 3, 1, 1, 10);
  133. run_bench(64, 384, 32, 32, 5, 5, 1, 1, 10);
  134. run_bench(64, 384, 32, 32, 7, 7, 1, 1, 10);
  135. run_bench(64, 384, 32, 32, 9, 9, 1, 1, 10);
  136. run_bench(64, 384, 32, 32, 11, 11, 1, 1, 10);
  137. run_bench(64, 384, 32, 32, 13, 13, 1, 1, 10);
  138. run_bench(64, 384, 32, 32, 15, 15, 1, 1, 10);
  139. run_bench(64, 384, 32, 32, 17, 17, 1, 1, 10);
  140. run_bench(64, 384, 32, 32, 19, 19, 1, 1, 10);
  141. run_bench(64, 384, 32, 32, 21, 21, 1, 1, 10);
  142. run_bench(64, 384, 32, 32, 23, 23, 1, 1, 10);
  143. run_bench(64, 384, 32, 32, 25, 25, 1, 1, 10);
  144. run_bench(64, 384, 32, 32, 27, 27, 1, 1, 10);
  145. run_bench(64, 384, 32, 32, 29, 29, 1, 1, 10);
  146. run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10);
  147. }
  148. TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_LARGE_FILTER_FP32) {
  149. require_compute_capability(7, 5);
  150. Benchmarker<ConvolutionBackwardData> bencher(handle_cuda());
  151. bencher.set_display(false);
  152. bencher.set_before_exec_callback(
  153. AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
  154. Benchmarker<RegionRestrictedConvolutionBackwardData> rr_bencher(handle_cuda());
  155. rr_bencher.set_display(false);
  156. ConvolutionBackwardData::Param param;
  157. param.format = ConvolutionBackwardData::Param::Format::NCHW;
  158. param.sparse = ConvolutionBackwardData::Param::Sparse::GROUP;
  159. RegionRestrictedConvolutionBackwardData::Param rr_param;
  160. rr_param.format = RegionRestrictedConvolutionBackwardData::Param::Format::NCHW;
  161. rr_param.sparse = RegionRestrictedConvolutionBackwardData::Param::Sparse::GROUP;
  162. UniformIntRNG r_rng{1, 3};
  163. auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
  164. size_t fw, size_t sh, size_t sw, size_t nr_times) {
  165. param.pad_h = fh / 2;
  166. param.pad_w = fw / 2;
  167. param.stride_h = sh;
  168. param.stride_w = sw;
  169. rr_param.pad_h = fh / 2;
  170. rr_param.pad_w = fw / 2;
  171. rr_param.stride_h = sh;
  172. rr_param.stride_w = sw;
  173. bencher.set_param(param)
  174. .set_dtype(0, dtype::Float32())
  175. .set_dtype(1, dtype::Float32())
  176. .set_dtype(2, dtype::Float32())
  177. .set_dtype(4, dtype::Float32());
  178. bencher.set_times(nr_times);
  179. rr_bencher.set_param(rr_param)
  180. .set_dtype(0, dtype::Float32())
  181. .set_dtype(1, dtype::Float32())
  182. .set_dtype(2, dtype::Int32())
  183. .set_dtype(3, dtype::Int32());
  184. rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng);
  185. rr_bencher.set_times(nr_times);
  186. size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
  187. size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
  188. TensorShape inp{batch, g, hi, wi} /*src*/, kern{g, 1, 1, fh, fw} /*filter*/,
  189. rin{batch, hi, wi}, rout{batch, ho, wo},
  190. out{batch, g, ho, wo} /*output*/;
  191. float bandwith = static_cast<float>(
  192. inp.total_nr_elems() + kern.total_nr_elems() +
  193. out.total_nr_elems()) /
  194. (1024 * 1024 * 1024) * 1e3;
  195. float rr_bandwith = static_cast<float>(
  196. inp.total_nr_elems() + kern.total_nr_elems() +
  197. rin.total_nr_elems() + rout.total_nr_elems() +
  198. out.total_nr_elems()) /
  199. (1024 * 1024 * 1024) * 1e3;
  200. auto time_in_ms = bencher.execs({kern, out, inp}) / nr_times;
  201. auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12;
  202. auto rr_time_in_ms = rr_bencher.execs({kern, out, rin, rout, inp}) / nr_times;
  203. auto rr_ops =
  204. 2.0 * batch * g * ho * wo * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
  205. printf("[DGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: "
  206. "grad=%s, "
  207. "kern=%s, diff=%s\n"
  208. "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
  209. "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
  210. inp.to_string().c_str(), kern.to_string().c_str(),
  211. out.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
  212. bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
  213. time_in_ms / rr_time_in_ms);
  214. };
  215. run_bench(64, 384, 32, 32, 3, 3, 1, 1, 10);
  216. run_bench(64, 384, 32, 32, 5, 5, 1, 1, 10);
  217. run_bench(64, 384, 32, 32, 7, 7, 1, 1, 10);
  218. run_bench(64, 384, 32, 32, 9, 9, 1, 1, 10);
  219. run_bench(64, 384, 32, 32, 11, 11, 1, 1, 10);
  220. run_bench(64, 384, 32, 32, 13, 13, 1, 1, 10);
  221. run_bench(64, 384, 32, 32, 15, 15, 1, 1, 10);
  222. run_bench(64, 384, 32, 32, 17, 17, 1, 1, 10);
  223. run_bench(64, 384, 32, 32, 19, 19, 1, 1, 10);
  224. run_bench(64, 384, 32, 32, 21, 21, 1, 1, 10);
  225. run_bench(64, 384, 32, 32, 23, 23, 1, 1, 10);
  226. run_bench(64, 384, 32, 32, 25, 25, 1, 1, 10);
  227. run_bench(64, 384, 32, 32, 27, 27, 1, 1, 10);
  228. run_bench(64, 384, 32, 32, 29, 29, 1, 1, 10);
  229. run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10);
  230. }
  231. TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_LARGE_FILTER_FP32_UINT8) {
  232. require_compute_capability(7, 5);
  233. Benchmarker<ConvolutionBackwardData> bencher(handle_cuda());
  234. bencher.set_display(false);
  235. bencher.set_before_exec_callback(
  236. AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
  237. Benchmarker<RegionRestrictedConvolutionBackwardData> rr_bencher(handle_cuda());
  238. rr_bencher.set_display(false);
  239. ConvolutionBackwardData::Param param;
  240. param.format = ConvolutionBackwardData::Param::Format::NCHW;
  241. param.sparse = ConvolutionBackwardData::Param::Sparse::GROUP;
  242. RegionRestrictedConvolutionBackwardData::Param rr_param;
  243. rr_param.format = RegionRestrictedConvolutionBackwardData::Param::Format::NCHW;
  244. rr_param.sparse = RegionRestrictedConvolutionBackwardData::Param::Sparse::GROUP;
  245. UniformIntRNG r_rng{1, 3};
  246. auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
  247. size_t fw, size_t sh, size_t sw, size_t nr_times) {
  248. param.pad_h = fh / 2;
  249. param.pad_w = fw / 2;
  250. param.stride_h = sh;
  251. param.stride_w = sw;
  252. rr_param.pad_h = fh / 2;
  253. rr_param.pad_w = fw / 2;
  254. rr_param.stride_h = sh;
  255. rr_param.stride_w = sw;
  256. bencher.set_param(param)
  257. .set_dtype(0, dtype::Float32())
  258. .set_dtype(1, dtype::Float32())
  259. .set_dtype(2, dtype::Float32())
  260. .set_dtype(4, dtype::Float32());
  261. bencher.set_times(nr_times);
  262. rr_bencher.set_param(rr_param)
  263. .set_dtype(0, dtype::Float32())
  264. .set_dtype(1, dtype::Float32())
  265. .set_dtype(2, dtype::Uint8())
  266. .set_dtype(3, dtype::Uint8());
  267. rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng);
  268. rr_bencher.set_times(nr_times);
  269. size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
  270. size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
  271. TensorShape inp{batch, g, hi, wi} /*src*/, kern{g, 1, 1, fh, fw} /*filter*/,
  272. rin{batch, hi, wi}, rout{batch, ho, wo},
  273. out{batch, g, ho, wo} /*output*/;
  274. float bandwith = static_cast<float>(
  275. inp.total_nr_elems() + kern.total_nr_elems() +
  276. out.total_nr_elems()) /
  277. (1024 * 1024 * 1024) * 1e3;
  278. float rr_bandwith = static_cast<float>(
  279. inp.total_nr_elems() + kern.total_nr_elems() +
  280. rin.total_nr_elems() + rout.total_nr_elems() +
  281. out.total_nr_elems()) /
  282. (1024 * 1024 * 1024) * 1e3;
  283. auto time_in_ms = bencher.execs({kern, out, inp}) / nr_times;
  284. auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12;
  285. auto rr_time_in_ms = rr_bencher.execs({kern, out, rin, rout, inp}) / nr_times;
  286. auto rr_ops =
  287. 2.0 * batch * g * ho * wo * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
  288. printf("[DGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: "
  289. "grad=%s, "
  290. "kern=%s, diff=%s\n"
  291. "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
  292. "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
  293. inp.to_string().c_str(), kern.to_string().c_str(),
  294. out.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
  295. bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
  296. time_in_ms / rr_time_in_ms);
  297. };
  298. run_bench(64, 384, 32, 32, 3, 3, 1, 1, 10);
  299. run_bench(64, 384, 32, 32, 5, 5, 1, 1, 10);
  300. run_bench(64, 384, 32, 32, 7, 7, 1, 1, 10);
  301. run_bench(64, 384, 32, 32, 9, 9, 1, 1, 10);
  302. run_bench(64, 384, 32, 32, 11, 11, 1, 1, 10);
  303. run_bench(64, 384, 32, 32, 13, 13, 1, 1, 10);
  304. run_bench(64, 384, 32, 32, 15, 15, 1, 1, 10);
  305. run_bench(64, 384, 32, 32, 17, 17, 1, 1, 10);
  306. run_bench(64, 384, 32, 32, 19, 19, 1, 1, 10);
  307. run_bench(64, 384, 32, 32, 21, 21, 1, 1, 10);
  308. run_bench(64, 384, 32, 32, 23, 23, 1, 1, 10);
  309. run_bench(64, 384, 32, 32, 25, 25, 1, 1, 10);
  310. run_bench(64, 384, 32, 32, 27, 27, 1, 1, 10);
  311. run_bench(64, 384, 32, 32, 29, 29, 1, 1, 10);
  312. run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10);
  313. }
  314. TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER_UINT8) {
  315. require_compute_capability(7, 5);
  316. Benchmarker<ConvBiasForward> bencher(handle_cuda());
  317. bencher.set_display(false);
  318. bencher.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  319. ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  320. "DEPTHWISE_LARGE_FILTER", {})
  321. .c_str()));
  322. Benchmarker<RegionRestrictedConvolutionForward> rr_bencher(handle_cuda());
  323. rr_bencher.set_display(false);
  324. ConvBias::Param param;
  325. param.format = ConvBias::Param::Format::NCHW;
  326. using NonlineMode = ConvBias::Param::NonlineMode;
  327. param.nonlineMode = NonlineMode::IDENTITY;
  328. param.sparse = ConvBias::Param::Sparse::GROUP;
  329. RegionRestrictedConvolutionForward::Param rr_param;
  330. rr_param.format = RegionRestrictedConvolutionForward::Param::Format::NCHW;
  331. rr_param.sparse = RegionRestrictedConvolutionForward::Param::Sparse::GROUP;
  332. UniformIntRNG r_rng{0, 2};
  333. auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
  334. size_t fw, size_t sh, size_t sw, size_t nr_times) {
  335. param.pad_h = fh / 2;
  336. param.pad_w = fw / 2;
  337. param.stride_h = sh;
  338. param.stride_w = sw;
  339. rr_param.pad_h = fh / 2;
  340. rr_param.pad_w = fw / 2;
  341. rr_param.stride_h = sh;
  342. rr_param.stride_w = sw;
  343. bencher.set_param(param)
  344. .set_dtype(0, dtype::Float32())
  345. .set_dtype(1, dtype::Float32())
  346. .set_dtype(2, dtype::Float32())
  347. .set_dtype(4, dtype::Float32());
  348. bencher.set_times(nr_times);
  349. rr_bencher.set_param(rr_param)
  350. .set_dtype(0, dtype::Float32())
  351. .set_dtype(1, dtype::Float32())
  352. .set_dtype(2, dtype::Uint8())
  353. .set_dtype(3, dtype::Uint8());
  354. rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng).set_rng(0, &r_rng);
  355. rr_bencher.set_times(nr_times);
  356. size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
  357. size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
  358. TensorShape inp{batch, g, hi, wi}, kern{g, 1, 1, fh, fw}, rin{batch, hi, wi},
  359. rout{batch, ho, wo}, out{batch, g, ho, wo};
  360. float bandwith = static_cast<float>(
  361. inp.total_nr_elems() + kern.total_nr_elems() +
  362. out.total_nr_elems()) /
  363. (1024 * 1024 * 1024) * 1e3;
  364. float rr_bandwith = static_cast<float>(
  365. inp.total_nr_elems() + kern.total_nr_elems() +
  366. rin.total_nr_elems() + rout.total_nr_elems() +
  367. out.total_nr_elems()) /
  368. (1024 * 1024 * 1024) * 1e3;
  369. auto time_in_ms = bencher.execs({inp, kern, {}, {}, out}) / nr_times;
  370. auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12;
  371. auto rr_time_in_ms = rr_bencher.execs({inp, kern, rin, rout, out}) / nr_times;
  372. auto rr_ops =
  373. 2.0 * batch * g * ho * wo * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
  374. printf("RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: inp=%s, "
  375. "kern=%s, out=%s\n"
  376. "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
  377. "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
  378. inp.to_string().c_str(), kern.to_string().c_str(),
  379. out.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
  380. bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
  381. time_in_ms / rr_time_in_ms);
  382. };
  383. run_bench(64, 384, 32, 32, 3, 3, 1, 1, 10);
  384. run_bench(64, 384, 32, 32, 5, 5, 1, 1, 10);
  385. run_bench(64, 384, 32, 32, 7, 7, 1, 1, 10);
  386. run_bench(64, 384, 32, 32, 9, 9, 1, 1, 10);
  387. run_bench(64, 384, 32, 32, 11, 11, 1, 1, 10);
  388. run_bench(64, 384, 32, 32, 13, 13, 1, 1, 10);
  389. run_bench(64, 384, 32, 32, 15, 15, 1, 1, 10);
  390. run_bench(64, 384, 32, 32, 17, 17, 1, 1, 10);
  391. run_bench(64, 384, 32, 32, 19, 19, 1, 1, 10);
  392. run_bench(64, 384, 32, 32, 21, 21, 1, 1, 10);
  393. run_bench(64, 384, 32, 32, 23, 23, 1, 1, 10);
  394. run_bench(64, 384, 32, 32, 25, 25, 1, 1, 10);
  395. run_bench(64, 384, 32, 32, 27, 27, 1, 1, 10);
  396. run_bench(64, 384, 32, 32, 29, 29, 1, 1, 10);
  397. run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10);
  398. }
  399. TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_FILTER_FP32) {
  400. require_compute_capability(7, 5);
  401. Benchmarker<ConvolutionBackwardFilter> bencher(handle_cuda());
  402. bencher.set_display(false);
  403. bencher.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  404. "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_128X128X8_32X64X8_2stage"));
  405. Benchmarker<RegionRestrictedConvolutionBackwardFilter> rr_bencher(handle_cuda());
  406. rr_bencher.set_display(false);
  407. ConvolutionBackwardFilter::Param param;
  408. param.format = ConvolutionBackwardFilter::Param::Format::NCHW;
  409. param.sparse = ConvolutionBackwardFilter::Param::Sparse::GROUP;
  410. RegionRestrictedConvolutionBackwardFilter::Param rr_param;
  411. rr_param.format = RegionRestrictedConvolutionBackwardFilter::Param::Format::NCHW;
  412. rr_param.sparse = RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP;
  413. UniformIntRNG r_rng{1, 3};
  414. auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
  415. size_t fw, size_t sh, size_t sw, size_t nr_times) {
  416. param.pad_h = fh / 2;
  417. param.pad_w = fw / 2;
  418. param.stride_h = sh;
  419. param.stride_w = sw;
  420. rr_param.pad_h = fh / 2;
  421. rr_param.pad_w = fw / 2;
  422. rr_param.stride_h = sh;
  423. rr_param.stride_w = sw;
  424. bencher.set_param(param)
  425. .set_dtype(0, dtype::Float32())
  426. .set_dtype(1, dtype::Float32())
  427. .set_dtype(2, dtype::Float32())
  428. .set_dtype(4, dtype::Float32());
  429. bencher.proxy()->target_execution_policy = {};
  430. bencher.set_times(nr_times);
  431. rr_bencher.set_param(rr_param)
  432. .set_dtype(0, dtype::Float32())
  433. .set_dtype(1, dtype::Float32())
  434. .set_dtype(2, dtype::Int32())
  435. .set_dtype(3, dtype::Int32());
  436. rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng);
  437. rr_bencher.set_times(nr_times);
  438. size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
  439. size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
  440. TensorShape src{batch, g, hi, wi}, diff{batch, g, ho, wo}, rin{batch, hi, wi},
  441. rout{batch, ho, wo}, grad{g, 1, 1, fh, fw};
  442. float bandwith = static_cast<float>(
  443. src.total_nr_elems() + diff.total_nr_elems() +
  444. grad.total_nr_elems()) /
  445. (1024 * 1024 * 1024) * 1e3;
  446. float rr_bandwith = static_cast<float>(
  447. src.total_nr_elems() + diff.total_nr_elems() +
  448. rin.total_nr_elems() + rout.total_nr_elems() +
  449. grad.total_nr_elems()) /
  450. (1024 * 1024 * 1024) * 1e3;
  451. auto time_in_ms = bencher.execs({src, diff, grad}) / nr_times;
  452. auto ops = 2.0 * batch * g * hi * wi * fh * fw / (time_in_ms * 1e-3) * 1e-12;
  453. auto rr_time_in_ms = rr_bencher.execs({src, diff, rin, rout, grad}) / nr_times;
  454. auto rr_ops =
  455. 2.0 * batch * g * hi * wi * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
  456. printf("[DGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: "
  457. "src=%s, "
  458. "diff=%s, grad=%s\n"
  459. "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
  460. "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
  461. src.to_string().c_str(), diff.to_string().c_str(),
  462. grad.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
  463. bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
  464. time_in_ms / rr_time_in_ms);
  465. };
  466. run_bench(64, 384, 32, 32, 3, 3, 1, 1, 1000);
  467. run_bench(64, 384, 32, 32, 5, 5, 1, 1, 1000);
  468. run_bench(64, 384, 32, 32, 7, 7, 1, 1, 1000);
  469. run_bench(64, 384, 32, 32, 9, 9, 1, 1, 1000);
  470. run_bench(64, 384, 32, 32, 11, 11, 1, 1, 1000);
  471. run_bench(64, 384, 32, 32, 13, 13, 1, 1, 1000);
  472. run_bench(64, 384, 32, 32, 15, 15, 1, 1, 1000);
  473. run_bench(64, 384, 32, 32, 17, 17, 1, 1, 1000);
  474. run_bench(64, 384, 32, 32, 19, 19, 1, 1, 1000);
  475. run_bench(64, 384, 32, 32, 21, 21, 1, 1, 1000);
  476. run_bench(64, 384, 32, 32, 23, 23, 1, 1, 1000);
  477. run_bench(64, 384, 32, 32, 25, 25, 1, 1, 1000);
  478. run_bench(64, 384, 32, 32, 27, 27, 1, 1, 1000);
  479. run_bench(64, 384, 32, 32, 29, 29, 1, 1, 1000);
  480. run_bench(64, 384, 32, 32, 31, 31, 1, 1, 1000);
  481. }
  482. TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_FILTER_FP32_RINT8) {
  483. require_compute_capability(7, 5);
  484. Benchmarker<ConvolutionBackwardFilter> bencher(handle_cuda());
  485. bencher.set_display(false);
  486. bencher.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  487. "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_128X128X8_32X64X8_2stage"));
  488. Benchmarker<RegionRestrictedConvolutionBackwardFilter> rr_bencher(handle_cuda());
  489. rr_bencher.set_display(false);
  490. ConvolutionBackwardFilter::Param param;
  491. param.format = ConvolutionBackwardFilter::Param::Format::NCHW;
  492. param.sparse = ConvolutionBackwardFilter::Param::Sparse::GROUP;
  493. RegionRestrictedConvolutionBackwardFilter::Param rr_param;
  494. rr_param.format = RegionRestrictedConvolutionBackwardFilter::Param::Format::NCHW;
  495. rr_param.sparse = RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP;
  496. UniformIntRNG r_rng{1, 3};
  497. auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
  498. size_t fw, size_t sh, size_t sw, size_t nr_times) {
  499. param.pad_h = fh / 2;
  500. param.pad_w = fw / 2;
  501. param.stride_h = sh;
  502. param.stride_w = sw;
  503. rr_param.pad_h = fh / 2;
  504. rr_param.pad_w = fw / 2;
  505. rr_param.stride_h = sh;
  506. rr_param.stride_w = sw;
  507. bencher.set_param(param)
  508. .set_dtype(0, dtype::Float32())
  509. .set_dtype(1, dtype::Float32())
  510. .set_dtype(2, dtype::Float32())
  511. .set_dtype(4, dtype::Float32());
  512. bencher.proxy()->target_execution_policy = {};
  513. bencher.set_times(nr_times);
  514. rr_bencher.set_param(rr_param)
  515. .set_dtype(0, dtype::Float32())
  516. .set_dtype(1, dtype::Float32())
  517. .set_dtype(2, dtype::Uint8())
  518. .set_dtype(3, dtype::Uint8());
  519. rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng);
  520. rr_bencher.set_times(nr_times);
  521. size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
  522. size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
  523. TensorShape src{batch, g, hi, wi}, diff{batch, g, ho, wo}, rin{batch, hi, wi},
  524. rout{batch, ho, wo}, grad{g, 1, 1, fh, fw};
  525. float bandwith = static_cast<float>(
  526. src.total_nr_elems() + diff.total_nr_elems() +
  527. grad.total_nr_elems()) /
  528. (1024 * 1024 * 1024) * 1e3;
  529. float rr_bandwith = static_cast<float>(
  530. src.total_nr_elems() + diff.total_nr_elems() +
  531. rin.total_nr_elems() + rout.total_nr_elems() +
  532. grad.total_nr_elems()) /
  533. (1024 * 1024 * 1024) * 1e3;
  534. auto time_in_ms = bencher.execs({src, diff, grad}) / nr_times;
  535. auto ops = 2.0 * batch * g * hi * wi * fh * fw / (time_in_ms * 1e-3) * 1e-12;
  536. auto rr_time_in_ms = rr_bencher.execs({src, diff, rin, rout, grad}) / nr_times;
  537. auto rr_ops =
  538. 2.0 * batch * g * hi * wi * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
  539. printf("[DGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: "
  540. "src=%s, "
  541. "diff=%s, grad=%s\n"
  542. "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
  543. "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
  544. src.to_string().c_str(), diff.to_string().c_str(),
  545. grad.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
  546. bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
  547. time_in_ms / rr_time_in_ms);
  548. };
  549. run_bench(64, 384, 32, 32, 3, 3, 1, 1, 1000);
  550. run_bench(64, 384, 32, 32, 5, 5, 1, 1, 1000);
  551. run_bench(64, 384, 32, 32, 7, 7, 1, 1, 1000);
  552. run_bench(64, 384, 32, 32, 9, 9, 1, 1, 1000);
  553. run_bench(64, 384, 32, 32, 11, 11, 1, 1, 1000);
  554. run_bench(64, 384, 32, 32, 13, 13, 1, 1, 1000);
  555. run_bench(64, 384, 32, 32, 15, 15, 1, 1, 1000);
  556. run_bench(64, 384, 32, 32, 17, 17, 1, 1, 1000);
  557. run_bench(64, 384, 32, 32, 19, 19, 1, 1, 1000);
  558. run_bench(64, 384, 32, 32, 21, 21, 1, 1, 1000);
  559. run_bench(64, 384, 32, 32, 23, 23, 1, 1, 1000);
  560. run_bench(64, 384, 32, 32, 25, 25, 1, 1, 1000);
  561. run_bench(64, 384, 32, 32, 27, 27, 1, 1, 1000);
  562. run_bench(64, 384, 32, 32, 29, 29, 1, 1, 1000);
  563. run_bench(64, 384, 32, 32, 31, 31, 1, 1, 1000);
  564. }
  565. #endif
  566. TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_DATA_FP32) {
  567. Checker<RegionRestrictedConvolutionBackwardData> checker(handle_cuda());
  568. for (auto dt : std::vector<DType>{dtype::Int32(), dtype::Uint8()}) {
  569. auto run = [&checker, &dt](
  570. size_t n, size_t g, size_t ih, size_t fh, size_t padding,
  571. size_t stride) {
  572. RegionRestrictedConvolutionBackwardData::Param cur_param;
  573. cur_param.mode = RegionRestrictedConvolutionBackwardData::Param::Mode::
  574. CROSS_CORRELATION;
  575. cur_param.compute_mode = RegionRestrictedConvolutionBackwardData::Param::
  576. ComputeMode::DEFAULT;
  577. cur_param.sparse =
  578. RegionRestrictedConvolutionBackwardData::Param::Sparse::GROUP;
  579. checker.set_dtype(0, dtype::Float32())
  580. .set_dtype(1, dtype::Float32())
  581. .set_dtype(2, dt)
  582. .set_dtype(3, dt);
  583. float scale = 64.f / sqrt(fh * fh);
  584. UniformFloatRNG rng(scale, 2 * scale);
  585. UniformIntRNG r_rng{1, 2};
  586. checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng(
  587. 3, &r_rng);
  588. cur_param.pad_h = cur_param.pad_w = padding;
  589. cur_param.stride_h = cur_param.stride_w = stride;
  590. size_t oh = (ih + 2 * padding - fh + 1) / stride;
  591. checker.set_param(cur_param).execs({
  592. {g, 1, 1, fh, fh}, // filter
  593. {n, g * 1, oh, oh}, // diff
  594. {n, ih, ih}, // rin
  595. {n, oh, oh}, // rout
  596. {n, g * 1, ih, ih} // grad
  597. });
  598. };
  599. if (dt == dtype::Int32()) {
  600. run(4, 8, 32, 5, 5 / 2, 1);
  601. run(1, 2, 2, 2, 0, 1);
  602. run(1, 2, 3, 3, 0, 1);
  603. run(1, 2, 4, 4, 0, 1);
  604. run(1, 2, 5, 5, 0, 1);
  605. run(1, 2, 6, 6, 0, 1);
  606. run(1, 2, 7, 7, 0, 1);
  607. }
  608. run(4, 8, 32, 7, 7 / 2, 1);
  609. run(4, 8, 32, 9, 9 / 2, 1);
  610. run(4, 8, 32, 11, 11 / 2, 1);
  611. run(4, 8, 32, 13, 13 / 2, 1);
  612. run(4, 8, 32, 15, 15 / 2, 1);
  613. run(4, 8, 32, 17, 17 / 2, 1);
  614. run(4, 8, 32, 19, 19 / 2, 1);
  615. run(4, 8, 32, 21, 21 / 2, 1);
  616. run(4, 8, 32, 23, 23 / 2, 1);
  617. run(4, 8, 32, 25, 25 / 2, 1);
  618. run(4, 8, 32, 27, 27 / 2, 1);
  619. run(4, 8, 32, 29, 29 / 2, 1);
  620. run(4, 8, 32, 31, 31 / 2, 1);
  621. }
  622. }
  623. TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_DATA_FP32_RIN_EQ_ROUT) {
  624. Checker<RegionRestrictedConvolutionBackwardData> checker(handle_cuda());
  625. for (auto dt : std::vector<DType>{dtype::Int32()}) {
  626. auto run = [&checker, &dt](
  627. size_t n, size_t g, size_t ih, size_t fh, size_t padding,
  628. size_t stride) {
  629. RegionRestrictedConvolutionBackwardData::Param cur_param;
  630. cur_param.mode = RegionRestrictedConvolutionBackwardData::Param::Mode::
  631. CROSS_CORRELATION;
  632. cur_param.compute_mode = RegionRestrictedConvolutionBackwardData::Param::
  633. ComputeMode::DEFAULT;
  634. cur_param.sparse =
  635. RegionRestrictedConvolutionBackwardData::Param::Sparse::GROUP;
  636. checker.set_dtype(2, dt).set_dtype(3, dt);
  637. float scale = 64.f / sqrt(fh * fh);
  638. UniformFloatRNG rng(scale, 2 * scale);
  639. // value 0 mask may cause unexpected behaviour.
  640. UniformIntRNG r_rng{1, 1};
  641. checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng(
  642. 3, &r_rng);
  643. cur_param.pad_h = cur_param.pad_w = padding;
  644. cur_param.stride_h = cur_param.stride_w = stride;
  645. size_t oh = (ih + 2 * padding - fh + 1) / stride;
  646. checker.set_param(cur_param).execs(
  647. {/*filter*/ {g, 1, 1, fh, fh},
  648. /*diff*/ {n, g * 1, oh, oh},
  649. /*rin*/ {n, ih, ih},
  650. /*rout*/ {n, oh, oh},
  651. /*grad*/ {n, g * 1, ih, ih}});
  652. };
  653. if (dt == dtype::Int32()) {
  654. // NOTE: UINT8 assert the spatial size of src&dst is 4*N
  655. run(4, 8, 32, 5, 5 / 2, 1);
  656. run(1, 2, 2, 2, 0, 1);
  657. run(1, 2, 3, 3, 0, 1);
  658. run(1, 2, 4, 4, 0, 1);
  659. run(1, 2, 5, 5, 0, 1);
  660. run(1, 2, 6, 6, 0, 1);
  661. run(1, 2, 7, 7, 0, 1);
  662. }
  663. run(4, 8, 32, 7, 7 / 2, 1);
  664. run(4, 8, 32, 9, 9 / 2, 1);
  665. run(4, 8, 32, 11, 11 / 2, 1);
  666. run(4, 8, 32, 13, 13 / 2, 1);
  667. run(4, 8, 32, 15, 15 / 2, 1);
  668. run(4, 8, 32, 17, 17 / 2, 1);
  669. run(4, 8, 32, 19, 19 / 2, 1);
  670. run(4, 8, 32, 21, 21 / 2, 1);
  671. run(4, 8, 32, 23, 23 / 2, 1);
  672. run(4, 8, 32, 25, 25 / 2, 1);
  673. run(4, 8, 32, 27, 27 / 2, 1);
  674. run(4, 8, 32, 29, 29 / 2, 1);
  675. run(4, 8, 32, 31, 31 / 2, 1);
  676. }
  677. }
  678. TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_FILTER_FP32) {
  679. require_compute_capability(6, 1);
  680. Checker<RegionRestrictedConvolutionBackwardFilter> checker(handle_cuda());
  681. for (auto dt : std::vector<DType>{dtype::Int32(), dtype::Uint8()}) {
  682. auto run = [&checker, &dt](
  683. size_t n, size_t g, size_t ih, size_t fh, size_t padding,
  684. size_t stride) {
  685. RegionRestrictedConvolutionBackwardFilter::Param cur_param;
  686. cur_param.mode = RegionRestrictedConvolutionBackwardFilter::Param::Mode::
  687. CROSS_CORRELATION;
  688. cur_param.compute_mode = RegionRestrictedConvolutionBackwardFilter::Param::
  689. ComputeMode::DEFAULT;
  690. cur_param.sparse =
  691. RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP;
  692. checker.set_dtype(0, dtype::Float32())
  693. .set_dtype(1, dtype::Float32())
  694. .set_dtype(2, dt)
  695. .set_dtype(3, dt);
  696. float scale = 64.f / sqrt(fh * fh);
  697. UniformFloatRNG rng(scale, 2 * scale);
  698. UniformIntRNG r_rng{1, 2};
  699. checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng(
  700. 3, &r_rng);
  701. cur_param.pad_h = cur_param.pad_w = padding;
  702. cur_param.stride_h = cur_param.stride_w = stride;
  703. size_t oh = (ih + 2 * padding - fh + 1) / stride;
  704. checker.set_param(cur_param).execs({
  705. {n, g * 1, ih, ih}, // src
  706. {n, g * 1, oh, oh}, // diff
  707. {n, ih, ih}, // rin
  708. {n, oh, oh}, // rout
  709. {g, 1, 1, fh, fh} // grad
  710. });
  711. };
  712. if (dt == dtype::Int32()) {
  713. run(4, 8, 32, 5, 5 / 2, 1);
  714. run(1, 2, 2, 2, 0, 1);
  715. run(1, 2, 3, 3, 0, 1);
  716. run(1, 2, 4, 4, 0, 1);
  717. run(1, 2, 5, 5, 0, 1);
  718. run(1, 2, 6, 6, 0, 1);
  719. run(1, 2, 7, 7, 0, 1);
  720. }
  721. run(4, 8, 32, 7, 7 / 2, 1);
  722. run(4, 8, 32, 9, 9 / 2, 1);
  723. run(4, 8, 32, 11, 11 / 2, 1);
  724. run(4, 8, 32, 13, 13 / 2, 1);
  725. run(4, 8, 32, 15, 15 / 2, 1);
  726. run(4, 8, 32, 17, 17 / 2, 1);
  727. run(4, 8, 32, 19, 19 / 2, 1);
  728. run(4, 8, 32, 21, 21 / 2, 1);
  729. run(4, 8, 32, 23, 23 / 2, 1);
  730. run(4, 8, 32, 25, 25 / 2, 1);
  731. run(4, 8, 32, 27, 27 / 2, 1);
  732. run(4, 8, 32, 29, 29 / 2, 1);
  733. run(4, 8, 32, 31, 31 / 2, 1);
  734. }
  735. }
  736. TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_FILTER_FP32_RIN_EQ_ROUT) {
  737. require_compute_capability(6, 1);
  738. Checker<RegionRestrictedConvolutionBackwardFilter> checker(handle_cuda());
  739. for (auto dt : std::vector<DType>{dtype::Int32(), dtype::Uint8()}) {
  740. auto run = [&checker, &dt](
  741. size_t n, size_t g, size_t ih, size_t fh, size_t padding,
  742. size_t stride) {
  743. RegionRestrictedConvolutionBackwardFilter::Param cur_param;
  744. cur_param.mode = RegionRestrictedConvolutionBackwardFilter::Param::Mode::
  745. CROSS_CORRELATION;
  746. cur_param.compute_mode = RegionRestrictedConvolutionBackwardFilter::Param::
  747. ComputeMode::DEFAULT;
  748. cur_param.sparse =
  749. RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP;
  750. checker.set_dtype(0, dtype::Float32())
  751. .set_dtype(1, dtype::Float32())
  752. .set_dtype(2, dt)
  753. .set_dtype(3, dt);
  754. float scale = 64.f / sqrt(fh * fh);
  755. UniformFloatRNG rng(scale, 2 * scale);
  756. UniformIntRNG r_rng{1, 1};
  757. checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng(
  758. 3, &r_rng);
  759. cur_param.pad_h = cur_param.pad_w = padding;
  760. cur_param.stride_h = cur_param.stride_w = stride;
  761. size_t oh = (ih + 2 * padding - fh + 1) / stride;
  762. checker.set_param(cur_param).execs({
  763. {n, g * 1, ih, ih}, // src
  764. {n, g * 1, oh, oh}, // diff
  765. {n, ih, ih}, // rin
  766. {n, oh, oh}, // rout
  767. {g, 1, 1, fh, fh} // grad
  768. });
  769. };
  770. if (dt == dtype::Int32()) {
  771. run(4, 8, 32, 5, 5 / 2, 1);
  772. run(1, 2, 2, 2, 0, 1);
  773. run(1, 2, 3, 3, 0, 1);
  774. run(1, 2, 4, 4, 0, 1);
  775. run(1, 2, 5, 5, 0, 1);
  776. run(1, 2, 6, 6, 0, 1);
  777. run(1, 2, 7, 7, 0, 1);
  778. }
  779. run(4, 8, 32, 7, 7 / 2, 1);
  780. run(4, 8, 32, 9, 9 / 2, 1);
  781. run(4, 8, 32, 11, 11 / 2, 1);
  782. run(4, 8, 32, 13, 13 / 2, 1);
  783. run(4, 8, 32, 15, 15 / 2, 1);
  784. run(4, 8, 32, 17, 17 / 2, 1);
  785. run(4, 8, 32, 19, 19 / 2, 1);
  786. run(4, 8, 32, 21, 21 / 2, 1);
  787. run(4, 8, 32, 23, 23 / 2, 1);
  788. run(4, 8, 32, 25, 25 / 2, 1);
  789. run(4, 8, 32, 27, 27 / 2, 1);
  790. run(4, 8, 32, 29, 29 / 2, 1);
  791. run(4, 8, 32, 31, 31 / 2, 1);
  792. }
  793. }
  794. } // namespace test
  795. } // namespace megdnn
  796. // vim: syntax=cpp.doxygen