You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

region_restricted_convolution.cpp 40 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917
  1. #include "megdnn/dtype.h"
  2. #include "megdnn/opr_param_defs.h"
  3. #include "megdnn/oprs.h"
  4. #include "test/common/checker.h"
  5. #include "test/common/conv_bias.h"
  6. #include "test/common/rng.h"
  7. #include "test/common/tensor.h"
  8. #include "test/common/workspace_wrapper.h"
  9. #include "test/cuda/benchmark.h"
  10. #include "test/cuda/fixture.h"
  11. #include "test/cuda/utils.h"
  12. #include <cudnn.h>
  13. #define V1(x) #x
  14. #define V(x) V1(x)
  15. #define CUDNN_VERSION_STRING \
  16. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  17. namespace megdnn {
  18. namespace test {
  19. TEST_F(CUDA, REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER) {
  20. Checker<RegionRestrictedConvolutionForward> checker(handle_cuda());
  21. auto opr = handle_cuda()->create_operator<ConvolutionForward>();
  22. for (auto dt : std::vector<DType>{dtype::Int32(), dtype::Uint8()}) {
  23. auto run = [&checker, &dt, &opr](
  24. size_t n, size_t g, size_t h, size_t fh, size_t padding,
  25. size_t stride) {
  26. RegionRestrictedConvolution::Param cur_param;
  27. cur_param.mode =
  28. RegionRestrictedConvolution::Param::Mode::CROSS_CORRELATION;
  29. cur_param.sparse = RegionRestrictedConvolution::Param::Sparse::GROUP;
  30. checker.set_dtype(2, dt).set_dtype(3, dt);
  31. float scale = 64.f / sqrt(fh * fh);
  32. UniformFloatRNG rng(scale, 2 * scale);
  33. UniformIntRNG r_rng{0, 2};
  34. checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng(
  35. 3, &r_rng);
  36. if (dt.enumv() == DTypeEnum::Float16) {
  37. checker.set_epsilon(1e-1);
  38. }
  39. cur_param.pad_h = cur_param.pad_w = padding;
  40. cur_param.stride_h = cur_param.stride_w = stride;
  41. size_t ho = infer_conv_shape(h, fh, stride, padding);
  42. checker.set_param(cur_param).execs(
  43. {{n, g, h, h}, {g, 1, 1, fh, fh}, {n, h, h}, {n, ho, ho}, {}});
  44. };
  45. run(4, 8, 32, 3, 3 / 2, 1);
  46. run(4, 8, 32, 5, 5 / 2, 1);
  47. run(4, 8, 32, 7, 7 / 2, 1);
  48. run(1, 2, 32, 9, 9 / 2, 1);
  49. run(4, 1, 32, 9, 9 / 2, 1);
  50. run(4, 8, 32, 11, 11 / 2, 1);
  51. run(4, 8, 32, 13, 13 / 2, 1);
  52. run(4, 8, 32, 15, 15 / 2, 1);
  53. run(4, 8, 32, 17, 17 / 2, 1);
  54. run(4, 8, 32, 19, 19 / 2, 1);
  55. run(4, 8, 32, 21, 21 / 2, 1);
  56. run(4, 8, 32, 23, 23 / 2, 1);
  57. run(4, 8, 32, 25, 25 / 2, 1);
  58. run(4, 8, 32, 27, 27 / 2, 1);
  59. run(4, 8, 32, 29, 29 / 2, 1);
  60. run(4, 8, 32, 31, 31 / 2, 1);
  61. }
  62. }
  63. #if MEGDNN_WITH_BENCHMARK
  64. TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER_FP32) {
  65. require_compute_capability(7, 5);
  66. Benchmarker<ConvBiasForward> bencher(handle_cuda());
  67. bencher.set_display(false);
  68. bencher.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  69. ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  70. "DEPTHWISE_LARGE_FILTER", {})
  71. .c_str()));
  72. Benchmarker<RegionRestrictedConvolutionForward> rr_bencher(handle_cuda());
  73. rr_bencher.set_display(false);
  74. ConvBias::Param param;
  75. param.format = ConvBias::Param::Format::NCHW;
  76. using NonlineMode = ConvBias::Param::NonlineMode;
  77. param.nonlineMode = NonlineMode::IDENTITY;
  78. param.sparse = ConvBias::Param::Sparse::GROUP;
  79. RegionRestrictedConvolutionForward::Param rr_param;
  80. rr_param.format = RegionRestrictedConvolutionForward::Param::Format::NCHW;
  81. rr_param.sparse = RegionRestrictedConvolutionForward::Param::Sparse::GROUP;
  82. UniformIntRNG r_rng{0, 2};
  83. auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
  84. size_t fw, size_t sh, size_t sw, size_t nr_times) {
  85. param.pad_h = fh / 2;
  86. param.pad_w = fw / 2;
  87. param.stride_h = sh;
  88. param.stride_w = sw;
  89. rr_param.pad_h = fh / 2;
  90. rr_param.pad_w = fw / 2;
  91. rr_param.stride_h = sh;
  92. rr_param.stride_w = sw;
  93. bencher.set_param(param)
  94. .set_dtype(0, dtype::Float32())
  95. .set_dtype(1, dtype::Float32())
  96. .set_dtype(2, dtype::Float32())
  97. .set_dtype(4, dtype::Float32());
  98. bencher.set_times(nr_times);
  99. rr_bencher.set_param(rr_param)
  100. .set_dtype(0, dtype::Float32())
  101. .set_dtype(1, dtype::Float32())
  102. .set_dtype(2, dtype::Int32())
  103. .set_dtype(3, dtype::Int32());
  104. rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng);
  105. rr_bencher.set_times(nr_times);
  106. size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
  107. size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
  108. TensorShape inp{batch, g, hi, wi}, kern{g, 1, 1, fh, fw}, rin{batch, hi, wi},
  109. rout{batch, ho, wo}, out{batch, g, ho, wo};
  110. float bandwith = static_cast<float>(
  111. inp.total_nr_elems() + kern.total_nr_elems() +
  112. out.total_nr_elems()) /
  113. (1024 * 1024 * 1024) * 1e3;
  114. float rr_bandwith = static_cast<float>(
  115. inp.total_nr_elems() + kern.total_nr_elems() +
  116. rin.total_nr_elems() + rout.total_nr_elems() +
  117. out.total_nr_elems()) /
  118. (1024 * 1024 * 1024) * 1e3;
  119. auto time_in_ms = bencher.execs({inp, kern, {}, {}, out}) / nr_times;
  120. auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12;
  121. auto rr_time_in_ms = rr_bencher.execs({inp, kern, rin, rout, out}) / nr_times;
  122. auto rr_ops =
  123. 2.0 * batch * g * ho * wo * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
  124. printf("RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: inp=%s, "
  125. "kern=%s, out=%s\n"
  126. "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
  127. "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
  128. inp.to_string().c_str(), kern.to_string().c_str(),
  129. out.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
  130. bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
  131. time_in_ms / rr_time_in_ms);
  132. };
  133. run_bench(64, 384, 32, 32, 3, 3, 1, 1, 10);
  134. run_bench(64, 384, 32, 32, 5, 5, 1, 1, 10);
  135. run_bench(64, 384, 32, 32, 7, 7, 1, 1, 10);
  136. run_bench(64, 384, 32, 32, 9, 9, 1, 1, 10);
  137. run_bench(64, 384, 32, 32, 11, 11, 1, 1, 10);
  138. run_bench(64, 384, 32, 32, 13, 13, 1, 1, 10);
  139. run_bench(64, 384, 32, 32, 15, 15, 1, 1, 10);
  140. run_bench(64, 384, 32, 32, 17, 17, 1, 1, 10);
  141. run_bench(64, 384, 32, 32, 19, 19, 1, 1, 10);
  142. run_bench(64, 384, 32, 32, 21, 21, 1, 1, 10);
  143. run_bench(64, 384, 32, 32, 23, 23, 1, 1, 10);
  144. run_bench(64, 384, 32, 32, 25, 25, 1, 1, 10);
  145. run_bench(64, 384, 32, 32, 27, 27, 1, 1, 10);
  146. run_bench(64, 384, 32, 32, 29, 29, 1, 1, 10);
  147. run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10);
  148. }
  149. TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_LARGE_FILTER_FP32) {
  150. require_compute_capability(7, 5);
  151. Benchmarker<ConvolutionBackwardData> bencher(handle_cuda());
  152. bencher.set_display(false);
  153. bencher.set_before_exec_callback(
  154. AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
  155. Benchmarker<RegionRestrictedConvolutionBackwardData> rr_bencher(handle_cuda());
  156. rr_bencher.set_display(false);
  157. ConvolutionBackwardData::Param param;
  158. param.format = ConvolutionBackwardData::Param::Format::NCHW;
  159. param.sparse = ConvolutionBackwardData::Param::Sparse::GROUP;
  160. RegionRestrictedConvolutionBackwardData::Param rr_param;
  161. rr_param.format = RegionRestrictedConvolutionBackwardData::Param::Format::NCHW;
  162. rr_param.sparse = RegionRestrictedConvolutionBackwardData::Param::Sparse::GROUP;
  163. UniformIntRNG r_rng{1, 3};
  164. auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
  165. size_t fw, size_t sh, size_t sw, size_t nr_times) {
  166. param.pad_h = fh / 2;
  167. param.pad_w = fw / 2;
  168. param.stride_h = sh;
  169. param.stride_w = sw;
  170. rr_param.pad_h = fh / 2;
  171. rr_param.pad_w = fw / 2;
  172. rr_param.stride_h = sh;
  173. rr_param.stride_w = sw;
  174. bencher.set_param(param)
  175. .set_dtype(0, dtype::Float32())
  176. .set_dtype(1, dtype::Float32())
  177. .set_dtype(2, dtype::Float32())
  178. .set_dtype(4, dtype::Float32());
  179. bencher.set_times(nr_times);
  180. rr_bencher.set_param(rr_param)
  181. .set_dtype(0, dtype::Float32())
  182. .set_dtype(1, dtype::Float32())
  183. .set_dtype(2, dtype::Int32())
  184. .set_dtype(3, dtype::Int32());
  185. rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng);
  186. rr_bencher.set_times(nr_times);
  187. size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
  188. size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
  189. TensorShape inp{batch, g, hi, wi} /*src*/, kern{g, 1, 1, fh, fw} /*filter*/,
  190. rin{batch, hi, wi}, rout{batch, ho, wo},
  191. out{batch, g, ho, wo} /*output*/;
  192. float bandwith = static_cast<float>(
  193. inp.total_nr_elems() + kern.total_nr_elems() +
  194. out.total_nr_elems()) /
  195. (1024 * 1024 * 1024) * 1e3;
  196. float rr_bandwith = static_cast<float>(
  197. inp.total_nr_elems() + kern.total_nr_elems() +
  198. rin.total_nr_elems() + rout.total_nr_elems() +
  199. out.total_nr_elems()) /
  200. (1024 * 1024 * 1024) * 1e3;
  201. auto time_in_ms = bencher.execs({kern, out, inp}) / nr_times;
  202. auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12;
  203. auto rr_time_in_ms = rr_bencher.execs({kern, out, rin, rout, inp}) / nr_times;
  204. auto rr_ops =
  205. 2.0 * batch * g * ho * wo * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
  206. printf("[DGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: "
  207. "grad=%s, "
  208. "kern=%s, diff=%s\n"
  209. "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
  210. "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
  211. inp.to_string().c_str(), kern.to_string().c_str(),
  212. out.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
  213. bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
  214. time_in_ms / rr_time_in_ms);
  215. };
  216. run_bench(64, 384, 32, 32, 3, 3, 1, 1, 10);
  217. run_bench(64, 384, 32, 32, 5, 5, 1, 1, 10);
  218. run_bench(64, 384, 32, 32, 7, 7, 1, 1, 10);
  219. run_bench(64, 384, 32, 32, 9, 9, 1, 1, 10);
  220. run_bench(64, 384, 32, 32, 11, 11, 1, 1, 10);
  221. run_bench(64, 384, 32, 32, 13, 13, 1, 1, 10);
  222. run_bench(64, 384, 32, 32, 15, 15, 1, 1, 10);
  223. run_bench(64, 384, 32, 32, 17, 17, 1, 1, 10);
  224. run_bench(64, 384, 32, 32, 19, 19, 1, 1, 10);
  225. run_bench(64, 384, 32, 32, 21, 21, 1, 1, 10);
  226. run_bench(64, 384, 32, 32, 23, 23, 1, 1, 10);
  227. run_bench(64, 384, 32, 32, 25, 25, 1, 1, 10);
  228. run_bench(64, 384, 32, 32, 27, 27, 1, 1, 10);
  229. run_bench(64, 384, 32, 32, 29, 29, 1, 1, 10);
  230. run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10);
  231. }
  232. TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_LARGE_FILTER_FP32_UINT8) {
  233. require_compute_capability(7, 5);
  234. Benchmarker<ConvolutionBackwardData> bencher(handle_cuda());
  235. bencher.set_display(false);
  236. bencher.set_before_exec_callback(
  237. AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
  238. Benchmarker<RegionRestrictedConvolutionBackwardData> rr_bencher(handle_cuda());
  239. rr_bencher.set_display(false);
  240. ConvolutionBackwardData::Param param;
  241. param.format = ConvolutionBackwardData::Param::Format::NCHW;
  242. param.sparse = ConvolutionBackwardData::Param::Sparse::GROUP;
  243. RegionRestrictedConvolutionBackwardData::Param rr_param;
  244. rr_param.format = RegionRestrictedConvolutionBackwardData::Param::Format::NCHW;
  245. rr_param.sparse = RegionRestrictedConvolutionBackwardData::Param::Sparse::GROUP;
  246. UniformIntRNG r_rng{1, 3};
  247. auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
  248. size_t fw, size_t sh, size_t sw, size_t nr_times) {
  249. param.pad_h = fh / 2;
  250. param.pad_w = fw / 2;
  251. param.stride_h = sh;
  252. param.stride_w = sw;
  253. rr_param.pad_h = fh / 2;
  254. rr_param.pad_w = fw / 2;
  255. rr_param.stride_h = sh;
  256. rr_param.stride_w = sw;
  257. bencher.set_param(param)
  258. .set_dtype(0, dtype::Float32())
  259. .set_dtype(1, dtype::Float32())
  260. .set_dtype(2, dtype::Float32())
  261. .set_dtype(4, dtype::Float32());
  262. bencher.set_times(nr_times);
  263. rr_bencher.set_param(rr_param)
  264. .set_dtype(0, dtype::Float32())
  265. .set_dtype(1, dtype::Float32())
  266. .set_dtype(2, dtype::Uint8())
  267. .set_dtype(3, dtype::Uint8());
  268. rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng);
  269. rr_bencher.set_times(nr_times);
  270. size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
  271. size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
  272. TensorShape inp{batch, g, hi, wi} /*src*/, kern{g, 1, 1, fh, fw} /*filter*/,
  273. rin{batch, hi, wi}, rout{batch, ho, wo},
  274. out{batch, g, ho, wo} /*output*/;
  275. float bandwith = static_cast<float>(
  276. inp.total_nr_elems() + kern.total_nr_elems() +
  277. out.total_nr_elems()) /
  278. (1024 * 1024 * 1024) * 1e3;
  279. float rr_bandwith = static_cast<float>(
  280. inp.total_nr_elems() + kern.total_nr_elems() +
  281. rin.total_nr_elems() + rout.total_nr_elems() +
  282. out.total_nr_elems()) /
  283. (1024 * 1024 * 1024) * 1e3;
  284. auto time_in_ms = bencher.execs({kern, out, inp}) / nr_times;
  285. auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12;
  286. auto rr_time_in_ms = rr_bencher.execs({kern, out, rin, rout, inp}) / nr_times;
  287. auto rr_ops =
  288. 2.0 * batch * g * ho * wo * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
  289. printf("[DGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: "
  290. "grad=%s, "
  291. "kern=%s, diff=%s\n"
  292. "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
  293. "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
  294. inp.to_string().c_str(), kern.to_string().c_str(),
  295. out.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
  296. bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
  297. time_in_ms / rr_time_in_ms);
  298. };
  299. run_bench(64, 384, 32, 32, 3, 3, 1, 1, 10);
  300. run_bench(64, 384, 32, 32, 5, 5, 1, 1, 10);
  301. run_bench(64, 384, 32, 32, 7, 7, 1, 1, 10);
  302. run_bench(64, 384, 32, 32, 9, 9, 1, 1, 10);
  303. run_bench(64, 384, 32, 32, 11, 11, 1, 1, 10);
  304. run_bench(64, 384, 32, 32, 13, 13, 1, 1, 10);
  305. run_bench(64, 384, 32, 32, 15, 15, 1, 1, 10);
  306. run_bench(64, 384, 32, 32, 17, 17, 1, 1, 10);
  307. run_bench(64, 384, 32, 32, 19, 19, 1, 1, 10);
  308. run_bench(64, 384, 32, 32, 21, 21, 1, 1, 10);
  309. run_bench(64, 384, 32, 32, 23, 23, 1, 1, 10);
  310. run_bench(64, 384, 32, 32, 25, 25, 1, 1, 10);
  311. run_bench(64, 384, 32, 32, 27, 27, 1, 1, 10);
  312. run_bench(64, 384, 32, 32, 29, 29, 1, 1, 10);
  313. run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10);
  314. }
  315. TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER_UINT8) {
  316. require_compute_capability(7, 5);
  317. Benchmarker<ConvBiasForward> bencher(handle_cuda());
  318. bencher.set_display(false);
  319. bencher.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  320. ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  321. "DEPTHWISE_LARGE_FILTER", {})
  322. .c_str()));
  323. Benchmarker<RegionRestrictedConvolutionForward> rr_bencher(handle_cuda());
  324. rr_bencher.set_display(false);
  325. ConvBias::Param param;
  326. param.format = ConvBias::Param::Format::NCHW;
  327. using NonlineMode = ConvBias::Param::NonlineMode;
  328. param.nonlineMode = NonlineMode::IDENTITY;
  329. param.sparse = ConvBias::Param::Sparse::GROUP;
  330. RegionRestrictedConvolutionForward::Param rr_param;
  331. rr_param.format = RegionRestrictedConvolutionForward::Param::Format::NCHW;
  332. rr_param.sparse = RegionRestrictedConvolutionForward::Param::Sparse::GROUP;
  333. UniformIntRNG r_rng{0, 2};
  334. auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
  335. size_t fw, size_t sh, size_t sw, size_t nr_times) {
  336. param.pad_h = fh / 2;
  337. param.pad_w = fw / 2;
  338. param.stride_h = sh;
  339. param.stride_w = sw;
  340. rr_param.pad_h = fh / 2;
  341. rr_param.pad_w = fw / 2;
  342. rr_param.stride_h = sh;
  343. rr_param.stride_w = sw;
  344. bencher.set_param(param)
  345. .set_dtype(0, dtype::Float32())
  346. .set_dtype(1, dtype::Float32())
  347. .set_dtype(2, dtype::Float32())
  348. .set_dtype(4, dtype::Float32());
  349. bencher.set_times(nr_times);
  350. rr_bencher.set_param(rr_param)
  351. .set_dtype(0, dtype::Float32())
  352. .set_dtype(1, dtype::Float32())
  353. .set_dtype(2, dtype::Uint8())
  354. .set_dtype(3, dtype::Uint8());
  355. rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng).set_rng(0, &r_rng);
  356. rr_bencher.set_times(nr_times);
  357. size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
  358. size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
  359. TensorShape inp{batch, g, hi, wi}, kern{g, 1, 1, fh, fw}, rin{batch, hi, wi},
  360. rout{batch, ho, wo}, out{batch, g, ho, wo};
  361. float bandwith = static_cast<float>(
  362. inp.total_nr_elems() + kern.total_nr_elems() +
  363. out.total_nr_elems()) /
  364. (1024 * 1024 * 1024) * 1e3;
  365. float rr_bandwith = static_cast<float>(
  366. inp.total_nr_elems() + kern.total_nr_elems() +
  367. rin.total_nr_elems() + rout.total_nr_elems() +
  368. out.total_nr_elems()) /
  369. (1024 * 1024 * 1024) * 1e3;
  370. auto time_in_ms = bencher.execs({inp, kern, {}, {}, out}) / nr_times;
  371. auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12;
  372. auto rr_time_in_ms = rr_bencher.execs({inp, kern, rin, rout, out}) / nr_times;
  373. auto rr_ops =
  374. 2.0 * batch * g * ho * wo * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
  375. printf("RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: inp=%s, "
  376. "kern=%s, out=%s\n"
  377. "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
  378. "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
  379. inp.to_string().c_str(), kern.to_string().c_str(),
  380. out.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
  381. bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
  382. time_in_ms / rr_time_in_ms);
  383. };
  384. run_bench(64, 384, 32, 32, 3, 3, 1, 1, 10);
  385. run_bench(64, 384, 32, 32, 5, 5, 1, 1, 10);
  386. run_bench(64, 384, 32, 32, 7, 7, 1, 1, 10);
  387. run_bench(64, 384, 32, 32, 9, 9, 1, 1, 10);
  388. run_bench(64, 384, 32, 32, 11, 11, 1, 1, 10);
  389. run_bench(64, 384, 32, 32, 13, 13, 1, 1, 10);
  390. run_bench(64, 384, 32, 32, 15, 15, 1, 1, 10);
  391. run_bench(64, 384, 32, 32, 17, 17, 1, 1, 10);
  392. run_bench(64, 384, 32, 32, 19, 19, 1, 1, 10);
  393. run_bench(64, 384, 32, 32, 21, 21, 1, 1, 10);
  394. run_bench(64, 384, 32, 32, 23, 23, 1, 1, 10);
  395. run_bench(64, 384, 32, 32, 25, 25, 1, 1, 10);
  396. run_bench(64, 384, 32, 32, 27, 27, 1, 1, 10);
  397. run_bench(64, 384, 32, 32, 29, 29, 1, 1, 10);
  398. run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10);
  399. }
  400. TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_FILTER_FP32) {
  401. require_compute_capability(7, 5);
  402. Benchmarker<ConvolutionBackwardFilter> bencher(handle_cuda());
  403. bencher.set_display(false);
  404. bencher.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  405. "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_128X128X8_32X64X8_2stage"));
  406. Benchmarker<RegionRestrictedConvolutionBackwardFilter> rr_bencher(handle_cuda());
  407. rr_bencher.set_display(false);
  408. ConvolutionBackwardFilter::Param param;
  409. param.format = ConvolutionBackwardFilter::Param::Format::NCHW;
  410. param.sparse = ConvolutionBackwardFilter::Param::Sparse::GROUP;
  411. RegionRestrictedConvolutionBackwardFilter::Param rr_param;
  412. rr_param.format = RegionRestrictedConvolutionBackwardFilter::Param::Format::NCHW;
  413. rr_param.sparse = RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP;
  414. UniformIntRNG r_rng{1, 3};
  415. auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
  416. size_t fw, size_t sh, size_t sw, size_t nr_times) {
  417. param.pad_h = fh / 2;
  418. param.pad_w = fw / 2;
  419. param.stride_h = sh;
  420. param.stride_w = sw;
  421. rr_param.pad_h = fh / 2;
  422. rr_param.pad_w = fw / 2;
  423. rr_param.stride_h = sh;
  424. rr_param.stride_w = sw;
  425. bencher.set_param(param)
  426. .set_dtype(0, dtype::Float32())
  427. .set_dtype(1, dtype::Float32())
  428. .set_dtype(2, dtype::Float32())
  429. .set_dtype(4, dtype::Float32());
  430. bencher.proxy()->target_execution_policy = {};
  431. bencher.set_times(nr_times);
  432. rr_bencher.set_param(rr_param)
  433. .set_dtype(0, dtype::Float32())
  434. .set_dtype(1, dtype::Float32())
  435. .set_dtype(2, dtype::Int32())
  436. .set_dtype(3, dtype::Int32());
  437. rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng);
  438. rr_bencher.set_times(nr_times);
  439. size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
  440. size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
  441. TensorShape src{batch, g, hi, wi}, diff{batch, g, ho, wo}, rin{batch, hi, wi},
  442. rout{batch, ho, wo}, grad{g, 1, 1, fh, fw};
  443. float bandwith = static_cast<float>(
  444. src.total_nr_elems() + diff.total_nr_elems() +
  445. grad.total_nr_elems()) /
  446. (1024 * 1024 * 1024) * 1e3;
  447. float rr_bandwith = static_cast<float>(
  448. src.total_nr_elems() + diff.total_nr_elems() +
  449. rin.total_nr_elems() + rout.total_nr_elems() +
  450. grad.total_nr_elems()) /
  451. (1024 * 1024 * 1024) * 1e3;
  452. auto time_in_ms = bencher.execs({src, diff, grad}) / nr_times;
  453. auto ops = 2.0 * batch * g * hi * wi * fh * fw / (time_in_ms * 1e-3) * 1e-12;
  454. auto rr_time_in_ms = rr_bencher.execs({src, diff, rin, rout, grad}) / nr_times;
  455. auto rr_ops =
  456. 2.0 * batch * g * hi * wi * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
  457. printf("[DGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: "
  458. "src=%s, "
  459. "diff=%s, grad=%s\n"
  460. "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
  461. "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
  462. src.to_string().c_str(), diff.to_string().c_str(),
  463. grad.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
  464. bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
  465. time_in_ms / rr_time_in_ms);
  466. };
  467. run_bench(64, 384, 32, 32, 3, 3, 1, 1, 1000);
  468. run_bench(64, 384, 32, 32, 5, 5, 1, 1, 1000);
  469. run_bench(64, 384, 32, 32, 7, 7, 1, 1, 1000);
  470. run_bench(64, 384, 32, 32, 9, 9, 1, 1, 1000);
  471. run_bench(64, 384, 32, 32, 11, 11, 1, 1, 1000);
  472. run_bench(64, 384, 32, 32, 13, 13, 1, 1, 1000);
  473. run_bench(64, 384, 32, 32, 15, 15, 1, 1, 1000);
  474. run_bench(64, 384, 32, 32, 17, 17, 1, 1, 1000);
  475. run_bench(64, 384, 32, 32, 19, 19, 1, 1, 1000);
  476. run_bench(64, 384, 32, 32, 21, 21, 1, 1, 1000);
  477. run_bench(64, 384, 32, 32, 23, 23, 1, 1, 1000);
  478. run_bench(64, 384, 32, 32, 25, 25, 1, 1, 1000);
  479. run_bench(64, 384, 32, 32, 27, 27, 1, 1, 1000);
  480. run_bench(64, 384, 32, 32, 29, 29, 1, 1, 1000);
  481. run_bench(64, 384, 32, 32, 31, 31, 1, 1, 1000);
  482. }
  483. TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_FILTER_FP32_RINT8) {
  484. require_compute_capability(7, 5);
  485. Benchmarker<ConvolutionBackwardFilter> bencher(handle_cuda());
  486. bencher.set_display(false);
  487. bencher.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  488. "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_128X128X8_32X64X8_2stage"));
  489. Benchmarker<RegionRestrictedConvolutionBackwardFilter> rr_bencher(handle_cuda());
  490. rr_bencher.set_display(false);
  491. ConvolutionBackwardFilter::Param param;
  492. param.format = ConvolutionBackwardFilter::Param::Format::NCHW;
  493. param.sparse = ConvolutionBackwardFilter::Param::Sparse::GROUP;
  494. RegionRestrictedConvolutionBackwardFilter::Param rr_param;
  495. rr_param.format = RegionRestrictedConvolutionBackwardFilter::Param::Format::NCHW;
  496. rr_param.sparse = RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP;
  497. UniformIntRNG r_rng{1, 3};
  498. auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
  499. size_t fw, size_t sh, size_t sw, size_t nr_times) {
  500. param.pad_h = fh / 2;
  501. param.pad_w = fw / 2;
  502. param.stride_h = sh;
  503. param.stride_w = sw;
  504. rr_param.pad_h = fh / 2;
  505. rr_param.pad_w = fw / 2;
  506. rr_param.stride_h = sh;
  507. rr_param.stride_w = sw;
  508. bencher.set_param(param)
  509. .set_dtype(0, dtype::Float32())
  510. .set_dtype(1, dtype::Float32())
  511. .set_dtype(2, dtype::Float32())
  512. .set_dtype(4, dtype::Float32());
  513. bencher.proxy()->target_execution_policy = {};
  514. bencher.set_times(nr_times);
  515. rr_bencher.set_param(rr_param)
  516. .set_dtype(0, dtype::Float32())
  517. .set_dtype(1, dtype::Float32())
  518. .set_dtype(2, dtype::Uint8())
  519. .set_dtype(3, dtype::Uint8());
  520. rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng);
  521. rr_bencher.set_times(nr_times);
  522. size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
  523. size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
  524. TensorShape src{batch, g, hi, wi}, diff{batch, g, ho, wo}, rin{batch, hi, wi},
  525. rout{batch, ho, wo}, grad{g, 1, 1, fh, fw};
  526. float bandwith = static_cast<float>(
  527. src.total_nr_elems() + diff.total_nr_elems() +
  528. grad.total_nr_elems()) /
  529. (1024 * 1024 * 1024) * 1e3;
  530. float rr_bandwith = static_cast<float>(
  531. src.total_nr_elems() + diff.total_nr_elems() +
  532. rin.total_nr_elems() + rout.total_nr_elems() +
  533. grad.total_nr_elems()) /
  534. (1024 * 1024 * 1024) * 1e3;
  535. auto time_in_ms = bencher.execs({src, diff, grad}) / nr_times;
  536. auto ops = 2.0 * batch * g * hi * wi * fh * fw / (time_in_ms * 1e-3) * 1e-12;
  537. auto rr_time_in_ms = rr_bencher.execs({src, diff, rin, rout, grad}) / nr_times;
  538. auto rr_ops =
  539. 2.0 * batch * g * hi * wi * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
  540. printf("[DGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: "
  541. "src=%s, "
  542. "diff=%s, grad=%s\n"
  543. "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
  544. "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
  545. src.to_string().c_str(), diff.to_string().c_str(),
  546. grad.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
  547. bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
  548. time_in_ms / rr_time_in_ms);
  549. };
  550. run_bench(64, 384, 32, 32, 3, 3, 1, 1, 1000);
  551. run_bench(64, 384, 32, 32, 5, 5, 1, 1, 1000);
  552. run_bench(64, 384, 32, 32, 7, 7, 1, 1, 1000);
  553. run_bench(64, 384, 32, 32, 9, 9, 1, 1, 1000);
  554. run_bench(64, 384, 32, 32, 11, 11, 1, 1, 1000);
  555. run_bench(64, 384, 32, 32, 13, 13, 1, 1, 1000);
  556. run_bench(64, 384, 32, 32, 15, 15, 1, 1, 1000);
  557. run_bench(64, 384, 32, 32, 17, 17, 1, 1, 1000);
  558. run_bench(64, 384, 32, 32, 19, 19, 1, 1, 1000);
  559. run_bench(64, 384, 32, 32, 21, 21, 1, 1, 1000);
  560. run_bench(64, 384, 32, 32, 23, 23, 1, 1, 1000);
  561. run_bench(64, 384, 32, 32, 25, 25, 1, 1, 1000);
  562. run_bench(64, 384, 32, 32, 27, 27, 1, 1, 1000);
  563. run_bench(64, 384, 32, 32, 29, 29, 1, 1, 1000);
  564. run_bench(64, 384, 32, 32, 31, 31, 1, 1, 1000);
  565. }
  566. #endif
  567. TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_DATA_FP32) {
  568. Checker<RegionRestrictedConvolutionBackwardData> checker(handle_cuda());
  569. for (auto dt : std::vector<DType>{dtype::Int32(), dtype::Uint8()}) {
  570. auto run = [&checker, &dt](
  571. size_t n, size_t g, size_t ih, size_t fh, size_t padding,
  572. size_t stride) {
  573. RegionRestrictedConvolutionBackwardData::Param cur_param;
  574. cur_param.mode = RegionRestrictedConvolutionBackwardData::Param::Mode::
  575. CROSS_CORRELATION;
  576. cur_param.compute_mode = RegionRestrictedConvolutionBackwardData::Param::
  577. ComputeMode::DEFAULT;
  578. cur_param.sparse =
  579. RegionRestrictedConvolutionBackwardData::Param::Sparse::GROUP;
  580. checker.set_dtype(0, dtype::Float32())
  581. .set_dtype(1, dtype::Float32())
  582. .set_dtype(2, dt)
  583. .set_dtype(3, dt);
  584. float scale = 64.f / sqrt(fh * fh);
  585. UniformFloatRNG rng(scale, 2 * scale);
  586. UniformIntRNG r_rng{1, 2};
  587. checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng(
  588. 3, &r_rng);
  589. cur_param.pad_h = cur_param.pad_w = padding;
  590. cur_param.stride_h = cur_param.stride_w = stride;
  591. size_t oh = (ih + 2 * padding - fh + 1) / stride;
  592. checker.set_param(cur_param).execs({
  593. {g, 1, 1, fh, fh}, // filter
  594. {n, g * 1, oh, oh}, // diff
  595. {n, ih, ih}, // rin
  596. {n, oh, oh}, // rout
  597. {n, g * 1, ih, ih} // grad
  598. });
  599. };
  600. if (dt == dtype::Int32()) {
  601. run(4, 8, 32, 5, 5 / 2, 1);
  602. run(1, 2, 2, 2, 0, 1);
  603. run(1, 2, 3, 3, 0, 1);
  604. run(1, 2, 4, 4, 0, 1);
  605. run(1, 2, 5, 5, 0, 1);
  606. run(1, 2, 6, 6, 0, 1);
  607. run(1, 2, 7, 7, 0, 1);
  608. }
  609. run(4, 8, 32, 7, 7 / 2, 1);
  610. run(4, 8, 32, 9, 9 / 2, 1);
  611. run(4, 8, 32, 11, 11 / 2, 1);
  612. run(4, 8, 32, 13, 13 / 2, 1);
  613. run(4, 8, 32, 15, 15 / 2, 1);
  614. run(4, 8, 32, 17, 17 / 2, 1);
  615. run(4, 8, 32, 19, 19 / 2, 1);
  616. run(4, 8, 32, 21, 21 / 2, 1);
  617. run(4, 8, 32, 23, 23 / 2, 1);
  618. run(4, 8, 32, 25, 25 / 2, 1);
  619. run(4, 8, 32, 27, 27 / 2, 1);
  620. run(4, 8, 32, 29, 29 / 2, 1);
  621. run(4, 1, 32, 29, 29 / 2, 1);
  622. run(4, 8, 32, 31, 31 / 2, 1);
  623. }
  624. }
  625. TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_DATA_FP32_RIN_EQ_ROUT) {
  626. Checker<RegionRestrictedConvolutionBackwardData> checker(handle_cuda());
  627. for (auto dt : std::vector<DType>{dtype::Int32()}) {
  628. auto run = [&checker, &dt](
  629. size_t n, size_t g, size_t ih, size_t fh, size_t padding,
  630. size_t stride) {
  631. RegionRestrictedConvolutionBackwardData::Param cur_param;
  632. cur_param.mode = RegionRestrictedConvolutionBackwardData::Param::Mode::
  633. CROSS_CORRELATION;
  634. cur_param.compute_mode = RegionRestrictedConvolutionBackwardData::Param::
  635. ComputeMode::DEFAULT;
  636. cur_param.sparse =
  637. RegionRestrictedConvolutionBackwardData::Param::Sparse::GROUP;
  638. checker.set_dtype(2, dt).set_dtype(3, dt);
  639. float scale = 64.f / sqrt(fh * fh);
  640. UniformFloatRNG rng(scale, 2 * scale);
  641. // value 0 mask may cause unexpected behaviour.
  642. UniformIntRNG r_rng{1, 1};
  643. checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng(
  644. 3, &r_rng);
  645. cur_param.pad_h = cur_param.pad_w = padding;
  646. cur_param.stride_h = cur_param.stride_w = stride;
  647. size_t oh = (ih + 2 * padding - fh + 1) / stride;
  648. checker.set_param(cur_param).execs(
  649. {/*filter*/ {g, 1, 1, fh, fh},
  650. /*diff*/ {n, g * 1, oh, oh},
  651. /*rin*/ {n, ih, ih},
  652. /*rout*/ {n, oh, oh},
  653. /*grad*/ {n, g * 1, ih, ih}});
  654. };
  655. if (dt == dtype::Int32()) {
  656. // NOTE: UINT8 assert the spatial size of src&dst is 4*N
  657. run(4, 8, 32, 5, 5 / 2, 1);
  658. run(1, 2, 2, 2, 0, 1);
  659. run(1, 2, 3, 3, 0, 1);
  660. run(1, 2, 4, 4, 0, 1);
  661. run(1, 2, 5, 5, 0, 1);
  662. run(1, 2, 6, 6, 0, 1);
  663. run(1, 2, 7, 7, 0, 1);
  664. }
  665. run(4, 8, 32, 7, 7 / 2, 1);
  666. run(4, 8, 32, 9, 9 / 2, 1);
  667. run(4, 8, 32, 11, 11 / 2, 1);
  668. run(4, 8, 32, 13, 13 / 2, 1);
  669. run(4, 8, 32, 15, 15 / 2, 1);
  670. run(4, 8, 32, 17, 17 / 2, 1);
  671. run(4, 8, 32, 19, 19 / 2, 1);
  672. run(4, 8, 32, 21, 21 / 2, 1);
  673. run(4, 8, 32, 23, 23 / 2, 1);
  674. run(4, 8, 32, 25, 25 / 2, 1);
  675. run(4, 1, 32, 25, 25 / 2, 1);
  676. run(4, 8, 32, 27, 27 / 2, 1);
  677. run(4, 8, 32, 29, 29 / 2, 1);
  678. run(4, 8, 32, 31, 31 / 2, 1);
  679. }
  680. }
  681. TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_FILTER_FP32) {
  682. require_compute_capability(6, 1);
  683. Checker<RegionRestrictedConvolutionBackwardFilter> checker(handle_cuda());
  684. for (auto dt : std::vector<DType>{dtype::Int32(), dtype::Uint8()}) {
  685. auto run = [&checker, &dt](
  686. size_t n, size_t g, size_t ih, size_t fh, size_t padding,
  687. size_t stride) {
  688. RegionRestrictedConvolutionBackwardFilter::Param cur_param;
  689. cur_param.mode = RegionRestrictedConvolutionBackwardFilter::Param::Mode::
  690. CROSS_CORRELATION;
  691. cur_param.compute_mode = RegionRestrictedConvolutionBackwardFilter::Param::
  692. ComputeMode::DEFAULT;
  693. cur_param.sparse =
  694. RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP;
  695. checker.set_dtype(0, dtype::Float32())
  696. .set_dtype(1, dtype::Float32())
  697. .set_dtype(2, dt)
  698. .set_dtype(3, dt);
  699. float scale = 64.f / sqrt(fh * fh);
  700. UniformFloatRNG rng(scale, 2 * scale);
  701. UniformIntRNG r_rng{1, 2};
  702. checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng(
  703. 3, &r_rng);
  704. cur_param.pad_h = cur_param.pad_w = padding;
  705. cur_param.stride_h = cur_param.stride_w = stride;
  706. size_t oh = (ih + 2 * padding - fh + 1) / stride;
  707. checker.set_param(cur_param).execs({
  708. {n, g * 1, ih, ih}, // src
  709. {n, g * 1, oh, oh}, // diff
  710. {n, ih, ih}, // rin
  711. {n, oh, oh}, // rout
  712. {g, 1, 1, fh, fh} // grad
  713. });
  714. };
  715. if (dt == dtype::Int32()) {
  716. run(4, 8, 32, 5, 5 / 2, 1);
  717. run(1, 2, 2, 2, 0, 1);
  718. run(1, 2, 3, 3, 0, 1);
  719. run(1, 2, 4, 4, 0, 1);
  720. run(1, 2, 5, 5, 0, 1);
  721. run(1, 2, 6, 6, 0, 1);
  722. run(1, 2, 7, 7, 0, 1);
  723. }
  724. run(4, 8, 32, 7, 7 / 2, 1);
  725. run(4, 8, 32, 9, 9 / 2, 1);
  726. run(4, 8, 32, 11, 11 / 2, 1);
  727. run(4, 8, 32, 13, 13 / 2, 1);
  728. run(4, 8, 32, 15, 15 / 2, 1);
  729. run(4, 8, 32, 17, 17 / 2, 1);
  730. run(4, 8, 32, 19, 19 / 2, 1);
  731. run(4, 8, 32, 21, 21 / 2, 1);
  732. run(4, 8, 32, 23, 23 / 2, 1);
  733. run(4, 8, 32, 25, 25 / 2, 1);
  734. run(4, 8, 32, 27, 27 / 2, 1);
  735. run(4, 1, 32, 27, 27 / 2, 1);
  736. run(4, 8, 32, 29, 29 / 2, 1);
  737. run(4, 8, 32, 31, 31 / 2, 1);
  738. }
  739. }
  740. TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_FILTER_FP32_RIN_EQ_ROUT) {
  741. require_compute_capability(6, 1);
  742. Checker<RegionRestrictedConvolutionBackwardFilter> checker(handle_cuda());
  743. for (auto dt : std::vector<DType>{dtype::Int32(), dtype::Uint8()}) {
  744. auto run = [&checker, &dt](
  745. size_t n, size_t g, size_t ih, size_t fh, size_t padding,
  746. size_t stride) {
  747. RegionRestrictedConvolutionBackwardFilter::Param cur_param;
  748. cur_param.mode = RegionRestrictedConvolutionBackwardFilter::Param::Mode::
  749. CROSS_CORRELATION;
  750. cur_param.compute_mode = RegionRestrictedConvolutionBackwardFilter::Param::
  751. ComputeMode::DEFAULT;
  752. cur_param.sparse =
  753. RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP;
  754. checker.set_dtype(0, dtype::Float32())
  755. .set_dtype(1, dtype::Float32())
  756. .set_dtype(2, dt)
  757. .set_dtype(3, dt);
  758. float scale = 64.f / sqrt(fh * fh);
  759. UniformFloatRNG rng(scale, 2 * scale);
  760. UniformIntRNG r_rng{1, 1};
  761. checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng(
  762. 3, &r_rng);
  763. cur_param.pad_h = cur_param.pad_w = padding;
  764. cur_param.stride_h = cur_param.stride_w = stride;
  765. size_t oh = (ih + 2 * padding - fh + 1) / stride;
  766. checker.set_param(cur_param).execs({
  767. {n, g * 1, ih, ih}, // src
  768. {n, g * 1, oh, oh}, // diff
  769. {n, ih, ih}, // rin
  770. {n, oh, oh}, // rout
  771. {g, 1, 1, fh, fh} // grad
  772. });
  773. };
  774. if (dt == dtype::Int32()) {
  775. run(4, 8, 32, 5, 5 / 2, 1);
  776. run(1, 2, 2, 2, 0, 1);
  777. run(1, 2, 3, 3, 0, 1);
  778. run(1, 2, 4, 4, 0, 1);
  779. run(1, 2, 5, 5, 0, 1);
  780. run(1, 2, 6, 6, 0, 1);
  781. run(1, 2, 7, 7, 0, 1);
  782. }
  783. run(4, 8, 32, 7, 7 / 2, 1);
  784. run(4, 8, 32, 9, 9 / 2, 1);
  785. run(4, 8, 32, 11, 11 / 2, 1);
  786. run(4, 8, 32, 13, 13 / 2, 1);
  787. run(4, 8, 32, 15, 15 / 2, 1);
  788. run(4, 8, 32, 17, 17 / 2, 1);
  789. run(4, 8, 32, 19, 19 / 2, 1);
  790. run(4, 8, 32, 21, 21 / 2, 1);
  791. run(4, 1, 32, 21, 21 / 2, 1);
  792. run(4, 8, 32, 23, 23 / 2, 1);
  793. run(4, 8, 32, 25, 25 / 2, 1);
  794. run(4, 8, 32, 27, 27 / 2, 1);
  795. run(4, 8, 32, 29, 29 / 2, 1);
  796. run(4, 8, 32, 31, 31 / 2, 1);
  797. }
  798. }
  799. } // namespace test
  800. } // namespace megdnn
  801. // vim: syntax=cpp.doxygen