You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

region_restricted_convolution.cpp 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. #include "megdnn/dtype.h"
  2. #include "megdnn/opr_param_defs.h"
  3. #include "megdnn/oprs.h"
  4. #include "test/common/checker.h"
  5. #include "test/common/conv_bias.h"
  6. #include "test/common/rng.h"
  7. #include "test/common/tensor.h"
  8. #include "test/common/workspace_wrapper.h"
  9. #include "test/cuda/benchmark.h"
  10. #include "test/cuda/fixture.h"
  11. #include "test/cuda/utils.h"
  12. #include <cudnn.h>
  13. #define V1(x) #x
  14. #define V(x) V1(x)
  15. #define CUDNN_VERSION_STRING \
  16. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  17. namespace megdnn {
  18. namespace test {
  19. TEST_F(CUDA, REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER) {
  20. Checker<RegionRestrictedConvolutionForward> checker(handle_cuda());
  21. auto opr = handle_cuda()->create_operator<ConvolutionForward>();
  22. for (auto dt : std::vector<DType>{dtype::Int32(), dtype::Uint8()}) {
  23. auto run = [&checker, &dt, &opr](
  24. size_t n, size_t g, size_t h, size_t fh, size_t padding,
  25. size_t stride) {
  26. RegionRestrictedConvolution::Param cur_param;
  27. cur_param.mode =
  28. RegionRestrictedConvolution::Param::Mode::CROSS_CORRELATION;
  29. cur_param.sparse = RegionRestrictedConvolution::Param::Sparse::GROUP;
  30. checker.set_dtype(2, dt).set_dtype(3, dt);
  31. float scale = 64.f / sqrt(fh * fh);
  32. UniformFloatRNG rng(scale, 2 * scale);
  33. UniformIntRNG r_rng{0, 2};
  34. checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng(
  35. 3, &r_rng);
  36. if (dt.enumv() == DTypeEnum::Float16) {
  37. checker.set_epsilon(1e-1);
  38. }
  39. cur_param.pad_h = cur_param.pad_w = padding;
  40. cur_param.stride_h = cur_param.stride_w = stride;
  41. size_t ho = infer_conv_shape(h, fh, stride, padding);
  42. checker.set_param(cur_param).execs(
  43. {{n, g, h, h}, {g, 1, 1, fh, fh}, {n, h, h}, {n, ho, ho}, {}});
  44. };
  45. run(4, 8, 32, 3, 3 / 2, 1);
  46. run(4, 8, 32, 5, 5 / 2, 1);
  47. run(4, 8, 32, 7, 7 / 2, 1);
  48. run(1, 2, 32, 9, 9 / 2, 1);
  49. run(4, 8, 32, 11, 11 / 2, 1);
  50. run(4, 8, 32, 13, 13 / 2, 1);
  51. run(4, 8, 32, 15, 15 / 2, 1);
  52. run(4, 8, 32, 17, 17 / 2, 1);
  53. run(4, 8, 32, 19, 19 / 2, 1);
  54. run(4, 8, 32, 21, 21 / 2, 1);
  55. run(4, 8, 32, 23, 23 / 2, 1);
  56. run(4, 8, 32, 25, 25 / 2, 1);
  57. run(4, 8, 32, 27, 27 / 2, 1);
  58. run(4, 8, 32, 29, 29 / 2, 1);
  59. run(4, 8, 32, 31, 31 / 2, 1);
  60. }
  61. }
  62. #if MEGDNN_WITH_BENCHMARK
  63. TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER_FP32) {
  64. require_compute_capability(7, 5);
  65. Benchmarker<ConvBiasForward> bencher(handle_cuda());
  66. bencher.set_display(false);
  67. bencher.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  68. ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  69. "DEPTHWISE_LARGE_FILTER", {})
  70. .c_str()));
  71. Benchmarker<RegionRestrictedConvolutionForward> rr_bencher(handle_cuda());
  72. rr_bencher.set_display(false);
  73. ConvBias::Param param;
  74. param.format = ConvBias::Param::Format::NCHW;
  75. using NonlineMode = ConvBias::Param::NonlineMode;
  76. param.nonlineMode = NonlineMode::IDENTITY;
  77. param.sparse = ConvBias::Param::Sparse::GROUP;
  78. RegionRestrictedConvolutionForward::Param rr_param;
  79. rr_param.format = RegionRestrictedConvolutionForward::Param::Format::NCHW;
  80. rr_param.sparse = RegionRestrictedConvolutionForward::Param::Sparse::GROUP;
  81. UniformIntRNG r_rng{0, 2};
  82. auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
  83. size_t fw, size_t sh, size_t sw, size_t nr_times) {
  84. param.pad_h = fh / 2;
  85. param.pad_w = fw / 2;
  86. param.stride_h = sh;
  87. param.stride_w = sw;
  88. rr_param.pad_h = fh / 2;
  89. rr_param.pad_w = fw / 2;
  90. rr_param.stride_h = sh;
  91. rr_param.stride_w = sw;
  92. bencher.set_param(param)
  93. .set_dtype(0, dtype::Float32())
  94. .set_dtype(1, dtype::Float32())
  95. .set_dtype(2, dtype::Float32())
  96. .set_dtype(4, dtype::Float32());
  97. bencher.set_times(nr_times);
  98. rr_bencher.set_param(rr_param)
  99. .set_dtype(0, dtype::Float32())
  100. .set_dtype(1, dtype::Float32())
  101. .set_dtype(2, dtype::Int32())
  102. .set_dtype(3, dtype::Int32());
  103. rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng).set_rng(0, &r_rng);
  104. rr_bencher.set_times(nr_times);
  105. size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
  106. size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
  107. TensorShape inp{batch, g, hi, wi}, kern{g, 1, 1, fh, fw}, rin{batch, hi, wi},
  108. rout{batch, ho, wo}, out{batch, g, ho, wo};
  109. float bandwith = static_cast<float>(
  110. inp.total_nr_elems() + kern.total_nr_elems() +
  111. out.total_nr_elems()) /
  112. (1024 * 1024 * 1024) * 1e3;
  113. float rr_bandwith = static_cast<float>(
  114. inp.total_nr_elems() + kern.total_nr_elems() +
  115. rin.total_nr_elems() + rout.total_nr_elems() +
  116. out.total_nr_elems()) /
  117. (1024 * 1024 * 1024) * 1e3;
  118. auto time_in_ms = bencher.execs({inp, kern, {}, {}, out}) / nr_times;
  119. auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12;
  120. auto rr_time_in_ms = rr_bencher.execs({inp, kern, rin, rout, out}) / nr_times;
  121. auto rr_ops =
  122. 2.0 * batch * g * ho * wo * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
  123. printf("RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: inp=%s, "
  124. "kern=%s, out=%s\n"
  125. "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
  126. "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
  127. inp.to_string().c_str(), kern.to_string().c_str(),
  128. out.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
  129. bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
  130. time_in_ms / rr_time_in_ms);
  131. };
  132. run_bench(64, 384, 32, 32, 3, 3, 1, 1, 10);
  133. run_bench(64, 384, 32, 32, 5, 5, 1, 1, 10);
  134. run_bench(64, 384, 32, 32, 7, 7, 1, 1, 10);
  135. run_bench(64, 384, 32, 32, 9, 9, 1, 1, 10);
  136. run_bench(64, 384, 32, 32, 11, 11, 1, 1, 10);
  137. run_bench(64, 384, 32, 32, 13, 13, 1, 1, 10);
  138. run_bench(64, 384, 32, 32, 15, 15, 1, 1, 10);
  139. run_bench(64, 384, 32, 32, 17, 17, 1, 1, 10);
  140. run_bench(64, 384, 32, 32, 19, 19, 1, 1, 10);
  141. run_bench(64, 384, 32, 32, 21, 21, 1, 1, 10);
  142. run_bench(64, 384, 32, 32, 23, 23, 1, 1, 10);
  143. run_bench(64, 384, 32, 32, 25, 25, 1, 1, 10);
  144. run_bench(64, 384, 32, 32, 27, 27, 1, 1, 10);
  145. run_bench(64, 384, 32, 32, 29, 29, 1, 1, 10);
  146. run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10);
  147. }
  148. TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER_UINT8) {
  149. require_compute_capability(7, 5);
  150. Benchmarker<ConvBiasForward> bencher(handle_cuda());
  151. bencher.set_display(false);
  152. bencher.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  153. ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  154. "DEPTHWISE_LARGE_FILTER", {})
  155. .c_str()));
  156. Benchmarker<RegionRestrictedConvolutionForward> rr_bencher(handle_cuda());
  157. rr_bencher.set_display(false);
  158. ConvBias::Param param;
  159. param.format = ConvBias::Param::Format::NCHW;
  160. using NonlineMode = ConvBias::Param::NonlineMode;
  161. param.nonlineMode = NonlineMode::IDENTITY;
  162. param.sparse = ConvBias::Param::Sparse::GROUP;
  163. RegionRestrictedConvolutionForward::Param rr_param;
  164. rr_param.format = RegionRestrictedConvolutionForward::Param::Format::NCHW;
  165. rr_param.sparse = RegionRestrictedConvolutionForward::Param::Sparse::GROUP;
  166. UniformIntRNG r_rng{0, 2};
  167. auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
  168. size_t fw, size_t sh, size_t sw, size_t nr_times) {
  169. param.pad_h = fh / 2;
  170. param.pad_w = fw / 2;
  171. param.stride_h = sh;
  172. param.stride_w = sw;
  173. rr_param.pad_h = fh / 2;
  174. rr_param.pad_w = fw / 2;
  175. rr_param.stride_h = sh;
  176. rr_param.stride_w = sw;
  177. bencher.set_param(param)
  178. .set_dtype(0, dtype::Float32())
  179. .set_dtype(1, dtype::Float32())
  180. .set_dtype(2, dtype::Float32())
  181. .set_dtype(4, dtype::Float32());
  182. bencher.set_times(nr_times);
  183. rr_bencher.set_param(rr_param)
  184. .set_dtype(0, dtype::Float32())
  185. .set_dtype(1, dtype::Float32())
  186. .set_dtype(2, dtype::Uint8())
  187. .set_dtype(3, dtype::Uint8());
  188. rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng).set_rng(0, &r_rng);
  189. rr_bencher.set_times(nr_times);
  190. size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
  191. size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
  192. TensorShape inp{batch, g, hi, wi}, kern{g, 1, 1, fh, fw}, rin{batch, hi, wi},
  193. rout{batch, ho, wo}, out{batch, g, ho, wo};
  194. float bandwith = static_cast<float>(
  195. inp.total_nr_elems() + kern.total_nr_elems() +
  196. out.total_nr_elems()) /
  197. (1024 * 1024 * 1024) * 1e3;
  198. float rr_bandwith = static_cast<float>(
  199. inp.total_nr_elems() + kern.total_nr_elems() +
  200. rin.total_nr_elems() + rout.total_nr_elems() +
  201. out.total_nr_elems()) /
  202. (1024 * 1024 * 1024) * 1e3;
  203. auto time_in_ms = bencher.execs({inp, kern, {}, {}, out}) / nr_times;
  204. auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12;
  205. auto rr_time_in_ms = rr_bencher.execs({inp, kern, rin, rout, out}) / nr_times;
  206. auto rr_ops =
  207. 2.0 * batch * g * ho * wo * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12;
  208. printf("RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: inp=%s, "
  209. "kern=%s, out=%s\n"
  210. "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n"
  211. "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n",
  212. inp.to_string().c_str(), kern.to_string().c_str(),
  213. out.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops,
  214. bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms,
  215. time_in_ms / rr_time_in_ms);
  216. };
  217. run_bench(64, 384, 32, 32, 3, 3, 1, 1, 10);
  218. run_bench(64, 384, 32, 32, 5, 5, 1, 1, 10);
  219. run_bench(64, 384, 32, 32, 7, 7, 1, 1, 10);
  220. run_bench(64, 384, 32, 32, 9, 9, 1, 1, 10);
  221. run_bench(64, 384, 32, 32, 11, 11, 1, 1, 10);
  222. run_bench(64, 384, 32, 32, 13, 13, 1, 1, 10);
  223. run_bench(64, 384, 32, 32, 15, 15, 1, 1, 10);
  224. run_bench(64, 384, 32, 32, 17, 17, 1, 1, 10);
  225. run_bench(64, 384, 32, 32, 19, 19, 1, 1, 10);
  226. run_bench(64, 384, 32, 32, 21, 21, 1, 1, 10);
  227. run_bench(64, 384, 32, 32, 23, 23, 1, 1, 10);
  228. run_bench(64, 384, 32, 32, 25, 25, 1, 1, 10);
  229. run_bench(64, 384, 32, 32, 27, 27, 1, 1, 10);
  230. run_bench(64, 384, 32, 32, 29, 29, 1, 1, 10);
  231. run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10);
  232. }
  233. #endif
  234. } // namespace test
  235. } // namespace megdnn
  236. // vim: syntax=cpp.doxygen