You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

resize.cpp 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. #include "test/common/resize.h"
  2. #include "src/common/cv/enums.h"
  3. #include "test/common/benchmarker.h"
  4. #include "test/common/checker.h"
  5. #include "test/cuda/fixture.h"
  6. namespace megdnn {
  7. namespace test {
  8. namespace resize {
  9. TEST_F(CUDA, RESIZE_CV) {
  10. using namespace resize;
  11. std::vector<TestArg> args = get_cv_args();
  12. Checker<Resize> checker(handle_cuda());
  13. for (auto&& arg : args) {
  14. checker.set_param(arg.param)
  15. .set_dtype(0, dtype::Uint8())
  16. .set_dtype(1, dtype::Uint8())
  17. .set_epsilon(1)
  18. .set_max_avg_error(0.4)
  19. .execs({arg.src, arg.dst});
  20. }
  21. for (auto&& arg : args) {
  22. checker.set_param(arg.param)
  23. .set_dtype(0, dtype::Float32())
  24. .set_dtype(1, dtype::Float32())
  25. .set_epsilon(1e-3)
  26. .execs({arg.src, arg.dst});
  27. }
  28. }
  29. TEST_F(CUDA, RESIZE_FORWARD) {
  30. using namespace resize;
  31. IMode modes[] = {IMode::INTER_LINEAR, IMode::NEAREST, IMode::INTER_CUBIC};
  32. for (auto imode : modes) {
  33. std::vector<TestArg> args = get_args(imode);
  34. Checker<Resize> checker(handle_cuda());
  35. for (auto&& arg : args) {
  36. checker.set_param(arg.param)
  37. .set_dtype(0, dtype::Uint8())
  38. .set_dtype(1, dtype::Uint8())
  39. .set_epsilon(1)
  40. .execs({arg.src, arg.dst});
  41. }
  42. for (auto&& arg : args) {
  43. checker.set_param(arg.param)
  44. .set_dtype(0, dtype::Float32())
  45. .set_dtype(1, dtype::Float32())
  46. .set_epsilon(1e-3)
  47. .execs({arg.src, arg.dst});
  48. }
  49. for (auto&& arg : args) {
  50. checker.set_param(arg.param)
  51. .set_dtype(0, dtype::Int8())
  52. .set_dtype(1, dtype::Int8())
  53. .set_epsilon(1)
  54. .execs({arg.src, arg.dst});
  55. }
  56. for (auto&& arg : args) {
  57. checker.set_param(arg.param)
  58. .set_dtype(0, dtype::Float16())
  59. .set_dtype(1, dtype::Float16())
  60. .set_epsilon(1e-3)
  61. .execs({arg.src, arg.dst});
  62. }
  63. }
  64. }
  65. TEST_F(CUDA, RESIZE_NHWC) {
  66. using namespace resize;
  67. std::vector<TestArg> args;
  68. param::Resize param;
  69. param.format = param::Resize::Format::NHWC;
  70. param.imode = param::Resize::InterpolationMode::LINEAR;
  71. args.emplace_back(param, TensorShape{1, 1, 4, 5}, TensorShape{1, 1, 8, 5});
  72. args.emplace_back(param, TensorShape{2, 6, 4, 5}, TensorShape{2, 3, 8, 5});
  73. args.emplace_back(param, TensorShape{1, 2, 2, 2}, TensorShape{1, 4, 3, 2});
  74. Checker<ResizeBackward> checkerBackward(handle_cuda());
  75. for (auto&& arg : args) {
  76. checkerBackward.set_param(arg.param)
  77. .set_dtype(0, dtype::Float32())
  78. .set_dtype(1, dtype::Float32())
  79. .set_epsilon(1e-3)
  80. .execs({arg.src, arg.dst});
  81. }
  82. for (auto&& arg : args) {
  83. checkerBackward.set_param(arg.param)
  84. .set_dtype(0, dtype::Float16())
  85. .set_dtype(1, dtype::Float16())
  86. .set_epsilon(1e-3)
  87. .execs({arg.src, arg.dst});
  88. }
  89. Checker<ResizeForward> checkerForward(handle_cuda());
  90. for (auto&& arg : args) {
  91. checkerForward.set_param(arg.param)
  92. .set_dtype(0, dtype::Float16())
  93. .set_dtype(1, dtype::Float16())
  94. .set_epsilon(1e-3)
  95. .execs({arg.src, arg.dst});
  96. }
  97. for (auto&& arg : args) {
  98. checkerForward.set_param(arg.param)
  99. .set_dtype(0, dtype::Float32())
  100. .set_dtype(1, dtype::Float32())
  101. .set_epsilon(1e-3)
  102. .execs({arg.src, arg.dst});
  103. }
  104. }
  105. TEST_F(CUDA, RESIZE_NCHW4) {
  106. using namespace resize;
  107. Checker<Resize> checker(handle_cuda());
  108. auto args = get_nchw4_args();
  109. for (auto&& arg : args) {
  110. checker.set_param(arg.param)
  111. .set_dtype(0, dtype::QuantizedS8(0.1f))
  112. .set_dtype(1, dtype::QuantizedS8(0.1f))
  113. .set_epsilon(1 + 1e-3)
  114. .execs({arg.src, arg.dst});
  115. }
  116. }
  117. TEST_F(CUDA, RESIZE_NCHW_WITH_STRIDE) {
  118. IMode modes[] = {IMode::INTER_LINEAR, IMode::NEAREST, IMode::INTER_CUBIC};
  119. for (auto imode : modes) {
  120. param::Resize param;
  121. param.format = param::Resize::Format::NCHW;
  122. param.imode = imode;
  123. Checker<Resize> checker(handle_cuda());
  124. checker.set_epsilon(1 + 1e-3).set_param(param);
  125. auto run = [&](TensorShape src_shape, std::vector<ptrdiff_t> src_layout,
  126. TensorShape dst_shape, DType dtype) {
  127. checker.set_dtype(0, dtype).set_dtype(1, dtype).execl(
  128. {{src_shape, src_layout, dtype}, {dst_shape, dtype}});
  129. };
  130. for (DType& dtype :
  131. std::vector<DType>{dtype::Float32(), dtype::Uint8(), dtype::Int8()}) {
  132. run({2, 3, 4, 4}, {256, 32, 8, 1}, {2, 3, 3, 3}, dtype);
  133. run({1, 3, 4, 3}, {105, 35, 7, 2}, {1, 3, 5, 5}, dtype);
  134. run({1, 3, 40, 40}, {25600, 3200, 80, 1}, {1, 3, 30, 30}, dtype);
  135. run({2, 3, 4, 4}, {-256, 32, -8, 1}, {2, 3, 3, 3}, dtype);
  136. run({2, 3, 4, 4}, {256, -32, 8, -1}, {2, 3, 3, 3}, dtype);
  137. run({2, 3, 4, 4}, {-256, -32, -8, -1}, {2, 3, 3, 3}, dtype);
  138. }
  139. }
  140. }
  141. TEST_F(CUDA, RESIZE_BACKWARD) {
  142. IMode modes[] = {IMode::INTER_LINEAR, IMode::NEAREST, IMode::INTER_CUBIC};
  143. for (auto imode : modes) {
  144. Checker<ResizeBackward> checker(handle_cuda());
  145. param::Resize param;
  146. param.format = param::Resize::Format::NCHW;
  147. param.imode = imode;
  148. checker.set_param(param);
  149. checker.set_dtype(0, dtype::Float16());
  150. checker.set_dtype(1, dtype::Float16());
  151. checker.set_epsilon(1 + 1e-3);
  152. checker.execs({{2, 3, 4, 5}, {2, 3, 8, 9}});
  153. checker.execs({{2, 5, 8, 9}, {2, 5, 4, 5}});
  154. checker.execs({{2, 5, 8, 5}, {2, 5, 4, 9}});
  155. checker.execs({{2, 5, 4, 9}, {2, 5, 8, 5}});
  156. }
  157. for (auto imode : modes) {
  158. Checker<ResizeBackward> checker(handle_cuda());
  159. param::Resize param;
  160. param.format = param::Resize::Format::NCHW;
  161. param.imode = imode;
  162. checker.set_param(param);
  163. checker.set_dtype(0, dtype::Float32());
  164. checker.set_dtype(1, dtype::Float32());
  165. checker.execs({{2, 3, 4, 5}, {2, 3, 8, 9}});
  166. checker.execs({{2, 5, 8, 9}, {2, 5, 4, 5}});
  167. checker.execs({{2, 5, 8, 5}, {2, 5, 4, 9}});
  168. checker.execs({{2, 5, 4, 9}, {2, 5, 8, 5}});
  169. }
  170. }
  171. #if MEGDNN_WITH_BENCHMARK
  172. TEST_F(CUDA, BENCHMARK_RESIZE_CV) {
  173. Benchmarker<Resize> benchmarker(handle_cuda());
  174. param::Resize param;
  175. param.format = param::Resize::Format::NHWC;
  176. param.imode = param::Resize::InterpolationMode::LANCZOS4;
  177. benchmarker.set_param(param);
  178. benchmarker.set_display(false);
  179. auto run = [&benchmarker](const TensorShape& src, const TensorShape& dst) {
  180. auto used = benchmarker.execs({src, dst});
  181. //! bandwith: each dst elem require 4 read and 1 write
  182. //! gflops: each dst elem require 4 mul + 3 add
  183. printf("run %s->%s used: %f ms %f GBPS %f Gflops\n", src.to_string().c_str(),
  184. dst.to_string().c_str(), used,
  185. dst.total_nr_elems() * (4.f + 1.f) * sizeof(float) /
  186. (1024 * 1024 * 1024) / used * 1e3,
  187. dst.total_nr_elems() * (4.f + 3.f) / (1024 * 1024 * 1024) / used * 1e3);
  188. };
  189. run({1, 128, 128, 3}, {1, 256, 256, 3});
  190. }
  191. TEST_F(CUDA, BENCHMARK_RESIZE_FORWARD) {
  192. Benchmarker<Resize> benchmarker(handle_cuda());
  193. param::Resize param;
  194. param.format = param::Resize::Format::NCHW;
  195. param.imode = param::Resize::InterpolationMode::LINEAR;
  196. benchmarker.set_param(param);
  197. benchmarker.set_display(false);
  198. auto run = [&benchmarker](const TensorShape& src, const TensorShape& dst) {
  199. auto used = benchmarker.execs({src, dst});
  200. //! bandwith: each dst elem require 4 read and 1 write
  201. //! gflops: each dst elem require 4 mul + 3 add
  202. printf("run %s->%s used: %f ms %f GBPS %f Gflops\n", src.to_string().c_str(),
  203. dst.to_string().c_str(), used,
  204. dst.total_nr_elems() * (4.f + 1.f) * sizeof(float) /
  205. (1024 * 1024 * 1024) / used * 1e3,
  206. dst.total_nr_elems() * (4.f + 3.f) / (1024 * 1024 * 1024) / used * 1e3);
  207. };
  208. run({1, 100, 256, 256}, {1, 100, 256, 5120});
  209. run({1, 100, 256, 5120}, {1, 100, 256, 256});
  210. run({1, 100, 256, 256}, {1, 100, 512, 512});
  211. run({1, 100, 512, 512}, {1, 100, 256, 256});
  212. }
  213. TEST_F(CUDA, BENCHMARK_RESIZE_FORWARD_NCHW4) {
  214. Benchmarker<Resize> benchmarker(handle_cuda());
  215. param::Resize param;
  216. param.imode = param::Resize::InterpolationMode::LINEAR;
  217. benchmarker.set_display(false);
  218. auto run = [&benchmarker](const TensorShape& src, const TensorShape& dst) {
  219. auto used = benchmarker.execs({src, dst});
  220. //! bandwith: each dst elem require 4 read and 1 write
  221. //! gflops: each dst elem require 4 mul + 3 add
  222. printf("run %s->%s used: %f ms %f GBPS %f Gflops\n", src.to_string().c_str(),
  223. dst.to_string().c_str(), used,
  224. dst.total_nr_elems() * (4.f + 1.f) / (1024 * 1024 * 1024) / used * 1e3,
  225. dst.total_nr_elems() * (4.f + 3.f) / (1024 * 1024 * 1024) / used * 1e3);
  226. };
  227. param.format = param::Resize::Format::NCHW;
  228. benchmarker.set_param(param);
  229. benchmarker.set_dtype(0, dtype::Int8());
  230. benchmarker.set_dtype(1, dtype::Int8());
  231. run({1, 100, 256, 256}, {1, 100, 256, 5120});
  232. run({1, 100, 256, 5120}, {1, 100, 256, 256});
  233. run({1, 100, 256, 256}, {1, 100, 512, 512});
  234. run({1, 100, 512, 512}, {1, 100, 256, 256});
  235. param.format = param::Resize::Format::NCHW4;
  236. benchmarker.set_param(param);
  237. benchmarker.set_dtype(0, dtype::QuantizedS8(1.0f));
  238. benchmarker.set_dtype(1, dtype::QuantizedS8(1.0f));
  239. run({1, 25, 256, 256, 4}, {1, 25, 256, 5120, 4});
  240. run({1, 25, 256, 5120, 4}, {1, 25, 256, 256, 4});
  241. run({1, 25, 256, 256, 4}, {1, 25, 512, 512, 4});
  242. run({1, 25, 512, 512, 4}, {1, 25, 256, 256, 4});
  243. }
  244. TEST_F(CUDA, BENCHMARK_RESIZE_BACKWARD) {
  245. Benchmarker<ResizeBackward> benchmarker(handle_cuda());
  246. param::Resize param;
  247. param.format = param::Resize::Format::NCHW;
  248. param.imode = param::Resize::InterpolationMode::LINEAR;
  249. benchmarker.set_param(param);
  250. benchmarker.set_display(false);
  251. const size_t RUNS = 5;
  252. benchmarker.set_times(RUNS);
  253. auto run = [&benchmarker](const TensorShape& diff, const TensorShape& grad) {
  254. auto used = benchmarker.execs({diff, grad});
  255. used /= RUNS;
  256. //! bandwith: each dst elem require 1 read and 4 write
  257. //! gflops: each dst elem require 4 add
  258. printf("run %s<-%s used: %f ms %f GBPS %f Gflops\n", diff.to_string().c_str(),
  259. grad.to_string().c_str(), used,
  260. diff.total_nr_elems() * (4.f + 1.f) * sizeof(float) /
  261. (1024 * 1024 * 1024) / used * 1e3,
  262. diff.total_nr_elems() * 4.f / (1024 * 1024 * 1024) / used * 1e3);
  263. };
  264. run({1, 100, 256, 256}, {1, 100, 256, 5120});
  265. run({1, 100, 256, 5120}, {1, 100, 256, 256});
  266. run({1, 100, 256, 256}, {1, 100, 512, 512});
  267. run({1, 100, 512, 512}, {1, 100, 256, 256});
  268. }
  269. #endif
  270. } // namespace resize
  271. } // namespace test
  272. } // namespace megdnn
  273. // vim: syntax=cpp.doxygen