You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

resize.cpp 9.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264
  1. /**
  2. * \file dnn/test/cuda/resize.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "test/common/resize.h"
  12. #include "src/common/cv/enums.h"
  13. #include "test/common/benchmarker.h"
  14. #include "test/common/checker.h"
  15. #include "test/cuda/fixture.h"
  16. namespace megdnn {
  17. namespace test {
  18. namespace resize {
  19. TEST_F(CUDA, RESIZE_CV) {
  20. using namespace resize;
  21. std::vector<TestArg> args = get_cv_args();
  22. Checker<Resize> checker(handle_cuda());
  23. for (auto&& arg : args) {
  24. checker.set_param(arg.param)
  25. .set_dtype(0, dtype::Uint8())
  26. .set_dtype(1, dtype::Uint8())
  27. .set_epsilon(1)
  28. .set_max_avg_error(0.4)
  29. .execs({arg.src, arg.dst});
  30. }
  31. for (auto&& arg : args) {
  32. checker.set_param(arg.param)
  33. .set_dtype(0, dtype::Float32())
  34. .set_dtype(1, dtype::Float32())
  35. .set_epsilon(1e-3)
  36. .execs({arg.src, arg.dst});
  37. }
  38. }
  39. TEST_F(CUDA, RESIZE_FORWARD) {
  40. using namespace resize;
  41. IMode modes[] = {IMode::INTER_LINEAR, IMode::NEAREST, IMode::INTER_CUBIC};
  42. for (auto imode : modes) {
  43. std::vector<TestArg> args = get_args(imode);
  44. Checker<Resize> checker(handle_cuda());
  45. for (auto&& arg : args) {
  46. checker.set_param(arg.param)
  47. .set_dtype(0, dtype::Uint8())
  48. .set_dtype(1, dtype::Uint8())
  49. .set_epsilon(1)
  50. .execs({arg.src, arg.dst});
  51. }
  52. for (auto&& arg : args) {
  53. checker.set_param(arg.param)
  54. .set_dtype(0, dtype::Float32())
  55. .set_dtype(1, dtype::Float32())
  56. .set_epsilon(1e-3)
  57. .execs({arg.src, arg.dst});
  58. }
  59. for (auto&& arg : args) {
  60. checker.set_param(arg.param)
  61. .set_dtype(0, dtype::Int8())
  62. .set_dtype(1, dtype::Int8())
  63. .set_epsilon(1)
  64. .execs({arg.src, arg.dst});
  65. }
  66. }
  67. }
  68. TEST_F(CUDA, RESIZE_NCHW4) {
  69. using namespace resize;
  70. Checker<Resize> checker(handle_cuda());
  71. auto args = get_nchw4_args();
  72. for (auto&& arg : args) {
  73. checker.set_param(arg.param)
  74. .set_dtype(0, dtype::QuantizedS8(0.1f))
  75. .set_dtype(1, dtype::QuantizedS8(0.1f))
  76. .set_epsilon(1 + 1e-3)
  77. .execs({arg.src, arg.dst});
  78. }
  79. }
  80. TEST_F(CUDA, RESIZE_NCHW_WITH_STRIDE) {
  81. IMode modes[] = {IMode::INTER_LINEAR, IMode::NEAREST, IMode::INTER_CUBIC};
  82. for (auto imode : modes) {
  83. param::Resize param;
  84. param.format = param::Resize::Format::NCHW;
  85. param.imode = imode;
  86. Checker<Resize> checker(handle_cuda());
  87. checker.set_epsilon(1 + 1e-3)
  88. .set_param(param);
  89. auto run = [&](TensorShape src_shape, std::vector<ptrdiff_t> src_layout,
  90. TensorShape dst_shape, DType dtype) {
  91. checker.set_dtype(0, dtype)
  92. .set_dtype(1, dtype)
  93. .execl({{src_shape, src_layout, dtype}, {dst_shape, dtype}});
  94. };
  95. for (DType& dtype : std::vector<DType>{dtype::Float32(), dtype::Uint8(),
  96. dtype::Int8()}) {
  97. run({2, 3, 4, 4}, {256, 32, 8, 1}, {2, 3, 3, 3}, dtype);
  98. run({1, 3, 4, 3}, {105, 35, 7, 2}, {1, 3, 5, 5}, dtype);
  99. run({1, 3, 40, 40}, {25600, 3200, 80, 1}, {1, 3, 30, 30}, dtype);
  100. run({2, 3, 4, 4}, {-256, 32, -8, 1}, {2, 3, 3, 3}, dtype);
  101. run({2, 3, 4, 4}, {256, -32, 8, -1}, {2, 3, 3, 3}, dtype);
  102. run({2, 3, 4, 4}, {-256, -32, -8, -1}, {2, 3, 3, 3}, dtype);
  103. }
  104. }
  105. }
  106. TEST_F(CUDA, RESIZE_BACKWARD) {
  107. IMode modes[] = {IMode::INTER_LINEAR, IMode::NEAREST, IMode::INTER_CUBIC};
  108. for (auto imode : modes) {
  109. Checker<ResizeBackward> checker(handle_cuda());
  110. param::Resize param;
  111. param.format = param::Resize::Format::NCHW;
  112. param.imode = imode;
  113. checker.set_param(param);
  114. checker.execs({{2, 3, 4, 5}, {2, 3, 8, 9}});
  115. checker.execs({{2, 5, 8, 9}, {2, 5, 4, 5}});
  116. checker.execs({{2, 5, 8, 5}, {2, 5, 4, 9}});
  117. checker.execs({{2, 5, 4, 9}, {2, 5, 8, 5}});
  118. }
  119. }
  120. #if MEGDNN_WITH_BENCHMARK
  121. TEST_F(CUDA, BENCHMARK_RESIZE_CV) {
  122. Benchmarker<Resize> benchmarker(handle_cuda());
  123. param::Resize param;
  124. param.format = param::Resize::Format::NHWC;
  125. param.imode = param::Resize::InterpolationMode::LANCZOS4;
  126. benchmarker.set_param(param);
  127. benchmarker.set_display(false);
  128. auto run = [&benchmarker](const TensorShape& src, const TensorShape& dst) {
  129. auto used = benchmarker.execs({src, dst});
  130. //! bandwith: each dst elem require 4 read and 1 write
  131. //! gflops: each dst elem require 4 mul + 3 add
  132. printf("run %s->%s used: %f ms %f GBPS %f Gflops\n",
  133. src.to_string().c_str(), dst.to_string().c_str(), used,
  134. dst.total_nr_elems() * (4.f + 1.f) * sizeof(float) /
  135. (1024 * 1024 * 1024) / used * 1e3,
  136. dst.total_nr_elems() * (4.f + 3.f) / (1024 * 1024 * 1024) /
  137. used * 1e3);
  138. };
  139. run({1, 128, 128, 3}, {1, 256, 256, 3});
  140. }
  141. TEST_F(CUDA, BENCHMARK_RESIZE_FORWARD) {
  142. Benchmarker<Resize> benchmarker(handle_cuda());
  143. param::Resize param;
  144. param.format = param::Resize::Format::NCHW;
  145. param.imode = param::Resize::InterpolationMode::LINEAR;
  146. benchmarker.set_param(param);
  147. benchmarker.set_display(false);
  148. auto run = [&benchmarker](const TensorShape& src, const TensorShape& dst) {
  149. auto used = benchmarker.execs({src, dst});
  150. //! bandwith: each dst elem require 4 read and 1 write
  151. //! gflops: each dst elem require 4 mul + 3 add
  152. printf("run %s->%s used: %f ms %f GBPS %f Gflops\n",
  153. src.to_string().c_str(), dst.to_string().c_str(), used,
  154. dst.total_nr_elems() * (4.f + 1.f) * sizeof(float) /
  155. (1024 * 1024 * 1024) / used * 1e3,
  156. dst.total_nr_elems() * (4.f + 3.f) / (1024 * 1024 * 1024) /
  157. used * 1e3);
  158. };
  159. run({1, 100, 256, 256}, {1, 100, 256, 5120});
  160. run({1, 100, 256, 5120}, {1, 100, 256, 256});
  161. run({1, 100, 256, 256}, {1, 100, 512, 512});
  162. run({1, 100, 512, 512}, {1, 100, 256, 256});
  163. }
  164. TEST_F(CUDA, BENCHMARK_RESIZE_FORWARD_NCHW4) {
  165. Benchmarker<Resize> benchmarker(handle_cuda());
  166. param::Resize param;
  167. param.imode = param::Resize::InterpolationMode::LINEAR;
  168. benchmarker.set_display(false);
  169. auto run = [&benchmarker](const TensorShape& src, const TensorShape& dst) {
  170. auto used = benchmarker.execs({src, dst});
  171. //! bandwith: each dst elem require 4 read and 1 write
  172. //! gflops: each dst elem require 4 mul + 3 add
  173. printf("run %s->%s used: %f ms %f GBPS %f Gflops\n",
  174. src.to_string().c_str(), dst.to_string().c_str(), used,
  175. dst.total_nr_elems() * (4.f + 1.f) /
  176. (1024 * 1024 * 1024) / used * 1e3,
  177. dst.total_nr_elems() * (4.f + 3.f) / (1024 * 1024 * 1024) /
  178. used * 1e3);
  179. };
  180. param.format = param::Resize::Format::NCHW;
  181. benchmarker.set_param(param);
  182. benchmarker.set_dtype(0, dtype::Int8());
  183. benchmarker.set_dtype(1, dtype::Int8());
  184. run({1, 100, 256, 256}, {1, 100, 256, 5120});
  185. run({1, 100, 256, 5120}, {1, 100, 256, 256});
  186. run({1, 100, 256, 256}, {1, 100, 512, 512});
  187. run({1, 100, 512, 512}, {1, 100, 256, 256});
  188. param.format = param::Resize::Format::NCHW4;
  189. benchmarker.set_param(param);
  190. benchmarker.set_dtype(0, dtype::QuantizedS8(1.0f));
  191. benchmarker.set_dtype(1, dtype::QuantizedS8(1.0f));
  192. run({1, 25, 256, 256, 4}, {1, 25, 256, 5120, 4});
  193. run({1, 25, 256, 5120, 4}, {1, 25, 256, 256, 4});
  194. run({1, 25, 256, 256, 4}, {1, 25, 512, 512, 4});
  195. run({1, 25, 512, 512, 4}, {1, 25, 256, 256, 4});
  196. }
  197. TEST_F(CUDA, BENCHMARK_RESIZE_BACKWARD) {
  198. Benchmarker<ResizeBackward> benchmarker(handle_cuda());
  199. param::Resize param;
  200. param.format = param::Resize::Format::NCHW;
  201. param.imode = param::Resize::InterpolationMode::LINEAR;
  202. benchmarker.set_param(param);
  203. benchmarker.set_display(false);
  204. const size_t RUNS = 5;
  205. benchmarker.set_times(RUNS);
  206. auto run = [&benchmarker](const TensorShape& diff,
  207. const TensorShape& grad) {
  208. auto used = benchmarker.execs({diff, grad});
  209. used /= RUNS;
  210. //! bandwith: each dst elem require 1 read and 4 write
  211. //! gflops: each dst elem require 4 add
  212. printf("run %s<-%s used: %f ms %f GBPS %f Gflops\n",
  213. diff.to_string().c_str(), grad.to_string().c_str(), used,
  214. diff.total_nr_elems() * (4.f + 1.f) * sizeof(float) /
  215. (1024 * 1024 * 1024) / used * 1e3,
  216. diff.total_nr_elems() * 4.f / (1024 * 1024 * 1024) / used * 1e3);
  217. };
  218. run({1, 100, 256, 256}, {1, 100, 256, 5120});
  219. run({1, 100, 256, 5120}, {1, 100, 256, 256});
  220. run({1, 100, 256, 256}, {1, 100, 512, 512});
  221. run({1, 100, 512, 512}, {1, 100, 256, 256});
  222. }
  223. #endif
  224. } // namespace resize
  225. } // namespace test
  226. } // namespace megdnn
  227. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台