You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

resize.cpp 9.2 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. /**
  2. * \file dnn/test/cuda/resize.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "test/common/resize.h"
  12. #include "test/common/benchmarker.h"
  13. #include "test/common/checker.h"
  14. #include "test/cuda/fixture.h"
  15. namespace megdnn {
  16. namespace test {
  17. namespace resize {
  18. TEST_F(CUDA, RESIZE_CV) {
  19. using namespace resize;
  20. std::vector<TestArg> args = get_cv_args();
  21. Checker<Resize> checker(handle_cuda());
  22. for (auto&& arg : args) {
  23. checker.set_param(arg.param)
  24. .set_dtype(0, dtype::Uint8())
  25. .set_dtype(1, dtype::Uint8())
  26. .set_epsilon(1)
  27. .set_max_avg_error(0.4)
  28. .execs({arg.src, arg.dst});
  29. }
  30. for (auto&& arg : args) {
  31. checker.set_param(arg.param)
  32. .set_dtype(0, dtype::Float32())
  33. .set_dtype(1, dtype::Float32())
  34. .set_epsilon(1e-3)
  35. .execs({arg.src, arg.dst});
  36. }
  37. }
  38. TEST_F(CUDA, RESIZE_FORWARD) {
  39. using namespace resize;
  40. std::vector<TestArg> args = get_args();
  41. Checker<Resize> checker(handle_cuda());
  42. for (auto&& arg : args) {
  43. checker.set_param(arg.param)
  44. .set_dtype(0, dtype::Uint8())
  45. .set_dtype(1, dtype::Uint8())
  46. .execs({arg.src, arg.dst});
  47. }
  48. for (auto&& arg : args) {
  49. checker.set_param(arg.param)
  50. .set_dtype(0, dtype::Float32())
  51. .set_dtype(1, dtype::Float32())
  52. .set_epsilon(1e-3)
  53. .execs({arg.src, arg.dst});
  54. }
  55. for (auto&& arg : args) {
  56. checker.set_param(arg.param)
  57. .set_dtype(0, dtype::Int8())
  58. .set_dtype(1, dtype::Int8())
  59. .set_epsilon(1e-3)
  60. .execs({arg.src, arg.dst});
  61. }
  62. }
  63. TEST_F(CUDA, RESIZE_NCHW4) {
  64. using namespace resize;
  65. Checker<Resize> checker(handle_cuda());
  66. auto args = get_nchw4_args();
  67. for (auto&& arg : args) {
  68. checker.set_param(arg.param)
  69. .set_dtype(0, dtype::QuantizedS8(0.1f))
  70. .set_dtype(1, dtype::QuantizedS8(0.1f))
  71. .set_epsilon(1 + 1e-3)
  72. .execs({arg.src, arg.dst});
  73. }
  74. }
  75. TEST_F(CUDA, RESIZE_NCHW_WITH_STRIDE) {
  76. param::Resize param;
  77. param.format = param::Resize::Format::NCHW;
  78. param.imode = param::Resize::InterpolationMode::LINEAR;
  79. Checker<Resize> checker(handle_cuda());
  80. checker.set_epsilon(1 + 1e-3)
  81. .set_param(param);
  82. auto run = [&](TensorShape src_shape, std::vector<ptrdiff_t> src_layout,
  83. TensorShape dst_shape, DType dtype) {
  84. checker.set_dtype(0, dtype)
  85. .set_dtype(1, dtype)
  86. .execl({{src_shape, src_layout, dtype}, {dst_shape, dtype}});
  87. };
  88. for (DType& dtype : std::vector<DType>{dtype::Float32(), dtype::Uint8(),
  89. dtype::Int8()}) {
  90. run({2, 3, 4, 4}, {256, 32, 8, 1}, {2, 3, 3, 3}, dtype);
  91. run({1, 3, 4, 3}, {105, 35, 7, 2}, {1, 3, 5, 5}, dtype);
  92. run({1, 3, 40, 40}, {25600, 3200, 80, 1}, {1, 3, 30, 30}, dtype);
  93. run({2, 3, 4, 4}, {-256, 32, -8, 1}, {2, 3, 3, 3}, dtype);
  94. run({2, 3, 4, 4}, {256, -32, 8, -1}, {2, 3, 3, 3}, dtype);
  95. run({2, 3, 4, 4}, {-256, -32, -8, -1}, {2, 3, 3, 3}, dtype);
  96. }
  97. }
  98. TEST_F(CUDA, RESIZE_BACKWARD) {
  99. Checker<ResizeBackward> checker(handle_cuda());
  100. param::Resize param;
  101. param.format = param::Resize::Format::NCHW;
  102. param.imode = param::Resize::InterpolationMode::LINEAR;
  103. checker.set_param(param);
  104. checker.execs({{2, 3, 4, 5}, {2, 3, 8, 9}});
  105. checker.execs({{2, 5, 8, 9}, {2, 5, 4, 5}});
  106. checker.execs({{2, 5, 8, 5}, {2, 5, 4, 9}});
  107. checker.execs({{2, 5, 4, 9}, {2, 5, 8, 5}});
  108. }
  109. #if MEGDNN_WITH_BENCHMARK
  110. TEST_F(CUDA, BENCHMARK_RESIZE_CV) {
  111. Benchmarker<Resize> benchmarker(handle_cuda());
  112. param::Resize param;
  113. param.format = param::Resize::Format::NHWC;
  114. param.imode = param::Resize::InterpolationMode::LANCZOS4;
  115. benchmarker.set_param(param);
  116. benchmarker.set_display(false);
  117. auto run = [&benchmarker](const TensorShape& src, const TensorShape& dst) {
  118. auto used = benchmarker.execs({src, dst});
  119. //! bandwith: each dst elem require 4 read and 1 write
  120. //! gflops: each dst elem require 4 mul + 3 add
  121. printf("run %s->%s used: %f ms %f GBPS %f Gflops\n",
  122. src.to_string().c_str(), dst.to_string().c_str(), used,
  123. dst.total_nr_elems() * (4.f + 1.f) * sizeof(float) /
  124. (1024 * 1024 * 1024) / used * 1e3,
  125. dst.total_nr_elems() * (4.f + 3.f) / (1024 * 1024 * 1024) /
  126. used * 1e3);
  127. };
  128. run({1, 128, 128, 3}, {1, 256, 256, 3});
  129. }
  130. TEST_F(CUDA, BENCHMARK_RESIZE_FORWARD) {
  131. Benchmarker<Resize> benchmarker(handle_cuda());
  132. param::Resize param;
  133. param.format = param::Resize::Format::NCHW;
  134. param.imode = param::Resize::InterpolationMode::LINEAR;
  135. benchmarker.set_param(param);
  136. benchmarker.set_display(false);
  137. auto run = [&benchmarker](const TensorShape& src, const TensorShape& dst) {
  138. auto used = benchmarker.execs({src, dst});
  139. //! bandwith: each dst elem require 4 read and 1 write
  140. //! gflops: each dst elem require 4 mul + 3 add
  141. printf("run %s->%s used: %f ms %f GBPS %f Gflops\n",
  142. src.to_string().c_str(), dst.to_string().c_str(), used,
  143. dst.total_nr_elems() * (4.f + 1.f) * sizeof(float) /
  144. (1024 * 1024 * 1024) / used * 1e3,
  145. dst.total_nr_elems() * (4.f + 3.f) / (1024 * 1024 * 1024) /
  146. used * 1e3);
  147. };
  148. run({1, 100, 256, 256}, {1, 100, 256, 5120});
  149. run({1, 100, 256, 5120}, {1, 100, 256, 256});
  150. run({1, 100, 256, 256}, {1, 100, 512, 512});
  151. run({1, 100, 512, 512}, {1, 100, 256, 256});
  152. }
  153. TEST_F(CUDA, BENCHMARK_RESIZE_FORWARD_NCHW4) {
  154. Benchmarker<Resize> benchmarker(handle_cuda());
  155. param::Resize param;
  156. param.imode = param::Resize::InterpolationMode::LINEAR;
  157. benchmarker.set_display(false);
  158. auto run = [&benchmarker](const TensorShape& src, const TensorShape& dst) {
  159. auto used = benchmarker.execs({src, dst});
  160. //! bandwith: each dst elem require 4 read and 1 write
  161. //! gflops: each dst elem require 4 mul + 3 add
  162. printf("run %s->%s used: %f ms %f GBPS %f Gflops\n",
  163. src.to_string().c_str(), dst.to_string().c_str(), used,
  164. dst.total_nr_elems() * (4.f + 1.f) /
  165. (1024 * 1024 * 1024) / used * 1e3,
  166. dst.total_nr_elems() * (4.f + 3.f) / (1024 * 1024 * 1024) /
  167. used * 1e3);
  168. };
  169. param.format = param::Resize::Format::NCHW;
  170. benchmarker.set_param(param);
  171. benchmarker.set_dtype(0, dtype::Int8());
  172. benchmarker.set_dtype(1, dtype::Int8());
  173. run({1, 100, 256, 256}, {1, 100, 256, 5120});
  174. run({1, 100, 256, 5120}, {1, 100, 256, 256});
  175. run({1, 100, 256, 256}, {1, 100, 512, 512});
  176. run({1, 100, 512, 512}, {1, 100, 256, 256});
  177. param.format = param::Resize::Format::NCHW4;
  178. benchmarker.set_param(param);
  179. benchmarker.set_dtype(0, dtype::QuantizedS8(1.0f));
  180. benchmarker.set_dtype(1, dtype::QuantizedS8(1.0f));
  181. run({1, 25, 256, 256, 4}, {1, 25, 256, 5120, 4});
  182. run({1, 25, 256, 5120, 4}, {1, 25, 256, 256, 4});
  183. run({1, 25, 256, 256, 4}, {1, 25, 512, 512, 4});
  184. run({1, 25, 512, 512, 4}, {1, 25, 256, 256, 4});
  185. }
  186. TEST_F(CUDA, BENCHMARK_RESIZE_BACKWARD) {
  187. Benchmarker<ResizeBackward> benchmarker(handle_cuda());
  188. param::Resize param;
  189. param.format = param::Resize::Format::NCHW;
  190. param.imode = param::Resize::InterpolationMode::LINEAR;
  191. benchmarker.set_param(param);
  192. benchmarker.set_display(false);
  193. const size_t RUNS = 5;
  194. benchmarker.set_times(RUNS);
  195. auto run = [&benchmarker](const TensorShape& diff,
  196. const TensorShape& grad) {
  197. auto used = benchmarker.execs({diff, grad});
  198. used /= RUNS;
  199. //! bandwith: each dst elem require 1 read and 4 write
  200. //! gflops: each dst elem require 4 add
  201. printf("run %s<-%s used: %f ms %f GBPS %f Gflops\n",
  202. diff.to_string().c_str(), grad.to_string().c_str(), used,
  203. diff.total_nr_elems() * (4.f + 1.f) * sizeof(float) /
  204. (1024 * 1024 * 1024) / used * 1e3,
  205. diff.total_nr_elems() * 4.f / (1024 * 1024 * 1024) / used * 1e3);
  206. };
  207. run({1, 100, 256, 256}, {1, 100, 256, 5120});
  208. run({1, 100, 256, 5120}, {1, 100, 256, 256});
  209. run({1, 100, 256, 256}, {1, 100, 512, 512});
  210. run({1, 100, 512, 512}, {1, 100, 256, 256});
  211. }
  212. #endif
  213. } // namespace resize
  214. } // namespace test
  215. } // namespace megdnn
  216. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台