You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

remap.cpp 17 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332
  1. /**
  2. * \file dnn/test/cuda/remap.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "test/common/remap.h"
  13. #include "test/common/benchmarker.h"
  14. #include "test/common/checker.h"
  15. #include "test/common/rng.h"
  16. #include "test/cuda/benchmark.h"
  17. #include "test/cuda/fixture.h"
  18. namespace megdnn {
  19. namespace test {
  20. namespace remap {
  21. TEST_F(CUDA, REMAP_NCHW_FLOAT) {
  22. Checker<Remap> checker(handle_cuda());
  23. std::vector<TestArg> args = get_nchw_args();
  24. UniformFloatRNG float_rng(0, 255);
  25. #define cb(data_type, data_rng) \
  26. for (auto arg : args) { \
  27. UniformFloatRNG map_rng( \
  28. -2, std::max(arg.map_xy.shape[2], arg.map_xy.shape[1]) + 2); \
  29. checker.set_dtype(0, data_type) \
  30. .set_dtype(1, dtype::Float32()) \
  31. .set_dtype(2, data_type) \
  32. .set_rng(0, &data_rng) \
  33. .set_rng(1, &map_rng) \
  34. .set_rng(2, &data_rng) \
  35. .set_param(arg.param) \
  36. .execs({arg.src, arg.map_xy, arg.dst}); \
  37. }
  38. cb(dtype::Float32(), float_rng);
  39. cb(dtype::Float16(), float_rng);
  40. #undef cb
  41. #define cb(data_type, data_rng) \
  42. for (auto arg : args) { \
  43. UniformFloatRNG map_rng( \
  44. -2, std::max(arg.map_xy.shape[2], arg.map_xy.shape[1]) + 2); \
  45. checker.set_dtype(0, data_type) \
  46. .set_dtype(1, dtype::Float32()) \
  47. .set_dtype(2, data_type) \
  48. .set_rng(0, &data_rng) \
  49. .set_rng(1, &map_rng) \
  50. .set_rng(2, &data_rng) \
  51. .set_param(arg.param) \
  52. .set_epsilon(1e-2) \
  53. .execs({arg.src, arg.map_xy, arg.dst}); \
  54. }
  55. cb(dtype::BFloat16(), float_rng);
  56. #undef cb
  57. }
  58. TEST_F(CUDA, REMAP_NCHW_INT) {
  59. Checker<Remap> checker(handle_cuda());
  60. std::vector<TestArg> args = get_nchw_args();
  61. UniformIntRNG uint8_rng(0, 255);
  62. UniformIntRNG int8_rng(-128, 127);
  63. #define cb(data_type, data_rng) \
  64. for (auto arg : args) { \
  65. UniformFloatRNG map_rng( \
  66. -2, std::max(arg.map_xy.shape[2], arg.map_xy.shape[1]) + 2); \
  67. checker.set_dtype(0, data_type) \
  68. .set_dtype(1, dtype::Float32()) \
  69. .set_dtype(2, data_type) \
  70. .set_rng(0, &data_rng) \
  71. .set_rng(1, &map_rng) \
  72. .set_rng(2, &data_rng) \
  73. .set_epsilon(1) \
  74. .set_param(arg.param) \
  75. .execs({arg.src, arg.map_xy, arg.dst}); \
  76. }
  77. cb(dtype::Int8(), int8_rng);
  78. cb(dtype::Uint8(), uint8_rng);
  79. #undef cb
  80. }
  81. TEST_F(CUDA, REMAP_NHWC_FLOAT) {
  82. Checker<Remap> checker(handle_cuda());
  83. std::vector<TestArg> args = get_nhwc_args();
  84. UniformFloatRNG float_rng(0, 255);
  85. #define cb(data_type, data_rng) \
  86. for (auto arg : args) { \
  87. UniformFloatRNG map_rng( \
  88. -2, std::max(arg.map_xy.shape[2], arg.map_xy.shape[1]) + 2); \
  89. checker.set_dtype(0, data_type) \
  90. .set_dtype(1, dtype::Float32()) \
  91. .set_dtype(2, data_type) \
  92. .set_rng(0, &data_rng) \
  93. .set_rng(1, &map_rng) \
  94. .set_rng(2, &data_rng) \
  95. .set_param(arg.param) \
  96. .execs({arg.src, arg.map_xy, arg.dst}); \
  97. }
  98. cb(dtype::Float32(), float_rng);
  99. cb(dtype::Float16(), float_rng);
  100. #undef cb
  101. #define cb(data_type, data_rng) \
  102. for (auto arg : args) { \
  103. UniformFloatRNG map_rng( \
  104. -2, std::max(arg.map_xy.shape[2], arg.map_xy.shape[1]) + 2); \
  105. checker.set_dtype(0, data_type) \
  106. .set_dtype(1, dtype::Float32()) \
  107. .set_dtype(2, data_type) \
  108. .set_rng(0, &data_rng) \
  109. .set_rng(1, &map_rng) \
  110. .set_rng(2, &data_rng) \
  111. .set_param(arg.param) \
  112. .set_epsilon(1e-2) \
  113. .execs({arg.src, arg.map_xy, arg.dst}); \
  114. }
  115. cb(dtype::BFloat16(), float_rng);
  116. #undef cb
  117. }
  118. TEST_F(CUDA, REMAP_NHWC_INT) {
  119. Checker<Remap> checker(handle_cuda());
  120. std::vector<TestArg> args = get_nhwc_args();
  121. UniformIntRNG uint8_rng(0, 255);
  122. UniformIntRNG int8_rng(-128, 127);
  123. #define cb(data_type, data_rng) \
  124. for (auto arg : args) { \
  125. UniformFloatRNG map_rng( \
  126. -2, std::max(arg.map_xy.shape[2], arg.map_xy.shape[1]) + 2); \
  127. checker.set_dtype(0, data_type) \
  128. .set_dtype(1, dtype::Float32()) \
  129. .set_dtype(2, data_type) \
  130. .set_rng(0, &data_rng) \
  131. .set_rng(1, &map_rng) \
  132. .set_rng(2, &data_rng) \
  133. .set_epsilon(1) \
  134. .set_param(arg.param) \
  135. .execs({arg.src, arg.map_xy, arg.dst}); \
  136. }
  137. cb(dtype::Int8(), int8_rng);
  138. cb(dtype::Uint8(), uint8_rng);
  139. #undef cb
  140. }
  141. TEST_F(CUDA, REMAP_BACKWARD_DATA) {
  142. Checker<RemapBackwardData> checker(handle_cuda());
  143. std::vector<TestArg> args = get_nchw_args();
  144. UniformFloatRNG float_rng(0, 255);
  145. #define cb(data_type, data_rng) \
  146. for (auto arg : args) { \
  147. UniformFloatRNG map_rng( \
  148. -2, std::max(arg.map_xy.shape[2], arg.map_xy.shape[1]) + 2); \
  149. checker.set_dtype(1, data_type) \
  150. .set_dtype(0, dtype::Float32()) \
  151. .set_dtype(2, data_type) \
  152. .set_rng(1, &data_rng) \
  153. .set_rng(0, &map_rng) \
  154. .set_rng(2, &data_rng) \
  155. .set_param(arg.param) \
  156. .execs({arg.map_xy, arg.dst, arg.src}); \
  157. }
  158. cb(dtype::Float32(), float_rng);
  159. #undef cb
  160. #define cb(data_type, data_rng) \
  161. for (auto arg : args) { \
  162. UniformFloatRNG map_rng( \
  163. -2, std::max(arg.map_xy.shape[2], arg.map_xy.shape[1]) + 2); \
  164. checker.set_dtype(1, data_type) \
  165. .set_dtype(0, dtype::Float32()) \
  166. .set_dtype(2, data_type) \
  167. .set_rng(1, &data_rng) \
  168. .set_rng(0, &map_rng) \
  169. .set_rng(2, &data_rng) \
  170. .set_param(arg.param) \
  171. .set_epsilon(1e-1) \
  172. .execs({arg.map_xy, arg.dst, arg.src}); \
  173. }
  174. cb(dtype::BFloat16(), float_rng);
  175. #undef cb
  176. }
  177. TEST_F(CUDA, REMAP_BACKWARD_MAT) {
  178. Checker<RemapBackwardMat> checker(handle_cuda());
  179. std::vector<TestArg> args = get_nchw_args();
  180. UniformFloatRNG float_rng(0, 255);
  181. #define cb(data_type, data_rng) \
  182. for (auto arg : args) { \
  183. UniformFloatRNG map_rng( \
  184. -2, std::max(arg.map_xy.shape[2], arg.map_xy.shape[1]) + 2); \
  185. checker.set_dtype(0, data_type) \
  186. .set_dtype(1, dtype::Float32()) \
  187. .set_dtype(2, data_type) \
  188. .set_dtype(3, dtype::Float32()) \
  189. .set_rng(0, &data_rng) \
  190. .set_rng(1, &map_rng) \
  191. .set_rng(2, &data_rng) \
  192. .set_rng(3, &map_rng) \
  193. .set_param(arg.param) \
  194. .set_epsilon(2e-2) \
  195. .execs({arg.src, arg.map_xy, arg.dst, arg.map_xy}); \
  196. }
  197. cb(dtype::Float32(), float_rng);
  198. #undef cb
  199. #define cb(data_type, data_rng) \
  200. for (auto arg : args) { \
  201. UniformFloatRNG map_rng( \
  202. -2, std::max(arg.map_xy.shape[2], arg.map_xy.shape[1]) + 2); \
  203. checker.set_dtype(0, data_type) \
  204. .set_dtype(1, dtype::Float32()) \
  205. .set_dtype(2, data_type) \
  206. .set_dtype(3, dtype::Float32()) \
  207. .set_rng(0, &data_rng) \
  208. .set_rng(1, &map_rng) \
  209. .set_rng(2, &data_rng) \
  210. .set_rng(3, &map_rng) \
  211. .set_param(arg.param) \
  212. .set_epsilon(1e-1) \
  213. .execs({arg.src, arg.map_xy, arg.dst, arg.map_xy}); \
  214. }
  215. cb(dtype::BFloat16(), float_rng);
  216. #undef cb
  217. }
  218. #if MEGDNN_WITH_BENCHMARK
  219. TEST_F(CUDA, BENCHMARK_REMAP) {
  220. using Param = param::Remap;
  221. auto run = [&](const TensorShapeArray& shapes, Param param, DType dtype) {
  222. auto handle_cpu = create_cpu_handle(2);
  223. Benchmarker<Remap> benchmarker_naive(handle_cpu.get());
  224. CUBenchmarker<Remap> benchmarker_cuda(handle_cuda());
  225. UniformIntRNG rng(0, 0xff);
  226. UniformFloatRNG map_rng(
  227. -2, std::max(shapes[1].shape[1], shapes[1].shape[2]) + 2);
  228. benchmarker_naive.set_rng(0, &rng);
  229. benchmarker_cuda.set_rng(0, &rng);
  230. benchmarker_naive.set_rng(1, &map_rng);
  231. benchmarker_cuda.set_rng(1, &map_rng);
  232. benchmarker_naive.set_rng(2, &rng);
  233. benchmarker_cuda.set_rng(2, &rng);
  234. benchmarker_naive.set_dtype(1, dtype::Float32());
  235. benchmarker_cuda.set_dtype(1, dtype::Float32());
  236. benchmarker_naive.set_dtype(0, dtype).set_dtype(2, dtype);
  237. benchmarker_cuda.set_dtype(0, dtype).set_dtype(2, dtype);
  238. size_t RUN = 10;
  239. auto t1 = benchmarker_naive.set_display(false)
  240. .set_times(RUN)
  241. .set_param(param)
  242. .execs(shapes);
  243. auto t2 = benchmarker_cuda.set_display(false).set_param(param).execs(
  244. shapes);
  245. int size = 0;
  246. if (dtype == dtype::Float32{}) {
  247. size = sizeof(float);
  248. printf("float32: ");
  249. } else if (dtype == dtype::Float16{}) {
  250. size = sizeof(dt_float16);
  251. printf("float16: ");
  252. } else if (dtype == dtype::Int8{}) {
  253. size = sizeof(dt_int8);
  254. printf("int8: ");
  255. } else if (dtype == dtype::Uint8{}) {
  256. size = sizeof(dt_uint8);
  257. printf("uint8: ");
  258. }
  259. const TensorShape map_xy = shapes[1];
  260. const TensorShape dst_layout = shapes[2];
  261. float calc_amount = (dst_layout.total_nr_elems() * (4.f + 1.f) * size +
  262. map_xy.total_nr_elems() * sizeof(float)) /
  263. (1024 * 1024 * 1024);
  264. printf("naive={%.3fms, %.3fGBPS}, "
  265. "cuda={%.3fms, %.3fGBPS}\n",
  266. t1 / RUN, calc_amount / (t1 / RUN) * 1e3, t2,
  267. calc_amount / t2 * 1e3);
  268. };
  269. Param param;
  270. param.imode = param::Remap::InterpolationMode::LINEAR;
  271. param.format = param::Remap::Format::NHWC;
  272. param.border_type = param::Remap::BorderMode::CONSTANT;
  273. run({{4, 200, 300, 10}, {4, 200, 300, 2}, {4, 200, 300, 10}}, param,
  274. dtype::Float32{});
  275. run({{4, 200, 300, 10}, {4, 200, 300, 2}, {4, 200, 300, 10}}, param,
  276. dtype::Float16{});
  277. run({{4, 200, 300, 10}, {4, 200, 300, 2}, {4, 200, 300, 10}}, param,
  278. dtype::Uint8{});
  279. run({{4, 200, 300, 10}, {4, 200, 300, 2}, {4, 200, 300, 10}}, param,
  280. dtype::Int8{});
  281. param.border_type = param::Remap::BorderMode::REPLICATE;
  282. run({{4, 200, 300, 10}, {4, 200, 300, 2}, {4, 200, 300, 10}}, param,
  283. dtype::Float32{});
  284. run({{4, 200, 300, 10}, {4, 200, 300, 2}, {4, 200, 300, 10}}, param,
  285. dtype::Float16{});
  286. run({{4, 200, 300, 10}, {4, 200, 300, 2}, {4, 200, 300, 10}}, param,
  287. dtype::Uint8{});
  288. run({{4, 200, 300, 10}, {4, 200, 300, 2}, {4, 200, 300, 10}}, param,
  289. dtype::Int8{});
  290. param.format = param::Remap::Format::NCHW;
  291. param.border_type = param::Remap::BorderMode::CONSTANT;
  292. run({{4, 10, 200, 300}, {4, 200, 300, 2}, {4, 10, 200, 300}}, param,
  293. dtype::Float32{});
  294. run({{4, 10, 200, 300}, {4, 200, 300, 2}, {4, 10, 200, 300}}, param,
  295. dtype::Float16{});
  296. run({{4, 10, 200, 300}, {4, 200, 300, 2}, {4, 10, 200, 300}}, param,
  297. dtype::Uint8{});
  298. run({{4, 10, 200, 300}, {4, 200, 300, 2}, {4, 10, 200, 300}}, param,
  299. dtype::Int8{});
  300. param.border_type = param::Remap::BorderMode::REPLICATE;
  301. run({{4, 10, 200, 300}, {4, 200, 300, 2}, {4, 10, 200, 300}}, param,
  302. dtype::Float32{});
  303. run({{4, 10, 200, 300}, {4, 200, 300, 2}, {4, 10, 200, 300}}, param,
  304. dtype::Float16{});
  305. run({{4, 10, 200, 300}, {4, 200, 300, 2}, {4, 10, 200, 300}}, param,
  306. dtype::Uint8{});
  307. run({{4, 10, 200, 300}, {4, 200, 300, 2}, {4, 10, 200, 300}}, param,
  308. dtype::Int8{});
  309. }
  310. #endif
  311. } // namespace remap
  312. } // namespace test
  313. } // namespace megdnn
  314. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台