You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

remap.cpp 17 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. #include "test/common/remap.h"
  2. #include "test/common/benchmarker.h"
  3. #include "test/common/checker.h"
  4. #include "test/common/rng.h"
  5. #include "test/cuda/benchmark.h"
  6. #include "test/cuda/fixture.h"
  7. namespace megdnn {
  8. namespace test {
  9. namespace remap {
  10. TEST_F(CUDA, REMAP_NCHW_FLOAT) {
  11. Checker<Remap> checker(handle_cuda());
  12. std::vector<TestArg> args = get_nchw_args();
  13. UniformFloatRNG float_rng(0, 255);
  14. #define cb(data_type, data_rng) \
  15. for (auto arg : args) { \
  16. UniformFloatRNG map_rng( \
  17. -2, std::max(arg.map_xy.shape[2], arg.map_xy.shape[1]) + 2); \
  18. checker.set_dtype(0, data_type) \
  19. .set_dtype(1, dtype::Float32()) \
  20. .set_dtype(2, data_type) \
  21. .set_rng(0, &data_rng) \
  22. .set_rng(1, &map_rng) \
  23. .set_rng(2, &data_rng) \
  24. .set_param(arg.param) \
  25. .execs({arg.src, arg.map_xy, arg.dst}); \
  26. }
  27. cb(dtype::Float32(), float_rng);
  28. cb(dtype::Float16(), float_rng);
  29. #undef cb
  30. #define cb(data_type, data_rng) \
  31. for (auto arg : args) { \
  32. UniformFloatRNG map_rng( \
  33. -2, std::max(arg.map_xy.shape[2], arg.map_xy.shape[1]) + 2); \
  34. checker.set_dtype(0, data_type) \
  35. .set_dtype(1, dtype::Float32()) \
  36. .set_dtype(2, data_type) \
  37. .set_rng(0, &data_rng) \
  38. .set_rng(1, &map_rng) \
  39. .set_rng(2, &data_rng) \
  40. .set_param(arg.param) \
  41. .set_epsilon(1e-2) \
  42. .execs({arg.src, arg.map_xy, arg.dst}); \
  43. }
  44. cb(dtype::BFloat16(), float_rng);
  45. #undef cb
  46. }
  47. TEST_F(CUDA, REMAP_NCHW_INT) {
  48. Checker<Remap> checker(handle_cuda());
  49. std::vector<TestArg> args = get_nchw_args();
  50. UniformIntRNG uint8_rng(0, 255);
  51. UniformIntRNG int8_rng(-128, 127);
  52. #define cb(data_type, data_rng) \
  53. for (auto arg : args) { \
  54. UniformFloatRNG map_rng( \
  55. -2, std::max(arg.map_xy.shape[2], arg.map_xy.shape[1]) + 2); \
  56. checker.set_dtype(0, data_type) \
  57. .set_dtype(1, dtype::Float32()) \
  58. .set_dtype(2, data_type) \
  59. .set_rng(0, &data_rng) \
  60. .set_rng(1, &map_rng) \
  61. .set_rng(2, &data_rng) \
  62. .set_epsilon(1) \
  63. .set_param(arg.param) \
  64. .execs({arg.src, arg.map_xy, arg.dst}); \
  65. }
  66. cb(dtype::Int8(), int8_rng);
  67. cb(dtype::Uint8(), uint8_rng);
  68. #undef cb
  69. }
  70. TEST_F(CUDA, REMAP_NHWC_FLOAT) {
  71. Checker<Remap> checker(handle_cuda());
  72. std::vector<TestArg> args = get_nhwc_args();
  73. UniformFloatRNG float_rng(0, 255);
  74. #define cb(data_type, data_rng) \
  75. for (auto arg : args) { \
  76. UniformFloatRNG map_rng( \
  77. -2, std::max(arg.map_xy.shape[2], arg.map_xy.shape[1]) + 2); \
  78. checker.set_dtype(0, data_type) \
  79. .set_dtype(1, dtype::Float32()) \
  80. .set_dtype(2, data_type) \
  81. .set_rng(0, &data_rng) \
  82. .set_rng(1, &map_rng) \
  83. .set_rng(2, &data_rng) \
  84. .set_param(arg.param) \
  85. .execs({arg.src, arg.map_xy, arg.dst}); \
  86. }
  87. cb(dtype::Float32(), float_rng);
  88. cb(dtype::Float16(), float_rng);
  89. #undef cb
  90. #define cb(data_type, data_rng) \
  91. for (auto arg : args) { \
  92. UniformFloatRNG map_rng( \
  93. -2, std::max(arg.map_xy.shape[2], arg.map_xy.shape[1]) + 2); \
  94. checker.set_dtype(0, data_type) \
  95. .set_dtype(1, dtype::Float32()) \
  96. .set_dtype(2, data_type) \
  97. .set_rng(0, &data_rng) \
  98. .set_rng(1, &map_rng) \
  99. .set_rng(2, &data_rng) \
  100. .set_param(arg.param) \
  101. .set_epsilon(1e-2) \
  102. .execs({arg.src, arg.map_xy, arg.dst}); \
  103. }
  104. cb(dtype::BFloat16(), float_rng);
  105. #undef cb
  106. }
  107. TEST_F(CUDA, REMAP_NHWC_INT) {
  108. Checker<Remap> checker(handle_cuda());
  109. std::vector<TestArg> args = get_nhwc_args();
  110. UniformIntRNG uint8_rng(0, 255);
  111. UniformIntRNG int8_rng(-128, 127);
  112. #define cb(data_type, data_rng) \
  113. for (auto arg : args) { \
  114. UniformFloatRNG map_rng( \
  115. -2, std::max(arg.map_xy.shape[2], arg.map_xy.shape[1]) + 2); \
  116. checker.set_dtype(0, data_type) \
  117. .set_dtype(1, dtype::Float32()) \
  118. .set_dtype(2, data_type) \
  119. .set_rng(0, &data_rng) \
  120. .set_rng(1, &map_rng) \
  121. .set_rng(2, &data_rng) \
  122. .set_epsilon(1) \
  123. .set_param(arg.param) \
  124. .execs({arg.src, arg.map_xy, arg.dst}); \
  125. }
  126. cb(dtype::Int8(), int8_rng);
  127. cb(dtype::Uint8(), uint8_rng);
  128. #undef cb
  129. }
  130. TEST_F(CUDA, REMAP_BACKWARD_DATA) {
  131. Checker<RemapBackwardData> checker(handle_cuda());
  132. std::vector<TestArg> args = get_nchw_args();
  133. UniformFloatRNG float_rng(0, 255);
  134. #define cb(data_type, data_rng) \
  135. for (auto arg : args) { \
  136. UniformFloatRNG map_rng( \
  137. -2, std::max(arg.map_xy.shape[2], arg.map_xy.shape[1]) + 2); \
  138. checker.set_dtype(1, data_type) \
  139. .set_dtype(0, dtype::Float32()) \
  140. .set_dtype(2, data_type) \
  141. .set_rng(1, &data_rng) \
  142. .set_rng(0, &map_rng) \
  143. .set_rng(2, &data_rng) \
  144. .set_param(arg.param) \
  145. .execs({arg.map_xy, arg.dst, arg.src}); \
  146. }
  147. cb(dtype::Float32(), float_rng);
  148. #undef cb
  149. #define cb(data_type, data_rng) \
  150. for (auto arg : args) { \
  151. UniformFloatRNG map_rng( \
  152. -2, std::max(arg.map_xy.shape[2], arg.map_xy.shape[1]) + 2); \
  153. checker.set_dtype(1, data_type) \
  154. .set_dtype(0, dtype::Float32()) \
  155. .set_dtype(2, data_type) \
  156. .set_rng(1, &data_rng) \
  157. .set_rng(0, &map_rng) \
  158. .set_rng(2, &data_rng) \
  159. .set_param(arg.param) \
  160. .set_epsilon(1e-1) \
  161. .execs({arg.map_xy, arg.dst, arg.src}); \
  162. }
  163. cb(dtype::BFloat16(), float_rng);
  164. cb(dtype::Float16(), float_rng);
  165. #undef cb
  166. }
  167. TEST_F(CUDA, REMAP_BACKWARD_MAT) {
  168. Checker<RemapBackwardMat> checker(handle_cuda());
  169. std::vector<TestArg> args = get_nchw_args();
  170. UniformFloatRNG float_rng(0, 255);
  171. #define cb(data_type, data_rng) \
  172. for (auto arg : args) { \
  173. UniformFloatRNG map_rng( \
  174. -2, std::max(arg.map_xy.shape[2], arg.map_xy.shape[1]) + 2); \
  175. checker.set_dtype(0, data_type) \
  176. .set_dtype(1, dtype::Float32()) \
  177. .set_dtype(2, data_type) \
  178. .set_dtype(3, dtype::Float32()) \
  179. .set_rng(0, &data_rng) \
  180. .set_rng(1, &map_rng) \
  181. .set_rng(2, &data_rng) \
  182. .set_rng(3, &map_rng) \
  183. .set_param(arg.param) \
  184. .set_epsilon(2e-2) \
  185. .execs({arg.src, arg.map_xy, arg.dst, arg.map_xy}); \
  186. }
  187. cb(dtype::Float32(), float_rng);
  188. #undef cb
  189. #define cb(data_type, data_rng) \
  190. for (auto arg : args) { \
  191. UniformFloatRNG map_rng( \
  192. -2, std::max(arg.map_xy.shape[2], arg.map_xy.shape[1]) + 2); \
  193. checker.set_dtype(0, data_type) \
  194. .set_dtype(1, dtype::Float32()) \
  195. .set_dtype(2, data_type) \
  196. .set_dtype(3, dtype::Float32()) \
  197. .set_rng(0, &data_rng) \
  198. .set_rng(1, &map_rng) \
  199. .set_rng(2, &data_rng) \
  200. .set_rng(3, &map_rng) \
  201. .set_param(arg.param) \
  202. .set_epsilon(1e-1) \
  203. .execs({arg.src, arg.map_xy, arg.dst, arg.map_xy}); \
  204. }
  205. cb(dtype::BFloat16(), float_rng);
  206. cb(dtype::Float16(), float_rng);
  207. #undef cb
  208. }
  209. #if MEGDNN_WITH_BENCHMARK
  210. TEST_F(CUDA, BENCHMARK_REMAP) {
  211. using Param = param::Remap;
  212. auto run = [&](const TensorShapeArray& shapes, Param param, DType dtype) {
  213. auto handle_cpu = create_cpu_handle(2);
  214. Benchmarker<Remap> benchmarker_naive(handle_cpu.get());
  215. CUBenchmarker<Remap> benchmarker_cuda(handle_cuda());
  216. UniformIntRNG rng(0, 0xff);
  217. UniformFloatRNG map_rng(
  218. -2, std::max(shapes[1].shape[1], shapes[1].shape[2]) + 2);
  219. benchmarker_naive.set_rng(0, &rng);
  220. benchmarker_cuda.set_rng(0, &rng);
  221. benchmarker_naive.set_rng(1, &map_rng);
  222. benchmarker_cuda.set_rng(1, &map_rng);
  223. benchmarker_naive.set_rng(2, &rng);
  224. benchmarker_cuda.set_rng(2, &rng);
  225. benchmarker_naive.set_dtype(1, dtype::Float32());
  226. benchmarker_cuda.set_dtype(1, dtype::Float32());
  227. benchmarker_naive.set_dtype(0, dtype).set_dtype(2, dtype);
  228. benchmarker_cuda.set_dtype(0, dtype).set_dtype(2, dtype);
  229. size_t RUN = 10;
  230. auto t1 = benchmarker_naive.set_display(false)
  231. .set_times(RUN)
  232. .set_param(param)
  233. .execs(shapes);
  234. auto t2 = benchmarker_cuda.set_display(false).set_param(param).execs(shapes);
  235. int size = 0;
  236. if (dtype == dtype::Float32{}) {
  237. size = sizeof(float);
  238. printf("float32: ");
  239. } else if (dtype == dtype::Float16{}) {
  240. size = sizeof(dt_float16);
  241. printf("float16: ");
  242. } else if (dtype == dtype::Int8{}) {
  243. size = sizeof(dt_int8);
  244. printf("int8: ");
  245. } else if (dtype == dtype::Uint8{}) {
  246. size = sizeof(dt_uint8);
  247. printf("uint8: ");
  248. }
  249. const TensorShape map_xy = shapes[1];
  250. const TensorShape dst_layout = shapes[2];
  251. float calc_amount = (dst_layout.total_nr_elems() * (4.f + 1.f) * size +
  252. map_xy.total_nr_elems() * sizeof(float)) /
  253. (1024 * 1024 * 1024);
  254. printf("naive={%.3fms, %.3fGBPS}, "
  255. "cuda={%.3fms, %.3fGBPS}\n",
  256. t1 / RUN, calc_amount / (t1 / RUN) * 1e3, t2, calc_amount / t2 * 1e3);
  257. };
  258. Param param;
  259. param.imode = param::Remap::InterpolationMode::LINEAR;
  260. param.format = param::Remap::Format::NHWC;
  261. param.border_type = param::Remap::BorderMode::CONSTANT;
  262. run({{4, 200, 300, 10}, {4, 200, 300, 2}, {4, 200, 300, 10}}, param,
  263. dtype::Float32{});
  264. run({{4, 200, 300, 10}, {4, 200, 300, 2}, {4, 200, 300, 10}}, param,
  265. dtype::Float16{});
  266. run({{4, 200, 300, 10}, {4, 200, 300, 2}, {4, 200, 300, 10}}, param,
  267. dtype::Uint8{});
  268. run({{4, 200, 300, 10}, {4, 200, 300, 2}, {4, 200, 300, 10}}, param, dtype::Int8{});
  269. param.border_type = param::Remap::BorderMode::REPLICATE;
  270. run({{4, 200, 300, 10}, {4, 200, 300, 2}, {4, 200, 300, 10}}, param,
  271. dtype::Float32{});
  272. run({{4, 200, 300, 10}, {4, 200, 300, 2}, {4, 200, 300, 10}}, param,
  273. dtype::Float16{});
  274. run({{4, 200, 300, 10}, {4, 200, 300, 2}, {4, 200, 300, 10}}, param,
  275. dtype::Uint8{});
  276. run({{4, 200, 300, 10}, {4, 200, 300, 2}, {4, 200, 300, 10}}, param, dtype::Int8{});
  277. param.format = param::Remap::Format::NCHW;
  278. param.border_type = param::Remap::BorderMode::CONSTANT;
  279. run({{4, 10, 200, 300}, {4, 200, 300, 2}, {4, 10, 200, 300}}, param,
  280. dtype::Float32{});
  281. run({{4, 10, 200, 300}, {4, 200, 300, 2}, {4, 10, 200, 300}}, param,
  282. dtype::Float16{});
  283. run({{4, 10, 200, 300}, {4, 200, 300, 2}, {4, 10, 200, 300}}, param,
  284. dtype::Uint8{});
  285. run({{4, 10, 200, 300}, {4, 200, 300, 2}, {4, 10, 200, 300}}, param, dtype::Int8{});
  286. param.border_type = param::Remap::BorderMode::REPLICATE;
  287. run({{4, 10, 200, 300}, {4, 200, 300, 2}, {4, 10, 200, 300}}, param,
  288. dtype::Float32{});
  289. run({{4, 10, 200, 300}, {4, 200, 300, 2}, {4, 10, 200, 300}}, param,
  290. dtype::Float16{});
  291. run({{4, 10, 200, 300}, {4, 200, 300, 2}, {4, 10, 200, 300}}, param,
  292. dtype::Uint8{});
  293. run({{4, 10, 200, 300}, {4, 200, 300, 2}, {4, 10, 200, 300}}, param, dtype::Int8{});
  294. }
  295. #endif
  296. } // namespace remap
  297. } // namespace test
  298. } // namespace megdnn
  299. // vim: syntax=cpp.doxygen