You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

elemwise.cpp 13 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332
  1. /**
  2. * \file dnn/test/cuda/elemwise.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "test/common/elemwise.h"
  12. #include "test/cuda/fixture.h"
  13. #include "megdnn/oprs.h"
  14. #include "test/common/tensor.h"
  15. #include "test/common/rng.h"
  16. #include "./utils.h"
  17. #include "test/common/benchmarker.h"
  18. #include "test/common/checker.h"
  19. #include <cudnn.h>
  20. #include <cuda_profiler_api.h>
  21. using namespace megdnn;
  22. using namespace test;
  23. #define cudnn_check(e) megdnn_assert((e) == CUDNN_STATUS_SUCCESS)
  24. namespace {
  25. __attribute__((unused))
  26. cudnnTensorDescriptor_t make_cudnn_tensor_desc(const TensorLayout &ly) {
  27. megdnn_assert(ly.ndim && ly.ndim <= 4 && ly.is_contiguous());
  28. int dim[4] = {1, 1, 1, 1}, stride[4] = {1, 1, 1, 1};
  29. for (size_t i = 0; i < ly.ndim; ++ i) {
  30. dim[i] = ly.shape[i];
  31. stride[i] = ly.stride[i];
  32. }
  33. cudnnTensorDescriptor_t ret;
  34. cudnn_check(cudnnCreateTensorDescriptor(&ret));
  35. // cudnn requires tensors to be at-least 4D
  36. cudnn_check(cudnnSetTensor4dDescriptorEx(ret,
  37. CUDNN_DATA_FLOAT,
  38. dim[0], dim[1], dim[2], dim[3],
  39. stride[0], stride[1], stride[2], stride[3]));
  40. return ret;
  41. }
  42. void run_tensor_add(
  43. Handle *handle_cuda,
  44. const TensorND &a, const TensorND &b,
  45. const TensorND &c) {
  46. #if 1
  47. cudnnHandle_t cudnn_handle;
  48. cudnn_check(cudnnCreate(&cudnn_handle));
  49. cuda_check(cudaDeviceSynchronize());
  50. cuda_check(cudaMemcpy(c.raw_ptr, a.raw_ptr, a.layout.span().dist_byte(),
  51. cudaMemcpyDeviceToDevice));
  52. auto bdesc = make_cudnn_tensor_desc(b.layout),
  53. cdesc = make_cudnn_tensor_desc(c.layout);
  54. float alpha = 1, beta = 1;
  55. cudaProfilerStart();
  56. cudnn_check(cudnnAddTensor(cudnn_handle,
  57. &alpha, bdesc, b.raw_ptr,
  58. &beta, cdesc, c.raw_ptr));
  59. cudaProfilerStop();
  60. cudnn_check(cudnnDestroyTensorDescriptor(cdesc));
  61. cudnn_check(cudnnDestroyTensorDescriptor(bdesc));
  62. cudnn_check(cudnnDestroy(cudnn_handle));
  63. cuda_check(cudaMemset(c.raw_ptr, 0, c.layout.span().dist_byte()));
  64. cuda_check(cudaDeviceSynchronize());
  65. #endif
  66. auto opr = handle_cuda->create_operator<ElemwiseForward>();
  67. opr->param().mode = ElemwiseForward::Mode::ADD;
  68. cudaProfilerStart();
  69. opr->exec({a, b}, c);
  70. cudaProfilerStop();
  71. }
  72. } // anonymous namespace
  73. template<typename tag>
  74. class CUDA_ELEMWISE: public CUDA {
  75. };
  76. TYPED_TEST_CASE(CUDA_ELEMWISE, elemwise::test_types);
  77. TYPED_TEST(CUDA_ELEMWISE, run) {
  78. elemwise::run_test<TypeParam>(this->handle_cuda());
  79. }
  80. TEST_F(CUDA, ELEMWISE_IBYTE) {
  81. Checker<ElemwiseForward> checker(handle_cuda());
  82. using Mode = ElemwiseForward::Param::Mode;
  83. UniformIntRNG i_rng{-128, 127};
  84. UniformIntRNG ui_rng{0, 255};
  85. checker.set_rng(0, &i_rng);
  86. auto run_unary = [&](size_t N, Mode mode, DType dtype) {
  87. checker.set_param(mode).set_dtype(0, dtype);
  88. checker.execs({{N}, {}});
  89. };
  90. #define RUN_UNARY_IBYTE(_dt) \
  91. run_unary(100, Mode::RELU, _dt); \
  92. run_unary(100, Mode::ABS, _dt);
  93. RUN_UNARY_IBYTE(dtype::Int8());
  94. checker.set_rng(0, &i_rng);
  95. RUN_UNARY_IBYTE(dtype::Uint8());
  96. #undef RUN_UNARY_IBYTE
  97. auto run_binary = [&](size_t N, size_t C, size_t H, size_t W, Mode mode,
  98. DType dtype) {
  99. checker.set_param(mode).set_dtype(0, dtype).set_dtype(1, dtype);
  100. checker.execs({{5}, {5}, {}});
  101. checker.execs({{4}, {4}, {}});
  102. checker.execs({{4}, {1}, {}});
  103. checker.execs({{N, C / 4, H, W, 4}, {N, C / 4, H, W, 4}, {}});
  104. checker.execs({{N, C / 4, H, W, 4}, {1, C / 4, 1, 1, 4}, {}});
  105. checker.execs({{N, C / 32, H, W, 32}, {N, C / 32, H, W, 32}, {}});
  106. checker.execs({{N, C / 32, H, W, 32}, {1, C / 32, 1, 1, 32}, {}});
  107. checker.execs({{3, 5, 7}, {3, 5, 7}, {}});
  108. checker.execs({{3, 5, 7}, {3, 5, 1}, {}});
  109. checker.execs({{3, 5, 1}, {3, 5, 7}, {}});
  110. checker.execs({{1}, {3, 5, 7}, {}});
  111. checker.execs({{3, 5, 7}, {1}, {}});
  112. };
  113. #define RUN_BINARY_IBYTE(_dt) \
  114. run_binary(4, 32, 10, 10, Mode::ADD, _dt); \
  115. run_binary(4, 32, 10, 10, Mode::MUL, _dt); \
  116. run_binary(4, 32, 10, 10, Mode::MAX, _dt); \
  117. run_binary(4, 32, 10, 10, Mode::MIN, _dt); \
  118. run_binary(4, 32, 10, 10, Mode::SUB, _dt);
  119. checker.set_rng(0, &i_rng).set_rng(1, &i_rng);
  120. RUN_BINARY_IBYTE(dtype::Int8());
  121. checker.set_rng(0, &ui_rng).set_rng(1, &ui_rng);
  122. RUN_BINARY_IBYTE(dtype::Uint8());
  123. #undef RUN_BINARY_IBYTE
  124. auto run_ternary = [&](size_t N, size_t C, size_t H, size_t W, Mode mode,
  125. DType dtype) {
  126. checker.set_param(mode)
  127. .set_dtype(0, dtype)
  128. .set_dtype(1, dtype)
  129. .set_dtype(2, dtype);
  130. checker.execs({{5}, {5}, {5}, {}});
  131. checker.execs({{4}, {4}, {1}, {}});
  132. checker.execs({{N, C / 4, H, W, 4},
  133. {N, C / 4, H, W, 4},
  134. {N, C / 4, H, W, 4},
  135. {}});
  136. checker.execs({{N, C / 4, H, W, 4},
  137. {1, C / 4, 1, 1, 4},
  138. {1, C / 4, 1, 1, 4},
  139. {}});
  140. checker.execs({{N, C / 32, H, W, 32},
  141. {N, C / 32, H, W, 32},
  142. {N, C / 32, H, W, 32},
  143. {}});
  144. checker.execs({{N, C / 32, H, W, 32},
  145. {1, C / 32, 1, 1, 32},
  146. {1, C / 32, 1, 1, 32},
  147. {}});
  148. checker.execs({{1}, {3, 5, 7}, {3, 5, 7}, {}});
  149. checker.execs({{3, 5, 7}, {3, 5, 1}, {3, 5, 1}, {}});
  150. checker.execs({{3, 5, 1}, {3, 5, 7}, {3, 5, 1}, {}});
  151. checker.execs({{1}, {3, 5, 7}, {1}, {}});
  152. checker.execs({{3, 5, 7}, {1}, {3, 5, 7}, {}});
  153. };
  154. #define RUN_TERNARY_IBYTE(_dt) \
  155. run_ternary(4, 32, 10, 10, Mode::FUSE_MUL_ADD3, _dt);
  156. checker.set_rng(0, &i_rng).set_rng(1, &i_rng);
  157. RUN_TERNARY_IBYTE(dtype::Int8());
  158. checker.set_rng(0, &ui_rng).set_rng(1, &ui_rng);
  159. RUN_TERNARY_IBYTE(dtype::Uint8());
  160. #undef RUN_TERNARY_IBYTE
  161. }
  162. //! the memory of this test case is too large, sometimes will fail on tx1
  163. TEST_F(CUDA, ELEMWISE_BENCHMARK_DENSE) {
  164. constexpr size_t A = 256 * 1024 * 64,
  165. S0 = 16, S1 = 256, S2 = 64, S3 = 64;
  166. static_assert(A == S0 * S1 * S2 * S3, "bad value");
  167. SyncedTensor<>
  168. t0(handle_cuda(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()}),
  169. t1(handle_cuda(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()});
  170. UniformFloatRNG rng{-2.f, 2.f};
  171. rng.gen(t0.tensornd_host());
  172. run_tensor_add(handle_cuda(),
  173. t0.tensornd_dev(), t0.tensornd_dev(), t1.tensornd_dev());
  174. auto p0 = t0.ptr_host(), p1 = t1.ptr_host();
  175. for (size_t i = 0; i < A; ++ i) {
  176. ASSERT_EQ(p0[i] + p0[i], p1[i]) << "at index " << i << "/" << A;
  177. }
  178. }
  179. #if MEGDNN_WITH_BENCHMARK
  180. TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_101) {
  181. constexpr size_t A = 511, B = 509, C0 = 23, C1 = 23, C = C0 * C1;
  182. SyncedTensor<>
  183. t0(handle_cuda(), {TensorShape{A, B, C0, C1}, dtype::Float32()}),
  184. t1(handle_cuda(), {TensorShape{1, B, 1, 1}, dtype::Float32()}),
  185. t2(handle_cuda(), {TensorShape{A, B, C0, C1}, dtype::Float32()});
  186. UniformFloatRNG rng{-2.f, 2.f};
  187. rng.gen(t0.tensornd_host());
  188. rng.gen(t1.tensornd_host());
  189. run_tensor_add(handle_cuda(),
  190. t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
  191. auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
  192. for (size_t i = 0; i < A; ++ i) {
  193. for (size_t j = 0; j < B; ++ j) {
  194. for (size_t k = 0; k < C; ++ k) {
  195. auto off = i * B * C + j * C + k;
  196. ASSERT_EQ(p0[off] + p1[j], p2[off]);
  197. }
  198. }
  199. }
  200. }
  201. TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_10) {
  202. constexpr size_t A = 11583, B = 11587;
  203. SyncedTensor<> t0(handle_cuda(), {TensorShape{A, B}, dtype::Float32()}),
  204. t1(handle_cuda(), {TensorShape{1, B}, dtype::Float32()}),
  205. t2(handle_cuda(), {TensorShape{A, B}, dtype::Float32()});
  206. UniformFloatRNG rng{-2.f, 2.f};
  207. rng.gen(t0.tensornd_host());
  208. rng.gen(t1.tensornd_host());
  209. run_tensor_add(handle_cuda(),
  210. t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
  211. auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
  212. for (size_t i = 0; i < A; ++ i) {
  213. for (size_t j = 0; j < B; ++ j) {
  214. auto off = i * B + j;
  215. ASSERT_EQ(p0[off] + p1[j], p2[off]);
  216. }
  217. }
  218. }
  219. TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_01) {
  220. constexpr size_t A = 11583, B = 11587;
  221. SyncedTensor<> t0(handle_cuda(), {TensorShape{1, A, B}, dtype::Float32()}),
  222. t1(handle_cuda(), {TensorShape{1, A, 1}, dtype::Float32()}),
  223. t2(handle_cuda(), {TensorShape{1, A, B}, dtype::Float32()});
  224. UniformFloatRNG rng{-2.f, 2.f};
  225. rng.gen(t0.tensornd_host());
  226. rng.gen(t1.tensornd_host());
  227. run_tensor_add(handle_cuda(),
  228. t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
  229. auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
  230. for (size_t i = 0; i < A; ++ i) {
  231. for (size_t j = 0; j < B; ++ j) {
  232. auto off = i * B + j;
  233. ASSERT_EQ(p0[off] + p1[i], p2[off]);
  234. }
  235. }
  236. }
  237. TEST_F(CUDA, BENCHMARK_ELEMWISE_IBYTE) {
  238. Benchmarker<ElemwiseForward> bencher(handle_cuda());
  239. using Mode = ElemwiseForward::Param::Mode;
  240. auto run_bench = [&](size_t N, size_t C, size_t H, size_t W) {
  241. size_t nr_times = 100;
  242. bencher.set_times(nr_times)
  243. .set_param(Mode::FUSE_ADD_RELU)
  244. .set_dtype(0, dtype::Int8())
  245. .set_dtype(1, dtype::Int8());
  246. auto time = bencher.execs({{N * C * H * W + 1}, {N * C * H * W + 1}, {}}) /
  247. nr_times;
  248. printf("time = %.2fms, bandwidth = %.2fGB/s\n", time,
  249. (3.0 * (N * C * H * W + 1)) / (time * 1e6));
  250. time = bencher.execs({{N, C / 4, H, W, 4}, {N, C / 4, H, W, 4}, {}}) /
  251. nr_times;
  252. printf("time = %.2fms, bandwidth = %.2fGB/s\n", time,
  253. (3.0 * N * C * H * W) / (time * 1e6));
  254. time = bencher.execs({{N, C / 4, H, W, 4}, {1, C / 4, 1, 1, 4}, {}}) /
  255. nr_times;
  256. printf("time = %.2fms, bandwidth = %.2fGB/s\n", time,
  257. (C + 2.0 * N * C * H * W) / (time * 1e6));
  258. time = bencher.execs({{N, C / 4, H, W, 4}, {1}, {}}) / nr_times;
  259. printf("time = %.2fms, bandwidth = %.2fGB/s\n", time,
  260. (2.0 * N * C * H * W + 1) / (time * 1e6));
  261. time = bencher.execs(
  262. {{N, C / 32, H, W, 32}, {N, C / 32, H, W, 32}, {}}) /
  263. nr_times;
  264. printf("time = %.2fms, bandwidth = %.2fGB/s\n", time,
  265. (3.0 * N * C * H * W) / (time * 1e6));
  266. time = bencher.execs(
  267. {{N, C / 32, H, W, 32}, {1, C / 32, 1, 1, 32}, {}}) /
  268. nr_times;
  269. printf("time = %.2fms, bandwidth = %.2fGB/s\n", time,
  270. (C + 2.0 * N * C * H * W) / (time * 1e6));
  271. bencher.set_dtype(0, dtype::Float32()).set_dtype(1, dtype::Float32());
  272. time = bencher.execs({{N, C / 4, H, W}, {N, C / 4, H, W}, {}}) /
  273. nr_times;
  274. printf("time = %.2fms, bandwidth = %.2fGB/s\n", time,
  275. (3.0 * N * C * H * W) / (time * 1e6));
  276. time = bencher.execs({{N, C / 4, H, W}, {1, C / 4, 1, 1}, {}}) /
  277. nr_times;
  278. printf("time = %.2fms, bandwidth = %.2fGB/s\n", time,
  279. (C + 2.0 * N * C * H * W) / (time * 1e6));
  280. };
  281. run_bench(256, 256, 56, 56);
  282. }
  283. TEST_F(CUDA, BENCHMARK_ELEMWISE_MIN_MAX) {
  284. Benchmarker<ElemwiseForward> bencher(handle_cuda());
  285. using Mode = ElemwiseForward::Param::Mode;
  286. UniformIntRNG const_1{1, 1}, rng{-128, 127};
  287. auto run_bench = [&](size_t N, size_t C, size_t H, size_t W, DType dtype) {
  288. size_t nr_times = 1000;
  289. bencher.set_times(nr_times)
  290. .set_param(Mode::MIN)
  291. .set_rng(0, &rng)
  292. .set_rng(1, &rng)
  293. .set_dtype(0, dtype)
  294. .set_dtype(1, dtype);
  295. auto time =
  296. bencher.execs({{N, C / 4, H, W, 4}, {N, C / 4, H, W, 4}, {}}) /
  297. nr_times;
  298. printf("time = %.2fms, bandwidth = %.2fGB/s\n", time,
  299. (3.0 * N * C * H * W) / (time * 1e6));
  300. bencher.set_param(Mode::MAX).set_rng(0, &const_1).set_rng(1, &const_1);
  301. time = bencher.execs({{N, C / 4, H, W, 4}, {N, C / 4, H, W, 4}, {}}) /
  302. nr_times;
  303. printf("time = %.2fms, bandwidth = %.2fGB/s\n", time,
  304. (3.0 * N * C * H * W) / (time * 1e6));
  305. };
  306. run_bench(256, 256, 56, 56, dtype::Int8());
  307. }
  308. #endif
  309. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台