You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

elemwise.cpp 18 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429
  1. #include "test/common/elemwise.h"
  2. #include "./utils.h"
  3. #include "megdnn/oprs.h"
  4. #include "test/common/benchmarker.h"
  5. #include "test/common/checker.h"
  6. #include "test/common/rng.h"
  7. #include "test/common/tensor.h"
  8. #include "test/cuda/fixture.h"
  9. #include <cuda_profiler_api.h>
  10. #include <cudnn.h>
  11. using namespace megdnn;
  12. using namespace test;
  13. #define cudnn_check(e) megdnn_assert((e) == CUDNN_STATUS_SUCCESS)
  14. namespace {
  15. __attribute__((unused)) cudnnTensorDescriptor_t make_cudnn_tensor_desc(
  16. const TensorLayout& ly) {
  17. megdnn_assert(ly.ndim && ly.ndim <= 4 && ly.is_contiguous());
  18. int dim[4] = {1, 1, 1, 1}, stride[4] = {1, 1, 1, 1};
  19. for (size_t i = 0; i < ly.ndim; ++i) {
  20. dim[i] = ly.shape[i];
  21. stride[i] = ly.stride[i];
  22. }
  23. cudnnTensorDescriptor_t ret;
  24. cudnn_check(cudnnCreateTensorDescriptor(&ret));
  25. // cudnn requires tensors to be at-least 4D
  26. cudnn_check(cudnnSetTensor4dDescriptorEx(
  27. ret, CUDNN_DATA_FLOAT, dim[0], dim[1], dim[2], dim[3], stride[0], stride[1],
  28. stride[2], stride[3]));
  29. return ret;
  30. }
  31. void run_tensor_add(
  32. Handle* handle_cuda, const TensorND& a, const TensorND& b, const TensorND& c) {
  33. #if 1
  34. cudnnHandle_t cudnn_handle;
  35. cudnn_check(cudnnCreate(&cudnn_handle));
  36. cuda_check(cudaDeviceSynchronize());
  37. cuda_check(cudaMemcpy(
  38. c.raw_ptr(), a.raw_ptr(), a.layout.span().dist_byte(),
  39. cudaMemcpyDeviceToDevice));
  40. auto bdesc = make_cudnn_tensor_desc(b.layout),
  41. cdesc = make_cudnn_tensor_desc(c.layout);
  42. float alpha = 1, beta = 1;
  43. cudaProfilerStart();
  44. cudnn_check(cudnnAddTensor(
  45. cudnn_handle, &alpha, bdesc, b.raw_ptr(), &beta, cdesc, c.raw_ptr()));
  46. cudaProfilerStop();
  47. cudnn_check(cudnnDestroyTensorDescriptor(cdesc));
  48. cudnn_check(cudnnDestroyTensorDescriptor(bdesc));
  49. cudnn_check(cudnnDestroy(cudnn_handle));
  50. cuda_check(cudaMemset(c.raw_ptr(), 0, c.layout.span().dist_byte()));
  51. cuda_check(cudaDeviceSynchronize());
  52. #endif
  53. auto opr = handle_cuda->create_operator<ElemwiseForward>();
  54. opr->param().mode = ElemwiseForward::Mode::ADD;
  55. cudaProfilerStart();
  56. opr->exec({a, b}, c);
  57. cudaProfilerStop();
  58. }
  59. } // anonymous namespace
  60. template <typename tag>
  61. class CUDA_ELEMWISE : public CUDA {};
  62. TYPED_TEST_CASE(CUDA_ELEMWISE, elemwise::test_types);
  63. TYPED_TEST(CUDA_ELEMWISE, run) {
  64. elemwise::run_test<TypeParam>(this->handle_cuda());
  65. }
  66. TEST_F(CUDA, ELEMWISE_IBYTE) {
  67. Checker<ElemwiseForward> checker(handle_cuda());
  68. using Mode = ElemwiseForward::Param::Mode;
  69. UniformIntRNG i_rng{-128, 127};
  70. UniformIntRNG ui_rng{0, 255};
  71. checker.set_rng(0, &i_rng);
  72. auto run_unary = [&](size_t N, Mode mode, DType dtype) {
  73. checker.set_param(mode).set_dtype(0, dtype);
  74. checker.execs({{N}, {}});
  75. };
  76. #define RUN_UNARY_IBYTE(_dt) \
  77. run_unary(100, Mode::RELU, _dt); \
  78. run_unary(100, Mode::ABS, _dt);
  79. RUN_UNARY_IBYTE(dtype::Int8());
  80. checker.set_rng(0, &i_rng);
  81. RUN_UNARY_IBYTE(dtype::Uint8());
  82. #undef RUN_UNARY_IBYTE
  83. auto run_binary = [&](size_t N, size_t C, size_t H, size_t W, Mode mode,
  84. DType dtype) {
  85. checker.set_param(mode).set_dtype(0, dtype).set_dtype(1, dtype);
  86. checker.execs({{5}, {5}, {}});
  87. checker.execs({{4}, {4}, {}});
  88. checker.execs({{4}, {1}, {}});
  89. checker.execs({{N, C / 4, H, W, 4}, {N, C / 4, H, W, 4}, {}});
  90. checker.execs({{N, C / 4, H, W, 4}, {1, C / 4, 1, 1, 4}, {}});
  91. checker.execs({{N, C / 32, H, W, 32}, {N, C / 32, H, W, 32}, {}});
  92. checker.execs({{N, C / 32, H, W, 32}, {1, C / 32, 1, 1, 32}, {}});
  93. checker.execs({{3, 5, 7}, {3, 5, 7}, {}});
  94. checker.execs({{3, 5, 7}, {3, 5, 1}, {}});
  95. checker.execs({{3, 5, 1}, {3, 5, 7}, {}});
  96. checker.execs({{1}, {3, 5, 7}, {}});
  97. checker.execs({{3, 5, 7}, {1}, {}});
  98. };
  99. #define RUN_BINARY_IBYTE(_dt) \
  100. run_binary(4, 32, 10, 10, Mode::ADD, _dt); \
  101. run_binary(4, 32, 10, 10, Mode::MUL, _dt); \
  102. run_binary(4, 32, 10, 10, Mode::MAX, _dt); \
  103. run_binary(4, 32, 10, 10, Mode::MIN, _dt); \
  104. run_binary(4, 32, 10, 10, Mode::SUB, _dt);
  105. checker.set_rng(0, &i_rng).set_rng(1, &i_rng);
  106. RUN_BINARY_IBYTE(dtype::Int8());
  107. checker.set_rng(0, &ui_rng).set_rng(1, &ui_rng);
  108. RUN_BINARY_IBYTE(dtype::Uint8());
  109. #undef RUN_BINARY_IBYTE
  110. auto run_ternary = [&](size_t N, size_t C, size_t H, size_t W, Mode mode,
  111. DType dtype) {
  112. checker.set_param(mode).set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(
  113. 2, dtype);
  114. checker.execs({{5}, {5}, {5}, {}});
  115. checker.execs({{4}, {4}, {1}, {}});
  116. checker.execs(
  117. {{N, C / 4, H, W, 4}, {N, C / 4, H, W, 4}, {N, C / 4, H, W, 4}, {}});
  118. checker.execs(
  119. {{N, C / 4, H, W, 4}, {1, C / 4, 1, 1, 4}, {1, C / 4, 1, 1, 4}, {}});
  120. checker.execs(
  121. {{N, C / 32, H, W, 32},
  122. {N, C / 32, H, W, 32},
  123. {N, C / 32, H, W, 32},
  124. {}});
  125. checker.execs(
  126. {{N, C / 32, H, W, 32},
  127. {1, C / 32, 1, 1, 32},
  128. {1, C / 32, 1, 1, 32},
  129. {}});
  130. checker.execs({{1}, {3, 5, 7}, {3, 5, 7}, {}});
  131. checker.execs({{3, 5, 7}, {3, 5, 1}, {3, 5, 1}, {}});
  132. checker.execs({{3, 5, 1}, {3, 5, 7}, {3, 5, 1}, {}});
  133. checker.execs({{1}, {3, 5, 7}, {1}, {}});
  134. checker.execs({{3, 5, 7}, {1}, {3, 5, 7}, {}});
  135. };
  136. #define RUN_TERNARY_IBYTE(_dt) run_ternary(4, 32, 10, 10, Mode::FUSE_MUL_ADD3, _dt);
  137. checker.set_rng(0, &i_rng).set_rng(1, &i_rng);
  138. RUN_TERNARY_IBYTE(dtype::Int8());
  139. checker.set_rng(0, &ui_rng).set_rng(1, &ui_rng);
  140. RUN_TERNARY_IBYTE(dtype::Uint8());
  141. #undef RUN_TERNARY_IBYTE
  142. }
  143. // from common/elemwise.cpp
  144. TEST_F(CUDA, ELEMWISE_BFLOAT16) {
  145. using Mode = ElemwiseForward::Param::Mode;
  146. Checker<ElemwiseForward> checker(handle_cuda());
  147. // unary
  148. #define UNARY_TEST_CASE(_optr) \
  149. checker.set_param(Mode::_optr).execs({{1, 127}, {}}); \
  150. checker.set_param(Mode::_optr).execs({{1, 7}, {}});
  151. #define BUILD_UNARY_TEST_CASE_FLOAT \
  152. UNARY_TEST_CASE(ABS) \
  153. UNARY_TEST_CASE(LOG) \
  154. UNARY_TEST_CASE(COS) \
  155. UNARY_TEST_CASE(SIN) \
  156. UNARY_TEST_CASE(FLOOR) \
  157. UNARY_TEST_CASE(CEIL) \
  158. UNARY_TEST_CASE(SIGMOID) \
  159. UNARY_TEST_CASE(EXP) \
  160. UNARY_TEST_CASE(TANH) \
  161. UNARY_TEST_CASE(FAST_TANH) \
  162. UNARY_TEST_CASE(RELU) \
  163. UNARY_TEST_CASE(ROUND)
  164. checker.set_dtype(0, dtype::BFloat16());
  165. checker.set_dtype(1, dtype::BFloat16());
  166. UniformFloatRNG rng0(1e-2, 6e1);
  167. checker.set_rng(0, &rng0);
  168. checker.set_epsilon(1e-2);
  169. BUILD_UNARY_TEST_CASE_FLOAT
  170. #undef UNARY_TEST_CASE
  171. #undef BUILD_UNARY_TEST_CASE_FLOAT
  172. // binary
  173. #define BINARY_COMPLATE_TEST_CASE(_optr) \
  174. checker.set_param(Mode::_optr).execs({{3, 4, 7}, {3, 4, 7}, {}}); \
  175. checker.set_param(Mode::_optr).execs({{3, 4, 5, 7}, {1, 4, 1, 1}, {}}); \
  176. checker.set_param(Mode::_optr).execs({{1, 4, 1, 1}, {3, 4, 5, 7}, {}}); \
  177. checker.set_param(Mode::_optr).execs({{3, 4, 7}, {1, 4, 1}, {}}); \
  178. checker.set_param(Mode::_optr).execs({{1, 4, 1}, {3, 4, 7}, {}}); \
  179. checker.set_param(Mode::_optr).execs({{3, 4, 5, 7}, {1, 1, 1, 1}, {}}); \
  180. checker.set_param(Mode::_optr).execs({{1, 1, 1, 1}, {3, 4, 5, 7}, {}}); \
  181. checker.set_param(Mode::_optr).execs({{1, 7}, {1, 7}, {}}); \
  182. checker.set_param(Mode::_optr).execs({{1, 2, 2}, {1, 2, 1}, {}}); \
  183. checker.set_param(Mode::_optr).execs({{1, 2, 1}, {1, 2, 2}, {}}); \
  184. checker.set_param(Mode::_optr).execs({{1, 2, 2}, {1, 1, 1}, {}}); \
  185. checker.set_param(Mode::_optr).execs({{1, 1, 1}, {1, 2, 2}, {}}); \
  186. checker.set_param(Mode::_optr).execs({{3, 4, 1}, {3, 4, 1}, {}});
  187. #define BUILD_BINARY_COMPLATE_TEST_CASE \
  188. BINARY_COMPLATE_TEST_CASE(ADD) \
  189. BINARY_COMPLATE_TEST_CASE(MUL) \
  190. BINARY_COMPLATE_TEST_CASE(MAX) \
  191. BINARY_COMPLATE_TEST_CASE(MIN) \
  192. BINARY_COMPLATE_TEST_CASE(SUB)
  193. UniformFloatRNG rng1(1e-5, 7e1);
  194. checker.set_rng(0, &rng1);
  195. checker.set_epsilon(1e-2);
  196. checker.set_dtype(0, dtype::BFloat16());
  197. checker.set_dtype(1, dtype::BFloat16());
  198. BUILD_BINARY_COMPLATE_TEST_CASE
  199. #undef BINARY_COMPLATE_TEST_CASE
  200. #undef BUILD_BINARY_COMPLATE_TEST_CASE
  201. // ternary
  202. #define TERNARY_COMPLATE_TEST_CASE(_optr) \
  203. checker.set_param(Mode::_optr).execs({{3, 4, 7}, {3, 4, 7}, {3, 4, 7}, {}}); \
  204. checker.set_param(Mode::_optr) \
  205. .execs({{1, 4, 1, 1}, {3, 4, 5, 7}, {1, 4, 1, 1}, {}}); \
  206. checker.set_param(Mode::_optr).execs({{1, 4, 1}, {3, 4, 7}, {1, 4, 1}, {}}); \
  207. checker.set_param(Mode::_optr) \
  208. .execs({{3, 4, 5, 7}, {3, 4, 5, 7}, {1, 1, 1, 1}, {}}); \
  209. checker.set_param(Mode::_optr).execs({{1, 7}, {1, 7}, {1, 7}, {}}); \
  210. checker.set_param(Mode::_optr).execs({{1, 2, 1}, {1, 2, 2}, {1, 2, 1}, {}}); \
  211. checker.set_param(Mode::_optr).execs({{1, 2, 2}, {1, 2, 2}, {1, 1, 1}, {}}); \
  212. checker.set_param(Mode::_optr).execs({{3, 4, 1}, {3, 4, 1}, {3, 4, 1}, {}});
  213. #define BUILD_TERNARY_COMPLATE_TEST_CASE TERNARY_COMPLATE_TEST_CASE(FUSE_MUL_ADD3)
  214. UniformFloatRNG rng2(1e-5, 7e1);
  215. checker.set_rng(0, &rng2);
  216. checker.set_epsilon(1e-2);
  217. checker.set_dtype(0, dtype::BFloat16());
  218. checker.set_dtype(1, dtype::BFloat16());
  219. checker.set_dtype(2, dtype::BFloat16());
  220. BUILD_TERNARY_COMPLATE_TEST_CASE
  221. #undef TERNARY_COMPLATE_TEST_CASE
  222. #undef BUILD_TERNARY_COMPLATE_TEST_CASE
  223. }
  224. TEST_F(CUDA, ELEMWISE_ADD_BCAST_10_INT8_INPLACE) {
  225. constexpr size_t A = 2, B = 48, C0 = 14, C1 = 14, C = C0 * C1;
  226. SyncedTensor<dt_int8> t0(handle_cuda(), {TensorShape{A, B, C0, C1}, dtype::Int8()}),
  227. t1(handle_cuda(), {TensorShape{1, B, C0, C1}, dtype::Int8()}),
  228. t2(handle_cuda(), {TensorShape{A, B, C0, C1}, dtype::Int8()});
  229. UniformIntRNG rng{-128, 127};
  230. rng.gen(t0.tensornd_host());
  231. rng.gen(t1.tensornd_host());
  232. auto p0 = t0.ptr_host(), p1 = t1.ptr_host();
  233. auto p2 = t2.ptr_mutable_host();
  234. for (size_t i = 0; i < A; ++i) {
  235. for (size_t j = 0; j < B; ++j) {
  236. for (size_t k = 0; k < C; ++k) {
  237. auto off0 = j * C + k;
  238. auto off1 = i * B * C + j * C + k;
  239. p2[off1] = p0[off1] + p1[off0];
  240. }
  241. }
  242. }
  243. auto opr = handle_cuda()->create_operator<ElemwiseForward>();
  244. opr->param().mode = ElemwiseForward::Mode::ADD;
  245. opr->exec({t0.tensornd_dev(), t1.tensornd_dev()}, t0.tensornd_dev());
  246. auto pt = t0.ptr_host();
  247. for (size_t i = 0; i < A; ++i) {
  248. for (size_t j = 0; j < B; ++j) {
  249. for (size_t k = 0; k < C; ++k) {
  250. auto off = i * B * C + j * C + k;
  251. ASSERT_EQ(pt[off], p2[off]);
  252. }
  253. }
  254. }
  255. }
  256. //! the memory of this test case is too large, sometimes will fail on tx1
  257. TEST_F(CUDA, ELEMWISE_BENCHMARK_DENSE) {
  258. constexpr size_t A = 256 * 1024 * 64, S0 = 16, S1 = 256, S2 = 64, S3 = 64;
  259. static_assert(A == S0 * S1 * S2 * S3, "bad value");
  260. SyncedTensor<> t0(handle_cuda(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()}),
  261. t1(handle_cuda(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()});
  262. UniformFloatRNG rng{-2.f, 2.f};
  263. rng.gen(t0.tensornd_host());
  264. run_tensor_add(
  265. handle_cuda(), t0.tensornd_dev(), t0.tensornd_dev(), t1.tensornd_dev());
  266. auto p0 = t0.ptr_host(), p1 = t1.ptr_host();
  267. for (size_t i = 0; i < A; ++i) {
  268. ASSERT_EQ(p0[i] + p0[i], p1[i]) << "at index " << i << "/" << A;
  269. }
  270. }
  271. #if MEGDNN_WITH_BENCHMARK
  272. TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_101) {
  273. constexpr size_t A = 511, B = 509, C0 = 23, C1 = 23, C = C0 * C1;
  274. SyncedTensor<> t0(handle_cuda(), {TensorShape{A, B, C0, C1}, dtype::Float32()}),
  275. t1(handle_cuda(), {TensorShape{1, B, 1, 1}, dtype::Float32()}),
  276. t2(handle_cuda(), {TensorShape{A, B, C0, C1}, dtype::Float32()});
  277. UniformFloatRNG rng{-2.f, 2.f};
  278. rng.gen(t0.tensornd_host());
  279. rng.gen(t1.tensornd_host());
  280. run_tensor_add(
  281. handle_cuda(), t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
  282. auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
  283. for (size_t i = 0; i < A; ++i) {
  284. for (size_t j = 0; j < B; ++j) {
  285. for (size_t k = 0; k < C; ++k) {
  286. auto off = i * B * C + j * C + k;
  287. ASSERT_EQ(p0[off] + p1[j], p2[off]);
  288. }
  289. }
  290. }
  291. }
  292. TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_10) {
  293. constexpr size_t A = 11583, B = 11587;
  294. SyncedTensor<> t0(handle_cuda(), {TensorShape{A, B}, dtype::Float32()}),
  295. t1(handle_cuda(), {TensorShape{1, B}, dtype::Float32()}),
  296. t2(handle_cuda(), {TensorShape{A, B}, dtype::Float32()});
  297. UniformFloatRNG rng{-2.f, 2.f};
  298. rng.gen(t0.tensornd_host());
  299. rng.gen(t1.tensornd_host());
  300. run_tensor_add(
  301. handle_cuda(), t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
  302. auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
  303. for (size_t i = 0; i < A; ++i) {
  304. for (size_t j = 0; j < B; ++j) {
  305. auto off = i * B + j;
  306. ASSERT_EQ(p0[off] + p1[j], p2[off]);
  307. }
  308. }
  309. }
  310. TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_01) {
  311. constexpr size_t A = 11583, B = 11587;
  312. SyncedTensor<> t0(handle_cuda(), {TensorShape{1, A, B}, dtype::Float32()}),
  313. t1(handle_cuda(), {TensorShape{1, A, 1}, dtype::Float32()}),
  314. t2(handle_cuda(), {TensorShape{1, A, B}, dtype::Float32()});
  315. UniformFloatRNG rng{-2.f, 2.f};
  316. rng.gen(t0.tensornd_host());
  317. rng.gen(t1.tensornd_host());
  318. run_tensor_add(
  319. handle_cuda(), t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
  320. auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
  321. for (size_t i = 0; i < A; ++i) {
  322. for (size_t j = 0; j < B; ++j) {
  323. auto off = i * B + j;
  324. ASSERT_EQ(p0[off] + p1[i], p2[off]);
  325. }
  326. }
  327. }
  328. TEST_F(CUDA, BENCHMARK_ELEMWISE_IBYTE) {
  329. Benchmarker<ElemwiseForward> bencher(handle_cuda());
  330. using Mode = ElemwiseForward::Param::Mode;
  331. auto run_bench = [&](size_t N, size_t C, size_t H, size_t W) {
  332. size_t nr_times = 100;
  333. bencher.set_times(nr_times)
  334. .set_param(Mode::FUSE_ADD_RELU)
  335. .set_dtype(0, dtype::Int8())
  336. .set_dtype(1, dtype::Int8());
  337. auto time = bencher.execs({{N * C * H * W + 1}, {N * C * H * W + 1}, {}}) /
  338. nr_times;
  339. printf("time = %.2fms, bandwidth = %.2fGB/s\n", time,
  340. (3.0 * (N * C * H * W + 1)) / (time * 1e6));
  341. time = bencher.execs({{N, C / 4, H, W, 4}, {N, C / 4, H, W, 4}, {}}) / nr_times;
  342. printf("time = %.2fms, bandwidth = %.2fGB/s\n", time,
  343. (3.0 * N * C * H * W) / (time * 1e6));
  344. time = bencher.execs({{N, C / 4, H, W, 4}, {1, C / 4, 1, 1, 4}, {}}) / nr_times;
  345. printf("time = %.2fms, bandwidth = %.2fGB/s\n", time,
  346. (C + 2.0 * N * C * H * W) / (time * 1e6));
  347. time = bencher.execs({{N, C / 4, H, W, 4}, {1}, {}}) / nr_times;
  348. printf("time = %.2fms, bandwidth = %.2fGB/s\n", time,
  349. (2.0 * N * C * H * W + 1) / (time * 1e6));
  350. time = bencher.execs({{N, C / 32, H, W, 32}, {N, C / 32, H, W, 32}, {}}) /
  351. nr_times;
  352. printf("time = %.2fms, bandwidth = %.2fGB/s\n", time,
  353. (3.0 * N * C * H * W) / (time * 1e6));
  354. time = bencher.execs({{N, C / 32, H, W, 32}, {1, C / 32, 1, 1, 32}, {}}) /
  355. nr_times;
  356. printf("time = %.2fms, bandwidth = %.2fGB/s\n", time,
  357. (C + 2.0 * N * C * H * W) / (time * 1e6));
  358. bencher.set_dtype(0, dtype::Float32()).set_dtype(1, dtype::Float32());
  359. time = bencher.execs({{N, C / 4, H, W}, {N, C / 4, H, W}, {}}) / nr_times;
  360. printf("time = %.2fms, bandwidth = %.2fGB/s\n", time,
  361. (3.0 * N * C * H * W) / (time * 1e6));
  362. time = bencher.execs({{N, C / 4, H, W}, {1, C / 4, 1, 1}, {}}) / nr_times;
  363. printf("time = %.2fms, bandwidth = %.2fGB/s\n", time,
  364. (C + 2.0 * N * C * H * W) / (time * 1e6));
  365. };
  366. run_bench(256, 256, 56, 56);
  367. }
  368. TEST_F(CUDA, BENCHMARK_ELEMWISE_MIN_MAX) {
  369. Benchmarker<ElemwiseForward> bencher(handle_cuda());
  370. using Mode = ElemwiseForward::Param::Mode;
  371. UniformIntRNG const_1{1, 1}, rng{-128, 127};
  372. auto run_bench = [&](size_t N, size_t C, size_t H, size_t W, DType dtype) {
  373. size_t nr_times = 1000;
  374. bencher.set_times(nr_times)
  375. .set_param(Mode::MIN)
  376. .set_rng(0, &rng)
  377. .set_rng(1, &rng)
  378. .set_dtype(0, dtype)
  379. .set_dtype(1, dtype);
  380. auto time = bencher.execs({{N, C / 4, H, W, 4}, {N, C / 4, H, W, 4}, {}}) /
  381. nr_times;
  382. printf("time = %.2fms, bandwidth = %.2fGB/s\n", time,
  383. (3.0 * N * C * H * W) / (time * 1e6));
  384. bencher.set_param(Mode::MAX).set_rng(0, &const_1).set_rng(1, &const_1);
  385. time = bencher.execs({{N, C / 4, H, W, 4}, {N, C / 4, H, W, 4}, {}}) / nr_times;
  386. printf("time = %.2fms, bandwidth = %.2fGB/s\n", time,
  387. (3.0 * N * C * H * W) / (time * 1e6));
  388. };
  389. run_bench(256, 256, 56, 56, dtype::Int8());
  390. }
  391. #endif
  392. // vim: syntax=cpp.doxygen