You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

elemwise.cpp 6.0 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. #include "hcc_detail/hcc_defs_prologue.h"
  2. #include "megdnn/oprs.h"
  3. #include "test/common/elemwise.h"
  4. #include "test/common/rng.h"
  5. #include "test/common/tensor.h"
  6. #include "test/rocm/fixture.h"
  7. #include "hip_header.h"
  8. #include "src/rocm/miopen_with_check.h"
  9. #include "test/rocm/benchmarker.h"
  10. using namespace megdnn;
  11. using namespace test;
  12. namespace {
  13. void run_tensor_add(
  14. Handle* handle_rocm, const TensorND& a, const TensorND& b, const TensorND& c) {
  15. auto opr = handle_rocm->create_operator<ElemwiseForward>();
  16. opr->param().mode = ElemwiseForward::Mode::ADD;
  17. hipProfilerStart();
  18. opr->exec({a, b}, c);
  19. hipProfilerStop();
  20. }
  21. using Mode = ElemwiseForward::Mode;
  22. template <Mode mode>
  23. void run_elemwise_benchmark(
  24. Handle* handle_rocm, Handle* handle_naive, TensorShapeArray shapes,
  25. DType dtype) {
  26. auto benchmarker = ROCMBenchmarker<ElemwiseForward>(handle_rocm, handle_naive);
  27. benchmarker.set_display(true);
  28. ElemwiseForward::Param param;
  29. param.mode = mode;
  30. benchmarker.set_param(param);
  31. TensorShape dst_shp;
  32. ElemwiseForward::deduce_shape(shapes, dst_shp);
  33. shapes.push_back(dst_shp);
  34. for (size_t i = 0; i < shapes.size(); i++) {
  35. benchmarker.set_dtype(i, dtype);
  36. }
  37. float io = 0.f;
  38. for (auto&& shp : shapes) {
  39. io += 1.f * shp.total_nr_elems() * dtype.size();
  40. }
  41. auto time_ms = benchmarker.execs(shapes);
  42. printf("io = %.3f GB, bandwidth = %.3f GB/s\n", io / 1e9, io / (1e6 * time_ms));
  43. }
  44. } // anonymous namespace
  45. template <typename tag>
  46. class ROCM_ELEMWISE : public ROCM {};
  47. TYPED_TEST_CASE(ROCM_ELEMWISE, elemwise::test_types);
  48. TYPED_TEST(ROCM_ELEMWISE, run) {
  49. elemwise::run_test<TypeParam>(this->handle_rocm());
  50. }
  51. //! the memory of this test case is too large, sometimes will fail on tx1
  52. TEST_F(ROCM, ELEMWISE_BENCHMARK_DENSE) {
  53. constexpr size_t A = 1024 * 1024 * 64, S0 = 64, S1 = 256, S2 = 64, S3 = 64;
  54. static_assert(A == S0 * S1 * S2 * S3, "bad value");
  55. SyncedTensor<> t0(handle_rocm(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()}),
  56. t1(handle_rocm(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()});
  57. UniformFloatRNG rng{-2.f, 2.f};
  58. rng.gen(t0.tensornd_host());
  59. run_tensor_add(
  60. handle_rocm(), t0.tensornd_dev(), t0.tensornd_dev(), t1.tensornd_dev());
  61. auto p0 = t0.ptr_host(), p1 = t1.ptr_host();
  62. for (size_t i = 0; i < A; ++i) {
  63. ASSERT_EQ(p0[i] + p0[i], p1[i]) << "at index " << i << "/" << A;
  64. }
  65. }
  66. #if MEGDNN_WITH_BENCHMARK
  67. TEST_F(ROCM, ELEMWISE_BENCHMARK_BCAST_101) {
  68. constexpr size_t A = 511, B = 509, C0 = 23, C1 = 23, C = C0 * C1;
  69. SyncedTensor<> t0(handle_rocm(), {TensorShape{A, B, C0, C1}, dtype::Float32()}),
  70. t1(handle_rocm(), {TensorShape{1, B, 1, 1}, dtype::Float32()}),
  71. t2(handle_rocm(), {TensorShape{A, B, C0, C1}, dtype::Float32()});
  72. UniformFloatRNG rng{-2.f, 2.f};
  73. rng.gen(t0.tensornd_host());
  74. rng.gen(t1.tensornd_host());
  75. run_tensor_add(
  76. handle_rocm(), t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
  77. auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
  78. for (size_t i = 0; i < A; ++i) {
  79. for (size_t j = 0; j < B; ++j) {
  80. for (size_t k = 0; k < C; ++k) {
  81. auto off = i * B * C + j * C + k;
  82. ASSERT_EQ(p0[off] + p1[j], p2[off]);
  83. }
  84. }
  85. }
  86. }
  87. TEST_F(ROCM, ELEMWISE_BENCHMARK_BCAST_10) {
  88. constexpr size_t A = 11583, B = 11587;
  89. SyncedTensor<> t0(handle_rocm(), {TensorShape{A, B}, dtype::Float32()}),
  90. t1(handle_rocm(), {TensorShape{1, B}, dtype::Float32()}),
  91. t2(handle_rocm(), {TensorShape{A, B}, dtype::Float32()});
  92. UniformFloatRNG rng{-2.f, 2.f};
  93. rng.gen(t0.tensornd_host());
  94. rng.gen(t1.tensornd_host());
  95. run_tensor_add(
  96. handle_rocm(), t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
  97. auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
  98. for (size_t i = 0; i < A; ++i) {
  99. for (size_t j = 0; j < B; ++j) {
  100. auto off = i * B + j;
  101. ASSERT_EQ(p0[off] + p1[j], p2[off]);
  102. }
  103. }
  104. }
  105. TEST_F(ROCM, ELEMWISE_BENCHMARK_BCAST_01) {
  106. constexpr size_t A = 11583, B = 11587;
  107. SyncedTensor<> t0(handle_rocm(), {TensorShape{1, A, B}, dtype::Float32()}),
  108. t1(handle_rocm(), {TensorShape{1, A, 1}, dtype::Float32()}),
  109. t2(handle_rocm(), {TensorShape{1, A, B}, dtype::Float32()});
  110. UniformFloatRNG rng{-2.f, 2.f};
  111. rng.gen(t0.tensornd_host());
  112. rng.gen(t1.tensornd_host());
  113. run_tensor_add(
  114. handle_rocm(), t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
  115. auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
  116. for (size_t i = 0; i < A; ++i) {
  117. for (size_t j = 0; j < B; ++j) {
  118. auto off = i * B + j;
  119. ASSERT_EQ(p0[off] + p1[i], p2[off]);
  120. }
  121. }
  122. }
  123. TEST_F(ROCM, ELEMWISE_BENCHMARK) {
  124. using Mode = ElemwiseForward::Mode;
  125. run_elemwise_benchmark<Mode::ADD>(
  126. handle_rocm(), handle_naive(false), {{32, 128, 56, 56}, {32, 128, 56, 56}},
  127. dtype::Float32());
  128. run_elemwise_benchmark<Mode::ADD>(
  129. handle_rocm(), handle_naive(false), {{32, 128, 56, 56}, {1, 128, 1, 1}},
  130. dtype::Float32());
  131. run_elemwise_benchmark<Mode::FUSE_ADD_RELU>(
  132. handle_rocm(), handle_naive(false), {{32, 128, 56, 56}, {1, 128, 1, 1}},
  133. dtype::Float32());
  134. run_elemwise_benchmark<Mode::FUSE_MUL_ADD3>(
  135. handle_rocm(), handle_naive(false),
  136. {{32, 128, 56, 56}, {1, 128, 1, 1}, {32, 128, 56, 56}}, dtype::Float32());
  137. }
  138. TEST_F(ROCM, ELEMWISE_BENCHMARK_PEAK_BANDWIDTH) {
  139. using Mode = ElemwiseForward::Mode;
  140. run_elemwise_benchmark<Mode::FUSE_MUL_ADD4>(
  141. handle_rocm(), handle_naive(false),
  142. {{10000, 10000}, {10000, 10000}, {10000, 10000}, {10000, 10000}},
  143. dtype::Float32());
  144. }
  145. #endif
  146. // vim: syntax=cpp.doxygen