You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

elemwise.cpp 6.4 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. /**
  2. * \file dnn/test/rocm/elemwise.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "hcc_detail/hcc_defs_prologue.h"
  12. #include "megdnn/oprs.h"
  13. #include "test/common/elemwise.h"
  14. #include "test/common/rng.h"
  15. #include "test/common/tensor.h"
  16. #include "test/rocm/fixture.h"
  17. #include "hip_header.h"
  18. #include "src/rocm/miopen_with_check.h"
  19. #include "test/rocm/benchmarker.h"
  20. using namespace megdnn;
  21. using namespace test;
  22. namespace {
  23. void run_tensor_add(
  24. Handle* handle_rocm, const TensorND& a, const TensorND& b, const TensorND& c) {
  25. auto opr = handle_rocm->create_operator<ElemwiseForward>();
  26. opr->param().mode = ElemwiseForward::Mode::ADD;
  27. hipProfilerStart();
  28. opr->exec({a, b}, c);
  29. hipProfilerStop();
  30. }
  31. using Mode = ElemwiseForward::Mode;
  32. template <Mode mode>
  33. void run_elemwise_benchmark(
  34. Handle* handle_rocm, Handle* handle_naive, TensorShapeArray shapes,
  35. DType dtype) {
  36. auto benchmarker = ROCMBenchmarker<ElemwiseForward>(handle_rocm, handle_naive);
  37. benchmarker.set_display(true);
  38. ElemwiseForward::Param param;
  39. param.mode = mode;
  40. benchmarker.set_param(param);
  41. TensorShape dst_shp;
  42. ElemwiseForward::deduce_shape(shapes, dst_shp);
  43. shapes.push_back(dst_shp);
  44. for (size_t i = 0; i < shapes.size(); i++) {
  45. benchmarker.set_dtype(i, dtype);
  46. }
  47. float io = 0.f;
  48. for (auto&& shp : shapes) {
  49. io += 1.f * shp.total_nr_elems() * dtype.size();
  50. }
  51. auto time_ms = benchmarker.execs(shapes);
  52. printf("io = %.3f GB, bandwidth = %.3f GB/s\n", io / 1e9, io / (1e6 * time_ms));
  53. }
  54. } // anonymous namespace
  55. template <typename tag>
  56. class ROCM_ELEMWISE : public ROCM {};
  57. TYPED_TEST_CASE(ROCM_ELEMWISE, elemwise::test_types);
  58. TYPED_TEST(ROCM_ELEMWISE, run) {
  59. elemwise::run_test<TypeParam>(this->handle_rocm());
  60. }
  61. //! the memory of this test case is too large, sometimes will fail on tx1
  62. TEST_F(ROCM, ELEMWISE_BENCHMARK_DENSE) {
  63. constexpr size_t A = 1024 * 1024 * 64, S0 = 64, S1 = 256, S2 = 64, S3 = 64;
  64. static_assert(A == S0 * S1 * S2 * S3, "bad value");
  65. SyncedTensor<> t0(handle_rocm(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()}),
  66. t1(handle_rocm(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()});
  67. UniformFloatRNG rng{-2.f, 2.f};
  68. rng.gen(t0.tensornd_host());
  69. run_tensor_add(
  70. handle_rocm(), t0.tensornd_dev(), t0.tensornd_dev(), t1.tensornd_dev());
  71. auto p0 = t0.ptr_host(), p1 = t1.ptr_host();
  72. for (size_t i = 0; i < A; ++i) {
  73. ASSERT_EQ(p0[i] + p0[i], p1[i]) << "at index " << i << "/" << A;
  74. }
  75. }
  76. #if MEGDNN_WITH_BENCHMARK
  77. TEST_F(ROCM, ELEMWISE_BENCHMARK_BCAST_101) {
  78. constexpr size_t A = 511, B = 509, C0 = 23, C1 = 23, C = C0 * C1;
  79. SyncedTensor<> t0(handle_rocm(), {TensorShape{A, B, C0, C1}, dtype::Float32()}),
  80. t1(handle_rocm(), {TensorShape{1, B, 1, 1}, dtype::Float32()}),
  81. t2(handle_rocm(), {TensorShape{A, B, C0, C1}, dtype::Float32()});
  82. UniformFloatRNG rng{-2.f, 2.f};
  83. rng.gen(t0.tensornd_host());
  84. rng.gen(t1.tensornd_host());
  85. run_tensor_add(
  86. handle_rocm(), t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
  87. auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
  88. for (size_t i = 0; i < A; ++i) {
  89. for (size_t j = 0; j < B; ++j) {
  90. for (size_t k = 0; k < C; ++k) {
  91. auto off = i * B * C + j * C + k;
  92. ASSERT_EQ(p0[off] + p1[j], p2[off]);
  93. }
  94. }
  95. }
  96. }
  97. TEST_F(ROCM, ELEMWISE_BENCHMARK_BCAST_10) {
  98. constexpr size_t A = 11583, B = 11587;
  99. SyncedTensor<> t0(handle_rocm(), {TensorShape{A, B}, dtype::Float32()}),
  100. t1(handle_rocm(), {TensorShape{1, B}, dtype::Float32()}),
  101. t2(handle_rocm(), {TensorShape{A, B}, dtype::Float32()});
  102. UniformFloatRNG rng{-2.f, 2.f};
  103. rng.gen(t0.tensornd_host());
  104. rng.gen(t1.tensornd_host());
  105. run_tensor_add(
  106. handle_rocm(), t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
  107. auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
  108. for (size_t i = 0; i < A; ++i) {
  109. for (size_t j = 0; j < B; ++j) {
  110. auto off = i * B + j;
  111. ASSERT_EQ(p0[off] + p1[j], p2[off]);
  112. }
  113. }
  114. }
  115. TEST_F(ROCM, ELEMWISE_BENCHMARK_BCAST_01) {
  116. constexpr size_t A = 11583, B = 11587;
  117. SyncedTensor<> t0(handle_rocm(), {TensorShape{1, A, B}, dtype::Float32()}),
  118. t1(handle_rocm(), {TensorShape{1, A, 1}, dtype::Float32()}),
  119. t2(handle_rocm(), {TensorShape{1, A, B}, dtype::Float32()});
  120. UniformFloatRNG rng{-2.f, 2.f};
  121. rng.gen(t0.tensornd_host());
  122. rng.gen(t1.tensornd_host());
  123. run_tensor_add(
  124. handle_rocm(), t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
  125. auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
  126. for (size_t i = 0; i < A; ++i) {
  127. for (size_t j = 0; j < B; ++j) {
  128. auto off = i * B + j;
  129. ASSERT_EQ(p0[off] + p1[i], p2[off]);
  130. }
  131. }
  132. }
  133. TEST_F(ROCM, ELEMWISE_BENCHMARK) {
  134. using Mode = ElemwiseForward::Mode;
  135. run_elemwise_benchmark<Mode::ADD>(
  136. handle_rocm(), handle_naive(false), {{32, 128, 56, 56}, {32, 128, 56, 56}},
  137. dtype::Float32());
  138. run_elemwise_benchmark<Mode::ADD>(
  139. handle_rocm(), handle_naive(false), {{32, 128, 56, 56}, {1, 128, 1, 1}},
  140. dtype::Float32());
  141. run_elemwise_benchmark<Mode::FUSE_ADD_RELU>(
  142. handle_rocm(), handle_naive(false), {{32, 128, 56, 56}, {1, 128, 1, 1}},
  143. dtype::Float32());
  144. run_elemwise_benchmark<Mode::FUSE_MUL_ADD3>(
  145. handle_rocm(), handle_naive(false),
  146. {{32, 128, 56, 56}, {1, 128, 1, 1}, {32, 128, 56, 56}}, dtype::Float32());
  147. }
  148. TEST_F(ROCM, ELEMWISE_BENCHMARK_PEAK_BANDWIDTH) {
  149. using Mode = ElemwiseForward::Mode;
  150. run_elemwise_benchmark<Mode::FUSE_MUL_ADD4>(
  151. handle_rocm(), handle_naive(false),
  152. {{10000, 10000}, {10000, 10000}, {10000, 10000}, {10000, 10000}},
  153. dtype::Float32());
  154. }
  155. #endif
  156. // vim: syntax=cpp.doxygen