You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

elemwise.cpp 6.8 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179
  1. /**
  2. * \file dnn/test/rocm/elemwise.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "hcc_detail/hcc_defs_prologue.h"
  12. #include "test/common/elemwise.h"
  13. #include "test/rocm/fixture.h"
  14. #include "megdnn/oprs.h"
  15. #include "test/common/tensor.h"
  16. #include "test/common/rng.h"
  17. #include "hip_header.h"
  18. #include "src/rocm/miopen_with_check.h"
  19. #include "test/rocm/benchmarker.h"
  20. using namespace megdnn;
  21. using namespace test;
  22. namespace {
  23. void run_tensor_add(
  24. Handle *handle_rocm,
  25. const TensorND &a, const TensorND &b,
  26. const TensorND &c) {
  27. auto opr = handle_rocm->create_operator<ElemwiseForward>();
  28. opr->param().mode = ElemwiseForward::Mode::ADD;
  29. hipProfilerStart();
  30. opr->exec({a, b}, c);
  31. hipProfilerStop();
  32. }
  33. using Mode = ElemwiseForward::Mode;
  34. template <Mode mode>
  35. void run_elemwise_benchmark(Handle* handle_rocm, Handle* handle_naive,
  36. TensorShapeArray shapes, DType dtype) {
  37. auto benchmarker =
  38. ROCMBenchmarker<ElemwiseForward>(handle_rocm, handle_naive);
  39. benchmarker.set_display(true);
  40. ElemwiseForward::Param param;
  41. param.mode = mode;
  42. benchmarker.set_param(param);
  43. TensorShape dst_shp;
  44. ElemwiseForward::deduce_shape(shapes, dst_shp);
  45. shapes.push_back(dst_shp);
  46. for (size_t i = 0; i < shapes.size(); i++) {
  47. benchmarker.set_dtype(i, dtype);
  48. }
  49. float io = 0.f;
  50. for (auto&& shp : shapes) {
  51. io += 1.f * shp.total_nr_elems() * dtype.size();
  52. }
  53. auto time_ms = benchmarker.execs(shapes);
  54. printf("io = %.3f GB, bandwidth = %.3f GB/s\n", io / 1e9,
  55. io / (1e6 * time_ms));
  56. }
  57. } // anonymous namespace
  58. template <typename tag>
  59. class ROCM_ELEMWISE : public ROCM {};
  60. TYPED_TEST_CASE(ROCM_ELEMWISE, elemwise::test_types);
  61. TYPED_TEST(ROCM_ELEMWISE, run) {
  62. elemwise::run_test<TypeParam>(this->handle_rocm());
  63. }
  64. //! the memory of this test case is too large, sometimes will fail on tx1
  65. TEST_F(ROCM, ELEMWISE_BENCHMARK_DENSE) {
  66. constexpr size_t A = 1024 * 1024 * 64,
  67. S0 = 64, S1 = 256, S2 = 64, S3 = 64;
  68. static_assert(A == S0 * S1 * S2 * S3, "bad value");
  69. SyncedTensor<>
  70. t0(handle_rocm(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()}),
  71. t1(handle_rocm(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()});
  72. UniformFloatRNG rng{-2.f, 2.f};
  73. rng.gen(t0.tensornd_host());
  74. run_tensor_add(handle_rocm(),
  75. t0.tensornd_dev(), t0.tensornd_dev(), t1.tensornd_dev());
  76. auto p0 = t0.ptr_host(), p1 = t1.ptr_host();
  77. for (size_t i = 0; i < A; ++ i) {
  78. ASSERT_EQ(p0[i] + p0[i], p1[i]) << "at index " << i << "/" << A;
  79. }
  80. }
  81. #if MEGDNN_WITH_BENCHMARK
  82. TEST_F(ROCM, ELEMWISE_BENCHMARK_BCAST_101) {
  83. constexpr size_t A = 511, B = 509, C0 = 23, C1 = 23, C = C0 * C1;
  84. SyncedTensor<>
  85. t0(handle_rocm(), {TensorShape{A, B, C0, C1}, dtype::Float32()}),
  86. t1(handle_rocm(), {TensorShape{1, B, 1, 1}, dtype::Float32()}),
  87. t2(handle_rocm(), {TensorShape{A, B, C0, C1}, dtype::Float32()});
  88. UniformFloatRNG rng{-2.f, 2.f};
  89. rng.gen(t0.tensornd_host());
  90. rng.gen(t1.tensornd_host());
  91. run_tensor_add(handle_rocm(),
  92. t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
  93. auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
  94. for (size_t i = 0; i < A; ++ i) {
  95. for (size_t j = 0; j < B; ++ j) {
  96. for (size_t k = 0; k < C; ++ k) {
  97. auto off = i * B * C + j * C + k;
  98. ASSERT_EQ(p0[off] + p1[j], p2[off]);
  99. }
  100. }
  101. }
  102. }
  103. TEST_F(ROCM, ELEMWISE_BENCHMARK_BCAST_10) {
  104. constexpr size_t A = 11583, B = 11587;
  105. SyncedTensor<> t0(handle_rocm(), {TensorShape{A, B}, dtype::Float32()}),
  106. t1(handle_rocm(), {TensorShape{1, B}, dtype::Float32()}),
  107. t2(handle_rocm(), {TensorShape{A, B}, dtype::Float32()});
  108. UniformFloatRNG rng{-2.f, 2.f};
  109. rng.gen(t0.tensornd_host());
  110. rng.gen(t1.tensornd_host());
  111. run_tensor_add(handle_rocm(),
  112. t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
  113. auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
  114. for (size_t i = 0; i < A; ++ i) {
  115. for (size_t j = 0; j < B; ++ j) {
  116. auto off = i * B + j;
  117. ASSERT_EQ(p0[off] + p1[j], p2[off]);
  118. }
  119. }
  120. }
  121. TEST_F(ROCM, ELEMWISE_BENCHMARK_BCAST_01) {
  122. constexpr size_t A = 11583, B = 11587;
  123. SyncedTensor<> t0(handle_rocm(), {TensorShape{1, A, B}, dtype::Float32()}),
  124. t1(handle_rocm(), {TensorShape{1, A, 1}, dtype::Float32()}),
  125. t2(handle_rocm(), {TensorShape{1, A, B}, dtype::Float32()});
  126. UniformFloatRNG rng{-2.f, 2.f};
  127. rng.gen(t0.tensornd_host());
  128. rng.gen(t1.tensornd_host());
  129. run_tensor_add(handle_rocm(),
  130. t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
  131. auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
  132. for (size_t i = 0; i < A; ++ i) {
  133. for (size_t j = 0; j < B; ++ j) {
  134. auto off = i * B + j;
  135. ASSERT_EQ(p0[off] + p1[i], p2[off]);
  136. }
  137. }
  138. }
  139. TEST_F(ROCM, ELEMWISE_BENCHMARK) {
  140. using Mode = ElemwiseForward::Mode;
  141. run_elemwise_benchmark<Mode::ADD>(handle_rocm(), handle_naive(false),
  142. {{32, 128, 56, 56}, {32, 128, 56, 56}},
  143. dtype::Float32());
  144. run_elemwise_benchmark<Mode::ADD>(handle_rocm(), handle_naive(false),
  145. {{32, 128, 56, 56}, {1, 128, 1, 1}},
  146. dtype::Float32());
  147. run_elemwise_benchmark<Mode::FUSE_ADD_RELU>(handle_rocm(), handle_naive(false),
  148. {{32, 128, 56, 56}, {1, 128, 1, 1}},
  149. dtype::Float32());
  150. run_elemwise_benchmark<Mode::FUSE_MUL_ADD3>(
  151. handle_rocm(), handle_naive(false),
  152. {{32, 128, 56, 56}, {1, 128, 1, 1}, {32, 128, 56, 56}},
  153. dtype::Float32());
  154. }
  155. TEST_F(ROCM, ELEMWISE_BENCHMARK_PEAK_BANDWIDTH) {
  156. using Mode = ElemwiseForward::Mode;
  157. run_elemwise_benchmark<Mode::FUSE_MUL_ADD4>(
  158. handle_rocm(), handle_naive(false),
  159. {{10000, 10000}, {10000, 10000}, {10000, 10000}, {10000, 10000}},
  160. dtype::Float32());
  161. }
  162. #endif
  163. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台