You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

powc.cpp.hip 5.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. /**
  2. * \file dnn/src/rocm/powc/powc.cpp.hip
  3. *
  4. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  5. *
  6. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  7. *
  8. * Unless required by applicable law or agreed to in writing,
  9. * software distributed under the License is distributed on an
  10. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  11. */
  12. #include "hcc_detail/hcc_defs_prologue.h"
  13. #include "src/rocm/powc/powc.h.hip"
  14. #include "megdnn/dtype.h"
  15. #include "src/rocm/elemwise_helper.h.hip"
  16. #include <cmath>
  17. #include <limits>
  18. namespace megdnn {
  19. namespace rocm {
  20. // use a namespace (but not anonymous namespace) to avoid name confliction while
  21. // maintaining readability of cuda kernel names
  22. namespace hip_kern {
  23. template <int>
  24. struct PowCIntSmall;
  25. template <>
  26. struct PowCIntSmall<0> {
  27. template <typename T>
  28. static __device__ __forceinline__ T apply(T) {
  29. return static_cast<T>(1);
  30. }
  31. };
  32. template <>
  33. struct PowCIntSmall<1> {
  34. template <typename T>
  35. static __device__ __forceinline__ T apply(T x) {
  36. return x;
  37. }
  38. };
  39. template <>
  40. struct PowCIntSmall<2> {
  41. template <typename T>
  42. static __device__ __forceinline__ T apply(T x) {
  43. return x * x;
  44. }
  45. };
  46. template <>
  47. struct PowCIntSmall<3> {
  48. template <typename T>
  49. static __device__ __forceinline__ T apply(T x) {
  50. return x * x * x;
  51. }
  52. };
  53. template <>
  54. struct PowCIntSmall<4> {
  55. template <typename T>
  56. static __device__ __forceinline__ T apply(T x) {
  57. x = x * x;
  58. return x * x;
  59. }
  60. };
  61. template <int n>
  62. struct PowCIntSmall {
  63. template <typename T>
  64. static __device__ __forceinline__ T apply(T x) {
  65. return PowCIntSmall<-n>::apply(static_cast<T>(1) / x);
  66. }
  67. };
  68. template <typename T>
  69. struct PowCIntOdd {
  70. T exp;
  71. __device__ __forceinline__ T apply(T x) {
  72. return static_cast<T>(copysignf(powf(fabsf(x), exp), x));
  73. }
  74. };
  75. template <typename T>
  76. struct PowCIntEven {
  77. T exp;
  78. __device__ __forceinline__ T apply(T x) {
  79. return static_cast<T>(powf(fabsf(x), exp));
  80. }
  81. };
  82. struct PowCFloatSqrt {
  83. template <typename T>
  84. static __device__ __forceinline__ T apply(T x) {
  85. return static_cast<T>(sqrtf(x));
  86. }
  87. };
  88. struct PowCFloatCbrt {
  89. template <typename T>
  90. static __device__ __forceinline__ T apply(T x) {
  91. return static_cast<T>(cbrtf(x));
  92. }
  93. };
  94. struct PowCFloatRSqrt {
  95. template <typename T>
  96. static __device__ __forceinline__ T apply(T x) {
  97. return static_cast<T>(rsqrtf(x));
  98. }
  99. };
  100. struct PowCFloatRCbrt {
  101. template <typename T>
  102. static __device__ __forceinline__ T apply(T x) {
  103. return static_cast<T>(rcbrtf(x));
  104. }
  105. };
  106. template <typename T>
  107. struct PowCFloat {
  108. T exp;
  109. __device__ __forceinline__ T apply(T x) {
  110. return static_cast<T>(powf(x, exp));
  111. }
  112. };
  113. template <typename T, typename PowOp>
  114. struct PowCOp {
  115. T* dest;
  116. PowOp pow_op;
  117. __device__ __forceinline__ void operator()(uint32_t idx, T src) {
  118. dest[idx] = pow_op.apply(src);
  119. }
  120. };
  121. } // namespace hip_kern
  122. namespace {
  123. template <typename T, typename PowOp>
  124. void invoke(const TensorND& dest, const TensorND& src, PowOp pow_op,
  125. hipStream_t stream) {
  126. ElemwiseOpParamN<1> param;
  127. param[0] = src;
  128. param.init_from_given_tensor();
  129. typedef hip_kern::PowCOp<T, PowOp> Op;
  130. Op op;
  131. op.dest = dest.ptr<T>();
  132. op.pow_op = pow_op;
  133. run_elemwise<Op, T, 1>(param, stream, op);
  134. }
  135. bool feq(float a, float b) {
  136. return std::abs(a - b) < std::numeric_limits<float>::epsilon();
  137. }
  138. template <typename T>
  139. void dispatch_op(const TensorND& dest, const TensorND& src, const float* exp_f,
  140. const int* exp_i, hipStream_t stream) {
  141. #define CALL(_op) invoke<T>(dest, src, _op, stream)
  142. if (exp_f) {
  143. float exp = *exp_f;
  144. #define CALL_IF(_v, _op) \
  145. do { \
  146. if (feq(exp, _v)) { \
  147. CALL(_op); \
  148. return; \
  149. } \
  150. } while (0)
  151. CALL_IF(.5f, hip_kern::PowCFloatSqrt());
  152. CALL_IF(1.f / 3.f, hip_kern::PowCFloatCbrt());
  153. CALL_IF(-.5f, hip_kern::PowCFloatRSqrt());
  154. CALL_IF(-1.f / 3.f, hip_kern::PowCFloatRCbrt());
  155. hip_kern::PowCFloat<T> op;
  156. op.exp = exp;
  157. CALL(op);
  158. return;
  159. #undef CALL_IF
  160. }
  161. int exp = *exp_i;
  162. switch (exp) {
  163. #define CASE(v) \
  164. case v: \
  165. CALL(hip_kern::PowCIntSmall<v>()); \
  166. return
  167. CASE(0);
  168. CASE(1);
  169. CASE(2);
  170. CASE(3);
  171. CASE(4);
  172. CASE(-1);
  173. CASE(-2);
  174. CASE(-3);
  175. CASE(-4);
  176. #undef CASE
  177. }
  178. if (exp & 1) {
  179. hip_kern::PowCIntOdd<T> op;
  180. op.exp = exp;
  181. CALL(op);
  182. } else {
  183. hip_kern::PowCIntEven<T> op;
  184. op.exp = exp;
  185. CALL(op);
  186. }
  187. #undef CALL
  188. }
  189. } // anonymous namespace
  190. void powc_kern(const TensorND& dest, const TensorND& src,
  191. const float* exp_f, const int* exp_i,
  192. hipStream_t stream) {
  193. switch (src.layout.dtype.enumv().ev) {
  194. #define cb(dt) \
  195. case DTypeTrait<dt>::enumv: \
  196. return dispatch_op<DTypeTrait<dt>::ctype>(dest, src, exp_f, exp_i, \
  197. stream);
  198. MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
  199. #undef cb
  200. default:
  201. megdnn_throw("unsupported dtype for PowC");
  202. }
  203. }
  204. } // namespace rocm
  205. } // namespace megdnn
  206. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台