You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

neon_mathfun.h 2.6 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. /**
  2. * \file dnn/src/arm_common/elemwise/neon_mathfun.h
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #pragma once
  13. #include "src/arm_common/simd_macro/marm_neon.h"
  14. namespace megdnn {
  15. namespace arm_common {
  16. typedef float32x4_t v4sf; // vector of 4 float
  17. typedef uint32x4_t v4su; // vector of 4 uint32
  18. typedef int32x4_t v4si; // vector of 4 uint32
  19. /**
  20. * \brief natural logarithm computed for 4 simultaneous float
  21. * return NaN for x <= 0
  22. */
  23. v4sf log_ps_f32(v4sf x);
  24. //! exp() computed for 4 float at once
  25. v4sf exp_ps_f32(v4sf x);
  26. /**
  27. * \brief evaluation of 4 sines & cosines at once.
  28. *
  29. * The code is the exact rewriting of the cephes sinf function.
  30. * Precision is excellent as long as x < 8192 (I did not bother to
  31. * take into account the special handling they have for greater values
  32. * -- it does not return garbage for arguments over 8192, though, but
  33. * the extra precision is missing).
  34. *
  35. * Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
  36. * surprising but correct result.
  37. *
  38. * Note also that when you compute sin(x), cos(x) is available at
  39. * almost no extra price so both sin_ps_f32 and cos_ps_f32 make use of
  40. * sincos_ps_f32..
  41. */
  42. void sincos_ps_f32(v4sf x, v4sf* ysin, v4sf* ycos);
  43. v4sf sin_ps_f32(v4sf x);
  44. v4sf cos_ps_f32(v4sf x);
  45. v4sf tan_ps_f32(v4sf x);
  46. static inline v4sf div_ps_f32(v4sf x, v4sf y) {
  47. #if MEGDNN_AARCH64
  48. return vdivq_f32(x, y);
  49. #else
  50. //! armv7 not support vdiv, so compute the reciprocal and iterate again
  51. float32x4_t recp = vrecpeq_f32(y);
  52. recp = vmulq_f32(vrecpsq_f32(y, recp), recp);
  53. return vmulq_f32(x, recp);
  54. #endif
  55. }
  56. v4sf sigmoid_ps_f32(v4sf x);
  57. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  58. /**
  59. * \brief compute for 8 half at once, the inner just invoke exp_ps_f32 twice
  60. */
  61. float16x8_t exp_ps_f16(float16x8_t x);
  62. static inline float16x8_t div_ps_f16(float16x8_t x, float16x8_t y) {
  63. #if MEGDNN_AARCH64
  64. return vdivq_f16(x, y);
  65. #else
  66. //! armv7 not support vdiv, so compute the reciprocal and iterate again
  67. float16x8_t recp = vrecpeq_f16(y);
  68. recp = vmulq_f16(vrecpsq_f16(y, recp), recp);
  69. return vmulq_f16(x, recp);
  70. #endif
  71. }
  72. float16x8_t sigmoid_ps_f16(float16x8_t x);
  73. #endif
  74. } // namespace arm_common
  75. } // namespace megdnn
  76. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台