|
- /**
- * \file dnn/src/arm_common/elemwise/neon_mathfun.h
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
- * implied.
- */
-
- #pragma once
-
- #include "src/arm_common/simd_macro/marm_neon.h"
-
- namespace megdnn {
- namespace arm_common {
-
- typedef float32x4_t v4sf; // vector of 4 float
- typedef uint32x4_t v4su; // vector of 4 uint32
- typedef int32x4_t v4si; // vector of 4 uint32
-
- /**
- * \brief natural logarithm computed for 4 simultaneous float
- * return NaN for x <= 0
- */
- v4sf log_ps_f32(v4sf x);
-
- //! exp() computed for 4 float at once
- v4sf exp_ps_f32(v4sf x);
-
- /**
- * \brief evaluation of 4 sines & cosines at once.
- *
- * The code is the exact rewriting of the cephes sinf function.
- * Precision is excellent as long as x < 8192 (I did not bother to
- * take into account the special handling they have for greater values
- * -- it does not return garbage for arguments over 8192, though, but
- * the extra precision is missing).
- *
- * Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
- * surprising but correct result.
- *
- * Note also that when you compute sin(x), cos(x) is available at
- * almost no extra price so both sin_ps_f32 and cos_ps_f32 make use of
- * sincos_ps_f32..
- */
- void sincos_ps_f32(v4sf x, v4sf* ysin, v4sf* ycos);
-
- v4sf sin_ps_f32(v4sf x);
-
- v4sf cos_ps_f32(v4sf x);
-
- v4sf tan_ps_f32(v4sf x);
-
- static inline v4sf div_ps_f32(v4sf x, v4sf y) {
- #if MEGDNN_AARCH64
- return vdivq_f32(x, y);
- #else
- //! armv7 not support vdiv, so compute the reciprocal and iterate again
- float32x4_t recp = vrecpeq_f32(y);
- recp = vmulq_f32(vrecpsq_f32(y, recp), recp);
- return vmulq_f32(x, recp);
- #endif
- }
-
- v4sf sigmoid_ps_f32(v4sf x);
-
- #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- /**
- * \brief compute for 8 half at once, the inner just invoke exp_ps_f32 twice
- */
- float16x8_t exp_ps_f16(float16x8_t x);
-
- static inline float16x8_t div_ps_f16(float16x8_t x, float16x8_t y) {
- #if MEGDNN_AARCH64
- return vdivq_f16(x, y);
- #else
- //! armv7 not support vdiv, so compute the reciprocal and iterate again
- float16x8_t recp = vrecpeq_f16(y);
- recp = vmulq_f16(vrecpsq_f16(y, recp), recp);
- return vmulq_f16(x, recp);
- #endif
- }
-
- float16x8_t sigmoid_ps_f16(float16x8_t x);
-
- #endif
-
- } // namespace arm_common
- } // namespace megdnn
-
- // vim: syntax=cpp.doxygen
|