@@ -17,6 +17,10 @@ | |||||
#endif | #endif | ||||
#endif | #endif | ||||
#if defined(__riscv_vector) | |||||
#include <riscv_vector.h> | |||||
#endif | |||||
#if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64) | #if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64) | ||||
#define GI_TARGET_X86 | #define GI_TARGET_X86 | ||||
#endif | #endif | ||||
@@ -47,7 +51,7 @@ | |||||
#define GI_INTERNAL_DATA extern "C" __attribute((visibility("hidden"))) | #define GI_INTERNAL_DATA extern "C" __attribute((visibility("hidden"))) | ||||
#endif | #endif | ||||
#if defined(GI_TARGET_ARM) | |||||
#if defined(GI_TARGET_ARM) && defined(__ARM_NEON) | |||||
#define GI_NEON_INTRINSICS | #define GI_NEON_INTRINSICS | ||||
#if defined(__aarch64__) | #if defined(__aarch64__) | ||||
#define GI_NEON64_INTRINSICS | #define GI_NEON64_INTRINSICS | ||||
@@ -72,6 +76,9 @@ | |||||
#define GI_SSE2_INTRINSICS | #define GI_SSE2_INTRINSICS | ||||
#endif | #endif | ||||
#endif | #endif | ||||
#if defined(__riscv_vector) | |||||
#define GI_RVV_INTRINSICS | |||||
#endif | |||||
#if defined(GI_TEST_NAIVE) | #if defined(GI_TEST_NAIVE) | ||||
#undef GI_NEON_INTRINSICS | #undef GI_NEON_INTRINSICS | ||||
@@ -82,6 +89,7 @@ | |||||
#undef GI_AVX_INTRINSICS | #undef GI_AVX_INTRINSICS | ||||
#undef GI_SSE42_INTRINSICS | #undef GI_SSE42_INTRINSICS | ||||
#undef GI_SSE2_INTRINSICS | #undef GI_SSE2_INTRINSICS | ||||
#undef GI_RVV_INTRINSICS | |||||
#endif | #endif | ||||
//! general intrinsic support dynamic length simd, if avx or avx2 the simd | //! general intrinsic support dynamic length simd, if avx or avx2 the simd | ||||
@@ -95,6 +103,10 @@ | |||||
defined(GI_SSE42_INTRINSICS) | defined(GI_SSE42_INTRINSICS) | ||||
#define GI_SIMD_LEN 128 | #define GI_SIMD_LEN 128 | ||||
#define GI_SIMD_LEN_BYTE 16 | #define GI_SIMD_LEN_BYTE 16 | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
//! TODO: make gi algo usable for other GI_SIMD_LEN/GI_SIMD_LEN_BYTE | |||||
#define GI_SIMD_LEN 128 | |||||
#define GI_SIMD_LEN_BYTE 16 | |||||
#else | #else | ||||
//! if no simd hardware support, the simd is implemented by C, default set to | //! if no simd hardware support, the simd is implemented by C, default set to | ||||
//! 128 | //! 128 | ||||
@@ -112,6 +124,7 @@ enum GiSimdType { | |||||
GI_SSE42, | GI_SSE42, | ||||
GI_SSE2, | GI_SSE2, | ||||
GI_NEON, | GI_NEON, | ||||
GI_RVV, | |||||
}; | }; | ||||
#if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \ | #if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \ | ||||
@@ -246,17 +259,41 @@ typedef __m64_128 float32x2_t; | |||||
return res64; | return res64; | ||||
#define _sse_vextq_s32(a, b, c) _MM_ALIGNR_EPI8(b, a, c * 4) | #define _sse_vextq_s32(a, b, c) _MM_ALIGNR_EPI8(b, a, c * 4) | ||||
#define _sse_vget_lane_f32(vec, lane) vec.m64_f32[lane] | #define _sse_vget_lane_f32(vec, lane) vec.m64_f32[lane] | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
#define __gi_simd_type GI_RVV | |||||
typedef vfloat32m1_t GI_FLOAT32_t; | |||||
typedef vuint8m1_t GI_UINT8_t; | |||||
typedef vint8m1_t GI_INT8_t; | |||||
typedef vint16m1_t GI_INT16_t; | |||||
typedef vint32m1_t GI_INT32_t; | |||||
typedef vuint32m1_t GI_UINT32_t; | |||||
//! FIXME: nezha D1 do not support vmv.x.s instruct | |||||
//! as a workaround, define GI_INT64_t to naive | |||||
typedef int64_t GI_INT64_RVV_WORKAROUND_t | |||||
__attribute__((vector_size(GI_SIMD_LEN_BYTE))); | |||||
typedef GI_INT64_RVV_WORKAROUND_t GI_INT64_t; | |||||
typedef vfloat32m1x2_t GI_FLOAT32_V2_t; | |||||
typedef vfloat32m1x3_t GI_FLOAT32_V3_t; | |||||
typedef vfloat32m1x4_t GI_FLOAT32_V4_t; | |||||
typedef vint32m1x2_t GI_INT32_V2_t; | |||||
typedef vint32m1x4_t GI_INT32_V4_t; | |||||
typedef vint16m1x2_t GI_INT16_V2_t; | |||||
typedef vint8m1x2_t GI_INT8_V2_t; | |||||
//! vfloat32mf2_t usable at RVV1.0, now we support 0.7, as | |||||
//! a workaround, we use vfloat32m1_t instead | |||||
typedef vfloat32m1_t float32x2_t; | |||||
#else | #else | ||||
#define __gi_simd_type GI_NAIVE | #define __gi_simd_type GI_NAIVE | ||||
typedef float GI_FLOAT32_t __attribute__((vector_size(16))); | |||||
typedef uint8_t GI_UINT8_t __attribute__((vector_size(16))); | |||||
typedef int8_t GI_INT8_t __attribute__((vector_size(16))); | |||||
typedef int16_t GI_INT16_t __attribute__((vector_size(16))); | |||||
typedef int32_t GI_INT32_t __attribute__((vector_size(16))); | |||||
typedef uint32_t GI_UINT32_t __attribute__((vector_size(16))); | |||||
typedef int64_t GI_INT64_t __attribute__((vector_size(16))); | |||||
#if !defined(__arm__) && !defined(__aarch64__) | |||||
typedef float float32x2_t __attribute__((vector_size(8))); | |||||
typedef float GI_FLOAT32_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); | |||||
typedef uint8_t GI_UINT8_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); | |||||
typedef int8_t GI_INT8_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); | |||||
typedef int16_t GI_INT16_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); | |||||
typedef int32_t GI_INT32_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); | |||||
typedef uint32_t GI_UINT32_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); | |||||
typedef int64_t GI_INT64_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); | |||||
#if !defined(__arm__) && !defined(__aarch64__) || !defined(__ARM_NEON) | |||||
typedef float float32x2_t __attribute__((vector_size(GI_SIMD_LEN_BYTE / 2))); | |||||
#endif | #endif | ||||
typedef float float32_t; | typedef float float32_t; | ||||
#endif | #endif | ||||
@@ -265,14 +302,14 @@ typedef float float32_t; | |||||
//! for example: GiAbsInt32 do not imp SSE2 case | //! for example: GiAbsInt32 do not imp SSE2 case | ||||
//! when *_t will define as _m128*(may be long long) | //! when *_t will define as _m128*(may be long long) | ||||
//! vector index do not have same logic as naive vector | //! vector index do not have same logic as naive vector | ||||
typedef float GI_FLOAT32_NAIVE_t __attribute__((vector_size(16))); | |||||
typedef uint8_t GI_UINT8_NAIVE_t __attribute__((vector_size(16))); | |||||
typedef int8_t GI_INT8_NAIVE_t __attribute__((vector_size(16))); | |||||
typedef int16_t GI_INT16_NAIVE_t __attribute__((vector_size(16))); | |||||
typedef int32_t GI_INT32_NAIVE_t __attribute__((vector_size(16))); | |||||
typedef uint32_t GI_UINT32_NAIVE_t __attribute__((vector_size(16))); | |||||
typedef int64_t GI_INT64_NAIVE_t __attribute__((vector_size(16))); | |||||
typedef float float32x2_NAIVE_t __attribute__((vector_size(8))); | |||||
typedef float GI_FLOAT32_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); | |||||
typedef uint8_t GI_UINT8_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); | |||||
typedef int8_t GI_INT8_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); | |||||
typedef int16_t GI_INT16_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); | |||||
typedef int32_t GI_INT32_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); | |||||
typedef uint32_t GI_UINT32_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); | |||||
typedef int64_t GI_INT64_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); | |||||
typedef float float32x2_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE / 2))); | |||||
typedef struct { | typedef struct { | ||||
GI_INT32_NAIVE_t val[2]; | GI_INT32_NAIVE_t val[2]; | ||||
} GI_INT32_V2_NAIVE_t; | } GI_INT32_V2_NAIVE_t; | ||||
@@ -301,22 +338,7 @@ typedef struct { | |||||
GI_INT8_NAIVE_t val[2]; | GI_INT8_NAIVE_t val[2]; | ||||
} GI_INT8_V2_NAIVE_t; | } GI_INT8_V2_NAIVE_t; | ||||
#define Max(a, b) (a) > (b) ? (a) : (b) | |||||
#define Min(a, b) (a) < (b) ? (a) : (b) | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
#if defined(__ARM_FEATURE_FMA) && defined(GI_NEON64_INTRINSICS) | |||||
#define v_fma_ps_f32(c, b, a) vfmaq_f32((c), (b), (a)) | |||||
#define v_fma_n_f32(c, b, a) vfmaq_n_f32((c), (b), (a)) | |||||
#define v_fma_lane_f32(c, b, a, lane) vfmaq_lane_f32((c), (b), (a), (lane)) | |||||
#else | |||||
#define v_fma_ps_f32(c, b, a) vmlaq_f32((c), (b), (a)) | |||||
#define v_fma_n_f32(c, b, a) vmlaq_n_f32((c), (b), (a)) | |||||
#define v_fma_lane_f32(c, b, a, lane) vmlaq_lane_f32((c), (b), (a), (lane)) | |||||
#endif | |||||
#endif | |||||
#if !defined(GI_NEON_INTRINSICS) | |||||
#if !defined(GI_NEON_INTRINSICS) && !defined(GI_RVV_INTRINSICS) | |||||
typedef struct { | typedef struct { | ||||
GI_INT32_t val[2]; | GI_INT32_t val[2]; | ||||
} GI_INT32_V2_t; | } GI_INT32_V2_t; | ||||
@@ -344,61 +366,272 @@ typedef struct { | |||||
typedef struct { | typedef struct { | ||||
GI_INT8_t val[2]; | GI_INT8_t val[2]; | ||||
} GI_INT8_V2_t; | } GI_INT8_V2_t; | ||||
#endif | |||||
GI_FORCEINLINE | |||||
GI_INT32_t GiAndInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vandq_s32(Vector1, Vector2); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
return _mm_and_si128(Vector1, Vector2); | |||||
#endif | |||||
//! variable length type intrinsic can not be a member of c++ class | |||||
//! caused by can not do sizeof at build stage, for example RVV and SVE | |||||
//! so we define a type_CLASS to solve this case | |||||
//! some variable length type intrinsic can not do array subscript, for | |||||
//! example RVV, so we define a GiGetSubVector_xx function to solve this | |||||
//! case. when fix-len type in fact will do nothing | |||||
#if defined(GI_RVV_INTRINSICS) | |||||
typedef GI_FLOAT32_NAIVE_t GI_FLOAT32_FIXLEN_t; | |||||
typedef GI_FLOAT32_V2_NAIVE_t GI_FLOAT32_FIXLEN_V2_t; | |||||
typedef GI_UINT8_NAIVE_t GI_UINT8_FIXLEN_t; | |||||
typedef GI_INT8_NAIVE_t GI_INT8_FIXLEN_t; | |||||
typedef GI_INT16_NAIVE_t GI_INT16_FIXLEN_t; | |||||
typedef GI_INT32_NAIVE_t GI_INT32_FIXLEN_t; | |||||
typedef GI_UINT32_NAIVE_t GI_UINT32_FIXLEN_t; | |||||
//! get subvector | |||||
#define GiGetSubVectorFloat32V2(s, index) vget_f32m1x2_f32m1(s, index) | |||||
#define GiGetSubVectorFloat32V3(s, index) vget_f32m1x3_f32m1(s, index) | |||||
#define GiGetSubVectorFloat32V4(s, index) vget_f32m1x4_f32m1(s, index) | |||||
#define GiGetSubVectorInt32V2(s, index) vget_i32m1x2_i32m1(s, index) | |||||
#define GiGetSubVectorInt32V4(s, index) vget_i32m1x4_i32m1(s, index) | |||||
#define GiGetSubVectorInt16V2(s, index) vget_i16m1x2_i16m1(s, index) | |||||
#define GiGetSubVectorInt8V2(s, index) vget_i8m1x2_i8m1(s, index) | |||||
//! insert subvector | |||||
#define GiSetSubVectorFloat32V2(d, index, s) d = vset_f32m1x2(d, index, s) | |||||
#define GiSetSubVectorFloat32V3(d, index, s) d = vset_f32m1x3(d, index, s) | |||||
#define GiSetSubVectorFloat32V4(d, index, s) d = vset_f32m1x4(d, index, s) | |||||
#define GiSetSubVectorInt32V2(d, index, s) d = vset_i32m1x2(d, index, s) | |||||
#define GiSetSubVectorInt32V4(d, index, s) d = vset_i32m1x4(d, index, s) | |||||
#define GiSetSubVectorInt16V2(d, index, s) d = vset_i16m1x2(d, index, s) | |||||
#define GiSetSubVectorInt8V2(d, index, s) d = vset_i8m1x2(d, index, s) | |||||
//! convert | |||||
#define GiFloat32Type2FixLenType(s) \ | |||||
__extension__({ \ | |||||
GI_FLOAT32_FIXLEN_t d; \ | |||||
vse32_v_f32m1((float*)&d, s, GI_SIMD_LEN_BYTE / sizeof(float)); \ | |||||
d; \ | |||||
}) | |||||
#define GiFixLenType2GiFloat32Type(s) \ | |||||
__extension__({ \ | |||||
GI_FLOAT32_t d; \ | |||||
d = vle32_v_f32m1((float*)&s, GI_SIMD_LEN_BYTE / sizeof(float)); \ | |||||
d; \ | |||||
}) | |||||
#define GiFloat32Type2FixLenV2Type(s) \ | |||||
__extension__({ \ | |||||
GI_FLOAT32_FIXLEN_V2_t d; \ | |||||
d.val[0] = GiFloat32Type2FixLenType(GiGetSubVectorFloat32V2(s, 0)); \ | |||||
d.val[1] = GiFloat32Type2FixLenType(GiGetSubVectorFloat32V2(s, 1)); \ | |||||
d; \ | |||||
}) | |||||
#define GiFixLenType2GiFloat32V2Type(s) \ | |||||
__extension__({ \ | |||||
GI_FLOAT32_V2_t d; \ | |||||
GiSetSubVectorFloat32V2(d, 0, GiFixLenType2GiFloat32Type(s.val[0])); \ | |||||
GiSetSubVectorFloat32V2(d, 1, GiFixLenType2GiFloat32Type(s.val[1])); \ | |||||
d; \ | |||||
}) | |||||
#define GiUint8Type2FixLenType(s) \ | |||||
__extension__({ \ | |||||
GI_UINT8_FIXLEN_t d; \ | |||||
vse8_v_u8m1((uint8_t*)&d, s, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); \ | |||||
d; \ | |||||
}) | |||||
#define GiFixLenType2GiUint8Type(s) \ | |||||
__extension__({ \ | |||||
GI_UINT8_t d; \ | |||||
d = vle8_v_u8m1((uint8_t*)&s, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); \ | |||||
d; \ | |||||
}) | |||||
#define GiInt8Type2FixLenType(s) \ | |||||
__extension__({ \ | |||||
GI_INT8_FIXLEN_t d; \ | |||||
vse8_v_i8m1((int8_t*)&d, s, GI_SIMD_LEN_BYTE / sizeof(int8_t)); \ | |||||
d; \ | |||||
}) | |||||
#define GiFixLenType2GiInt8Type(s) \ | |||||
__extension__({ \ | |||||
GI_INT8_t d; \ | |||||
d = vle8_v_i8m1((int8_t*)&s, GI_SIMD_LEN_BYTE / sizeof(int8_t)); \ | |||||
d; \ | |||||
}) | |||||
#define GiInt16Type2FixLenType(s) \ | |||||
__extension__({ \ | |||||
GI_INT16_FIXLEN_t d; \ | |||||
vse16_v_i16m1((int16_t*)&d, s, GI_SIMD_LEN_BYTE / sizeof(int16_t)); \ | |||||
d; \ | |||||
}) | |||||
#define GiFixLenType2GiInt16Type(s) \ | |||||
__extension__({ \ | |||||
GI_INT16_t d; \ | |||||
d = vle16_v_i16m1((int16_t*)&s, GI_SIMD_LEN_BYTE / sizeof(int16_t)); \ | |||||
d; \ | |||||
}) | |||||
#define GiInt32Type2FixLenType(s) \ | |||||
__extension__({ \ | |||||
GI_INT32_FIXLEN_t d; \ | |||||
vse32_v_i32m1((int32_t*)&d, s, GI_SIMD_LEN_BYTE / sizeof(int32_t)); \ | |||||
d; \ | |||||
}) | |||||
#define GiFixLenType2GiInt32Type(s) \ | |||||
__extension__({ \ | |||||
GI_INT32_t d; \ | |||||
d = vle32_v_i32m1((int32_t*)&s, GI_SIMD_LEN_BYTE / sizeof(int32_t)); \ | |||||
d; \ | |||||
}) | |||||
#define GiUint32Type2FixLenType(s) \ | |||||
__extension__({ \ | |||||
GI_UINT32_FIXLEN_t d; \ | |||||
vse32_v_u32m1((uint32_t*)&d, s, GI_SIMD_LEN_BYTE / sizeof(uint32_t)); \ | |||||
d; \ | |||||
}) | |||||
#define GiFixLenType2GiUint32Type(s) \ | |||||
__extension__({ \ | |||||
GI_UINT32_t d; \ | |||||
d = vle32_v_u32m1((uint32_t*)&s, GI_SIMD_LEN_BYTE / sizeof(uint32_t)); \ | |||||
d; \ | |||||
}) | |||||
#else | #else | ||||
return Vector1 & Vector2; | |||||
typedef GI_FLOAT32_t GI_FLOAT32_FIXLEN_t; | |||||
typedef GI_FLOAT32_V2_t GI_FLOAT32_FIXLEN_V2_t; | |||||
typedef GI_UINT8_t GI_UINT8_FIXLEN_t; | |||||
typedef GI_INT8_t GI_INT8_FIXLEN_t; | |||||
typedef GI_INT16_t GI_INT16_FIXLEN_t; | |||||
typedef GI_INT32_t GI_INT32_FIXLEN_t; | |||||
typedef GI_UINT32_t GI_UINT32_FIXLEN_t; | |||||
#define GiFloat32Type2FixLenType(s) (s) | |||||
#define GiFixLenType2GiFloat32Type(s) (s) | |||||
#define GiFloat32Type2FixLenV2Type(s) (s) | |||||
#define GiFixLenType2GiFloat32V2Type(s) (s) | |||||
#define GiUint8Type2FixLenType(s) (s) | |||||
#define GiFixLenType2GiUint8Type(s) (s) | |||||
#define GiInt8Type2FixLenType(s) (s) | |||||
#define GiFixLenType2GiInt8Type(s) (s) | |||||
#define GiInt16Type2FixLenType(s) (s) | |||||
#define GiFixLenType2GiInt16Type(s) (s) | |||||
#define GiInt32Type2FixLenType(s) (s) | |||||
#define GiFixLenType2GiInt32Type(s) (s) | |||||
#define GiUint32Type2FixLenType(s) (s) | |||||
#define GiFixLenType2GiUint32Type(s) (s) | |||||
//! get subvector | |||||
#define GiGetSubVectorFloat32V2(s, index) s.val[index] | |||||
#define GiGetSubVectorFloat32V3(s, index) s.val[index] | |||||
#define GiGetSubVectorFloat32V4(s, index) s.val[index] | |||||
#define GiGetSubVectorInt32V2(s, index) s.val[index] | |||||
#define GiGetSubVectorInt32V4(s, index) s.val[index] | |||||
#define GiGetSubVectorInt16V2(s, index) s.val[index] | |||||
#define GiGetSubVectorInt8V2(s, index) s.val[index] | |||||
//! insert subvector | |||||
#define GiSetSubVectorFloat32V2(d, index, s) d.val[index] = s | |||||
#define GiSetSubVectorFloat32V3(d, index, s) d.val[index] = s | |||||
#define GiSetSubVectorFloat32V4(d, index, s) d.val[index] = s | |||||
#define GiSetSubVectorInt32V2(d, index, s) d.val[index] = s | |||||
#define GiSetSubVectorInt32V4(d, index, s) d.val[index] = s | |||||
#define GiSetSubVectorInt16V2(d, index, s) d.val[index] = s | |||||
#define GiSetSubVectorInt8V2(d, index, s) d.val[index] = s | |||||
#endif | #endif | ||||
} | |||||
GI_FORCEINLINE | |||||
GI_INT32_t GiOrInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
#define Max(a, b) (a) > (b) ? (a) : (b) | |||||
#define Min(a, b) (a) < (b) ? (a) : (b) | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vorrq_s32(Vector1, Vector2); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
return _mm_or_si128(Vector1, Vector2); | |||||
#if defined(__ARM_FEATURE_FMA) && defined(GI_NEON64_INTRINSICS) | |||||
#define v_fma_ps_f32(c, b, a) vfmaq_f32((c), (b), (a)) | |||||
#define v_fma_n_f32(c, b, a) vfmaq_n_f32((c), (b), (a)) | |||||
#define v_fma_lane_f32(c, b, a, lane) vfmaq_lane_f32((c), (b), (a), (lane)) | |||||
#else | #else | ||||
return Vector1 | Vector2; | |||||
#define v_fma_ps_f32(c, b, a) vmlaq_f32((c), (b), (a)) | |||||
#define v_fma_n_f32(c, b, a) vmlaq_n_f32((c), (b), (a)) | |||||
#define v_fma_lane_f32(c, b, a, lane) vmlaq_lane_f32((c), (b), (a), (lane)) | |||||
#endif | |||||
#endif | #endif | ||||
} | |||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT32_t GiAndNotInt32(GI_INT32_t VectorNot, GI_INT32_t Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vandq_s32(vmvnq_s32(VectorNot), Vector); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
return _mm_andnot_si128(VectorNot, Vector); | |||||
#else | |||||
return (~VectorNot) & Vector; | |||||
enum GiSimdType GiGetSimdType() { | |||||
//! override by special macro to insure ci have test naive and sse2 | |||||
//! now we do not imp GI_AVX to now and x64 ci device will test GI_SSE42 | |||||
//! now arm ci device will test GI_NEON | |||||
//! insure test GI_SSE2 by command: | |||||
//! --copt -march=core2 --copt -mno-sse4.2 | |||||
//! --copt -mno-sse3 --copt -DGI_TEST_SSE2 | |||||
//! insure test GI_NAIVE by command: | |||||
//! --copt -DGI_TEST_SSE2 | |||||
//! DNN code at least need sse2 at x86 | |||||
//! so we can not test GI_NAIVE by | |||||
//! --copt -march=core2 --copt -mno-sse4.2 | |||||
//! --copt -mno-sse3 --copt -mno-sse2 | |||||
//! --copt -DGI_TEST_NAIVE | |||||
//! about CMake, can override build flags to CMAKE_CXX_FLAGS/CMAKE_C_FLAGS by | |||||
//! EXTRA_CMAKE_ARGS when use scripts/cmake-build/*.sh | |||||
#if defined(GI_TEST_NAIVE) | |||||
#undef __gi_simd_type | |||||
#define __gi_simd_type GI_NAIVE | |||||
#elif defined(GI_TEST_SSE2) | |||||
#undef __gi_simd_type | |||||
#define __gi_simd_type GI_SSE2 | |||||
#endif | #endif | ||||
return __gi_simd_type; | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT32_t GiXorInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
GI_FLOAT32_t GiBroadcastFloat32(float Value) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return veorq_s32(Vector1, Vector2); | |||||
return vdupq_n_f32(Value); | |||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_xor_si128(Vector1, Vector2); | |||||
return _mm_set1_ps(Value); | |||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vfmv_v_f_f32m1(Value, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
return Vector1 ^ Vector2; | |||||
GI_FLOAT32_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
ret[i] = Value; | |||||
} | |||||
return ret; | |||||
#endif | #endif | ||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_FLOAT32_t GiBroadcastFloat32(float Value) { | |||||
GI_INT8_t GiBroadcastInt8(int8_t Value) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vdupq_n_f32(Value); | |||||
return vdupq_n_s8(Value); | |||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_set1_ps(Value); | |||||
return _mm_set1_epi8(Value); | |||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vmv_v_x_i8m1(Value, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
#else | #else | ||||
GI_FLOAT32_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
GI_INT8_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | |||||
ret[i] = Value; | ret[i] = Value; | ||||
} | } | ||||
return ret; | return ret; | ||||
@@ -411,6 +644,8 @@ GI_INT32_t GiBroadcastInt32(int32_t Value) { | |||||
return vdupq_n_s32(Value); | return vdupq_n_s32(Value); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_set1_epi32(Value); | return _mm_set1_epi32(Value); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vmv_v_x_i32m1(Value, GI_SIMD_LEN_BYTE / sizeof(int32_t)); | |||||
#else | #else | ||||
GI_INT32_t ret; | GI_INT32_t ret; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | ||||
@@ -421,54 +656,55 @@ GI_INT32_t GiBroadcastInt32(int32_t Value) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT8_t GiBroadcastInt8(int8_t Value) { | |||||
GI_INT32_t GiAndInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vdupq_n_s8(Value); | |||||
return vandq_s32(Vector1, Vector2); | |||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_set1_epi8(Value); | |||||
return _mm_and_si128(Vector1, Vector2); | |||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vand_vv_i32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int32_t)); | |||||
#else | #else | ||||
GI_INT8_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | |||||
ret[i] = Value; | |||||
} | |||||
return ret; | |||||
return Vector1 & Vector2; | |||||
#endif | #endif | ||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GiSimdType GiGetSimdType() { | |||||
//! override by special macro to insure ci have test naive and sse2 | |||||
//! now we do not imp GI_AVX to now and x64 ci device will test GI_SSE42 | |||||
//! now arm ci device will test GI_NEON | |||||
//! insure test GI_SSE2 by command: | |||||
//! --copt -march=core2 --copt -mno-sse4.2 | |||||
//! --copt -mno-sse3 --copt -DGI_TEST_SSE2 | |||||
//! insure test GI_NAIVE by command: | |||||
//! --copt -DGI_TEST_SSE2 | |||||
//! DNN code at least need sse2 at x86 | |||||
//! so we can not test GI_NAIVE by | |||||
//! --copt -march=core2 --copt -mno-sse4.2 | |||||
//! --copt -mno-sse3 --copt -mno-sse2 | |||||
//! --copt -DGI_TEST_NAIVE | |||||
//! about CMake, can override build flags to CMAKE_CXX_FLAGS/CMAKE_C_FLAGS by | |||||
//! EXTRA_CMAKE_ARGS when use scripts/cmake-build/*.sh | |||||
#if defined(GI_TEST_NAIVE) | |||||
#undef __gi_simd_type | |||||
#define __gi_simd_type GI_NAIVE | |||||
#elif defined(GI_TEST_SSE2) | |||||
#undef __gi_simd_type | |||||
#define __gi_simd_type GI_SSE2 | |||||
GI_INT32_t GiOrInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vorrq_s32(Vector1, Vector2); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
return _mm_or_si128(Vector1, Vector2); | |||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vor_vv_i32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int32_t)); | |||||
#else | |||||
return Vector1 | Vector2; | |||||
#endif | #endif | ||||
return __gi_simd_type; | |||||
} | } | ||||
__attribute__((unused)) const GI_INT8_t vzero_int8 = GiBroadcastInt8(0); | |||||
__attribute__((unused)) const GI_INT32_t vzero = GiBroadcastInt32(0); | |||||
__attribute__((unused)) const GI_FLOAT32_t vfzero = GiBroadcastFloat32(0.0f); | |||||
__attribute__((unused)) const GI_FLOAT32_t vfhalf = GiBroadcastFloat32(0.5f); | |||||
__attribute__((unused)) const GI_FLOAT32_t vfneg_half = GiBroadcastFloat32(-0.5f); | |||||
__attribute__((unused)) const GI_FLOAT32_t vfmin_int8 = GiBroadcastFloat32(-128.0f); | |||||
__attribute__((unused)) const GI_FLOAT32_t vfmax_int8 = GiBroadcastFloat32(127.0f); | |||||
GI_FORCEINLINE | |||||
GI_INT32_t GiAndNotInt32(GI_INT32_t VectorNot, GI_INT32_t Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vandq_s32(vmvnq_s32(VectorNot), Vector); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
return _mm_andnot_si128(VectorNot, Vector); | |||||
#elif defined(GI_RVV_INTRINSICS) | |||||
GI_INT32_t not_v = vnot_v_i32m1(VectorNot, GI_SIMD_LEN_BYTE / sizeof(int32_t)); | |||||
return vand_vv_i32m1(not_v, Vector, GI_SIMD_LEN_BYTE / sizeof(int32_t)); | |||||
#else | |||||
return (~VectorNot) & Vector; | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_INT32_t GiXorInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return veorq_s32(Vector1, Vector2); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
return _mm_xor_si128(Vector1, Vector2); | |||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vxor_vv_i32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int32_t)); | |||||
#else | |||||
return Vector1 ^ Vector2; | |||||
#endif | |||||
} | |||||
// vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen |
@@ -8,6 +8,8 @@ GI_INT32_t GiReinterpretAsInt32(GI_FLOAT32_t In) { | |||||
return vreinterpretq_s32_f32(In); | return vreinterpretq_s32_f32(In); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_castps_si128(In); | return _mm_castps_si128(In); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vreinterpret_v_f32m1_i32m1(In); | |||||
#else | #else | ||||
return (GI_INT32_t)In; | return (GI_INT32_t)In; | ||||
#endif | #endif | ||||
@@ -19,6 +21,8 @@ GI_UINT32_t GiReinterpretAsUint32(GI_FLOAT32_t In) { | |||||
return vreinterpretq_u32_f32(In); | return vreinterpretq_u32_f32(In); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_castps_si128(In); | return _mm_castps_si128(In); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vreinterpret_v_f32m1_u32m1(In); | |||||
#else | #else | ||||
return (GI_UINT32_t)In; | return (GI_UINT32_t)In; | ||||
#endif | #endif | ||||
@@ -30,6 +34,8 @@ GI_FLOAT32_t GiReintInt32ToFloat32(GI_INT32_t Vector) { | |||||
return vreinterpretq_f32_s32(Vector); | return vreinterpretq_f32_s32(Vector); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_castsi128_ps(Vector); | return _mm_castsi128_ps(Vector); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vreinterpret_v_i32m1_f32m1(Vector); | |||||
#else | #else | ||||
return (GI_FLOAT32_t)Vector; | return (GI_FLOAT32_t)Vector; | ||||
#endif | #endif | ||||
@@ -41,6 +47,8 @@ GI_FLOAT32_t GiReintUint32ToFloat32(GI_UINT32_t Vector) { | |||||
return vreinterpretq_f32_u32(Vector); | return vreinterpretq_f32_u32(Vector); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_castsi128_ps(Vector); | return _mm_castsi128_ps(Vector); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vreinterpret_v_u32m1_f32m1(Vector); | |||||
#else | #else | ||||
return (GI_FLOAT32_t)Vector; | return (GI_FLOAT32_t)Vector; | ||||
#endif | #endif | ||||
@@ -52,12 +60,18 @@ GI_INT32_t GiRoundAsInt32(GI_FLOAT32_t Vector) { | |||||
#if __ARM_ARCH >= 8 | #if __ARM_ARCH >= 8 | ||||
return vcvtaq_s32_f32(Vector); | return vcvtaq_s32_f32(Vector); | ||||
#else | #else | ||||
float32x4_t vinc0 = vbslq_f32(vcgeq_f32(Vector, vfzero), vfhalf, vfneg_half); | |||||
float32x4_t vinc0 = vbslq_f32( | |||||
vcgeq_f32(Vector, GiBroadcastFloat32(0.0f)), GiBroadcastFloat32(0.5f), | |||||
GiBroadcastFloat32(-0.5f)); | |||||
return vcvtq_s32_f32(vaddq_f32(Vector, vinc0)); | return vcvtq_s32_f32(vaddq_f32(Vector, vinc0)); | ||||
#endif | #endif | ||||
#elif defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE42_INTRINSICS) | ||||
__m128 vinc0 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(Vector, vfzero)); | |||||
__m128 vinc0 = _mm_blendv_ps( | |||||
GiBroadcastFloat32(-0.5f), GiBroadcastFloat32(0.5f), | |||||
_mm_cmpge_ps(Vector, GiBroadcastFloat32(0.0f))); | |||||
return _mm_cvttps_epi32(_mm_add_ps(Vector, vinc0)); | return _mm_cvttps_epi32(_mm_add_ps(Vector, vinc0)); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vfcvt_x_f_v_i32m1(Vector, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
GI_INT32_t ret; | GI_INT32_t ret; | ||||
GI_INT32_NAIVE_t tmp_ret; | GI_INT32_NAIVE_t tmp_ret; | ||||
@@ -77,6 +91,16 @@ GI_INT32_t GiCastToInt32(GI_FLOAT32_t Vector) { | |||||
return vcvtq_s32_f32(Vector); | return vcvtq_s32_f32(Vector); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_cvttps_epi32(Vector); | return _mm_cvttps_epi32(Vector); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
//! TODO: vfcvt_rtz_x_f_v_i32m1 is RVV 1.0 api, now xuantie D1 only support 0p7 | |||||
//! as a workaround, we imp this API by naive | |||||
//! return vfcvt_rtz_x_f_v_i32m1(Vector, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
GI_FLOAT32_FIXLEN_t src = GiFloat32Type2FixLenType(Vector); | |||||
GI_INT32_FIXLEN_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
ret[i] = (int32_t)(src[i]); | |||||
} | |||||
return GiFixLenType2GiInt32Type(ret); | |||||
#else | #else | ||||
GI_INT32_t ret; | GI_INT32_t ret; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
@@ -92,6 +116,8 @@ GI_FLOAT32_t GiCastToFloat32(GI_INT32_t Vector) { | |||||
return vcvtq_f32_s32(Vector); | return vcvtq_f32_s32(Vector); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_cvtepi32_ps(Vector); | return _mm_cvtepi32_ps(Vector); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vfcvt_f_x_v_f32m1(Vector, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
GI_FLOAT32_t ret; | GI_FLOAT32_t ret; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | ||||
@@ -107,6 +133,8 @@ GI_FLOAT32_t GiLoadBroadcastFloat32(const float* Value) { | |||||
return vld1q_dup_f32(Value); | return vld1q_dup_f32(Value); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_load_ps1(Value); | return _mm_load_ps1(Value); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return GiBroadcastFloat32(*Value); | |||||
#else | #else | ||||
GI_FLOAT32_t ret; | GI_FLOAT32_t ret; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
@@ -136,6 +164,8 @@ GI_FLOAT32_t GiLoadFloat32(const float* Buffer) { | |||||
return _mm_load_ps(Buffer); | return _mm_load_ps(Buffer); | ||||
else | else | ||||
return _mm_loadu_ps(Buffer); | return _mm_loadu_ps(Buffer); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vle32_v_f32m1(Buffer, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
GI_FLOAT32_t ret; | GI_FLOAT32_t ret; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
@@ -151,8 +181,9 @@ GI_FLOAT32_V2_t GiLoadFloat32V2(const float* Buffer) { | |||||
return vld1q_f32_x2(Buffer); | return vld1q_f32_x2(Buffer); | ||||
#else | #else | ||||
GI_FLOAT32_V2_t v; | GI_FLOAT32_V2_t v; | ||||
v.val[0] = GiLoadFloat32(Buffer); | |||||
v.val[1] = GiLoadFloat32(Buffer + GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
GiSetSubVectorFloat32V2(v, 0, GiLoadFloat32(Buffer)); | |||||
GiSetSubVectorFloat32V2( | |||||
v, 1, GiLoadFloat32(Buffer + GI_SIMD_LEN_BYTE / sizeof(float))); | |||||
return v; | return v; | ||||
#endif | #endif | ||||
@@ -171,6 +202,8 @@ GI_FLOAT32_t GiLoadFloat32LowHalf(const float* Buffer) { | |||||
high.m64_f32[1] = 0; | high.m64_f32[1] = 0; | ||||
__m128i res = _mm_unpacklo_epi64(_pM128i(low), _pM128i(high)); | __m128i res = _mm_unpacklo_epi64(_pM128i(low), _pM128i(high)); | ||||
return _M128(res); | return _M128(res); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vle32_v_f32m1(Buffer, GI_SIMD_LEN_BYTE / sizeof(float) / 2); | |||||
#else | #else | ||||
GI_FLOAT32_t ret; | GI_FLOAT32_t ret; | ||||
memset(&ret, 0, sizeof(GI_FLOAT32_t)); | memset(&ret, 0, sizeof(GI_FLOAT32_t)); | ||||
@@ -194,6 +227,8 @@ GI_FLOAT32_t GiMlaqFloat32(GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t c) { | |||||
__m128 res; | __m128 res; | ||||
res = _mm_mul_ps(c, b); | res = _mm_mul_ps(c, b); | ||||
return _mm_add_ps(a, res); | return _mm_add_ps(a, res); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vfmadd_vv_f32m1(b, c, a, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
GI_FLOAT32_t ret; | GI_FLOAT32_t ret; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
@@ -211,6 +246,14 @@ GI_FORCEINLINE GI_FLOAT32_V2_t GiUzpqFloat32(GI_FLOAT32_t a, GI_FLOAT32_t b) { | |||||
v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); | v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); | ||||
v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); | v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); | ||||
return v32x4; | return v32x4; | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
//! may need optimize | |||||
float tmp[GI_SIMD_LEN_BYTE / sizeof(float) * 2] = {0}; | |||||
vse32_v_f32m1(tmp, a, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
vse32_v_f32m1( | |||||
tmp + GI_SIMD_LEN_BYTE / sizeof(float), b, | |||||
GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
return vlseg2e32_v_f32m1x2(tmp, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
GI_FLOAT32_V2_t ret; | GI_FLOAT32_V2_t ret; | ||||
ret.val[0][0] = a[0]; | ret.val[0][0] = a[0]; | ||||
@@ -233,6 +276,8 @@ GI_FORCEINLINE float32x2_t GiDupFloat32(float a) { | |||||
res.m64_f32[0] = a; | res.m64_f32[0] = a; | ||||
res.m64_f32[1] = a; | res.m64_f32[1] = a; | ||||
return res; | return res; | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return GiBroadcastFloat32(a); | |||||
#else | #else | ||||
float32x2_t res; | float32x2_t res; | ||||
res[0] = a; | res[0] = a; | ||||
@@ -249,6 +294,8 @@ GI_FORCEINLINE float32x2_t GiLdFloat32(float const* ptr) { | |||||
res.m64_f32[0] = *(ptr); | res.m64_f32[0] = *(ptr); | ||||
res.m64_f32[1] = *(ptr + 1); | res.m64_f32[1] = *(ptr + 1); | ||||
return res; | return res; | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vle32_v_f32m1(ptr, 2); | |||||
#else | #else | ||||
float32x2_t res; | float32x2_t res; | ||||
res[0] = *(ptr); | res[0] = *(ptr); | ||||
@@ -266,6 +313,8 @@ GI_FORCEINLINE float32x2_t GiAddDFloat32(float32x2_t a, float32x2_t b) { | |||||
res = _mm_add_ps(_pM128(a), _pM128(b)); // SSE, use only low 64 bits | res = _mm_add_ps(_pM128(a), _pM128(b)); // SSE, use only low 64 bits | ||||
_M64f(res64, res); | _M64f(res64, res); | ||||
return res64; | return res64; | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vfadd_vv_f32m1(a, b, 2); | |||||
#else | #else | ||||
float32x2_t res; | float32x2_t res; | ||||
res[0] = a[0] + b[0]; | res[0] = a[0] + b[0]; | ||||
@@ -280,6 +329,10 @@ GI_FORCEINLINE float32x2_t GiAddDFloat32(float32x2_t a, float32x2_t b) { | |||||
GI_FORCEINLINE float __gi_vget_lane_f32(float32x2_t v, const int lane) { | GI_FORCEINLINE float __gi_vget_lane_f32(float32x2_t v, const int lane) { | ||||
#if defined(GI_SSE2_INTRINSICS) | #if defined(GI_SSE2_INTRINSICS) | ||||
return _sse_vget_lane_f32(v, lane); | return _sse_vget_lane_f32(v, lane); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
float ret[2]; | |||||
vse32_v_f32m1(ret, v, 2); | |||||
return ret[lane]; | |||||
#else | #else | ||||
return v[lane]; | return v[lane]; | ||||
#endif | #endif | ||||
@@ -297,6 +350,11 @@ __gi_vset_lane_f32(float32_t value, float32x2_t vec, int lane) { | |||||
res = vec; | res = vec; | ||||
res.m64_f32[lane] = value; | res.m64_f32[lane] = value; | ||||
return res; | return res; | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
float tmp[2]; | |||||
vse32_v_f32m1(tmp, vec, 2); | |||||
tmp[lane] = value; | |||||
return vle32_v_f32m1(tmp, 2); | |||||
#else | #else | ||||
float32x2_t res; | float32x2_t res; | ||||
res = vec; | res = vec; | ||||
@@ -314,6 +372,8 @@ GI_FORCEINLINE void GiSt1Float32(float* ptr, float32x2_t val) { | |||||
*(ptr) = val.m64_f32[0]; | *(ptr) = val.m64_f32[0]; | ||||
*(ptr + 1) = val.m64_f32[1]; | *(ptr + 1) = val.m64_f32[1]; | ||||
return; | return; | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vse32_v_f32m1(ptr, val, 2); | |||||
#else | #else | ||||
*(ptr) = val[0]; | *(ptr) = val[0]; | ||||
*(ptr + 1) = val[1]; | *(ptr + 1) = val[1]; | ||||
@@ -330,6 +390,8 @@ GI_FORCEINLINE GI_FLOAT32_V2_t GiLd2qFloat32(const float* Buffer) { | |||||
v.val[1] = GiLoadFloat32((Buffer + 4)); | v.val[1] = GiLoadFloat32((Buffer + 4)); | ||||
v = GiUzpqFloat32(v.val[0], v.val[1]); | v = GiUzpqFloat32(v.val[0], v.val[1]); | ||||
return v; | return v; | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vlseg2e32_v_f32m1x2(Buffer, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
GI_FLOAT32_V2_t ret; | GI_FLOAT32_V2_t ret; | ||||
ret.val[0][0] = Buffer[0]; | ret.val[0][0] = Buffer[0]; | ||||
@@ -351,6 +413,16 @@ GI_FORCEINLINE GI_FLOAT32_V2_t GiLd2qFloat32(const float* Buffer) { | |||||
#else | #else | ||||
GI_FORCEINLINE GI_FLOAT32_t | GI_FORCEINLINE GI_FLOAT32_t | ||||
__naive_gi_vextq_f32(GI_FLOAT32_t a, GI_FLOAT32_t b, const int n) { | __naive_gi_vextq_f32(GI_FLOAT32_t a, GI_FLOAT32_t b, const int n) { | ||||
#if defined(GI_RVV_INTRINSICS) | |||||
int t_count = GI_SIMD_LEN_BYTE / sizeof(float); | |||||
int a_count = t_count - n; | |||||
float tmp[GI_SIMD_LEN_BYTE / sizeof(float)]; | |||||
float tmp_a[GI_SIMD_LEN_BYTE / sizeof(float)]; | |||||
vse32_v_f32m1(tmp_a, a, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
memcpy(tmp, tmp_a + n, a_count * sizeof(float)); | |||||
vse32_v_f32m1(tmp + a_count, b, n); | |||||
return vle32_v_f32m1(tmp, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | |||||
GI_FLOAT32_t ret; | GI_FLOAT32_t ret; | ||||
int t_count = GI_SIMD_LEN_BYTE / sizeof(float); | int t_count = GI_SIMD_LEN_BYTE / sizeof(float); | ||||
int a_count = t_count - n; | int a_count = t_count - n; | ||||
@@ -361,6 +433,7 @@ __naive_gi_vextq_f32(GI_FLOAT32_t a, GI_FLOAT32_t b, const int n) { | |||||
ret[i + a_count] = b[i]; | ret[i + a_count] = b[i]; | ||||
} | } | ||||
return ret; | return ret; | ||||
#endif | |||||
} | } | ||||
#define GiExtqFloat32(a, b, n) __naive_gi_vextq_f32(a, b, n) | #define GiExtqFloat32(a, b, n) __naive_gi_vextq_f32(a, b, n) | ||||
#endif | #endif | ||||
@@ -372,6 +445,9 @@ GI_FLOAT32_t GiMultiplySubFloat32( | |||||
return vmlsq_f32(VectorSum, Vector1, Vector2); | return vmlsq_f32(VectorSum, Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_sub_ps(VectorSum, _mm_mul_ps(Vector1, Vector2)); | return _mm_sub_ps(VectorSum, _mm_mul_ps(Vector1, Vector2)); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vfnmsub_vv_f32m1( | |||||
Vector1, Vector2, VectorSum, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
GI_FLOAT32_t ret; | GI_FLOAT32_t ret; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
@@ -449,6 +525,12 @@ __gi_vld1q_lane_f32(const float* Buffer, GI_FLOAT32_t src, const int n) { | |||||
GI_FLOAT32_t p; | GI_FLOAT32_t p; | ||||
p = _mm_set1_ps(*(Buffer)); | p = _mm_set1_ps(*(Buffer)); | ||||
return _MM_INSERT_PS(src, p, _INSERTPS_NDX(0, n)); | return _MM_INSERT_PS(src, p, _INSERTPS_NDX(0, n)); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
//! mask will use more instruct | |||||
float tmp[GI_SIMD_LEN_BYTE / sizeof(float)]; | |||||
vse32_v_f32m1(tmp, src, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
tmp[n] = *Buffer; | |||||
return vle32_v_f32m1(tmp, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
GI_FLOAT32_t ret; | GI_FLOAT32_t ret; | ||||
memcpy(&ret, &src, sizeof(GI_FLOAT32_t)); | memcpy(&ret, &src, sizeof(GI_FLOAT32_t)); | ||||
@@ -479,11 +561,20 @@ __gi_vsetq_lane_f32(float value, GI_FLOAT32_t vec, const int lane) { | |||||
#else | #else | ||||
GI_FORCEINLINE GI_FLOAT32_t __naive_gi_vmlaq_lane_f32_high_half( | GI_FORCEINLINE GI_FLOAT32_t __naive_gi_vmlaq_lane_f32_high_half( | ||||
GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t v, const int lane) { | GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t v, const int lane) { | ||||
#if defined(GI_RVV_INTRINSICS) | |||||
float tmp[GI_SIMD_LEN_BYTE / sizeof(float)]; | |||||
vse32_v_f32m1(tmp, v, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
return vfmadd_vf_f32m1( | |||||
b, tmp[lane + GI_SIMD_LEN_BYTE / sizeof(float) / 2], a, | |||||
GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | |||||
GI_FLOAT32_t ret; | GI_FLOAT32_t ret; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
ret[i] = a[i] + (b[i] * v[lane + 2]); | |||||
ret[i] = a[i] + (b[i] * v[lane + GI_SIMD_LEN_BYTE / sizeof(float) / 2]); | |||||
} | } | ||||
return ret; | return ret; | ||||
#endif | |||||
} | } | ||||
#define GiMlaqLaneFloat32HighHalf(a, b, v, lane) \ | #define GiMlaqLaneFloat32HighHalf(a, b, v, lane) \ | ||||
__naive_gi_vmlaq_lane_f32_high_half(a, b, v, lane) | __naive_gi_vmlaq_lane_f32_high_half(a, b, v, lane) | ||||
@@ -498,11 +589,18 @@ GI_FORCEINLINE GI_FLOAT32_t __naive_gi_vmlaq_lane_f32_high_half( | |||||
#else | #else | ||||
GI_FORCEINLINE GI_FLOAT32_t __naive_gi_vmlaq_lane_f32_low_half( | GI_FORCEINLINE GI_FLOAT32_t __naive_gi_vmlaq_lane_f32_low_half( | ||||
GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t v, const int lane) { | GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t v, const int lane) { | ||||
#if defined(GI_RVV_INTRINSICS) | |||||
float tmp[GI_SIMD_LEN_BYTE / sizeof(float) / 2]; | |||||
vse32_v_f32m1(tmp, v, GI_SIMD_LEN_BYTE / sizeof(float) / 2); | |||||
return vfmadd_vf_f32m1(b, tmp[lane], a, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | |||||
GI_FLOAT32_t ret; | GI_FLOAT32_t ret; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
ret[i] = a[i] + (b[i] * v[lane]); | ret[i] = a[i] + (b[i] * v[lane]); | ||||
} | } | ||||
return ret; | return ret; | ||||
#endif | |||||
} | } | ||||
#define GiVmlaqLaneFloat32LowHalf(a, b, v, lane) \ | #define GiVmlaqLaneFloat32LowHalf(a, b, v, lane) \ | ||||
__naive_gi_vmlaq_lane_f32_low_half(a, b, v, lane) | __naive_gi_vmlaq_lane_f32_low_half(a, b, v, lane) | ||||
@@ -514,6 +612,8 @@ void GiStoreFloat32(float* Buffer, GI_FLOAT32_t Vector) { | |||||
vst1q_f32(Buffer, Vector); | vst1q_f32(Buffer, Vector); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
_mm_storeu_ps(Buffer, Vector); | _mm_storeu_ps(Buffer, Vector); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
vse32_v_f32m1(Buffer, Vector, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
Buffer[i] = Vector[i]; | Buffer[i] = Vector[i]; | ||||
@@ -526,8 +626,10 @@ void GiStoreFloat32V2(float* Buffer, GI_FLOAT32_V2_t Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
vst1q_f32_x2(Buffer, Vector); | vst1q_f32_x2(Buffer, Vector); | ||||
#else | #else | ||||
GiStoreFloat32(Buffer, Vector.val[0]); | |||||
GiStoreFloat32(Buffer + GI_SIMD_LEN_BYTE / sizeof(float), Vector.val[1]); | |||||
GiStoreFloat32(Buffer, GiGetSubVectorFloat32V2(Vector, 0)); | |||||
GiStoreFloat32( | |||||
Buffer + GI_SIMD_LEN_BYTE / sizeof(float), | |||||
GiGetSubVectorFloat32V2(Vector, 1)); | |||||
#endif | #endif | ||||
} | } | ||||
@@ -543,6 +645,14 @@ void GiStoreFloat32V2(float* Buffer, GI_FLOAT32_V2_t Vector) { | |||||
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \ | GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \ | ||||
_mm_store_ss(Buffer, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \ | _mm_store_ss(Buffer, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \ | ||||
} | } | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
#define GISTORELANEFLOAT32(i) \ | |||||
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \ | |||||
float tmp[GI_SIMD_LEN_BYTE / sizeof(float)]; \ | |||||
vse32_v_f32m1(tmp, Vector, GI_SIMD_LEN_BYTE / sizeof(float)); \ | |||||
*Buffer = tmp[i]; \ | |||||
} | |||||
#else | #else | ||||
#define GISTORELANEFLOAT32(i) \ | #define GISTORELANEFLOAT32(i) \ | ||||
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \ | GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \ | ||||
@@ -568,6 +678,14 @@ GISTORELANEFLOAT32(3) | |||||
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \ | GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \ | ||||
return _mm_cvtss_f32(_mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \ | return _mm_cvtss_f32(_mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \ | ||||
} | } | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
#define GIEXTRACTLANEFLOAT32(i) \ | |||||
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \ | |||||
float tmp[GI_SIMD_LEN_BYTE / sizeof(float)]; \ | |||||
vse32_v_f32m1(tmp, Vector, GI_SIMD_LEN_BYTE / sizeof(float)); \ | |||||
return tmp[i]; \ | |||||
} | |||||
#else | #else | ||||
#define GIEXTRACTLANEFLOAT32(i) \ | #define GIEXTRACTLANEFLOAT32(i) \ | ||||
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \ | GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \ | ||||
@@ -590,6 +708,26 @@ GI_FLOAT32_V2_t GiZipqFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
f32x4.val[0] = _mm_unpacklo_ps(Vector1, Vector2); | f32x4.val[0] = _mm_unpacklo_ps(Vector1, Vector2); | ||||
f32x4.val[1] = _mm_unpackhi_ps(Vector1, Vector2); | f32x4.val[1] = _mm_unpackhi_ps(Vector1, Vector2); | ||||
return f32x4; | return f32x4; | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
vfloat32m2_t d = vundefined_f32m2(); | |||||
d = vset_v_f32m1_f32m2(d, 0, Vector1); | |||||
d = vset_v_f32m1_f32m2(d, 1, Vector2); | |||||
vuint32m2_t index; | |||||
#if GI_SIMD_LEN_BYTE == 16 | |||||
uint32_t index_128[8] = {0, 4, 1, 5, 2, 6, 3, 7}; | |||||
index = vle32_v_u32m2(index_128, 8); | |||||
#else | |||||
uint32_t* index_p = (uint32_t*)&index; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
index_p[2 * i] = i; | |||||
index_p[2 * i + 1] = i + GI_SIMD_LEN_BYTE / sizeof(float); | |||||
} | |||||
#endif | |||||
vfloat32m2_t g_d = | |||||
vrgather_vv_f32m2(d, index, GI_SIMD_LEN_BYTE / sizeof(float) * 2); | |||||
vfloat32m1_t v0 = vget_v_f32m2_f32m1(g_d, 0); | |||||
vfloat32m1_t v1 = vget_v_f32m2_f32m1(g_d, 1); | |||||
return vcreate_f32m1x2(v0, v1); | |||||
#else | #else | ||||
GI_FLOAT32_V2_t ret; | GI_FLOAT32_V2_t ret; | ||||
ret.val[0][0] = Vector1[0]; | ret.val[0][0] = Vector1[0]; | ||||
@@ -610,9 +748,11 @@ void GiStoreZipFloat32V2(float* Buffer, GI_FLOAT32_V2_t Vector) { | |||||
vst2q_f32(Buffer, Vector); | vst2q_f32(Buffer, Vector); | ||||
#else | #else | ||||
GI_FLOAT32_V2_t tmp; | GI_FLOAT32_V2_t tmp; | ||||
tmp = GiZipqFloat32(Vector.val[0], Vector.val[1]); | |||||
GiStoreFloat32(Buffer, tmp.val[0]); | |||||
GiStoreFloat32(Buffer + GI_SIMD_LEN_BYTE / sizeof(float), tmp.val[1]); | |||||
tmp = GiZipqFloat32( | |||||
GiGetSubVectorFloat32V2(Vector, 0), GiGetSubVectorFloat32V2(Vector, 1)); | |||||
GiStoreFloat32(Buffer, GiGetSubVectorFloat32V2(tmp, 0)); | |||||
GiStoreFloat32( | |||||
Buffer + GI_SIMD_LEN_BYTE / sizeof(float), GiGetSubVectorFloat32V2(tmp, 1)); | |||||
#endif | #endif | ||||
} | } | ||||
@@ -625,6 +765,24 @@ GI_FLOAT32_t GiInterleaveLowFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) | |||||
return zipped.val[0]; | return zipped.val[0]; | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_unpacklo_ps(Vector1, Vector2); | return _mm_unpacklo_ps(Vector1, Vector2); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
vfloat32m2_t d = vundefined_f32m2(); | |||||
d = vset_v_f32m1_f32m2(d, 0, Vector1); | |||||
d = vset_v_f32m1_f32m2(d, 1, Vector2); | |||||
vuint32m2_t index; | |||||
#if GI_SIMD_LEN_BYTE == 16 | |||||
uint32_t index_128[4] = {0, 4, 1, 5}; | |||||
index = vle32_v_u32m2(index_128, 4); | |||||
#else | |||||
uint32_t* index_p = (uint32_t*)&index; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float) / 2; i++) { | |||||
index_p[2 * i] = i; | |||||
index_p[2 * i + 1] = i + GI_SIMD_LEN_BYTE / sizeof(float); | |||||
} | |||||
#endif | |||||
vfloat32m2_t g_d = | |||||
vrgather_vv_f32m2(d, index, GI_SIMD_LEN_BYTE / sizeof(float) * 2); | |||||
return vget_v_f32m2_f32m1(g_d, 0); | |||||
#else | #else | ||||
GI_FLOAT32_t ret; | GI_FLOAT32_t ret; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) { | ||||
@@ -644,6 +802,24 @@ GI_FLOAT32_t GiInterleaveHighFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) | |||||
return zipped.val[1]; | return zipped.val[1]; | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_unpackhi_ps(Vector1, Vector2); | return _mm_unpackhi_ps(Vector1, Vector2); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
vfloat32m2_t d = vundefined_f32m2(); | |||||
d = vset_v_f32m1_f32m2(d, 0, Vector1); | |||||
d = vset_v_f32m1_f32m2(d, 1, Vector2); | |||||
vuint32m2_t index; | |||||
#if GI_SIMD_LEN_BYTE == 16 | |||||
uint32_t index_128[8] = {0, 4, 1, 5, 2, 6, 3, 7}; | |||||
index = vle32_v_u32m2(index_128, 8); | |||||
#else | |||||
uint32_t* index_p = (uint32_t*)&index; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
index_p[2 * i] = i; | |||||
index_p[2 * i + 1] = i + GI_SIMD_LEN_BYTE / sizeof(float); | |||||
} | |||||
#endif | |||||
vfloat32m2_t g_d = | |||||
vrgather_vv_f32m2(d, index, GI_SIMD_LEN_BYTE / sizeof(float) * 2); | |||||
return vget_v_f32m2_f32m1(g_d, 1); | |||||
#else | #else | ||||
GI_FLOAT32_t ret; | GI_FLOAT32_t ret; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) { | ||||
@@ -660,6 +836,8 @@ GI_FLOAT32_t GiAddFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
return vaddq_f32(Vector1, Vector2); | return vaddq_f32(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_add_ps(Vector1, Vector2); | return _mm_add_ps(Vector1, Vector2); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vfadd_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
return Vector1 + Vector2; | return Vector1 + Vector2; | ||||
#endif | #endif | ||||
@@ -671,6 +849,8 @@ GI_FLOAT32_t GiSubtractFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
return vsubq_f32(Vector1, Vector2); | return vsubq_f32(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_sub_ps(Vector1, Vector2); | return _mm_sub_ps(Vector1, Vector2); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vfsub_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
return Vector1 - Vector2; | return Vector1 - Vector2; | ||||
#endif | #endif | ||||
@@ -682,6 +862,8 @@ GI_FLOAT32_t GiMultiplyFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
return vmulq_f32(Vector1, Vector2); | return vmulq_f32(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_mul_ps(Vector1, Vector2); | return _mm_mul_ps(Vector1, Vector2); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vfmul_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
return Vector1 * Vector2; | return Vector1 * Vector2; | ||||
#endif | #endif | ||||
@@ -694,6 +876,8 @@ GI_FLOAT32_t GiMultiplyScalerFloat32(GI_FLOAT32_t Vector1, float Scaler) { | |||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
GI_FLOAT32_t Vector2 = _mm_set1_ps(Scaler); | GI_FLOAT32_t Vector2 = _mm_set1_ps(Scaler); | ||||
return _mm_mul_ps(Vector1, Vector2); | return _mm_mul_ps(Vector1, Vector2); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vfmul_vf_f32m1(Vector1, Scaler, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
return Vector1 * Scaler; | return Vector1 * Scaler; | ||||
#endif | #endif | ||||
@@ -708,6 +892,9 @@ GI_FLOAT32_t GiMultiplyAddFloat32( | |||||
return _mm_fmadd_ps(Vector1, Vector2, VectorSum); | return _mm_fmadd_ps(Vector1, Vector2, VectorSum); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_add_ps(_mm_mul_ps(Vector1, Vector2), VectorSum); | return _mm_add_ps(_mm_mul_ps(Vector1, Vector2), VectorSum); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vfmadd_vv_f32m1( | |||||
Vector1, Vector2, VectorSum, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
return Vector1 * Vector2 + VectorSum; | return Vector1 * Vector2 + VectorSum; | ||||
#endif | #endif | ||||
@@ -720,6 +907,8 @@ GI_FLOAT32_t GiMultiplyAddScalarFloat32( | |||||
return v_fma_n_f32(VectorSum, Vector, Scalar); | return v_fma_n_f32(VectorSum, Vector, Scalar); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return GiMultiplyAddFloat32(VectorSum, GiBroadcastFloat32(Scalar), Vector); | return GiMultiplyAddFloat32(VectorSum, GiBroadcastFloat32(Scalar), Vector); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vfmadd_vf_f32m1(Vector, Scalar, VectorSum, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
return VectorSum + Vector * Scalar; | return VectorSum + Vector * Scalar; | ||||
#endif | #endif | ||||
@@ -767,6 +956,8 @@ GI_FLOAT32_t GiDivideFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
return vmulq_f32(Vector1, recp); | return vmulq_f32(Vector1, recp); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_div_ps(Vector1, Vector2); | return _mm_div_ps(Vector1, Vector2); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vfdiv_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
return Vector1 / Vector2; | return Vector1 / Vector2; | ||||
#endif | #endif | ||||
@@ -779,6 +970,9 @@ GI_FLOAT32_t GiRecpeSFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
GI_FLOAT32_t two = _mm_set1_ps(2.0f); | GI_FLOAT32_t two = _mm_set1_ps(2.0f); | ||||
return _mm_sub_ps(two, _mm_mul_ps(Vector1, Vector2)); | return _mm_sub_ps(two, _mm_mul_ps(Vector1, Vector2)); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
GI_FLOAT32_t two = GiBroadcastFloat32(2.0f); | |||||
return vfnmsub_vv_f32m1(Vector1, Vector2, two, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
return (2.0f - Vector1 * Vector2); | return (2.0f - Vector1 * Vector2); | ||||
#endif | #endif | ||||
@@ -791,6 +985,9 @@ GI_FLOAT32_t GiRecpeFloat32(GI_FLOAT32_t Vector) { | |||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
GI_FLOAT32_t ones = _mm_set1_ps(1.0f); | GI_FLOAT32_t ones = _mm_set1_ps(1.0f); | ||||
return _mm_div_ps(ones, Vector); | return _mm_div_ps(ones, Vector); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
GI_FLOAT32_t ones = GiBroadcastFloat32(1.0f); | |||||
return vfdiv_vv_f32m1(ones, Vector, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
//! FIXME: neon or sse always have low accuracy than 1/x | //! FIXME: neon or sse always have low accuracy than 1/x | ||||
return 1 / Vector; | return 1 / Vector; | ||||
@@ -804,6 +1001,8 @@ GI_FLOAT32_t GiNegFloat32(GI_FLOAT32_t Vector) { | |||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
GI_FLOAT32_t zero = _mm_set1_ps(0.0f); | GI_FLOAT32_t zero = _mm_set1_ps(0.0f); | ||||
return _mm_sub_ps(zero, Vector); | return _mm_sub_ps(zero, Vector); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vfneg_v_f32m1(Vector, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
return -Vector; | return -Vector; | ||||
#endif | #endif | ||||
@@ -815,6 +1014,12 @@ GI_UINT32_t GiGreaterThanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
return vcgtq_f32(Vector1, Vector2); | return vcgtq_f32(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_castps_si128(_mm_cmpgt_ps(Vector1, Vector2)); | return _mm_castps_si128(_mm_cmpgt_ps(Vector1, Vector2)); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
vbool32_t b = | |||||
vmfgt_vv_f32m1_b32(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
GI_UINT32_t ret; | |||||
memcpy(&ret, &b, GI_SIMD_LEN_BYTE); | |||||
return vneg_v_u32m1(ret, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
GI_UINT32_t ret; | GI_UINT32_t ret; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
@@ -830,6 +1035,12 @@ GI_UINT32_t GiLessThanEqFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
return vcleq_f32(Vector1, Vector2); | return vcleq_f32(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_castps_si128(_mm_cmple_ps(Vector1, Vector2)); | return _mm_castps_si128(_mm_cmple_ps(Vector1, Vector2)); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
vbool32_t b = | |||||
vmfle_vv_f32m1_b32(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
GI_UINT32_t ret; | |||||
memcpy(&ret, &b, GI_SIMD_LEN_BYTE); | |||||
return vneg_v_u32m1(ret, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
GI_UINT32_t ret; | GI_UINT32_t ret; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
@@ -845,6 +1056,12 @@ GI_UINT32_t GiLessThanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
return vcltq_f32(Vector1, Vector2); | return vcltq_f32(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_castps_si128(_mm_cmplt_ps(Vector1, Vector2)); | return _mm_castps_si128(_mm_cmplt_ps(Vector1, Vector2)); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
vbool32_t b = | |||||
vmflt_vv_f32m1_b32(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
GI_UINT32_t ret; | |||||
memcpy(&ret, &b, GI_SIMD_LEN_BYTE); | |||||
return vneg_v_u32m1(ret, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
GI_UINT32_t ret; | GI_UINT32_t ret; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
@@ -920,6 +1137,8 @@ GI_FLOAT32_t GiMaximumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
return vmaxq_f32(Vector1, Vector2); | return vmaxq_f32(Vector1, Vector2); | ||||
#elif defined(GI_NEON32_INTRINSICS) | #elif defined(GI_NEON32_INTRINSICS) | ||||
return _mm_max_ps(Vector1, Vector2); | return _mm_max_ps(Vector1, Vector2); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vfmax_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
GI_FLOAT32_t max; | GI_FLOAT32_t max; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
@@ -935,6 +1154,8 @@ GI_FLOAT32_t GiMinimumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
return vminq_f32(Vector1, Vector2); | return vminq_f32(Vector1, Vector2); | ||||
#elif defined(GI_NEON32_INTRINSICS) | #elif defined(GI_NEON32_INTRINSICS) | ||||
return _mm_min_ps(Vector1, Vector2); | return _mm_min_ps(Vector1, Vector2); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vfmin_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
GI_FLOAT32_t min; | GI_FLOAT32_t min; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
@@ -948,6 +1169,15 @@ GI_FORCEINLINE | |||||
GI_FLOAT32_t GiMaxNanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | GI_FLOAT32_t GiMaxNanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | ||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vmaxq_f32(Vector1, Vector2); | return vmaxq_f32(Vector1, Vector2); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
//! vfmax_vv_f32m1 NAN logic is not same with NEON, imp with naive | |||||
GI_FLOAT32_FIXLEN_t a, b, ret; | |||||
a = GiFloat32Type2FixLenType(Vector1); | |||||
b = GiFloat32Type2FixLenType(Vector2); | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
ret[i] = MAX_NAN(a[i], b[i]); | |||||
} | |||||
return GiFixLenType2GiFloat32Type(ret); | |||||
#else | #else | ||||
//! _mm_max_ps does not fellow the IEEE standard when input is NAN, so | //! _mm_max_ps does not fellow the IEEE standard when input is NAN, so | ||||
//! implement by C code | //! implement by C code | ||||
@@ -963,6 +1193,15 @@ GI_FORCEINLINE | |||||
GI_FLOAT32_t GiMinNanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | GI_FLOAT32_t GiMinNanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | ||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vminq_f32(Vector1, Vector2); | return vminq_f32(Vector1, Vector2); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
//! vfmin_vv_f32m1 NAN logic is not same with NEON, imp with naive | |||||
GI_FLOAT32_FIXLEN_t a, b, ret; | |||||
a = GiFloat32Type2FixLenType(Vector1); | |||||
b = GiFloat32Type2FixLenType(Vector2); | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
ret[i] = MIN_NAN(a[i], b[i]); | |||||
} | |||||
return GiFixLenType2GiFloat32Type(ret); | |||||
#else | #else | ||||
//! _mm_min_ps does not fellow the IEEE standard when input is NAN, so | //! _mm_min_ps does not fellow the IEEE standard when input is NAN, so | ||||
//! implement by C code | //! implement by C code | ||||
@@ -999,6 +1238,12 @@ float GiReduceAddFloat32(GI_FLOAT32_t Vector) { | |||||
Vector = GiAddFloat32( | Vector = GiAddFloat32( | ||||
Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); | Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); | ||||
return GiExtractLane0Float32(Vector); | return GiExtractLane0Float32(Vector); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
vfloat32m1_t redsum = vundefined_f32m1(); | |||||
//! use Ordered sum, may Unordered sum more fast with vfredusum_vs_f32m1_f32m1 | |||||
redsum = vfredosum_vs_f32m1_f32m1( | |||||
redsum, Vector, GiBroadcastFloat32(0.0f), GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
return GiExtractLane0Float32(redsum); | |||||
#else | #else | ||||
float ret = 0; | float ret = 0; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
@@ -1021,6 +1266,14 @@ float GiReduceMultiplyFloat32(GI_FLOAT32_t Vector) { | |||||
Vector = GiMultiplyFloat32( | Vector = GiMultiplyFloat32( | ||||
Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); | Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); | ||||
return GiExtractLane0Float32(Vector); | return GiExtractLane0Float32(Vector); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
//! RVV do not have reduce mul, imp with naive | |||||
float ret = 1; | |||||
GI_FLOAT32_FIXLEN_t v = GiFloat32Type2FixLenType(Vector); | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
ret *= v[i]; | |||||
} | |||||
return ret; | |||||
#else | #else | ||||
float ret = 1; | float ret = 1; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
@@ -1049,6 +1302,14 @@ float GiReduceMaxNanFloat32(GI_FLOAT32_t Vector) { | |||||
Vector = GiMaxNanFloat32( | Vector = GiMaxNanFloat32( | ||||
Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); | Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); | ||||
return GiExtractLane0Float32(Vector); | return GiExtractLane0Float32(Vector); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
//! vfredmax_vs_f32m1_f32m1 can not handle NAN case, imp with naive | |||||
GI_FLOAT32_FIXLEN_t v = GiFloat32Type2FixLenType(Vector); | |||||
float ret = v[0]; | |||||
for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
ret = MAX_NAN(ret, v[i]); | |||||
} | |||||
return ret; | |||||
#else | #else | ||||
float ret = Vector[0]; | float ret = Vector[0]; | ||||
for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
@@ -1074,6 +1335,14 @@ float GiReduceMinNanFloat32(GI_FLOAT32_t Vector) { | |||||
Vector = GiMinNanFloat32( | Vector = GiMinNanFloat32( | ||||
Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); | Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); | ||||
return GiExtractLane0Float32(Vector); | return GiExtractLane0Float32(Vector); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
//! vfredmin_vs_f32m1_f32m1 can not handle NAN case, imp with naive | |||||
GI_FLOAT32_FIXLEN_t v = GiFloat32Type2FixLenType(Vector); | |||||
float ret = v[0]; | |||||
for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
ret = MIN_NAN(ret, v[i]); | |||||
} | |||||
return ret; | |||||
#else | #else | ||||
float ret = Vector[0]; | float ret = Vector[0]; | ||||
for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
@@ -1094,6 +1363,8 @@ GI_FLOAT32_t GiAbsFloat32(GI_FLOAT32_t Vector1) { | |||||
} value; | } value; | ||||
value.int_val = 0x7fffffff; | value.int_val = 0x7fffffff; | ||||
return _mm_and_ps(Vector1, _mm_set_ps1(value.float_val)); | return _mm_and_ps(Vector1, _mm_set_ps1(value.float_val)); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vfabs_v_f32m1(Vector1, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
GI_FLOAT32_t ret; | GI_FLOAT32_t ret; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
@@ -1156,6 +1427,8 @@ GI_FORCEINLINE GI_FLOAT32_t GiReinterpretqS64ToFloat32(GI_INT64_t a) { | |||||
return vreinterpretq_f32_s64(a); | return vreinterpretq_f32_s64(a); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _M128(a); | return _M128(a); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vle32_v_f32m1((float*)&a, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
GI_FLOAT32_t ret; | GI_FLOAT32_t ret; | ||||
memcpy(&ret, &a, sizeof(GI_FLOAT32_t)); | memcpy(&ret, &a, sizeof(GI_FLOAT32_t)); | ||||
@@ -1168,6 +1441,10 @@ GI_FORCEINLINE GI_INT64_t GiReinterpretqFloat32ToS64(GI_FLOAT32_t a) { | |||||
return vreinterpretq_s64_f32(a); | return vreinterpretq_s64_f32(a); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _M128i(a); | return _M128i(a); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
GI_INT64_t ret; | |||||
vse32_v_f32m1((float*)&ret, a, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
return ret; | |||||
#else | #else | ||||
GI_INT64_t ret; | GI_INT64_t ret; | ||||
memcpy(&ret, &a, sizeof(GI_INT64_t)); | memcpy(&ret, &a, sizeof(GI_INT64_t)); | ||||
@@ -1177,6 +1454,16 @@ GI_FORCEINLINE GI_INT64_t GiReinterpretqFloat32ToS64(GI_FLOAT32_t a) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
#define GiSimdFmaLane(a, b, c, d) vfmaq_laneq_f32(a, b, c, d) | #define GiSimdFmaLane(a, b, c, d) vfmaq_laneq_f32(a, b, c, d) | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
#define __rvv_fmaq_laneq_f32(__a, __b, __c, __lane) \ | |||||
__extension__({ \ | |||||
float t[GI_SIMD_LEN_BYTE / sizeof(float)]; \ | |||||
vse32_v_f32m1(t, __c, GI_SIMD_LEN_BYTE / sizeof(float)); \ | |||||
GI_FLOAT32_t __ret = vfmadd_vf_f32m1( \ | |||||
__b, t[__lane], __a, GI_SIMD_LEN_BYTE / sizeof(float)); \ | |||||
__ret; \ | |||||
}) | |||||
#define GiSimdFmaLane(a, b, c, d) __rvv_fmaq_laneq_f32(a, b, c, d) | |||||
#else | #else | ||||
GI_FORCEINLINE GI_FLOAT32_t | GI_FORCEINLINE GI_FLOAT32_t | ||||
___gi_vmlaq_lane_f32(GI_FLOAT32_t a, GI_FLOAT32_t b, float32x2_t v, int l) { | ___gi_vmlaq_lane_f32(GI_FLOAT32_t a, GI_FLOAT32_t b, float32x2_t v, int l) { | ||||
@@ -1262,6 +1549,9 @@ ___gi_vfmaq_laneq_f32(GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t v, int l) { | |||||
__ret; \ | __ret; \ | ||||
}) | }) | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
#define GiMlaqLowLaneFloat32(a, b, c, d) __rvv_fmaq_laneq_f32(a, b, c, d) | |||||
#define GiMlaqHighLaneFloat32(a, b, c, d) __rvv_fmaq_laneq_f32(a, b, c, d) | |||||
#else | #else | ||||
//! naive | //! naive | ||||
#define GiMlaqLowLaneFloat32(__a, __b, __v, __lane) \ | #define GiMlaqLowLaneFloat32(__a, __b, __v, __lane) \ | ||||
@@ -1303,6 +1593,16 @@ SSE_VFMSQ_LANEQ_F32(2) | |||||
SSE_VFMSQ_LANEQ_F32(3) | SSE_VFMSQ_LANEQ_F32(3) | ||||
#undef SSE_VFMSQ_LANEQ_F32 | #undef SSE_VFMSQ_LANEQ_F32 | ||||
#define GiFmsqLaneQFloat32(a, b, v, lane) sse_vfmsq_lane_##lane##_q_f32(a, b, v) | #define GiFmsqLaneQFloat32(a, b, v, lane) sse_vfmsq_lane_##lane##_q_f32(a, b, v) | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
#define __rvv_fmsq_lane_float32(__a, __b, __c, __lane) \ | |||||
__extension__({ \ | |||||
float t[GI_SIMD_LEN_BYTE / sizeof(float)]; \ | |||||
vse32_v_f32m1(t, __c, GI_SIMD_LEN_BYTE / sizeof(float)); \ | |||||
GI_FLOAT32_t __ret = vfnmsub_vf_f32m1( \ | |||||
__b, t[__lane], __a, GI_SIMD_LEN_BYTE / sizeof(float)); \ | |||||
__ret; \ | |||||
}) | |||||
#define GiFmsqLaneQFloat32(a, b, c, d) __rvv_fmsq_lane_float32(a, b, c, d) | |||||
#else | #else | ||||
//! naive | //! naive | ||||
GI_FORCEINLINE GI_FLOAT32_t __naive_GiFmsqLaneQFloat32( | GI_FORCEINLINE GI_FLOAT32_t __naive_GiFmsqLaneQFloat32( | ||||
@@ -1324,6 +1624,11 @@ GI_FORCEINLINE GI_FLOAT32_t GiCombineFloat32(float32x2_t a, float32x2_t b) { | |||||
__m128i res; | __m128i res; | ||||
res = _mm_unpacklo_epi64(_pM128i(a), _pM128i(b)); | res = _mm_unpacklo_epi64(_pM128i(a), _pM128i(b)); | ||||
return _M128(res); | return _M128(res); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
float t[GI_SIMD_LEN_BYTE / sizeof(float)]; | |||||
vse32_v_f32m1(t, a, 2); | |||||
vse32_v_f32m1(t + 2, b, 2); | |||||
return vle32_v_f32m1(t, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
GI_FLOAT32_t res; | GI_FLOAT32_t res; | ||||
res[0] = a[0]; | res[0] = a[0]; | ||||
@@ -1337,6 +1642,8 @@ GI_FORCEINLINE GI_FLOAT32_t GiCombineFloat32(float32x2_t a, float32x2_t b) { | |||||
GI_FORCEINLINE float32x2_t GiGetLowFloat32(GI_FLOAT32_t a) { | GI_FORCEINLINE float32x2_t GiGetLowFloat32(GI_FLOAT32_t a) { | ||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vget_low_f32(a); | return vget_low_f32(a); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vmv_v_v_f32m1(a, 2); | |||||
#else | #else | ||||
return ___gi_vget_low_f32(a); | return ___gi_vget_low_f32(a); | ||||
#endif | #endif | ||||
@@ -1345,6 +1652,12 @@ GI_FORCEINLINE float32x2_t GiGetLowFloat32(GI_FLOAT32_t a) { | |||||
GI_FORCEINLINE float32x2_t GiGetHighFloat32(GI_FLOAT32_t a) { | GI_FORCEINLINE float32x2_t GiGetHighFloat32(GI_FLOAT32_t a) { | ||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vget_high_f32(a); | return vget_high_f32(a); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
float t[GI_SIMD_LEN_BYTE / sizeof(float)]; | |||||
vse32_v_f32m1(t, a, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
return vle32_v_f32m1( | |||||
t + GI_SIMD_LEN_BYTE / sizeof(float) / 2, | |||||
GI_SIMD_LEN_BYTE / sizeof(float) / 2); | |||||
#else | #else | ||||
return ___gi_vget_high_f32(a); | return ___gi_vget_high_f32(a); | ||||
#endif | #endif | ||||
@@ -1358,6 +1671,13 @@ GI_FORCEINLINE float32x2_t GiPaddFloat32(float32x2_t a, float32x2_t b) { | |||||
res.m64_f32[0] = a.m64_f32[0] + a.m64_f32[1]; | res.m64_f32[0] = a.m64_f32[0] + a.m64_f32[1]; | ||||
res.m64_f32[1] = b.m64_f32[0] + b.m64_f32[1]; | res.m64_f32[1] = b.m64_f32[0] + b.m64_f32[1]; | ||||
return res; | return res; | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
float t[GI_SIMD_LEN_BYTE / sizeof(float)]; | |||||
vse32_v_f32m1(t, a, 2); | |||||
vse32_v_f32m1(t + 2, b, 2); | |||||
t[0] = t[0] + t[1]; | |||||
t[1] = t[2] + t[3]; | |||||
return vle32_v_f32m1(t, 2); | |||||
#else | #else | ||||
float32x2_t res; | float32x2_t res; | ||||
res[0] = a[0] + a[1]; | res[0] = a[0] + a[1]; | ||||
@@ -1374,6 +1694,13 @@ GI_FORCEINLINE float32x2_t GiPmaxFloat32(float32x2_t a, float32x2_t b) { | |||||
res.m64_f32[0] = MAX_NAN(a.m64_f32[0], a.m64_f32[1]); | res.m64_f32[0] = MAX_NAN(a.m64_f32[0], a.m64_f32[1]); | ||||
res.m64_f32[1] = MAX_NAN(b.m64_f32[0], b.m64_f32[1]); | res.m64_f32[1] = MAX_NAN(b.m64_f32[0], b.m64_f32[1]); | ||||
return res; | return res; | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
float t[GI_SIMD_LEN_BYTE / sizeof(float)]; | |||||
vse32_v_f32m1(t, a, 2); | |||||
vse32_v_f32m1(t + 2, b, 2); | |||||
t[0] = MAX_NAN(t[0], t[1]); | |||||
t[1] = MAX_NAN(t[2], t[3]); | |||||
return vle32_v_f32m1(t, 2); | |||||
#else | #else | ||||
float32x2_t res; | float32x2_t res; | ||||
res[0] = MAX_NAN(a[0], a[1]); | res[0] = MAX_NAN(a[0], a[1]); | ||||
@@ -1408,6 +1735,8 @@ GI_FLOAT32_V3_t GiLoadUzipFloat32V3(const float* ptr) { | |||||
v.val[1] = _mm_movehl_ps(tmp3, v.val[1]); | v.val[1] = _mm_movehl_ps(tmp3, v.val[1]); | ||||
v.val[2] = _mm_movehl_ps(tmp2, tmp0); | v.val[2] = _mm_movehl_ps(tmp2, tmp0); | ||||
return v; | return v; | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vlseg3e32_v_f32m1x3(ptr, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | #else | ||||
GI_FLOAT32_V3_t ret; | GI_FLOAT32_V3_t ret; | ||||
for (size_t i = 0; i < 3; i++) { | for (size_t i = 0; i < 3; i++) { | ||||
@@ -1440,6 +1769,35 @@ void GiStoreZipFloat32V3(float* ptr, GI_FLOAT32_V3_t val) { | |||||
GiStoreFloat32(ptr, v.val[0]); | GiStoreFloat32(ptr, v.val[0]); | ||||
GiStoreFloat32((ptr + 4), v.val[1]); | GiStoreFloat32((ptr + 4), v.val[1]); | ||||
GiStoreFloat32((ptr + 8), v.val[2]); | GiStoreFloat32((ptr + 8), v.val[2]); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
vfloat32m4_t d = vundefined_f32m4(); | |||||
d = vset_v_f32m1_f32m4(d, 0, GiGetSubVectorFloat32V3(val, 0)); | |||||
d = vset_v_f32m1_f32m4(d, 1, GiGetSubVectorFloat32V3(val, 1)); | |||||
d = vset_v_f32m1_f32m4(d, 2, GiGetSubVectorFloat32V3(val, 2)); | |||||
vuint32m4_t index; | |||||
#if GI_SIMD_LEN_BYTE == 16 | |||||
uint32_t index_128[16] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11, 0, 0, 0, 0}; | |||||
index = vle32_v_u32m4(index_128, 16); | |||||
#else | |||||
uint32_t* index_p = (uint32_t*)&index; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
index_p[3 * i] = i; | |||||
index_p[3 * i + 1] = i + GI_SIMD_LEN_BYTE / sizeof(float); | |||||
index_p[3 * i + 2] = i + GI_SIMD_LEN_BYTE / sizeof(float) * 2; | |||||
} | |||||
#endif | |||||
vfloat32m4_t g_d = | |||||
vrgather_vv_f32m4(d, index, GI_SIMD_LEN_BYTE / sizeof(float) * 3); | |||||
vfloat32m1_t v0 = vget_v_f32m4_f32m1(g_d, 0); | |||||
vfloat32m1_t v1 = vget_v_f32m4_f32m1(g_d, 1); | |||||
vfloat32m1_t v2 = vget_v_f32m4_f32m1(g_d, 2); | |||||
GI_FLOAT32_V3_t tmp = vcreate_f32m1x3(v0, v1, v2); | |||||
GiStoreFloat32(ptr, GiGetSubVectorFloat32V3(tmp, 0)); | |||||
GiStoreFloat32( | |||||
ptr + GI_SIMD_LEN_BYTE / sizeof(float), GiGetSubVectorFloat32V3(tmp, 1)); | |||||
GiStoreFloat32( | |||||
ptr + GI_SIMD_LEN_BYTE / sizeof(float) * 2, | |||||
GiGetSubVectorFloat32V3(tmp, 2)); | |||||
#else | #else | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
*ptr++ = val.val[0][i]; | *ptr++ = val.val[0][i]; | ||||
@@ -1448,3 +1806,13 @@ void GiStoreZipFloat32V3(float* ptr, GI_FLOAT32_V3_t val) { | |||||
} | } | ||||
#endif | #endif | ||||
} | } | ||||
GI_FORCEINLINE | |||||
GI_FLOAT32_t GiDivFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
#if defined(GI_RVV_INTRINSICS) | |||||
return vfdiv_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float)); | |||||
#else | |||||
//! neon, ssex and naive can auto call builtin function | |||||
return Vector1 / Vector2; | |||||
#endif | |||||
} |
@@ -3,11 +3,26 @@ | |||||
#include "gi_common.h" | #include "gi_common.h" | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT32_t GiReinterpretInt8AsInt32(GI_INT8_t In) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vreinterpretq_s32_s8(In); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
return (GI_INT32_t)In; | |||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vreinterpret_v_i8m1_i32m1(In); | |||||
#else | |||||
return (GI_INT32_t)In; | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_UINT32_t GiBroadcastUint32(int32_t Value) { | GI_UINT32_t GiBroadcastUint32(int32_t Value) { | ||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vdupq_n_u32(Value); | return vdupq_n_u32(Value); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_set1_epi32(Value); | return _mm_set1_epi32(Value); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vmv_v_x_u32m1(Value, GI_SIMD_LEN_BYTE / sizeof(int32_t)); | |||||
#else | #else | ||||
GI_UINT32_t ret; | GI_UINT32_t ret; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | ||||
@@ -23,6 +38,8 @@ GI_INT32_t GiLoadInt32(const void* Buffer) { | |||||
return vld1q_s32((int32_t*)Buffer); | return vld1q_s32((int32_t*)Buffer); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_loadu_si128((const __m128i*)Buffer); | return _mm_loadu_si128((const __m128i*)Buffer); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vle32_v_i32m1((int32_t*)Buffer, GI_SIMD_LEN_BYTE / sizeof(int32_t)); | |||||
#else | #else | ||||
GI_INT32_t ret; | GI_INT32_t ret; | ||||
const int32_t* ptr = (int32_t*)Buffer; | const int32_t* ptr = (int32_t*)Buffer; | ||||
@@ -39,6 +56,8 @@ GI_INT16_t GiLoadInt16(const void* Buffer) { | |||||
return vld1q_s16((int16_t*)Buffer); | return vld1q_s16((int16_t*)Buffer); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_loadu_si128((const __m128i*)Buffer); | return _mm_loadu_si128((const __m128i*)Buffer); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vle16_v_i16m1((int16_t*)Buffer, GI_SIMD_LEN_BYTE / sizeof(int16_t)); | |||||
#else | #else | ||||
GI_INT16_t ret; | GI_INT16_t ret; | ||||
const int16_t* ptr = (int16_t*)Buffer; | const int16_t* ptr = (int16_t*)Buffer; | ||||
@@ -55,6 +74,8 @@ GI_INT8_t GiLoadInt8(const void* Buffer) { | |||||
return vld1q_s8((int8_t*)Buffer); | return vld1q_s8((int8_t*)Buffer); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_loadu_si128((const __m128i*)Buffer); | return _mm_loadu_si128((const __m128i*)Buffer); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vle8_v_i8m1((int8_t*)Buffer, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
#else | #else | ||||
GI_INT8_t ret; | GI_INT8_t ret; | ||||
const int8_t* ptr = (int8_t*)Buffer; | const int8_t* ptr = (int8_t*)Buffer; | ||||
@@ -71,6 +92,8 @@ void GiStoreInt32(void* Buffer, GI_INT32_t Vector) { | |||||
vst1q_s32((int32_t*)Buffer, Vector); | vst1q_s32((int32_t*)Buffer, Vector); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
_mm_storeu_si128((__m128i*)Buffer, Vector); | _mm_storeu_si128((__m128i*)Buffer, Vector); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
vse32_v_i32m1((int32_t*)Buffer, Vector, GI_SIMD_LEN_BYTE / sizeof(int32_t)); | |||||
#else | #else | ||||
int32_t* ptr = (int32_t*)Buffer; | int32_t* ptr = (int32_t*)Buffer; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | ||||
@@ -93,6 +116,14 @@ void GiStoreInt32(void* Buffer, GI_INT32_t Vector) { | |||||
_mm_store_ss( \ | _mm_store_ss( \ | ||||
(float*)Buffer, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(i, i, i, i))); \ | (float*)Buffer, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(i, i, i, i))); \ | ||||
} | } | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
#define GISTORELANEINT32(i) \ | |||||
GI_FORCEINLINE void GiStoreLane##i##Int32(void* Buffer, GI_INT32_t Vector) { \ | |||||
int32_t t[GI_SIMD_LEN_BYTE / sizeof(int32_t)]; \ | |||||
vse32_v_i32m1(t, Vector, GI_SIMD_LEN_BYTE / sizeof(int32_t)); \ | |||||
*((int32_t*)Buffer) = t[i]; \ | |||||
} | |||||
#else | #else | ||||
#define GISTORELANEINT32(i) \ | #define GISTORELANEINT32(i) \ | ||||
GI_FORCEINLINE void GiStoreLane##i##Int32(void* Buffer, GI_INT32_t Vector) { \ | GI_FORCEINLINE void GiStoreLane##i##Int32(void* Buffer, GI_INT32_t Vector) { \ | ||||
@@ -113,6 +144,8 @@ GI_INT8_t GiReinterInt32ToInt8(GI_INT32_t Vector) { | |||||
return vreinterpretq_s8_s32(Vector); | return vreinterpretq_s8_s32(Vector); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return Vector; | return Vector; | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vreinterpret_v_i32m1_i8m1(Vector); | |||||
#else | #else | ||||
return *(GI_INT8_t*)&Vector; | return *(GI_INT8_t*)&Vector; | ||||
#endif | #endif | ||||
@@ -124,6 +157,8 @@ void GiStoreInt16(void* Buffer, GI_INT16_t Vector) { | |||||
vst1q_s16((int16_t*)Buffer, Vector); | vst1q_s16((int16_t*)Buffer, Vector); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
_mm_storeu_si128((__m128i*)Buffer, Vector); | _mm_storeu_si128((__m128i*)Buffer, Vector); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
vse16_v_i16m1((int16_t*)Buffer, Vector, GI_SIMD_LEN_BYTE / sizeof(int16_t)); | |||||
#else | #else | ||||
int16_t* ptr = (int16_t*)Buffer; | int16_t* ptr = (int16_t*)Buffer; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); i++) { | ||||
@@ -138,6 +173,8 @@ void GiStoreInt8(void* Buffer, GI_INT8_t Vector) { | |||||
vst1q_s8((int8_t*)Buffer, Vector); | vst1q_s8((int8_t*)Buffer, Vector); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
_mm_storeu_si128((__m128i*)Buffer, Vector); | _mm_storeu_si128((__m128i*)Buffer, Vector); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
vse8_v_i8m1((int8_t*)Buffer, Vector, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
#else | #else | ||||
int8_t* ptr = (int8_t*)Buffer; | int8_t* ptr = (int8_t*)Buffer; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | ||||
@@ -152,6 +189,8 @@ void GiStoreLowInt8(void* Buffer, GI_INT8_t Vector) { | |||||
vst1_s8((int8_t*)Buffer, vget_low_s8(Vector)); | vst1_s8((int8_t*)Buffer, vget_low_s8(Vector)); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
_mm_storel_epi64((__m128i*)Buffer, Vector); | _mm_storel_epi64((__m128i*)Buffer, Vector); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
vse8_v_i8m1((int8_t*)Buffer, Vector, GI_SIMD_LEN_BYTE / sizeof(int8_t) / 2); | |||||
#else | #else | ||||
int8_t* ptr = (int8_t*)Buffer; | int8_t* ptr = (int8_t*)Buffer; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); i++) { | ||||
@@ -166,6 +205,21 @@ void GiStoreHihgInt8(void* Buffer, GI_INT8_t Vector) { | |||||
vst1_s8((int8_t*)Buffer, vget_high_s8(Vector)); | vst1_s8((int8_t*)Buffer, vget_high_s8(Vector)); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
_mm_storel_epi64((__m128i*)Buffer, _mm_unpackhi_epi64(Vector, Vector)); | _mm_storel_epi64((__m128i*)Buffer, _mm_unpackhi_epi64(Vector, Vector)); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
vuint8m1_t index; | |||||
#if GI_SIMD_LEN_BYTE == 16 | |||||
uint8_t index_128[16] = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}; | |||||
index = vle8_v_u8m1(index_128, 16); | |||||
#else | |||||
uint8_t* index_p = (uint8_t*)&index; | |||||
int32_t offset = GI_SIMD_LEN_BYTE / sizeof(int8_t) / 2; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t) / 2; i++) { | |||||
index_p[i] = offset + i; | |||||
index_p[offset + i] = i; | |||||
} | |||||
#endif | |||||
vint8m1_t g_d = vrgather_vv_i8m1(Vector, index, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
vse8_v_i8m1((int8_t*)Buffer, g_d, GI_SIMD_LEN_BYTE / sizeof(int8_t) / 2); | |||||
#else | #else | ||||
int8_t* ptr = (int8_t*)Buffer; | int8_t* ptr = (int8_t*)Buffer; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); i++) { | ||||
@@ -181,6 +235,8 @@ GI_INT32_t GiNegInt32(GI_INT32_t Vector) { | |||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
GI_INT32_t zero = _mm_set1_epi32(0); | GI_INT32_t zero = _mm_set1_epi32(0); | ||||
return _mm_sub_epi32(zero, Vector); | return _mm_sub_epi32(zero, Vector); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vneg_v_i32m1(Vector, GI_SIMD_LEN_BYTE / sizeof(int32_t)); | |||||
#else | #else | ||||
return -Vector; | return -Vector; | ||||
#endif | #endif | ||||
@@ -193,6 +249,8 @@ GI_INT8_t GiNegInt8(GI_INT8_t Vector) { | |||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
GI_INT32_t zero = _mm_set1_epi8(0); | GI_INT32_t zero = _mm_set1_epi8(0); | ||||
return _mm_sub_epi8(zero, Vector); | return _mm_sub_epi8(zero, Vector); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vneg_v_i8m1(Vector, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
#else | #else | ||||
return -Vector; | return -Vector; | ||||
#endif | #endif | ||||
@@ -209,6 +267,15 @@ GI_UINT32_t GiTestAndSetUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { | |||||
res = _mm_and_si128(Vector1, Vector2); | res = _mm_and_si128(Vector1, Vector2); | ||||
res = _mm_cmpeq_epi32(res, zero); | res = _mm_cmpeq_epi32(res, zero); | ||||
return _mm_xor_si128(res, one); | return _mm_xor_si128(res, one); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
//! rvv uint32_t mask only use bit 0 and 1, imp with naive | |||||
GI_UINT32_FIXLEN_t a = GiUint32Type2FixLenType(Vector1); | |||||
GI_UINT32_FIXLEN_t b = GiUint32Type2FixLenType(Vector2); | |||||
GI_UINT32_FIXLEN_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||||
ret[i] = a[i] & b[i] ? 0xFFFFFFFF : 0; | |||||
} | |||||
return GiFixLenType2GiUint32Type(ret); | |||||
#else | #else | ||||
GI_UINT32_t ret; | GI_UINT32_t ret; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | ||||
@@ -224,6 +291,8 @@ GI_INT32_t GiAddInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
return vaddq_s32(Vector1, Vector2); | return vaddq_s32(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_add_epi32(Vector1, Vector2); | return _mm_add_epi32(Vector1, Vector2); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vadd_vv_i32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int32_t)); | |||||
#else | #else | ||||
return Vector1 + Vector2; | return Vector1 + Vector2; | ||||
#endif | #endif | ||||
@@ -235,6 +304,8 @@ GI_UINT32_t GiAddUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { | |||||
return vaddq_u32(Vector1, Vector2); | return vaddq_u32(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_add_epi32(Vector1, Vector2); | return _mm_add_epi32(Vector1, Vector2); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vadd_vv_u32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(uint32_t)); | |||||
#else | #else | ||||
return Vector1 + Vector2; | return Vector1 + Vector2; | ||||
#endif | #endif | ||||
@@ -246,6 +317,8 @@ GI_INT16_t GiAddInt16(GI_INT16_t Vector1, GI_INT16_t Vector2) { | |||||
return vaddq_s16(Vector1, Vector2); | return vaddq_s16(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_add_epi16(Vector1, Vector2); | return _mm_add_epi16(Vector1, Vector2); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vadd_vv_i16m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int16_t)); | |||||
#else | #else | ||||
return Vector1 + Vector2; | return Vector1 + Vector2; | ||||
#endif | #endif | ||||
@@ -257,6 +330,8 @@ GI_INT8_t GiAddInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||||
return vaddq_s8(Vector1, Vector2); | return vaddq_s8(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_add_epi8(Vector1, Vector2); | return _mm_add_epi8(Vector1, Vector2); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vadd_vv_i8m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
#else | #else | ||||
return Vector1 + Vector2; | return Vector1 + Vector2; | ||||
#endif | #endif | ||||
@@ -268,6 +343,8 @@ GI_INT32_t GiSubtractInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
return vsubq_s32(Vector1, Vector2); | return vsubq_s32(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_sub_epi32(Vector1, Vector2); | return _mm_sub_epi32(Vector1, Vector2); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vsub_vv_i32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int32_t)); | |||||
#else | #else | ||||
return Vector1 - Vector2; | return Vector1 - Vector2; | ||||
#endif | #endif | ||||
@@ -279,6 +356,8 @@ GI_UINT32_t GiSubtractUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { | |||||
return vsubq_u32(Vector1, Vector2); | return vsubq_u32(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_sub_epi32(Vector1, Vector2); | return _mm_sub_epi32(Vector1, Vector2); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vsub_vv_u32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(uint32_t)); | |||||
#else | #else | ||||
return Vector1 - Vector2; | return Vector1 - Vector2; | ||||
#endif | #endif | ||||
@@ -290,6 +369,8 @@ GI_INT8_t GiSubtractInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||||
return vsubq_s8(Vector1, Vector2); | return vsubq_s8(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_sub_epi8(Vector1, Vector2); | return _mm_sub_epi8(Vector1, Vector2); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vsub_vv_i8m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
#else | #else | ||||
return Vector1 - Vector2; | return Vector1 - Vector2; | ||||
#endif | #endif | ||||
@@ -303,6 +384,8 @@ GI_INT32_t GiMultiplyInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
GI_FLOAT32_t v0 = _mm_cvtepi32_ps(Vector1); | GI_FLOAT32_t v0 = _mm_cvtepi32_ps(Vector1); | ||||
GI_FLOAT32_t v1 = _mm_cvtepi32_ps(Vector2); | GI_FLOAT32_t v1 = _mm_cvtepi32_ps(Vector2); | ||||
return _mm_cvttps_epi32(_mm_mul_ps(v0, v1)); | return _mm_cvttps_epi32(_mm_mul_ps(v0, v1)); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vmul_vv_i32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int32_t)); | |||||
#else | #else | ||||
return Vector1 * Vector2; | return Vector1 * Vector2; | ||||
#endif | #endif | ||||
@@ -320,6 +403,8 @@ GI_INT8_t GiMultiplyInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||||
res[id] = v1[id] * v2[id]; | res[id] = v1[id] * v2[id]; | ||||
} | } | ||||
return _mm_loadu_si128((__m128i*)res); | return _mm_loadu_si128((__m128i*)res); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vmul_vv_i8m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
#else | #else | ||||
return Vector1 * Vector2; | return Vector1 * Vector2; | ||||
#endif | #endif | ||||
@@ -332,6 +417,9 @@ GI_INT32_t GiMultiplyAddInt32( | |||||
return vmlaq_s32(Vector1, Vector2, Vector3); | return vmlaq_s32(Vector1, Vector2, Vector3); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_add_epi32(Vector1, GiMultiplyInt32(Vector2, Vector3)); | return _mm_add_epi32(Vector1, GiMultiplyInt32(Vector2, Vector3)); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vmadd_vv_i32m1( | |||||
Vector2, Vector3, Vector1, GI_SIMD_LEN_BYTE / sizeof(int32_t)); | |||||
#else | #else | ||||
return Vector1 + Vector2 * Vector3; | return Vector1 + Vector2 * Vector3; | ||||
#endif | #endif | ||||
@@ -343,6 +431,8 @@ GI_INT8_t GiMultiplyAddInt8(GI_INT8_t Vector1, GI_INT8_t Vector2, GI_INT8_t Vect | |||||
return vmlaq_s8(Vector1, Vector2, Vector3); | return vmlaq_s8(Vector1, Vector2, Vector3); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_add_epi8(Vector1, GiMultiplyInt8(Vector2, Vector3)); | return _mm_add_epi8(Vector1, GiMultiplyInt8(Vector2, Vector3)); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vmadd_vv_i8m1(Vector2, Vector3, Vector1, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
#else | #else | ||||
return Vector1 + Vector2 * Vector3; | return Vector1 + Vector2 * Vector3; | ||||
#endif | #endif | ||||
@@ -354,6 +444,8 @@ GI_INT8_t GiAndInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||||
return vandq_s8(Vector1, Vector2); | return vandq_s8(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_and_si128(Vector1, Vector2); | return _mm_and_si128(Vector1, Vector2); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vand_vv_i8m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
#else | #else | ||||
return Vector1 & Vector2; | return Vector1 & Vector2; | ||||
#endif | #endif | ||||
@@ -365,6 +457,8 @@ GI_UINT32_t GiEOrUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { | |||||
return veorq_u32(Vector1, Vector2); | return veorq_u32(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_xor_si128(Vector1, Vector2); | return _mm_xor_si128(Vector1, Vector2); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vxor_vv_u32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(uint32_t)); | |||||
#else | #else | ||||
return Vector1 ^ Vector2; | return Vector1 ^ Vector2; | ||||
#endif | #endif | ||||
@@ -376,6 +470,8 @@ GI_INT8_t GiOrInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||||
return vorrq_s8(Vector1, Vector2); | return vorrq_s8(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_or_si128(Vector1, Vector2); | return _mm_or_si128(Vector1, Vector2); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vor_vv_i8m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
#else | #else | ||||
return Vector1 | Vector2; | return Vector1 | Vector2; | ||||
#endif | #endif | ||||
@@ -387,6 +483,9 @@ GI_INT8_t GiAndNotInt8(GI_INT8_t VectorNot, GI_INT8_t Vector) { | |||||
return vandq_s8(vmvnq_s8(VectorNot), Vector); | return vandq_s8(vmvnq_s8(VectorNot), Vector); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_andnot_si128(VectorNot, Vector); | return _mm_andnot_si128(VectorNot, Vector); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
GI_INT8_t not_v = vnot_v_i8m1(VectorNot, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
return vand_vv_i8m1(not_v, Vector, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
#else | #else | ||||
GI_INT8_t Not = ~VectorNot; | GI_INT8_t Not = ~VectorNot; | ||||
return (Not & Vector); | return (Not & Vector); | ||||
@@ -399,6 +498,8 @@ GI_INT8_t GiXorInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||||
return veorq_s8(Vector1, Vector2); | return veorq_s8(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_xor_si128(Vector1, Vector2); | return _mm_xor_si128(Vector1, Vector2); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vxor_vv_i8m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
#else | #else | ||||
return Vector1 ^ Vector2; | return Vector1 ^ Vector2; | ||||
#endif | #endif | ||||
@@ -410,6 +511,8 @@ GI_INT32_t GiShiftLeft23Int32(GI_INT32_t Vector) { | |||||
return vshlq_n_s32(Vector, 23); | return vshlq_n_s32(Vector, 23); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_slli_epi32(Vector, 23); | return _mm_slli_epi32(Vector, 23); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vsll_vx_i32m1(Vector, 23, GI_SIMD_LEN_BYTE / sizeof(int32_t)); | |||||
#else | #else | ||||
return Vector << 23; | return Vector << 23; | ||||
#endif | #endif | ||||
@@ -421,6 +524,8 @@ GI_INT32_t GiShiftRight23Int32(GI_INT32_t Vector) { | |||||
return vshrq_n_s32(Vector, 23); | return vshrq_n_s32(Vector, 23); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_srai_epi32(Vector, 23); | return _mm_srai_epi32(Vector, 23); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vsra_vx_i32m1(Vector, 23, GI_SIMD_LEN_BYTE / sizeof(int32_t)); | |||||
#else | #else | ||||
return Vector >> 23; | return Vector >> 23; | ||||
#endif | #endif | ||||
@@ -442,6 +547,11 @@ GI_INT32_t GiAbsInt32(GI_INT32_t Vector) { | |||||
return vabsq_s32(Vector); | return vabsq_s32(Vector); | ||||
#elif defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE42_INTRINSICS) | ||||
return _mm_abs_epi32(Vector); | return _mm_abs_epi32(Vector); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
//! rvv do not have int abs now | |||||
GI_INT32_t shift = vsra_vx_i32m1(Vector, 31, GI_SIMD_LEN_BYTE / sizeof(int32_t)); | |||||
GI_INT32_t t_add = vadd_vv_i32m1(Vector, shift, GI_SIMD_LEN_BYTE / sizeof(int32_t)); | |||||
return vxor_vv_i32m1(t_add, shift, GI_SIMD_LEN_BYTE / sizeof(int32_t)); | |||||
#else | #else | ||||
GI_INT32_t ret; | GI_INT32_t ret; | ||||
GI_INT32_NAIVE_t tmp_ret; | GI_INT32_NAIVE_t tmp_ret; | ||||
@@ -463,6 +573,11 @@ GI_INT16_t GiAbsInt16(GI_INT16_t Vector) { | |||||
return vabsq_s16(Vector); | return vabsq_s16(Vector); | ||||
#elif defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE42_INTRINSICS) | ||||
return _mm_abs_epi16(Vector); | return _mm_abs_epi16(Vector); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
//! rvv do not have int abs now | |||||
GI_INT16_t shift = vsra_vx_i16m1(Vector, 15, GI_SIMD_LEN_BYTE / sizeof(int16_t)); | |||||
GI_INT16_t t_add = vadd_vv_i16m1(Vector, shift, GI_SIMD_LEN_BYTE / sizeof(int16_t)); | |||||
return vxor_vv_i16m1(t_add, shift, GI_SIMD_LEN_BYTE / sizeof(int16_t)); | |||||
#else | #else | ||||
GI_INT16_t ret; | GI_INT16_t ret; | ||||
GI_INT16_NAIVE_t tmp_ret; | GI_INT16_NAIVE_t tmp_ret; | ||||
@@ -483,6 +598,11 @@ GI_INT8_t GiAbsInt8(GI_INT8_t Vector) { | |||||
return vabsq_s8(Vector); | return vabsq_s8(Vector); | ||||
#elif defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE42_INTRINSICS) | ||||
return _mm_abs_epi8(Vector); | return _mm_abs_epi8(Vector); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
//! rvv do not have int abs now | |||||
GI_INT8_t shift = vsra_vx_i8m1(Vector, 7, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
GI_INT8_t t_add = vadd_vv_i8m1(Vector, shift, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
return vxor_vv_i8m1(t_add, shift, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
#else | #else | ||||
GI_INT8_t ret; | GI_INT8_t ret; | ||||
GI_INT8_NAIVE_t tmp_ret; | GI_INT8_NAIVE_t tmp_ret; | ||||
@@ -505,6 +625,8 @@ GI_INT32_t GiMaximumInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
return _mm_max_epi32(Vector1, Vector2); | return _mm_max_epi32(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return GiBlendInt32(Vector2, Vector1, _mm_cmpgt_epi32(Vector1, Vector2)); | return GiBlendInt32(Vector2, Vector1, _mm_cmpgt_epi32(Vector1, Vector2)); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vmax_vv_i32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int32_t)); | |||||
#else | #else | ||||
GI_INT32_t tmp; | GI_INT32_t tmp; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | ||||
@@ -522,6 +644,8 @@ GI_INT32_t GiMinimumInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
return _mm_min_epi32(Vector1, Vector2); | return _mm_min_epi32(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return GiBlendInt32(Vector2, Vector1, _mm_cmpgt_epi32(Vector2, Vector1)); | return GiBlendInt32(Vector2, Vector1, _mm_cmpgt_epi32(Vector2, Vector1)); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vmin_vv_i32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int32_t)); | |||||
#else | #else | ||||
GI_INT32_t tmp; | GI_INT32_t tmp; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | ||||
@@ -544,6 +668,8 @@ GI_INT8_t GiMaximumInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||||
return _mm_max_epi8(Vector1, Vector2); | return _mm_max_epi8(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return GiBlendInt8(Vector2, Vector1, _mm_cmpgt_epi8(Vector1, Vector2)); | return GiBlendInt8(Vector2, Vector1, _mm_cmpgt_epi8(Vector1, Vector2)); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vmax_vv_i8m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
#else | #else | ||||
GI_INT8_t tmp; | GI_INT8_t tmp; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | ||||
@@ -561,6 +687,8 @@ GI_INT8_t GiMinimumInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||||
return _mm_min_epi8(Vector1, Vector2); | return _mm_min_epi8(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return GiBlendInt8(Vector2, Vector1, _mm_cmpgt_epi8(Vector2, Vector1)); | return GiBlendInt8(Vector2, Vector1, _mm_cmpgt_epi8(Vector2, Vector1)); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
return vmin_vv_i8m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
#else | #else | ||||
GI_INT8_t tmp; | GI_INT8_t tmp; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | ||||
@@ -584,6 +712,9 @@ GI_INT16_t GiMoveHighLongInt8(GI_INT8_t Vector) { | |||||
data[i] = o_data[8 + i]; | data[i] = o_data[8 + i]; | ||||
} | } | ||||
return _mm_loadu_si128((__m128i*)data); | return _mm_loadu_si128((__m128i*)data); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
vint16m2_t two = vwcvt_x_x_v_i16m2(Vector, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
return vget_v_i16m2_i16m1(two, 1); | |||||
#else | #else | ||||
GI_INT16_t ret; | GI_INT16_t ret; | ||||
int8_t* data = (int8_t*)&Vector; | int8_t* data = (int8_t*)&Vector; | ||||
@@ -609,6 +740,9 @@ GI_INT16_t GiMoveLowLongInt8(GI_INT8_t Vector) { | |||||
data[i] = o_data[i]; | data[i] = o_data[i]; | ||||
} | } | ||||
return _mm_loadu_si128((__m128i*)data); | return _mm_loadu_si128((__m128i*)data); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
vint16m2_t two = vwcvt_x_x_v_i16m2(Vector, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
return vget_v_i16m2_i16m1(two, 0); | |||||
#else | #else | ||||
GI_INT16_t ret; | GI_INT16_t ret; | ||||
size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); | size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); | ||||
@@ -633,6 +767,9 @@ GI_INT32_t GiMoveHighLongInt16(GI_INT16_t Vector) { | |||||
data[i] = o_data[4 + i]; | data[i] = o_data[4 + i]; | ||||
} | } | ||||
return _mm_loadu_si128((__m128i*)data); | return _mm_loadu_si128((__m128i*)data); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
vint32m2_t two = vwcvt_x_x_v_i32m2(Vector, GI_SIMD_LEN_BYTE / sizeof(int16_t)); | |||||
return vget_v_i32m2_i32m1(two, 1); | |||||
#else | #else | ||||
GI_INT32_t ret; | GI_INT32_t ret; | ||||
size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); | size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); | ||||
@@ -657,6 +794,9 @@ GI_INT32_t GiMoveLowLongInt16(GI_INT16_t Vector) { | |||||
data[i] = o_data[i]; | data[i] = o_data[i]; | ||||
} | } | ||||
return _mm_loadu_si128((__m128i*)data); | return _mm_loadu_si128((__m128i*)data); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
vint32m2_t two = vwcvt_x_x_v_i32m2(Vector, GI_SIMD_LEN_BYTE / sizeof(int16_t)); | |||||
return vget_v_i32m2_i32m1(two, 0); | |||||
#else | #else | ||||
GI_INT32_t ret; | GI_INT32_t ret; | ||||
size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); | size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); | ||||
@@ -703,8 +843,16 @@ int32_t GiReduceAddInt8(GI_INT8_t Vector) { | |||||
float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(2, 2, 2, 2))); | float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(2, 2, 2, 2))); | ||||
float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 3, 3))); | float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 3, 3))); | ||||
return (int16_t)(ret0 + ret1 + ret2 + ret3); | return (int16_t)(ret0 + ret1 + ret2 + ret3); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
vint16m1_t redsum = vundefined_i16m1(); | |||||
vint16m1_t zero = vmv_v_x_i16m1(0, GI_SIMD_LEN_BYTE / sizeof(int16_t)); | |||||
redsum = vwredsum_vs_i8m1_i16m1( | |||||
redsum, Vector, zero, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
int16_t ret = 0; | |||||
vse16_v_i16m1(&ret, redsum, 1); | |||||
return ret; | |||||
#else | #else | ||||
int32_t sum = 0; | |||||
int16_t sum = 0; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | ||||
sum += Vector[i]; | sum += Vector[i]; | ||||
} | } | ||||
@@ -751,6 +899,13 @@ int8_t GiReduceMaxInt8(GI_INT8_t Vector) { | |||||
float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(max, max, _MM_SHUFFLE(2, 2, 2, 2))); | float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(max, max, _MM_SHUFFLE(2, 2, 2, 2))); | ||||
float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(max, max, _MM_SHUFFLE(3, 3, 3, 3))); | float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(max, max, _MM_SHUFFLE(3, 3, 3, 3))); | ||||
return (int8_t)(Max(Max(ret0, ret1), Max(ret2, ret3))); | return (int8_t)(Max(Max(ret0, ret1), Max(ret2, ret3))); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
vint8m1_t max = vundefined_i8m1(); | |||||
vint8m1_t zero = vmv_v_x_i8m1(0, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
max = vredmax_vs_i8m1_i8m1(max, Vector, zero, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
int8_t ret = 0; | |||||
vse8_v_i8m1(&ret, max, 1); | |||||
return ret; | |||||
#else | #else | ||||
int8_t max = Vector[0]; | int8_t max = Vector[0]; | ||||
for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | ||||
@@ -799,6 +954,13 @@ int8_t GiReduceMinInt8(GI_INT8_t Vector) { | |||||
float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(min, min, _MM_SHUFFLE(2, 2, 2, 2))); | float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(min, min, _MM_SHUFFLE(2, 2, 2, 2))); | ||||
float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(min, min, _MM_SHUFFLE(3, 3, 3, 3))); | float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(min, min, _MM_SHUFFLE(3, 3, 3, 3))); | ||||
return (int8_t)(Min(Min(ret0, ret1), Min(ret2, ret3))); | return (int8_t)(Min(Min(ret0, ret1), Min(ret2, ret3))); | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
vint8m1_t min = vundefined_i8m1(); | |||||
vint8m1_t zero = vmv_v_x_i8m1(0, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
min = vredmin_vs_i8m1_i8m1(min, Vector, zero, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
int8_t ret = 0; | |||||
vse8_v_i8m1(&ret, min, 1); | |||||
return ret; | |||||
#else | #else | ||||
int8_t min = Vector[0]; | int8_t min = Vector[0]; | ||||
for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | ||||
@@ -821,21 +983,40 @@ GI_INT8_t GiCvtFromFloat32ToInt8(GI_FLOAT32_t src) { | |||||
int16x8_t mid_s16 = vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres0)); | int16x8_t mid_s16 = vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres0)); | ||||
return vcombine_s8(vqmovn_s16(mid_s16), vqmovn_s16(mid_s16)); | return vcombine_s8(vqmovn_s16(mid_s16), vqmovn_s16(mid_s16)); | ||||
#else | #else | ||||
float32x4_t vinc0 = vbslq_f32(vcgeq_f32(src, vfzero), vfhalf, vfneg_half); | |||||
float32x4_t vinc0 = vbslq_f32( | |||||
vcgeq_f32(src, GiBroadcastFloat32(0.0f)), GiBroadcastFloat32(0.5f), | |||||
GiBroadcastFloat32(-0.5f)); | |||||
int32x4_t vres0 = vcvtq_s32_f32(vaddq_f32(src, vinc0)); | int32x4_t vres0 = vcvtq_s32_f32(vaddq_f32(src, vinc0)); | ||||
int16x8_t mid_s16 = vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres0)); | int16x8_t mid_s16 = vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres0)); | ||||
return vcombine_s8(vqmovn_s16(mid_s16), vqmovn_s16(mid_s16)); | return vcombine_s8(vqmovn_s16(mid_s16), vqmovn_s16(mid_s16)); | ||||
#endif | #endif | ||||
#elif defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE42_INTRINSICS) | ||||
__m128 vinc0 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(src, vfzero)); | |||||
__m128 vinc0 = _mm_blendv_ps( | |||||
GiBroadcastFloat32(-0.5f), GiBroadcastFloat32(0.5f), | |||||
_mm_cmpge_ps(src, GiBroadcastFloat32(0.0f))); | |||||
__m128 vres0 = _mm_add_ps(src, vinc0); | __m128 vres0 = _mm_add_ps(src, vinc0); | ||||
vres0 = _mm_round_ps(vres0, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); | vres0 = _mm_round_ps(vres0, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); | ||||
vres0 = _mm_min_ps(_mm_max_ps(vres0, vfmin_int8), vfmax_int8); | |||||
vres0 = _mm_min_ps( | |||||
_mm_max_ps(vres0, GiBroadcastFloat32(-128.0f)), GiBroadcastFloat32(127.0f)); | |||||
__m128i vepi32_0 = _mm_cvtps_epi32(vres0); | __m128i vepi32_0 = _mm_cvtps_epi32(vres0); | ||||
__m128i vepi16 = _mm_packs_epi32(vepi32_0, vepi32_0); | __m128i vepi16 = _mm_packs_epi32(vepi32_0, vepi32_0); | ||||
__m128i vepi8 = _mm_packs_epi16(vepi16, vepi16); | __m128i vepi8 = _mm_packs_epi16(vepi16, vepi16); | ||||
return vepi8; | return vepi8; | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
//! TODO: vfcvt_rtz_x_f_v_i32m1 is RVV 1.0 api, now xuantie D1 only support 0p7 | |||||
//! as a workaround, we imp this API by naive | |||||
GI_INT8_NAIVE_t tmp_ret; | |||||
GI_FLOAT32_FIXLEN_t s0 = GiFloat32Type2FixLenType(src); | |||||
int length = GI_SIMD_LEN_BYTE / sizeof(float); | |||||
for (int i = 0; i < length; i++) { | |||||
int8_t data = Saturate(round(s0[i]), -128, 127); | |||||
tmp_ret[i] = data; | |||||
tmp_ret[length + i] = data; | |||||
tmp_ret[2 * length + i] = data; | |||||
tmp_ret[3 * length + i] = data; | |||||
} | |||||
return vle8_v_i8m1((const signed char*)&tmp_ret, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
#else | #else | ||||
GI_INT8_t ret; | GI_INT8_t ret; | ||||
GI_INT8_NAIVE_t tmp_ret; | GI_INT8_NAIVE_t tmp_ret; | ||||
@@ -863,16 +1044,25 @@ GI_INT8_t GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2_t vsrc) { | |||||
int8x8_t mid1 = vqmovn_s16(vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres1))); | int8x8_t mid1 = vqmovn_s16(vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres1))); | ||||
return vcombine_s8(mid1, mid1); | return vcombine_s8(mid1, mid1); | ||||
#else | #else | ||||
float32x4_t vinc0 = vbslq_f32(vcgeq_f32(vsrc.val[0], vfzero), vfhalf, vfneg_half); | |||||
float32x4_t vinc1 = vbslq_f32(vcgeq_f32(vsrc.val[1], vfzero), vfhalf, vfneg_half); | |||||
GI_FLOAT32_t vfhalf = GiBroadcastFloat32(0.5f); | |||||
GI_FLOAT32_t vfneg_half = GiBroadcastFloat32(-0.5f); | |||||
float32x4_t vinc0 = vbslq_f32( | |||||
vcgeq_f32(vsrc.val[0], GiBroadcastFloat32(0.0f)), vfhalf, vfneg_half); | |||||
float32x4_t vinc1 = vbslq_f32( | |||||
vcgeq_f32(vsrc.val[1], GiBroadcastFloat32(0.0f)), vfhalf, vfneg_half); | |||||
int32x4_t vres0 = vcvtq_s32_f32(vaddq_f32(vsrc.val[0], vinc0)); | int32x4_t vres0 = vcvtq_s32_f32(vaddq_f32(vsrc.val[0], vinc0)); | ||||
int32x4_t vres1 = vcvtq_s32_f32(vaddq_f32(vsrc.val[1], vinc1)); | int32x4_t vres1 = vcvtq_s32_f32(vaddq_f32(vsrc.val[1], vinc1)); | ||||
int8x8_t mid1 = vqmovn_s16(vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres1))); | int8x8_t mid1 = vqmovn_s16(vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres1))); | ||||
return vcombine_s8(mid1, mid1); | return vcombine_s8(mid1, mid1); | ||||
#endif | #endif | ||||
#elif defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE42_INTRINSICS) | ||||
__m128 vinc0 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(vsrc.val[0], vfzero)); | |||||
__m128 vinc1 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(vsrc.val[1], vfzero)); | |||||
GI_FLOAT32_t vfhalf = GiBroadcastFloat32(0.5f); | |||||
GI_FLOAT32_t vfneg_half = GiBroadcastFloat32(-0.5f); | |||||
GI_FLOAT32_t vfmax_int8 = GiBroadcastFloat32(127.0f); | |||||
__m128 vinc0 = _mm_blendv_ps( | |||||
vfneg_half, vfhalf, _mm_cmpge_ps(vsrc.val[0], GiBroadcastFloat32(0.0f))); | |||||
__m128 vinc1 = _mm_blendv_ps( | |||||
vfneg_half, vfhalf, _mm_cmpge_ps(vsrc.val[1], GiBroadcastFloat32(0.0f))); | |||||
__m128 vres0 = _mm_add_ps(vsrc.val[0], vinc0); | __m128 vres0 = _mm_add_ps(vsrc.val[0], vinc0); | ||||
__m128 vres1 = _mm_add_ps(vsrc.val[1], vinc1); | __m128 vres1 = _mm_add_ps(vsrc.val[1], vinc1); | ||||
@@ -880,14 +1070,26 @@ GI_INT8_t GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2_t vsrc) { | |||||
vres0 = _mm_round_ps(vres0, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); | vres0 = _mm_round_ps(vres0, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); | ||||
vres1 = _mm_round_ps(vres1, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); | vres1 = _mm_round_ps(vres1, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); | ||||
vres0 = _mm_min_ps(_mm_max_ps(vres0, vfmin_int8), vfmax_int8); | |||||
vres1 = _mm_min_ps(_mm_max_ps(vres1, vfmin_int8), vfmax_int8); | |||||
vres0 = _mm_min_ps(_mm_max_ps(vres0, GiBroadcastFloat32(-128.0f)), vfmax_int8); | |||||
vres1 = _mm_min_ps(_mm_max_ps(vres1, GiBroadcastFloat32(-128.0f)), vfmax_int8); | |||||
__m128i vepi32_0 = _mm_cvtps_epi32(vres0); | __m128i vepi32_0 = _mm_cvtps_epi32(vres0); | ||||
__m128i vepi32_1 = _mm_cvtps_epi32(vres1); | __m128i vepi32_1 = _mm_cvtps_epi32(vres1); | ||||
__m128i vepi16_0 = _mm_packs_epi32(vepi32_0, vepi32_1); | __m128i vepi16_0 = _mm_packs_epi32(vepi32_0, vepi32_1); | ||||
__m128i vepi8 = _mm_packs_epi16(vepi16_0, vepi16_0); | __m128i vepi8 = _mm_packs_epi16(vepi16_0, vepi16_0); | ||||
return vepi8; | return vepi8; | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
//! TODO: vfcvt_rtz_x_f_v_i32m1 is RVV 1.0 api, now xuantie D1 only support 0p7 | |||||
//! as a workaround, we imp this API by naive | |||||
GI_INT8_NAIVE_t tmp_ret; | |||||
GI_FLOAT32_FIXLEN_V2_t s0 = GiFloat32Type2FixLenV2Type(vsrc); | |||||
int length = GI_SIMD_LEN_BYTE / sizeof(float); | |||||
for (int i = 0; i < 2 * length; i++) { | |||||
int8_t data = Saturate(round(s0.val[i / length][i % length]), -128, 127); | |||||
tmp_ret[i] = data; | |||||
tmp_ret[i + length * 2] = data; | |||||
} | |||||
return vle8_v_i8m1((const signed char*)&tmp_ret, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
#else | #else | ||||
GI_INT8_t ret; | GI_INT8_t ret; | ||||
GI_INT8_NAIVE_t tmp_ret; | GI_INT8_NAIVE_t tmp_ret; | ||||
@@ -932,6 +1134,11 @@ GI_INT8_t GiCvtFromFloat32V4ToInt8(GI_FLOAT32_V4_t vsrc) { | |||||
return vcombine_s8(mid1, mid2); | return vcombine_s8(mid1, mid2); | ||||
#endif | #endif | ||||
#elif defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE42_INTRINSICS) | ||||
GI_FLOAT32_t vfzero = GiBroadcastFloat32(0.0f); | |||||
GI_FLOAT32_t vfhalf = GiBroadcastFloat32(0.5f); | |||||
GI_FLOAT32_t vfneg_half = GiBroadcastFloat32(-0.5f); | |||||
GI_FLOAT32_t vfmin_int8 = GiBroadcastFloat32(-128.0f); | |||||
GI_FLOAT32_t vfmax_int8 = GiBroadcastFloat32(127.0f); | |||||
__m128 vinc0 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(vsrc.val[0], vfzero)); | __m128 vinc0 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(vsrc.val[0], vfzero)); | ||||
__m128 vinc1 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(vsrc.val[1], vfzero)); | __m128 vinc1 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(vsrc.val[1], vfzero)); | ||||
__m128 vinc2 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(vsrc.val[2], vfzero)); | __m128 vinc2 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(vsrc.val[2], vfzero)); | ||||
@@ -960,6 +1167,20 @@ GI_INT8_t GiCvtFromFloat32V4ToInt8(GI_FLOAT32_V4_t vsrc) { | |||||
__m128i vepi16_1 = _mm_packs_epi32(vepi32_2, vepi32_3); | __m128i vepi16_1 = _mm_packs_epi32(vepi32_2, vepi32_3); | ||||
__m128i vepi8 = _mm_packs_epi16(vepi16_0, vepi16_1); | __m128i vepi8 = _mm_packs_epi16(vepi16_0, vepi16_1); | ||||
return vepi8; | return vepi8; | ||||
#elif defined(GI_RVV_INTRINSICS) | |||||
//! TODO: vfcvt_rtz_x_f_v_i32m1 is RVV 1.0 api, now xuantie D1 only support 0p7 | |||||
//! as a workaround, we imp this API by naive | |||||
GI_INT8_NAIVE_t tmp_ret; | |||||
GI_FLOAT32_V4_NAIVE_t s0; | |||||
s0.val[0] = GiFloat32Type2FixLenType(GiGetSubVectorFloat32V4(vsrc, 0)); | |||||
s0.val[1] = GiFloat32Type2FixLenType(GiGetSubVectorFloat32V4(vsrc, 1)); | |||||
s0.val[2] = GiFloat32Type2FixLenType(GiGetSubVectorFloat32V4(vsrc, 2)); | |||||
s0.val[3] = GiFloat32Type2FixLenType(GiGetSubVectorFloat32V4(vsrc, 3)); | |||||
int length = GI_SIMD_LEN_BYTE / sizeof(float); | |||||
for (int i = 0; i < 4 * length; i++) { | |||||
tmp_ret[i] = Saturate(round(s0.val[i / length][i % length]), -128, 127); | |||||
} | |||||
return vle8_v_i8m1((const signed char*)&tmp_ret, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||||
#else | #else | ||||
GI_INT8_t ret; | GI_INT8_t ret; | ||||
GI_INT8_NAIVE_t tmp_ret; | GI_INT8_NAIVE_t tmp_ret; | ||||
@@ -1,5 +1,19 @@ | |||||
#if defined(ONLY_BUILD_GI_API) | |||||
#include <gtest/gtest.h> | |||||
int gtest_main(int argc, char** argv) { | |||||
::testing::InitGoogleTest(&argc, argv); | |||||
auto ret = RUN_ALL_TESTS(); | |||||
return ret; | |||||
} | |||||
int main(int argc, char** argv) { | |||||
return gtest_main(argc, argv); | |||||
} | |||||
#else | |||||
extern "C" int gtest_main(int argc, char** argv); | extern "C" int gtest_main(int argc, char** argv); | ||||
int main(int argc, char** argv) { | int main(int argc, char** argv) { | ||||
return gtest_main(argc, argv); | return gtest_main(argc, argv); | ||||
} | } | ||||
#endif |