GitOrigin-RevId: 37409bae9a
release-1.10
@@ -82,29 +82,33 @@ | |||||
#if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \ | #if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \ | ||||
defined(GI_FMA_INTRINSICS) | defined(GI_FMA_INTRINSICS) | ||||
typedef __m256 GI_FLOAT32; | |||||
typedef __m256i GI_UINT8; | |||||
typedef __m256i GI_INT8; | |||||
typedef __m256i GI_INT16; | |||||
typedef __m256i GI_INT32; | |||||
typedef __m256 GI_FLOAT32_t; | |||||
typedef __m256i GI_UINT8_t; | |||||
typedef __m256i GI_INT8_t; | |||||
typedef __m256i GI_INT16_t; | |||||
typedef __m256i GI_INT32_t; | |||||
typedef __m256i GI_UINT32_t; | |||||
#elif defined(GI_NEON_INTRINSICS) | #elif defined(GI_NEON_INTRINSICS) | ||||
typedef float32x4_t GI_FLOAT32; | |||||
typedef uint8x16_t GI_UINT8; | |||||
typedef int8x16_t GI_INT8; | |||||
typedef int16x8_t GI_INT16; | |||||
typedef int32x4_t GI_INT32; | |||||
typedef float32x4_t GI_FLOAT32_t; | |||||
typedef uint8x16_t GI_UINT8_t; | |||||
typedef int8x16_t GI_INT8_t; | |||||
typedef int16x8_t GI_INT16_t; | |||||
typedef int32x4_t GI_INT32_t; | |||||
typedef uint32x4_t GI_UINT32_t; | |||||
#elif defined(GI_SSE2_INTRINSICS) || defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) || defined(GI_SSE42_INTRINSICS) | ||||
typedef __m128 GI_FLOAT32; | |||||
typedef __m128i GI_UINT8; | |||||
typedef __m128i GI_INT8; | |||||
typedef __m128i GI_INT16; | |||||
typedef __m128i GI_INT32; | |||||
typedef __m128 GI_FLOAT32_t; | |||||
typedef __m128i GI_UINT8_t; | |||||
typedef __m128i GI_INT8_t; | |||||
typedef __m128i GI_INT16_t; | |||||
typedef __m128i GI_INT32_t; | |||||
typedef __m128i GI_UINT32_t; | |||||
#else | #else | ||||
typedef float GI_FLOAT32 __attribute__((vector_size(16))); | |||||
typedef uint8_t GI_UINT8 __attribute__((vector_size(16))); | |||||
typedef int8_t GI_INT8 __attribute__((vector_size(16))); | |||||
typedef int16_t GI_INT16 __attribute__((vector_size(16))); | |||||
typedef int32_t GI_INT32 __attribute__((vector_size(16))); | |||||
typedef float GI_FLOAT32_t __attribute__((vector_size(16))); | |||||
typedef uint8_t GI_UINT8_t __attribute__((vector_size(16))); | |||||
typedef int8_t GI_INT8_t __attribute__((vector_size(16))); | |||||
typedef int16_t GI_INT16_t __attribute__((vector_size(16))); | |||||
typedef int32_t GI_INT32_t __attribute__((vector_size(16))); | |||||
typedef uint32_t GI_UINT32_t __attribute__((vector_size(16))); | |||||
#endif | #endif | ||||
//! general intrinsic support dynamic length simd, if avx or avx2 the simd | //! general intrinsic support dynamic length simd, if avx or avx2 the simd | ||||
@@ -129,24 +133,31 @@ typedef int32_t GI_INT32 __attribute__((vector_size(16))); | |||||
#define Min(a, b) (a) < (b) ? (a) : (b) | #define Min(a, b) (a) < (b) ? (a) : (b) | ||||
typedef struct { | typedef struct { | ||||
GI_INT32 val[2]; | |||||
} GI_INT32_V2; | |||||
GI_INT32_t val[2]; | |||||
} GI_INT32_V2_t; | |||||
typedef struct { | typedef struct { | ||||
GI_INT32 val[4]; | |||||
} GI_INT32_V4; | |||||
GI_INT32_t val[4]; | |||||
} GI_INT32_V4_t; | |||||
typedef struct { | typedef struct { | ||||
GI_FLOAT32 val[2]; | |||||
} GI_FLOAT32_V2; | |||||
GI_FLOAT32_t val[2]; | |||||
} GI_FLOAT32_V2_t; | |||||
typedef struct { | typedef struct { | ||||
GI_FLOAT32 val[4]; | |||||
} GI_FLOAT32_V4; | |||||
GI_FLOAT32_t val[4]; | |||||
} GI_FLOAT32_V4_t; | |||||
typedef struct { | |||||
GI_INT16_t val[2]; | |||||
} GI_INT16_V2_t; | |||||
typedef struct { | |||||
GI_INT8_t val[2]; | |||||
} GI_INT8_V2_t; | |||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT32 | |||||
GiAndInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
GI_INT32_t GiAndInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vandq_s32(Vector1, Vector2); | return vandq_s32(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
@@ -157,8 +168,7 @@ GiAndInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT32 | |||||
GiOrInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
GI_INT32_t GiOrInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vorrq_s32(Vector1, Vector2); | return vorrq_s32(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
@@ -169,8 +179,7 @@ GiOrInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT32 | |||||
GiAndNotInt32(GI_INT32 VectorNot, GI_INT32 Vector) { | |||||
GI_INT32_t GiAndNotInt32(GI_INT32_t VectorNot, GI_INT32_t Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vandq_s32(vmvnq_s32(VectorNot), Vector); | return vandq_s32(vmvnq_s32(VectorNot), Vector); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
@@ -181,8 +190,7 @@ GiAndNotInt32(GI_INT32 VectorNot, GI_INT32 Vector) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT32 | |||||
GiXorInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
GI_INT32_t GiXorInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return veorq_s32(Vector1, Vector2); | return veorq_s32(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
@@ -14,20 +14,51 @@ | |||||
#include "gi_common.h" | #include "gi_common.h" | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT32 | |||||
GiReinterpretAsInt32(GI_FLOAT32 In) { | |||||
GI_INT32_t GiReinterpretAsInt32(GI_FLOAT32_t In) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vreinterpretq_s32_f32(In); | return vreinterpretq_s32_f32(In); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_castps_si128(In); | return _mm_castps_si128(In); | ||||
#else | #else | ||||
return GI_INT32(In); | |||||
return *(GI_INT32_t*)(&In); | |||||
#endif | #endif | ||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT32 | |||||
GiRoundAsInt32(GI_FLOAT32 Vector) { | |||||
GI_UINT32_t GiReinterpretAsUint32(GI_FLOAT32_t In) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vreinterpretq_u32_f32(In); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
return _mm_castps_si128(In); | |||||
#else | |||||
return *(GI_UINT32_t*)(&In); | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_FLOAT32_t GiReintInt32ToFloat32(GI_INT32_t Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vreinterpretq_f32_s32(Vector); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
return _mm_castsi128_ps(Vector); | |||||
#else | |||||
return *(GI_FLOAT32_t*)(&Vector); | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_FLOAT32_t GiReintUint32ToFloat32(GI_UINT32_t Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vreinterpretq_f32_u32(Vector); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
return _mm_castsi128_ps(Vector); | |||||
#else | |||||
return *(GI_FLOAT32_t*)(&Vector); | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_INT32_t GiRoundAsInt32(GI_FLOAT32_t Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
#if __ARM_ARCH >= 8 | #if __ARM_ARCH >= 8 | ||||
return vcvtaq_s32_f32(Vector); | return vcvtaq_s32_f32(Vector); | ||||
@@ -47,7 +78,7 @@ GiRoundAsInt32(GI_FLOAT32 Vector) { | |||||
return _mm_castps_si128( | return _mm_castps_si128( | ||||
_mm_round_ps(vres0, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); | _mm_round_ps(vres0, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); | ||||
#else | #else | ||||
GI_INT32 ret; | |||||
GI_INT32_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
ret[i] = (int32_t)round(Vector[i]); | ret[i] = (int32_t)round(Vector[i]); | ||||
} | } | ||||
@@ -56,42 +87,43 @@ GiRoundAsInt32(GI_FLOAT32 Vector) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_FLOAT32 | |||||
GiCastToFloat32(GI_INT32 Vector) { | |||||
GI_INT32_t GiCastToInt32(GI_FLOAT32_t Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vcvtq_f32_s32(Vector); | |||||
return vcvtq_s32_f32(Vector); | |||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_cvtepi32_ps(Vector); | |||||
return _mm_cvttps_epi32(Vector); | |||||
#else | #else | ||||
GI_FLOAT32 ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||||
ret[i] = float(Vector[i]); | |||||
GI_INT32_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
ret[i] = (int32_t)(Vector[i]); | |||||
} | } | ||||
return ret; | return ret; | ||||
#endif | #endif | ||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_FLOAT32 | |||||
GiReinterpretAsFloat32(GI_INT32 Vector) { | |||||
GI_FLOAT32_t GiCastToFloat32(GI_INT32_t Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vreinterpretq_f32_s32(Vector); | |||||
return vcvtq_f32_s32(Vector); | |||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_castsi128_ps(Vector); | |||||
return _mm_cvtepi32_ps(Vector); | |||||
#else | #else | ||||
return GI_FLOAT32(Vector); | |||||
GI_FLOAT32_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||||
ret[i] = float(Vector[i]); | |||||
} | |||||
return ret; | |||||
#endif | #endif | ||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_FLOAT32 | |||||
GiBroadcastFloat32(float Value) { | |||||
GI_FLOAT32_t GiBroadcastFloat32(float Value) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vdupq_n_f32(Value); | return vdupq_n_f32(Value); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_set1_ps(Value); | return _mm_set1_ps(Value); | ||||
#else | #else | ||||
GI_FLOAT32 ret; | |||||
GI_FLOAT32_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
ret[i] = Value; | ret[i] = Value; | ||||
} | } | ||||
@@ -100,14 +132,13 @@ GiBroadcastFloat32(float Value) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_FLOAT32 | |||||
GiBroadcastFloat32(const float* Value) { | |||||
GI_FLOAT32_t GiLoadBroadcastFloat32(const float* Value) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vld1q_dup_f32(Value); | return vld1q_dup_f32(Value); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_load_ps1(Value); | return _mm_load_ps1(Value); | ||||
#else | #else | ||||
GI_FLOAT32 ret; | |||||
GI_FLOAT32_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
ret[i] = *Value; | ret[i] = *Value; | ||||
} | } | ||||
@@ -116,8 +147,7 @@ GiBroadcastFloat32(const float* Value) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_FLOAT32 | |||||
GiZeroFloat32(void) { | |||||
GI_FLOAT32_t GiZeroFloat32(void) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vdupq_n_f32(0.0f); | return vdupq_n_f32(0.0f); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
@@ -128,14 +158,13 @@ GiZeroFloat32(void) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_FLOAT32 | |||||
GiLoadFloat32(const float* Buffer) { | |||||
GI_FLOAT32_t GiLoadFloat32(const float* Buffer) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vld1q_f32(Buffer); | return vld1q_f32(Buffer); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_loadu_ps(Buffer); | return _mm_loadu_ps(Buffer); | ||||
#else | #else | ||||
GI_FLOAT32 ret; | |||||
GI_FLOAT32_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
ret[i] = Buffer[i]; | ret[i] = Buffer[i]; | ||||
} | } | ||||
@@ -144,7 +173,7 @@ GiLoadFloat32(const float* Buffer) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
void GiStoreFloat32(float* Buffer, GI_FLOAT32 Vector) { | |||||
void GiStoreFloat32(float* Buffer, GI_FLOAT32_t Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
vst1q_f32(Buffer, Vector); | vst1q_f32(Buffer, Vector); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
@@ -156,33 +185,22 @@ void GiStoreFloat32(float* Buffer, GI_FLOAT32 Vector) { | |||||
#endif | #endif | ||||
} | } | ||||
GI_FORCEINLINE | |||||
void GiStoreAlignedFloat32(float* Buffer, GI_FLOAT32 Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
vst1q_f32(Buffer, Vector); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
_mm_store_ps(Buffer, Vector); | |||||
#else | |||||
GiStoreFloat32(Buffer, Vector); | |||||
#endif | |||||
} | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
#define GISTORELANEFLOAT32(i) \ | |||||
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32 Vector) { \ | |||||
vst1q_lane_f32(Buffer, Vector, i); \ | |||||
#define GISTORELANEFLOAT32(i) \ | |||||
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \ | |||||
vst1q_lane_f32(Buffer, Vector, i); \ | |||||
} | } | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
#define GISTORELANEFLOAT32(i) \ | #define GISTORELANEFLOAT32(i) \ | ||||
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32 Vector) { \ | |||||
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \ | |||||
_mm_store_ss(Buffer, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \ | _mm_store_ss(Buffer, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \ | ||||
} | } | ||||
#else | #else | ||||
#define GISTORELANEFLOAT32(i) \ | |||||
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32 Vector) { \ | |||||
*Buffer = Vector[i]; \ | |||||
#define GISTORELANEFLOAT32(i) \ | |||||
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \ | |||||
*Buffer = Vector[i]; \ | |||||
} | } | ||||
#endif | #endif | ||||
@@ -194,20 +212,20 @@ GISTORELANEFLOAT32(3) | |||||
#undef GISTORELANEFLOAT32 | #undef GISTORELANEFLOAT32 | ||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
#define GIEXTRACTLANEFLOAT32(i) \ | |||||
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32 Vector) { \ | |||||
return vgetq_lane_f32(Vector, i); \ | |||||
#define GIEXTRACTLANEFLOAT32(i) \ | |||||
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \ | |||||
return vgetq_lane_f32(Vector, i); \ | |||||
} | } | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
#define GIEXTRACTLANEFLOAT32(i) \ | #define GIEXTRACTLANEFLOAT32(i) \ | ||||
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32 Vector) { \ | |||||
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \ | |||||
return _mm_cvtss_f32(_mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \ | return _mm_cvtss_f32(_mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \ | ||||
} | } | ||||
#else | #else | ||||
#define GIEXTRACTLANEFLOAT32(i) \ | |||||
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32 Vector) { \ | |||||
return Vector[i]; \ | |||||
#define GIEXTRACTLANEFLOAT32(i) \ | |||||
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \ | |||||
return Vector[i]; \ | |||||
} | } | ||||
#endif | #endif | ||||
@@ -218,8 +236,7 @@ GIEXTRACTLANEFLOAT32(3) | |||||
#undef GIEXTRACTLANEFLOAT32 | #undef GIEXTRACTLANEFLOAT32 | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_FLOAT32 | |||||
GiInterleaveLowFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
GI_FLOAT32_t GiInterleaveLowFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
#if defined(GI_NEON64_INTRINSICS) | #if defined(GI_NEON64_INTRINSICS) | ||||
return vzip1q_f32(Vector1, Vector2); | return vzip1q_f32(Vector1, Vector2); | ||||
#elif defined(GI_NEON32_INTRINSICS) | #elif defined(GI_NEON32_INTRINSICS) | ||||
@@ -228,7 +245,7 @@ GiInterleaveLowFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_unpacklo_ps(Vector1, Vector2); | return _mm_unpacklo_ps(Vector1, Vector2); | ||||
#else | #else | ||||
GI_FLOAT32 ret; | |||||
GI_FLOAT32_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) { | ||||
ret[2 * i] = Vector1[i]; | ret[2 * i] = Vector1[i]; | ||||
ret[2 * i + 1] = Vector2[i]; | ret[2 * i + 1] = Vector2[i]; | ||||
@@ -238,8 +255,7 @@ GiInterleaveLowFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_FLOAT32 | |||||
GiInterleaveHighFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
GI_FLOAT32_t GiInterleaveHighFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
#if defined(GI_NEON64_INTRINSICS) | #if defined(GI_NEON64_INTRINSICS) | ||||
return vzip2q_f32(Vector1, Vector2); | return vzip2q_f32(Vector1, Vector2); | ||||
#elif defined(GI_NEON32_INTRINSICS) | #elif defined(GI_NEON32_INTRINSICS) | ||||
@@ -248,7 +264,7 @@ GiInterleaveHighFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_unpackhi_ps(Vector1, Vector2); | return _mm_unpackhi_ps(Vector1, Vector2); | ||||
#else | #else | ||||
GI_FLOAT32 ret; | |||||
GI_FLOAT32_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) { | ||||
ret[2 * i] = Vector1[GI_SIMD_LEN_BYTE / 2 + i]; | ret[2 * i] = Vector1[GI_SIMD_LEN_BYTE / 2 + i]; | ||||
ret[2 * i + 1] = Vector2[GI_SIMD_LEN_BYTE / 2 + i]; | ret[2 * i + 1] = Vector2[GI_SIMD_LEN_BYTE / 2 + i]; | ||||
@@ -258,8 +274,7 @@ GiInterleaveHighFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_FLOAT32 | |||||
GiAddFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
GI_FLOAT32_t GiAddFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vaddq_f32(Vector1, Vector2); | return vaddq_f32(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
@@ -270,8 +285,7 @@ GiAddFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_FLOAT32 | |||||
GiSubtractFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
GI_FLOAT32_t GiSubtractFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vsubq_f32(Vector1, Vector2); | return vsubq_f32(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
@@ -282,8 +296,7 @@ GiSubtractFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_FLOAT32 | |||||
GiMultiplyFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
GI_FLOAT32_t GiMultiplyFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vmulq_f32(Vector1, Vector2); | return vmulq_f32(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
@@ -294,12 +307,11 @@ GiMultiplyFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_FLOAT32 | |||||
GiMultiplyScalerFloat32(GI_FLOAT32 Vector1, float Scaler) { | |||||
GI_FLOAT32_t GiMultiplyScalerFloat32(GI_FLOAT32_t Vector1, float Scaler) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vmulq_n_f32(Vector1, Scaler); | return vmulq_n_f32(Vector1, Scaler); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
GI_FLOAT32 Vector2 = _mm_set1_ps(Scaler); | |||||
GI_FLOAT32_t Vector2 = _mm_set1_ps(Scaler); | |||||
return _mm_mul_ps(Vector1, Vector2); | return _mm_mul_ps(Vector1, Vector2); | ||||
#else | #else | ||||
return Vector1 * Scaler; | return Vector1 * Scaler; | ||||
@@ -307,10 +319,14 @@ GiMultiplyScalerFloat32(GI_FLOAT32 Vector1, float Scaler) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_FLOAT32 | |||||
GiMultiplyAddVecFloat32(GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
GI_FLOAT32_t GiMultiplyAddFloat32( | |||||
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
#if defined(__ARM_FEATURE_FMA) | |||||
return vfmaq_f32(VectorSum, Vector1, Vector2); | |||||
#else | |||||
return vmlaq_f32(VectorSum, Vector1, Vector2); | return vmlaq_f32(VectorSum, Vector1, Vector2); | ||||
#endif | |||||
#elif defined(GI_FMA3_INTRINSICS) | #elif defined(GI_FMA3_INTRINSICS) | ||||
return _mm_fmadd_ps(Vector1, Vector2, VectorSum); | return _mm_fmadd_ps(Vector1, Vector2, VectorSum); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
@@ -321,41 +337,75 @@ GiMultiplyAddVecFloat32(GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vec | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_FLOAT32 | |||||
GiMultiplyAddScalarFloat32(GI_FLOAT32 VectorSum, GI_FLOAT32 Vector, float Scalar) { | |||||
GI_FLOAT32_t GiMultiplySubFloat32( | |||||
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vmlaq_n_f32(VectorSum, Vector, Scalar); | |||||
return vmlsq_f32(VectorSum, Vector1, Vector2); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
return _mm_sub_ps(VectorSum, _mm_mul_ps(Vector1, Vector2)); | |||||
#else | |||||
return VectorSum - Vector1 * Vector2; | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_FLOAT32_t GiMultiplyAddScalarFloat32( | |||||
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector, float Scalar) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
#if defined(__ARM_FEATURE_FMA) | |||||
return vfmaq_n_f32(VectorSum, Vector, Scalar); | |||||
#else | |||||
return vfmla_n_f32(VectorSum, Vector, Scalar); | |||||
#endif | |||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return GiMultiplyAddVecFloat32(VectorSum, GiBroadcastFloat32(Scalar), Vector); | |||||
return GiMultiplyAddFloat32(VectorSum, GiBroadcastFloat32(Scalar), Vector); | |||||
#else | #else | ||||
return VectorSum + Vector * Scalar; | return VectorSum + Vector * Scalar; | ||||
#endif | #endif | ||||
} | } | ||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
#define GIMULTIPLYADDLANFLOAT32(i) \ | |||||
GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \ | |||||
GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \ | |||||
return vmlaq_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i); \ | |||||
#if defined(__ARM_FEATURE_FMA) | |||||
#define GIMULTIPLYADDLANFLOAT32(i) \ | |||||
GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ | |||||
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ | |||||
return vfmaq_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i); \ | |||||
} | } | ||||
GIMULTIPLYADDLANFLOAT32(0) | GIMULTIPLYADDLANFLOAT32(0) | ||||
GIMULTIPLYADDLANFLOAT32(1) | GIMULTIPLYADDLANFLOAT32(1) | ||||
#undef GIMULTIPLYADDLANFLOAT32 | #undef GIMULTIPLYADDLANFLOAT32 | ||||
#define GIMULTIPLYADDLANFLOAT32(i) \ | #define GIMULTIPLYADDLANFLOAT32(i) \ | ||||
GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \ | |||||
GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \ | |||||
GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ | |||||
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ | |||||
return vfmaq_lane_f32(VectorSum, Vector1, vget_high_f32(Vector2), i - 2); \ | |||||
} | |||||
GIMULTIPLYADDLANFLOAT32(2) | |||||
GIMULTIPLYADDLANFLOAT32(3) | |||||
#else | |||||
#define GIMULTIPLYADDLANFLOAT32(i) \ | |||||
GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ | |||||
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ | |||||
return vmlaq_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i); \ | |||||
} | |||||
GIMULTIPLYADDLANFLOAT32(0) | |||||
GIMULTIPLYADDLANFLOAT32(1) | |||||
#undef GIMULTIPLYADDLANFLOAT32 | |||||
#define GIMULTIPLYADDLANFLOAT32(i) \ | |||||
GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ | |||||
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ | |||||
return vmlaq_lane_f32(VectorSum, Vector1, vget_high_f32(Vector2), i - 2); \ | return vmlaq_lane_f32(VectorSum, Vector1, vget_high_f32(Vector2), i - 2); \ | ||||
} | } | ||||
GIMULTIPLYADDLANFLOAT32(2) | GIMULTIPLYADDLANFLOAT32(2) | ||||
GIMULTIPLYADDLANFLOAT32(3) | GIMULTIPLYADDLANFLOAT32(3) | ||||
#endif | |||||
#undef GIMULTIPLYADDLANFLOAT32 | #undef GIMULTIPLYADDLANFLOAT32 | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
#define GIMULTIPLYADDLANFLOAT32(i) \ | |||||
GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \ | |||||
GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \ | |||||
return GiMultiplyAddScalarFloat32( \ | |||||
VectorSum, Vector1, GiExtractLane##i##Float32(Vector2)); \ | |||||
#define GIMULTIPLYADDLANFLOAT32(i) \ | |||||
GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ | |||||
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ | |||||
return GiMultiplyAddScalarFloat32( \ | |||||
VectorSum, Vector1, GiExtractLane##i##Float32(Vector2)); \ | |||||
} | } | ||||
GIMULTIPLYADDLANFLOAT32(0) | GIMULTIPLYADDLANFLOAT32(0) | ||||
GIMULTIPLYADDLANFLOAT32(1) | GIMULTIPLYADDLANFLOAT32(1) | ||||
@@ -363,10 +413,10 @@ GIMULTIPLYADDLANFLOAT32(2) | |||||
GIMULTIPLYADDLANFLOAT32(3) | GIMULTIPLYADDLANFLOAT32(3) | ||||
#undef GIMULTIPLYADDLANFLOAT32 | #undef GIMULTIPLYADDLANFLOAT32 | ||||
#else | #else | ||||
#define GIMULTIPLYADDLANFLOAT32(i) \ | |||||
GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \ | |||||
GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \ | |||||
return VectorSum + Vector1 * Vector2[i]; \ | |||||
#define GIMULTIPLYADDLANFLOAT32(i) \ | |||||
GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ | |||||
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ | |||||
return VectorSum + Vector1 * Vector2[i]; \ | |||||
} | } | ||||
GIMULTIPLYADDLANFLOAT32(0) | GIMULTIPLYADDLANFLOAT32(0) | ||||
GIMULTIPLYADDLANFLOAT32(1) | GIMULTIPLYADDLANFLOAT32(1) | ||||
@@ -376,8 +426,7 @@ GIMULTIPLYADDLANFLOAT32(3) | |||||
#endif | #endif | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_FLOAT32 | |||||
GiDivideFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
GI_FLOAT32_t GiDivideFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
#if defined(GI_NEON64_INTRINSICS) | #if defined(GI_NEON64_INTRINSICS) | ||||
return vdivq_f32(Vector1, Vector2); | return vdivq_f32(Vector1, Vector2); | ||||
#elif defined(GI_NEON32_INTRINSICS) | #elif defined(GI_NEON32_INTRINSICS) | ||||
@@ -392,64 +441,129 @@ GiDivideFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_FLOAT32 | |||||
GiGreaterThanFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
GI_FLOAT32_t GiRecpeSFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
#if defined(GI_NEON64_INTRINSICS) | |||||
return vrecpsq_f32(Vector1, Vector2); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
GI_FLOAT32_t two = _mm_set1_ps(2.0f); | |||||
return _mm_sub_ps(two, _mm_mul_ps(Vector1, Vector2)); | |||||
#else | |||||
return (2.0f - Vector1 * Vector2); | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_FLOAT32_t GiRecpeFloat32(GI_FLOAT32_t Vector) { | |||||
#if defined(GI_NEON32_INTRINSICS) | |||||
return vrecpeq_f32(Vector); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
GI_FLOAT32_t ones = _mm_set1_ps(1.0f); | |||||
return _mm_div_ps(ones, Vector); | |||||
#else | |||||
return 1 / Vector; | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_FLOAT32_t GiNegFloat32(GI_FLOAT32_t Vector) { | |||||
#if defined(GI_NEON32_INTRINSICS) | |||||
return vnegq_f32(Vector); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
GI_FLOAT32_t zero = _mm_set1_ps(0.0f); | |||||
return _mm_sub_ps(zero, Vector); | |||||
#else | |||||
return -Vector; | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_UINT32_t GiGreaterThanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vcgtq_f32(Vector1, Vector2); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
return _mm_castps_si128(_mm_cmpgt_ps(Vector1, Vector2)); | |||||
#else | |||||
GI_UINT32_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
ret[i] = Vector1[i] > Vector2[i] ? 0xFFFFFFFF : 0; | |||||
} | |||||
return ret; | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_UINT32_t GiLessThanEqFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vreinterpretq_f32_u32(vcgtq_f32(Vector1, Vector2)); | |||||
return vcleq_f32(Vector1, Vector2); | |||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_cmpgt_ps(Vector1, Vector2); | |||||
return _mm_castps_si128(_mm_cmple_ps(Vector1, Vector2)); | |||||
#else | #else | ||||
return Vector1 > Vector2; | |||||
GI_UINT32_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
ret[i] = Vector1[i] <= Vector2[i] ? 0xFFFFFFFF : 0; | |||||
} | |||||
return ret; | |||||
#endif | #endif | ||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_FLOAT32 | |||||
GiAndFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
GI_UINT32_t GiLessThanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vcltq_f32(Vector1, Vector2); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
return _mm_castps_si128(_mm_cmplt_ps(Vector1, Vector2)); | |||||
#else | |||||
GI_UINT32_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
ret[i] = Vector1[i] < Vector2[i] ? 0xFFFFFFFF : 0; | |||||
} | |||||
return ret; | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_FLOAT32_t GiAndFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
#if defined(GI_SSE2_INTRINSICS) | #if defined(GI_SSE2_INTRINSICS) | ||||
return _mm_and_ps(Vector1, Vector2); | return _mm_and_ps(Vector1, Vector2); | ||||
#else | #else | ||||
return GiReinterpretAsFloat32( | |||||
return GiReintInt32ToFloat32( | |||||
GiAndInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2))); | GiAndInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2))); | ||||
#endif | #endif | ||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_FLOAT32 | |||||
GiOrFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
GI_FLOAT32_t GiOrFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
#if defined(GI_SSE2_INTRINSICS) | #if defined(GI_SSE2_INTRINSICS) | ||||
return _mm_or_ps(Vector1, Vector2); | return _mm_or_ps(Vector1, Vector2); | ||||
#else | #else | ||||
return GiReinterpretAsFloat32( | |||||
return GiReintInt32ToFloat32( | |||||
GiOrInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2))); | GiOrInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2))); | ||||
#endif | #endif | ||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_FLOAT32 | |||||
GiAndNotFloat32(GI_FLOAT32 VectorNot, GI_FLOAT32 Vector) { | |||||
GI_FLOAT32_t GiAndNotFloat32(GI_FLOAT32_t VectorNot, GI_FLOAT32_t Vector) { | |||||
#if defined(GI_SSE2_INTRINSICS) | #if defined(GI_SSE2_INTRINSICS) | ||||
return _mm_andnot_ps(VectorNot, Vector); | return _mm_andnot_ps(VectorNot, Vector); | ||||
#else | #else | ||||
return GiReinterpretAsFloat32(GiAndNotInt32( | |||||
return GiReintInt32ToFloat32(GiAndNotInt32( | |||||
GiReinterpretAsInt32(VectorNot), GiReinterpretAsInt32(Vector))); | GiReinterpretAsInt32(VectorNot), GiReinterpretAsInt32(Vector))); | ||||
#endif | #endif | ||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_FLOAT32 | |||||
GiXorFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
GI_FLOAT32_t GiXorFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
#if defined(GI_SSE2_INTRINSICS) | #if defined(GI_SSE2_INTRINSICS) | ||||
return _mm_xor_ps(Vector1, Vector2); | return _mm_xor_ps(Vector1, Vector2); | ||||
#else | #else | ||||
return GiReinterpretAsFloat32( | |||||
return GiReintInt32ToFloat32( | |||||
GiXorInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2))); | GiXorInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2))); | ||||
#endif | #endif | ||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_FLOAT32 | |||||
GiBlendFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2, GI_FLOAT32 Selection) { | |||||
GI_FLOAT32_t GiBlendFloat32( | |||||
GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2, GI_FLOAT32_t Selection) { | |||||
return GiOrFloat32( | return GiOrFloat32( | ||||
GiAndFloat32(Vector2, Selection), GiAndNotFloat32(Selection, Vector1)); | GiAndFloat32(Vector2, Selection), GiAndNotFloat32(Selection, Vector1)); | ||||
} | } | ||||
@@ -458,14 +572,54 @@ GiBlendFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2, GI_FLOAT32 Selection) { | |||||
#define MAX_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b); | #define MAX_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b); | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_FLOAT32 | |||||
GiMaximumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
GI_FLOAT32_t GiBSLFloat32( | |||||
GI_UINT32_t Selection, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vbslq_f32(Selection, Vector1, Vector2); | |||||
#else | |||||
return GiBlendFloat32(Vector1, Vector2, GiReintUint32ToFloat32(Selection)); | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_FLOAT32_t GiMaximumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vmaxq_f32(Vector1, Vector2); | |||||
#elif defined(GI_NEON32_INTRINSICS) | |||||
return _mm_max_ps(Vector1, Vector2); | |||||
#else | |||||
GI_FLOAT32_t max; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
max[i] = Max(Vector1[i], Vector2[i]); | |||||
} | |||||
return max; | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_FLOAT32_t GiMinimumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vminq_f32(Vector1, Vector2); | |||||
#elif defined(GI_NEON32_INTRINSICS) | |||||
return _mm_min_ps(Vector1, Vector2); | |||||
#else | |||||
GI_FLOAT32_t min; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
min[i] = Min(Vector1[i], Vector2[i]); | |||||
} | |||||
return min; | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_FLOAT32_t GiMaxNanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vmaxq_f32(Vector1, Vector2); | return vmaxq_f32(Vector1, Vector2); | ||||
#else | #else | ||||
//! _mm_max_ps does not fellow the IEEE standard when input is NAN, so | //! _mm_max_ps does not fellow the IEEE standard when input is NAN, so | ||||
//! implement by C code | //! implement by C code | ||||
GI_FLOAT32 max; | |||||
#define MAX_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b); | |||||
GI_FLOAT32_t max; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
max[i] = MAX_NAN(Vector1[i], Vector2[i]); | max[i] = MAX_NAN(Vector1[i], Vector2[i]); | ||||
} | } | ||||
@@ -474,14 +628,14 @@ GiMaximumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_FLOAT32 | |||||
GiMinimumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
GI_FLOAT32_t GiMinNanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vminq_f32(Vector1, Vector2); | return vminq_f32(Vector1, Vector2); | ||||
#else | #else | ||||
//! _mm_min_ps does not fellow the IEEE standard when input is NAN, so | //! _mm_min_ps does not fellow the IEEE standard when input is NAN, so | ||||
//! implement by C code | //! implement by C code | ||||
GI_FLOAT32 min; | |||||
#define MIN_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b); | |||||
GI_FLOAT32_t min; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
min[i] = MIN_NAN(Vector1[i], Vector2[i]); | min[i] = MIN_NAN(Vector1[i], Vector2[i]); | ||||
} | } | ||||
@@ -490,15 +644,14 @@ GiMinimumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_FLOAT32 | |||||
GiClampFloat32(GI_FLOAT32 Value, float LowerRange, float UpperRange) { | |||||
GI_FLOAT32_t GiClampFloat32(GI_FLOAT32_t Value, float LowerRange, float UpperRange) { | |||||
Value = GiMaximumFloat32(GiBroadcastFloat32(LowerRange), Value); | Value = GiMaximumFloat32(GiBroadcastFloat32(LowerRange), Value); | ||||
Value = GiMinimumFloat32(GiBroadcastFloat32(UpperRange), Value); | Value = GiMinimumFloat32(GiBroadcastFloat32(UpperRange), Value); | ||||
return Value; | return Value; | ||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
float GiReduceAddFloat32(GI_FLOAT32 Vector) { | |||||
float GiReduceAddFloat32(GI_FLOAT32_t Vector) { | |||||
#if defined(GI_NEON64_INTRINSICS) | #if defined(GI_NEON64_INTRINSICS) | ||||
Vector = vpaddq_f32(Vector, Vector); | Vector = vpaddq_f32(Vector, Vector); | ||||
Vector = vpaddq_f32(Vector, Vector); | Vector = vpaddq_f32(Vector, Vector); | ||||
@@ -525,7 +678,7 @@ float GiReduceAddFloat32(GI_FLOAT32 Vector) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
float GiReduceMultiplyFloat32(GI_FLOAT32 Vector) { | |||||
float GiReduceMultiplyFloat32(GI_FLOAT32_t Vector) { | |||||
#if defined(GI_NEON64_INTRINSICS) | #if defined(GI_NEON64_INTRINSICS) | ||||
float32x2_t low = vget_low_f32(Vector); | float32x2_t low = vget_low_f32(Vector); | ||||
float32x2_t high = vget_high_f32(Vector); | float32x2_t high = vget_high_f32(Vector); | ||||
@@ -550,7 +703,7 @@ float GiReduceMultiplyFloat32(GI_FLOAT32 Vector) { | |||||
#define Min(a, b) (a) < (b) ? (a) : (b) | #define Min(a, b) (a) < (b) ? (a) : (b) | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
float GiReduceMaximumFloat32(GI_FLOAT32 Vector) { | |||||
float GiReduceMaxNanFloat32(GI_FLOAT32_t Vector) { | |||||
#if defined(GI_NEON64_INTRINSICS) | #if defined(GI_NEON64_INTRINSICS) | ||||
return vmaxvq_f32(Vector); | return vmaxvq_f32(Vector); | ||||
#elif defined(GI_NEON32_INTRINSICS) | #elif defined(GI_NEON32_INTRINSICS) | ||||
@@ -560,9 +713,9 @@ float GiReduceMaximumFloat32(GI_FLOAT32 Vector) { | |||||
VectorLow = vpmax_f32(VectorLow, VectorHigh); | VectorLow = vpmax_f32(VectorLow, VectorHigh); | ||||
return vget_lane_f32(VectorLow, 0); | return vget_lane_f32(VectorLow, 0); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
Vector = GiMaximumFloat32( | |||||
Vector = GiMaxNanFloat32( | |||||
Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3))); | Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3))); | ||||
Vector = GiMaximumFloat32( | |||||
Vector = GiMaxNanFloat32( | |||||
Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); | Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); | ||||
return GiExtractLane0Float32(Vector); | return GiExtractLane0Float32(Vector); | ||||
#else | #else | ||||
@@ -575,7 +728,7 @@ float GiReduceMaximumFloat32(GI_FLOAT32 Vector) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
float GiReduceMinimumFloat32(GI_FLOAT32 Vector) { | |||||
float GiReduceMinNanFloat32(GI_FLOAT32_t Vector) { | |||||
#if defined(GI_NEON64_INTRINSICS) | #if defined(GI_NEON64_INTRINSICS) | ||||
return vminvq_f32(Vector); | return vminvq_f32(Vector); | ||||
#elif defined(GI_NEON32_INTRINSICS) | #elif defined(GI_NEON32_INTRINSICS) | ||||
@@ -585,9 +738,9 @@ float GiReduceMinimumFloat32(GI_FLOAT32 Vector) { | |||||
VectorLow = vpmin_f32(VectorLow, VectorHigh); | VectorLow = vpmin_f32(VectorLow, VectorHigh); | ||||
return vget_lane_f32(VectorLow, 0); | return vget_lane_f32(VectorLow, 0); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
Vector = GiMinimumFloat32( | |||||
Vector = GiMinNanFloat32( | |||||
Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3))); | Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3))); | ||||
Vector = GiMinimumFloat32( | |||||
Vector = GiMinNanFloat32( | |||||
Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); | Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); | ||||
return GiExtractLane0Float32(Vector); | return GiExtractLane0Float32(Vector); | ||||
#else | #else | ||||
@@ -599,4 +752,24 @@ float GiReduceMinimumFloat32(GI_FLOAT32 Vector) { | |||||
#endif | #endif | ||||
} | } | ||||
GI_FORCEINLINE | |||||
GI_FLOAT32_t GiAbsFloat32(GI_FLOAT32_t Vector1) { | |||||
#if defined(GI_NEON64_INTRINSICS) | |||||
return vabsq_f32(Vector1); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
union { | |||||
unsigned int int_val; | |||||
float float_val; | |||||
} value; | |||||
value.int_val = 0x7fffffff; | |||||
return _mm_and_ps(Vector1, _mm_set_ps1(value.float_val)); | |||||
#else | |||||
GI_FLOAT32_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
ret[i] = Vector1[i] > 0 ? Vector1[i] : -Vector1[i]; | |||||
} | |||||
return ret; | |||||
#endif | |||||
} | |||||
// vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen |
@@ -14,14 +14,13 @@ | |||||
#include "gi_common.h" | #include "gi_common.h" | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT32 | |||||
GiBroadcastInt32(int32_t Value) { | |||||
GI_INT32_t GiBroadcastInt32(int32_t Value) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vdupq_n_s32(Value); | return vdupq_n_s32(Value); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_set1_epi32(Value); | return _mm_set1_epi32(Value); | ||||
#else | #else | ||||
GI_INT32 ret; | |||||
GI_INT32_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | ||||
ret[i] = Value; | ret[i] = Value; | ||||
} | } | ||||
@@ -30,14 +29,28 @@ GiBroadcastInt32(int32_t Value) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT8 | |||||
GiBroadcastInt8(int8_t Value) { | |||||
GI_UINT32_t GiBroadcastUint32(int32_t Value) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vdupq_n_u32(Value); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
return _mm_set1_epi32(Value); | |||||
#else | |||||
GI_UINT32_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||||
ret[i] = Value; | |||||
} | |||||
return ret; | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_INT8_t GiBroadcastInt8(int8_t Value) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vdupq_n_s8(Value); | return vdupq_n_s8(Value); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_set1_epi8(Value); | return _mm_set1_epi8(Value); | ||||
#else | #else | ||||
GI_INT8 ret; | |||||
GI_INT8_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | ||||
ret[i] = Value; | ret[i] = Value; | ||||
} | } | ||||
@@ -46,14 +59,13 @@ GiBroadcastInt8(int8_t Value) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT32 | |||||
GiLoadInt32(const int32_t* Buffer) { | |||||
GI_INT32_t GiLoadInt32(const int32_t* Buffer) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vld1q_s32(Buffer); | return vld1q_s32(Buffer); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_loadu_si128((const __m128i*)Buffer); | return _mm_loadu_si128((const __m128i*)Buffer); | ||||
#else | #else | ||||
GI_INT32 ret; | |||||
GI_INT32_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | ||||
ret[i] = Buffer[i]; | ret[i] = Buffer[i]; | ||||
} | } | ||||
@@ -62,14 +74,13 @@ GiLoadInt32(const int32_t* Buffer) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT8 | |||||
GiLoadInt8(const int8_t* Buffer) { | |||||
GI_INT8_t GiLoadInt8(const int8_t* Buffer) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vld1q_s8(Buffer); | return vld1q_s8(Buffer); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_loadu_si128((const __m128i*)Buffer); | return _mm_loadu_si128((const __m128i*)Buffer); | ||||
#else | #else | ||||
GI_INT8 ret; | |||||
GI_INT8_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | ||||
ret[i] = Buffer[i]; | ret[i] = Buffer[i]; | ||||
} | } | ||||
@@ -78,7 +89,7 @@ GiLoadInt8(const int8_t* Buffer) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
void GiStoreInt32(int32_t* Buffer, GI_INT32 Vector) { | |||||
void GiStoreInt32(int32_t* Buffer, GI_INT32_t Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
vst1q_s32(Buffer, Vector); | vst1q_s32(Buffer, Vector); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
@@ -90,8 +101,60 @@ void GiStoreInt32(int32_t* Buffer, GI_INT32 Vector) { | |||||
#endif | #endif | ||||
} | } | ||||
#if defined(GI_NEON_INTRINSICS) | |||||
#define GISTORELANEINT32(i) \ | |||||
GI_FORCEINLINE void GiStoreLane##i##Int32(int32_t* Buffer, GI_INT32_t Vector) { \ | |||||
vst1q_lane_s32(Buffer, Vector, i); \ | |||||
} | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
#define GISTORELANEINT32(i) \ | |||||
GI_FORCEINLINE void GiStoreLane##i##Int32(int32_t* Buffer, GI_INT32_t Vector) { \ | |||||
GI_FLOAT32_t tmp = _mm_castsi128_ps(Vector); \ | |||||
_mm_store_ss( \ | |||||
(float*)Buffer, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(i, i, i, i))); \ | |||||
} | |||||
#else | |||||
#define GISTORELANEINT32(i) \ | |||||
GI_FORCEINLINE void GiStoreLane##i##Int32(int32_t* Buffer, GI_INT32_t Vector) { \ | |||||
*Buffer = Vector[i]; \ | |||||
} | |||||
#endif | |||||
GISTORELANEINT32(0) | |||||
GISTORELANEINT32(1) | |||||
GISTORELANEINT32(2) | |||||
GISTORELANEINT32(3) | |||||
#undef GISTORELANEFLOAT32 | |||||
GI_FORCEINLINE | |||||
GI_INT8_t GiReinterInt32ToInt8(GI_INT32_t Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vreinterpretq_s8_s32(Vector); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
return Vector; | |||||
#else | |||||
return *(GI_INT8_t*)&Vector; | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
void GiStoreInt16(int16_t* Buffer, GI_INT16_t Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
vst1q_s16(Buffer, Vector); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
_mm_storeu_si128((__m128i*)Buffer, Vector); | |||||
#else | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); i++) { | |||||
Buffer[i] = Vector[i]; | |||||
} | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
void GiStoreInt8(int8_t* Buffer, GI_INT8 Vector) { | |||||
void GiStoreInt8(int8_t* Buffer, GI_INT8_t Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
vst1q_s8(Buffer, Vector); | vst1q_s8(Buffer, Vector); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
@@ -104,7 +167,7 @@ void GiStoreInt8(int8_t* Buffer, GI_INT8 Vector) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
void GiStoreLowInt8(int8_t* Buffer, GI_INT8 Vector) { | |||||
void GiStoreLowInt8(int8_t* Buffer, GI_INT8_t Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
vst1_s8(Buffer, vget_low_s8(Vector)); | vst1_s8(Buffer, vget_low_s8(Vector)); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
@@ -117,7 +180,7 @@ void GiStoreLowInt8(int8_t* Buffer, GI_INT8 Vector) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
void GiStoreHihgInt8(int8_t* Buffer, GI_INT8 Vector) { | |||||
void GiStoreHihgInt8(int8_t* Buffer, GI_INT8_t Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
vst1_s8(Buffer, vget_high_s8(Vector)); | vst1_s8(Buffer, vget_high_s8(Vector)); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
@@ -130,8 +193,47 @@ void GiStoreHihgInt8(int8_t* Buffer, GI_INT8 Vector) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT32 | |||||
GiAddInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
GI_INT32_t GiNegInt32(GI_INT32_t Vector) { | |||||
#if defined(GI_NEON32_INTRINSICS) | |||||
return vnegq_s32(Vector); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
GI_INT32_t zero = _mm_set1_epi32(0); | |||||
return _mm_sub_epi32(zero, Vector); | |||||
#else | |||||
return -Vector; | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_INT8_t GiNegInt8(GI_INT8_t Vector) { | |||||
#if defined(GI_NEON32_INTRINSICS) | |||||
return vnegq_s8(Vector); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
GI_INT32_t zero = _mm_set1_epi8(0); | |||||
return _mm_sub_epi8(zero, Vector); | |||||
#else | |||||
return -Vector; | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_UINT32_t GiTestAndSetUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vtstq_u32(Vector1, Vector2); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
GI_UINT32_t tmp = _mm_and_si128(Vector1, Vector2); | |||||
return _mm_cmpeq_epi32(tmp, _mm_setzero_si128()); | |||||
#else | |||||
GI_UINT32_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||||
ret[i] = Vector1[i] & Vector2[i] ? 0xFFFFFFFF : 0; | |||||
} | |||||
return ret; | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_INT32_t GiAddInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vaddq_s32(Vector1, Vector2); | return vaddq_s32(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
@@ -142,8 +244,40 @@ GiAddInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT32 | |||||
GiSubtractInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
GI_UINT32_t GiAddUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vaddq_u32(Vector1, Vector2); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
return _mm_add_epi32(Vector1, Vector2); | |||||
#else | |||||
return Vector1 + Vector2; | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_INT16_t GiAddInt16(GI_INT16_t Vector1, GI_INT16_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vaddq_s16(Vector1, Vector2); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
return _mm_add_epi16(Vector1, Vector2); | |||||
#else | |||||
return Vector1 + Vector2; | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_INT8_t GiAddInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vaddq_s8(Vector1, Vector2); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
return _mm_add_epi8(Vector1, Vector2); | |||||
#else | |||||
return Vector1 + Vector2; | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_INT32_t GiSubtractInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vsubq_s32(Vector1, Vector2); | return vsubq_s32(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
@@ -154,20 +288,82 @@ GiSubtractInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT32 | |||||
GiMultiplyInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
GI_UINT32_t GiSubtractUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vsubq_u32(Vector1, Vector2); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
return _mm_sub_epi32(Vector1, Vector2); | |||||
#else | |||||
return Vector1 - Vector2; | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_INT8_t GiSubtractInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vsubq_s8(Vector1, Vector2); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
return _mm_sub_epi8(Vector1, Vector2); | |||||
#else | |||||
return Vector1 - Vector2; | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_INT32_t GiMultiplyInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vmulq_s32(Vector1, Vector2); | return vmulq_s32(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_mul_epi32(Vector1, Vector2); | |||||
GI_FLOAT32_t v0 = _mm_cvtepi32_ps(Vector1); | |||||
GI_FLOAT32_t v1 = _mm_cvtepi32_ps(Vector2); | |||||
return _mm_cvttps_epi32(_mm_mul_ps(v0, v1)); | |||||
#else | |||||
return Vector1 * Vector2; | |||||
#endif | |||||
} | |||||
//! in x86, there is no int multiply, so implement it naive | |||||
GI_FORCEINLINE | |||||
GI_INT8_t GiMultiplyInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vmulq_s8(Vector1, Vector2); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
int8_t v1[16], v2[16], res[16]; | |||||
_mm_storeu_si128((__m128i*)v1, Vector1); | |||||
_mm_storeu_si128((__m128i*)v2, Vector2); | |||||
for (size_t id = 0; id < 16; id++) { | |||||
res[id] = v1[id] * v2[id]; | |||||
} | |||||
return _mm_loadu_si128((__m128i*)res); | |||||
#else | #else | ||||
return Vector1 * Vector2; | return Vector1 * Vector2; | ||||
#endif | #endif | ||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT8 | |||||
GiAndInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||||
GI_INT32_t GiMultiplyAddInt32( | |||||
GI_INT32_t Vector1, GI_INT32_t Vector2, GI_INT32_t Vector3) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vmlaq_s32(Vector1, Vector2, Vector3); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
return _mm_add_epi32(Vector1, GiMultiplyInt32(Vector2, Vector3)); | |||||
#else | |||||
return Vector1 + Vector2 * Vector3; | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_INT8_t GiMultiplyAddInt8(GI_INT8_t Vector1, GI_INT8_t Vector2, GI_INT8_t Vector3) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vmlaq_s8(Vector1, Vector2, Vector3); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
return _mm_add_epi8(Vector1, GiMultiplyInt8(Vector2, Vector3)); | |||||
#else | |||||
return Vector1 + Vector2 * Vector3; | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_INT8_t GiAndInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vandq_s8(Vector1, Vector2); | return vandq_s8(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
@@ -178,8 +374,18 @@ GiAndInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT8 | |||||
GiOrInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||||
GI_UINT32_t GiEOrUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return veorq_u32(Vector1, Vector2); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
return _mm_xor_si128(Vector1, Vector2); | |||||
#else | |||||
return Vector1 ^ Vector2; | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_INT8_t GiOrInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vorrq_s8(Vector1, Vector2); | return vorrq_s8(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
@@ -190,21 +396,19 @@ GiOrInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT8 | |||||
GiAndNotInt8(GI_INT8 VectorNot, GI_INT8 Vector) { | |||||
GI_INT8_t GiAndNotInt8(GI_INT8_t VectorNot, GI_INT8_t Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vandq_s8(vmvnq_s8(VectorNot), Vector); | return vandq_s8(vmvnq_s8(VectorNot), Vector); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_andnot_si128(VectorNot, Vector); | return _mm_andnot_si128(VectorNot, Vector); | ||||
#else | #else | ||||
GI_INT8 Not = ~VectorNot; | |||||
GI_INT8_t Not = ~VectorNot; | |||||
return (Not & Vector); | return (Not & Vector); | ||||
#endif | #endif | ||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT8 | |||||
GiXorInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||||
GI_INT8_t GiXorInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return veorq_s8(Vector1, Vector2); | return veorq_s8(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
@@ -214,47 +418,85 @@ GiXorInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||||
#endif | #endif | ||||
} | } | ||||
GI_FORCEINLINE | |||||
GI_INT32_t GiShiftLeft23Int32(GI_INT32_t Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
#define GISHIFTLEFTINT32(i) \ | |||||
GI_FORCEINLINE GI_INT32 GiShiftLeft##i##Int32(GI_INT32 Vector) { \ | |||||
return vshlq_n_s32(Vector, i); \ | |||||
} | |||||
return vshlq_n_s32(Vector, 23); | |||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
#define GISHIFTLEFTINT32(i) \ | |||||
GI_FORCEINLINE GI_INT32 GiShiftLeft##i##Int32(GI_INT32 Vector) { \ | |||||
return _mm_slli_epi32(Vector, i); \ | |||||
} | |||||
return _mm_slli_epi32(Vector, 23); | |||||
#else | #else | ||||
#define GISHIFTLEFTINT32(i) \ | |||||
GI_FORCEINLINE GI_INT32 GiShiftLeft##i##Int32(GI_INT32 Vector) { \ | |||||
return Vector << i; \ | |||||
} | |||||
return Vector << 23; | |||||
#endif | #endif | ||||
} | |||||
GISHIFTLEFTINT32(0) | |||||
GISHIFTLEFTINT32(1) | |||||
GISHIFTLEFTINT32(2) | |||||
GISHIFTLEFTINT32(3) | |||||
#undef GISHIFTLEFTINT32 | |||||
GI_FORCEINLINE | |||||
GI_INT32_t GiShiftRight23Int32(GI_INT32_t Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vshrq_n_s32(Vector, 23); | |||||
#elif defined(GI_SSE2_INTRINSICS) | |||||
return _mm_srai_epi32(Vector, 23); | |||||
#else | |||||
return Vector >> 23; | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT32 | |||||
GiBlendInt32(GI_INT32 Vector1, GI_INT32 Vector2, GI_INT32 Selection) { | |||||
GI_INT32_t GiBlendInt32(GI_INT32_t Vector1, GI_INT32_t Vector2, GI_INT32_t Selection) { | |||||
return GiOrInt32(GiAndInt32(Vector2, Selection), GiAndNotInt32(Selection, Vector1)); | return GiOrInt32(GiAndInt32(Vector2, Selection), GiAndNotInt32(Selection, Vector1)); | ||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT8 | |||||
GiBlendInt8(GI_INT8 Vector1, GI_INT8 Vector2, GI_INT8 Selection) { | |||||
GI_INT8_t GiBlendInt8(GI_INT8_t Vector1, GI_INT8_t Vector2, GI_INT8_t Selection) { | |||||
return GiOrInt8(GiAndInt8(Vector2, Selection), GiAndNotInt8(Selection, Vector1)); | return GiOrInt8(GiAndInt8(Vector2, Selection), GiAndNotInt8(Selection, Vector1)); | ||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT32 | |||||
GiMaximumInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
GI_INT32_t GiAbsInt32(GI_INT32_t Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vabsq_s32(Vector); | |||||
#elif defined(GI_SSE42_INTRINSICS) | |||||
return _mm_abs_epi32(Vector); | |||||
#else | |||||
GI_INT32_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||||
ret[i] = Vector[i] > 0 ? Vector[i] : -Vector[i]; | |||||
} | |||||
return ret; | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_INT16_t GiAbsInt16(GI_INT16_t Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vabsq_s16(Vector); | |||||
#elif defined(GI_SSE42_INTRINSICS) | |||||
return _mm_abs_epi16(Vector); | |||||
#else | |||||
GI_INT16_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); i++) { | |||||
ret[i] = Vector[i] > 0 ? Vector[i] : -Vector[i]; | |||||
} | |||||
return ret; | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_INT8_t GiAbsInt8(GI_INT8_t Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | |||||
return vabsq_s8(Vector); | |||||
#elif defined(GI_SSE42_INTRINSICS) | |||||
return _mm_abs_epi8(Vector); | |||||
#else | |||||
GI_INT8_t ret; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | |||||
ret[i] = Vector[i] > 0 ? Vector[i] : -Vector[i]; | |||||
} | |||||
return ret; | |||||
#endif | |||||
} | |||||
GI_FORCEINLINE | |||||
GI_INT32_t GiMaximumInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vmaxq_s32(Vector1, Vector2); | return vmaxq_s32(Vector1, Vector2); | ||||
#elif defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE42_INTRINSICS) | ||||
@@ -267,8 +509,7 @@ GiMaximumInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT32 | |||||
GiMinimumInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
GI_INT32_t GiMinimumInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vminq_s32(Vector1, Vector2); | return vminq_s32(Vector1, Vector2); | ||||
#elif defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE42_INTRINSICS) | ||||
@@ -281,14 +522,12 @@ GiMinimumInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT8 | |||||
GiBlendInt8x16(GI_INT8 Vector1, GI_INT8 Vector2, GI_INT8 Selection) { | |||||
GI_INT8_t GiBlendInt8x16(GI_INT8_t Vector1, GI_INT8_t Vector2, GI_INT8_t Selection) { | |||||
return GiOrInt8(GiAndInt8(Vector2, Selection), GiAndNotInt8(Selection, Vector1)); | return GiOrInt8(GiAndInt8(Vector2, Selection), GiAndNotInt8(Selection, Vector1)); | ||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT8 | |||||
GiMaximumInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||||
GI_INT8_t GiMaximumInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vmaxq_s8(Vector1, Vector2); | return vmaxq_s8(Vector1, Vector2); | ||||
#elif defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE42_INTRINSICS) | ||||
@@ -301,8 +540,7 @@ GiMaximumInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT8 | |||||
GiMinimumInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||||
GI_INT8_t GiMinimumInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vminq_s8(Vector1, Vector2); | return vminq_s8(Vector1, Vector2); | ||||
#elif defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE42_INTRINSICS) | ||||
@@ -315,8 +553,7 @@ GiMinimumInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT16 | |||||
GiMoveHighLongInt8(GI_INT8 Vector) { | |||||
GI_INT16_t GiMoveHighLongInt8(GI_INT8_t Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vmovl_s8(vget_high_s8(Vector)); | return vmovl_s8(vget_high_s8(Vector)); | ||||
#elif defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE42_INTRINSICS) | ||||
@@ -330,7 +567,7 @@ GiMoveHighLongInt8(GI_INT8 Vector) { | |||||
} | } | ||||
return _mm_loadu_si128((__m128i*)data); | return _mm_loadu_si128((__m128i*)data); | ||||
#else | #else | ||||
GI_INT16 ret; | |||||
GI_INT16_t ret; | |||||
int8_t* data = (int8_t*)&Vector; | int8_t* data = (int8_t*)&Vector; | ||||
size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); | size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); | ||||
for (size_t i = 0; i < half_length; i++) { | for (size_t i = 0; i < half_length; i++) { | ||||
@@ -341,8 +578,7 @@ GiMoveHighLongInt8(GI_INT8 Vector) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT16 | |||||
GiMoveLowLongInt8(GI_INT8 Vector) { | |||||
GI_INT16_t GiMoveLowLongInt8(GI_INT8_t Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vmovl_s8(vget_low_s8(Vector)); | return vmovl_s8(vget_low_s8(Vector)); | ||||
#elif defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE42_INTRINSICS) | ||||
@@ -356,7 +592,7 @@ GiMoveLowLongInt8(GI_INT8 Vector) { | |||||
} | } | ||||
return _mm_loadu_si128((__m128i*)data); | return _mm_loadu_si128((__m128i*)data); | ||||
#else | #else | ||||
GI_INT16 ret; | |||||
GI_INT16_t ret; | |||||
size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); | size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); | ||||
for (size_t i = 0; i < half_length; i++) { | for (size_t i = 0; i < half_length; i++) { | ||||
ret[i] = Vector[i]; | ret[i] = Vector[i]; | ||||
@@ -366,8 +602,7 @@ GiMoveLowLongInt8(GI_INT8 Vector) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT32 | |||||
GiMoveHighLongInt16(GI_INT16 Vector) { | |||||
GI_INT32_t GiMoveHighLongInt16(GI_INT16_t Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vmovl_s16(vget_high_s16(Vector)); | return vmovl_s16(vget_high_s16(Vector)); | ||||
#elif defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE42_INTRINSICS) | ||||
@@ -381,7 +616,7 @@ GiMoveHighLongInt16(GI_INT16 Vector) { | |||||
} | } | ||||
return _mm_loadu_si128((__m128i*)data); | return _mm_loadu_si128((__m128i*)data); | ||||
#else | #else | ||||
GI_INT32 ret; | |||||
GI_INT32_t ret; | |||||
size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); | size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); | ||||
for (size_t i = 0; i < half_length; i++) { | for (size_t i = 0; i < half_length; i++) { | ||||
ret[i] = Vector[half_length + i]; | ret[i] = Vector[half_length + i]; | ||||
@@ -391,8 +626,7 @@ GiMoveHighLongInt16(GI_INT16 Vector) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT32 | |||||
GiMoveLowLongInt16(GI_INT16 Vector) { | |||||
GI_INT32_t GiMoveLowLongInt16(GI_INT16_t Vector) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vmovl_s16(vget_low_s16(Vector)); | return vmovl_s16(vget_low_s16(Vector)); | ||||
#elif defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE42_INTRINSICS) | ||||
@@ -406,7 +640,7 @@ GiMoveLowLongInt16(GI_INT16 Vector) { | |||||
} | } | ||||
return _mm_loadu_si128((__m128i*)data); | return _mm_loadu_si128((__m128i*)data); | ||||
#else | #else | ||||
GI_INT32 ret; | |||||
GI_INT32_t ret; | |||||
size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); | size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); | ||||
for (size_t i = 0; i < half_length; i++) { | for (size_t i = 0; i < half_length; i++) { | ||||
ret[i] = Vector[i]; | ret[i] = Vector[i]; | ||||
@@ -416,7 +650,7 @@ GiMoveLowLongInt16(GI_INT16 Vector) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
int32_t GiReduceAddInt8(GI_INT8 Vector) { | |||||
int32_t GiReduceAddInt8(GI_INT8_t Vector) { | |||||
#if defined(GI_NEON64_INTRINSICS) | #if defined(GI_NEON64_INTRINSICS) | ||||
return vaddlvq_s8(Vector); | return vaddlvq_s8(Vector); | ||||
#elif defined(GI_NEON32_INTRINSICS) | #elif defined(GI_NEON32_INTRINSICS) | ||||
@@ -461,7 +695,7 @@ int32_t GiReduceAddInt8(GI_INT8 Vector) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
int8_t GiReduceMaxInt8(GI_INT8 Vector) { | |||||
int8_t GiReduceMaxInt8(GI_INT8_t Vector) { | |||||
#if defined(GI_NEON64_INTRINSICS) | #if defined(GI_NEON64_INTRINSICS) | ||||
return vmaxvq_s8(Vector); | return vmaxvq_s8(Vector); | ||||
#elif defined(GI_NEON32_INTRINSICS) | #elif defined(GI_NEON32_INTRINSICS) | ||||
@@ -509,7 +743,7 @@ int8_t GiReduceMaxInt8(GI_INT8 Vector) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
int8_t GiReduceMinInt8(GI_INT8 Vector) { | |||||
int8_t GiReduceMinInt8(GI_INT8_t Vector) { | |||||
#if defined(GI_NEON64_INTRINSICS) | #if defined(GI_NEON64_INTRINSICS) | ||||
return vminvq_s8(Vector); | return vminvq_s8(Vector); | ||||
#elif defined(GI_NEON32_INTRINSICS) | #elif defined(GI_NEON32_INTRINSICS) | ||||
@@ -562,8 +796,7 @@ int8_t GiReduceMinInt8(GI_INT8 Vector) { | |||||
//! convert to the short type with the lower bit fill the real data, the high bite | //! convert to the short type with the lower bit fill the real data, the high bite | ||||
//! will repeat the lower bit | //! will repeat the lower bit | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT8 | |||||
GiCvtFromFloat32ToInt8(GI_FLOAT32 src) { | |||||
GI_INT8_t GiCvtFromFloat32ToInt8(GI_FLOAT32_t src) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
#if __ARM_ARCH >= 8 | #if __ARM_ARCH >= 8 | ||||
int32x4_t vres0 = vcvtaq_s32_f32(src); | int32x4_t vres0 = vcvtaq_s32_f32(src); | ||||
@@ -595,7 +828,7 @@ GiCvtFromFloat32ToInt8(GI_FLOAT32 src) { | |||||
__m128i vepi8 = _mm_packs_epi16(vepi16, vepi16); | __m128i vepi8 = _mm_packs_epi16(vepi16, vepi16); | ||||
return vepi8; | return vepi8; | ||||
#else | #else | ||||
GI_INT8 ret; | |||||
GI_INT8_t ret; | |||||
int length = GI_SIMD_LEN_BYTE / sizeof(float); | int length = GI_SIMD_LEN_BYTE / sizeof(float); | ||||
for (int i = 0; i < length; i++) { | for (int i = 0; i < length; i++) { | ||||
int8_t data = Saturate(round(src[i]), -128, 127); | int8_t data = Saturate(round(src[i]), -128, 127); | ||||
@@ -609,8 +842,7 @@ GiCvtFromFloat32ToInt8(GI_FLOAT32 src) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT8 | |||||
GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2 vsrc) { | |||||
GI_INT8_t GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2_t vsrc) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
#if __ARM_ARCH >= 8 | #if __ARM_ARCH >= 8 | ||||
int32x4_t vres0 = vcvtaq_s32_f32(vsrc.val[0]); | int32x4_t vres0 = vcvtaq_s32_f32(vsrc.val[0]); | ||||
@@ -653,7 +885,7 @@ GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2 vsrc) { | |||||
__m128i vepi8 = _mm_packs_epi16(vepi16_0, vepi16_0); | __m128i vepi8 = _mm_packs_epi16(vepi16_0, vepi16_0); | ||||
return vepi8; | return vepi8; | ||||
#else | #else | ||||
GI_INT8 ret; | |||||
GI_INT8_t ret; | |||||
int length = GI_SIMD_LEN_BYTE / sizeof(float); | int length = GI_SIMD_LEN_BYTE / sizeof(float); | ||||
for (int i = 0; i < 2 * length; i++) { | for (int i = 0; i < 2 * length; i++) { | ||||
ret[i] = Saturate(round(vsrc.val[i / length][i % length]), -128, 127); | ret[i] = Saturate(round(vsrc.val[i / length][i % length]), -128, 127); | ||||
@@ -663,8 +895,7 @@ GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2 vsrc) { | |||||
} | } | ||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
GI_INT8 | |||||
GiCvtFromFloat32V4ToInt8(GI_FLOAT32_V4 vsrc) { | |||||
GI_INT8_t GiCvtFromFloat32V4ToInt8(GI_FLOAT32_V4_t vsrc) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
#if __ARM_ARCH >= 8 | #if __ARM_ARCH >= 8 | ||||
int32x4_t vres0 = vcvtaq_s32_f32(vsrc.val[0]); | int32x4_t vres0 = vcvtaq_s32_f32(vsrc.val[0]); | ||||
@@ -726,7 +957,7 @@ GiCvtFromFloat32V4ToInt8(GI_FLOAT32_V4 vsrc) { | |||||
__m128i vepi8 = _mm_packs_epi16(vepi16_0, vepi16_1); | __m128i vepi8 = _mm_packs_epi16(vepi16_0, vepi16_1); | ||||
return vepi8; | return vepi8; | ||||
#else | #else | ||||
GI_INT8 ret; | |||||
GI_INT8_t ret; | |||||
int length = GI_SIMD_LEN_BYTE / sizeof(float); | int length = GI_SIMD_LEN_BYTE / sizeof(float); | ||||
for (int i = 0; i < 4 * length; i++) { | for (int i = 0; i < 4 * length; i++) { | ||||
ret[i] = Saturate(round(vsrc.val[i / length][i % length]), -128, 127); | ret[i] = Saturate(round(vsrc.val[i / length][i % length]), -128, 127); | ||||
@@ -46,25 +46,25 @@ struct MeanReducer<dt_qint8, int8_t, int32_t, false> { | |||||
using ctype = int8_t; | using ctype = int8_t; | ||||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); | static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); | ||||
GI_INT32 res[4]; | |||||
GI_INT32_t res[4]; | |||||
int32_t remain; | int32_t remain; | ||||
int32_t cnt; | int32_t cnt; | ||||
float coef; | float coef; | ||||
GI_FLOAT32 vcoef; | |||||
GI_FLOAT32_t vcoef; | |||||
MeanReducer(DType, size_t cnt) : remain(0), cnt(cnt), coef(1.0 / cnt) { | MeanReducer(DType, size_t cnt) : remain(0), cnt(cnt), coef(1.0 / cnt) { | ||||
memset(res, 0, sizeof(res)); | memset(res, 0, sizeof(res)); | ||||
vcoef = GiBroadcastFloat32(coef); | vcoef = GiBroadcastFloat32(coef); | ||||
} | } | ||||
MeanReducer() = default; | MeanReducer() = default; | ||||
void feed(const int8_t* val) { | void feed(const int8_t* val) { | ||||
const GI_INT8 vval = GiLoadInt8(val); | |||||
const GI_INT16 vval_low = GiMoveLowLongInt8(vval); | |||||
const GI_INT16 vval_high = GiMoveHighLongInt8(vval); | |||||
const GI_INT8_t vval = GiLoadInt8(val); | |||||
const GI_INT16_t vval_low = GiMoveLowLongInt8(vval); | |||||
const GI_INT16_t vval_high = GiMoveHighLongInt8(vval); | |||||
const GI_INT32 vval_low_low = GiMoveLowLongInt16(vval_low); | |||||
const GI_INT32 vval_low_high = GiMoveHighLongInt16(vval_low); | |||||
const GI_INT32 vval_high_low = GiMoveLowLongInt16(vval_high); | |||||
const GI_INT32 vval_high_high = GiMoveHighLongInt16(vval_high); | |||||
const GI_INT32_t vval_low_low = GiMoveLowLongInt16(vval_low); | |||||
const GI_INT32_t vval_low_high = GiMoveHighLongInt16(vval_low); | |||||
const GI_INT32_t vval_high_low = GiMoveLowLongInt16(vval_high); | |||||
const GI_INT32_t vval_high_high = GiMoveHighLongInt16(vval_high); | |||||
res[0] = GiAddInt32(res[0], vval_low_low); | res[0] = GiAddInt32(res[0], vval_low_low); | ||||
res[1] = GiAddInt32(res[1], vval_low_high); | res[1] = GiAddInt32(res[1], vval_low_high); | ||||
@@ -74,11 +74,11 @@ struct MeanReducer<dt_qint8, int8_t, int32_t, false> { | |||||
void feed_remain(const int8_t* val) { remain += *val; } | void feed_remain(const int8_t* val) { remain += *val; } | ||||
void post(int8_t* dst) { | void post(int8_t* dst) { | ||||
for (int i = 0; i < 4; i += 2) { | for (int i = 0; i < 4; i += 2) { | ||||
GI_FLOAT32 vitem0 = GiMultiplyFloat32(GiCastToFloat32(res[i]), vcoef); | |||||
GI_FLOAT32 vitem1 = GiMultiplyFloat32(GiCastToFloat32(res[i + 1]), vcoef); | |||||
GI_FLOAT32_t vitem0 = GiMultiplyFloat32(GiCastToFloat32(res[i]), vcoef); | |||||
GI_FLOAT32_t vitem1 = GiMultiplyFloat32(GiCastToFloat32(res[i + 1]), vcoef); | |||||
GiStoreLowInt8( | GiStoreLowInt8( | ||||
dst, | |||||
(QConverter::convert<GI_INT8, GI_FLOAT32_V2>({{vitem0, vitem1}}))); | |||||
dst, (QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>( | |||||
{{vitem0, vitem1}}))); | |||||
dst += 8; | dst += 8; | ||||
} | } | ||||
} | } | ||||
@@ -93,7 +93,7 @@ struct MeanReducer<dt_float32, float, float, true> { | |||||
using ctype = float; | using ctype = float; | ||||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | ||||
GI_FLOAT32 res; | |||||
GI_FLOAT32_t res; | |||||
float result; | float result; | ||||
float coef; | float coef; | ||||
MeanReducer(DType, size_t cnt) : result(0.0f), coef(1.0 / cnt) { | MeanReducer(DType, size_t cnt) : result(0.0f), coef(1.0 / cnt) { | ||||
@@ -113,7 +113,7 @@ struct MeanReducer<dt_float32, float, float, false> { | |||||
using ctype = float; | using ctype = float; | ||||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | ||||
GI_FLOAT32 res; | |||||
GI_FLOAT32_t res; | |||||
float remain; | float remain; | ||||
float coef; | float coef; | ||||
MeanReducer(DType, size_t cnt) : remain(0.0f), coef(1.0 / cnt) { | MeanReducer(DType, size_t cnt) : remain(0.0f), coef(1.0 / cnt) { | ||||
@@ -140,30 +140,33 @@ struct minReducer; | |||||
struct _mode##Reducer<dt_float32, float, float, true> { \ | struct _mode##Reducer<dt_float32, float, float, true> { \ | ||||
using ctype = float; \ | using ctype = float; \ | ||||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | ||||
GI_FLOAT32 res; \ | |||||
GI_FLOAT32_t res; \ | |||||
_mode##Reducer(DType, size_t) { res = GiBroadcastFloat32(_init); } \ | _mode##Reducer(DType, size_t) { res = GiBroadcastFloat32(_init); } \ | ||||
_mode##Reducer() = default; \ | _mode##Reducer() = default; \ | ||||
void feed(const float* val) { \ | void feed(const float* val) { \ | ||||
auto vval = GiLoadFloat32(val); \ | auto vval = GiLoadFloat32(val); \ | ||||
res = Gi##_Mode##imumFloat32(res, vval); \ | |||||
res = Gi##_Mode##NanFloat32(res, vval); \ | |||||
} \ | } \ | ||||
void feed_remain(const float* val) { \ | void feed_remain(const float* val) { \ | ||||
auto vval = GiBroadcastFloat32(*val); \ | auto vval = GiBroadcastFloat32(*val); \ | ||||
res = Gi##_Mode##imumFloat32(vval, res); \ | |||||
res = Gi##_Mode##NanFloat32(vval, res); \ | |||||
} \ | } \ | ||||
void post(float* dst) { *dst = GiReduce##_Mode##imumFloat32(res); } \ | |||||
void post(float* dst) { *dst = GiReduce##_Mode##NanFloat32(res); } \ | |||||
} | } | ||||
REDUCER_MAX_MIN_C1(max, Max, std::numeric_limits<dt_float32>::lowest()); | REDUCER_MAX_MIN_C1(max, Max, std::numeric_limits<dt_float32>::lowest()); | ||||
REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max()); | REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max()); | ||||
#undef REDUCER_MAX_MIN_C1 | #undef REDUCER_MAX_MIN_C1 | ||||
#define Max_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b); | |||||
#define Min_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b); | |||||
#define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \ | #define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \ | ||||
template <> \ | template <> \ | ||||
struct _mode##Reducer<dt_float32, float, float, false> { \ | struct _mode##Reducer<dt_float32, float, float, false> { \ | ||||
using ctype = float; \ | using ctype = float; \ | ||||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | ||||
GI_FLOAT32 res; \ | |||||
GI_FLOAT32_t res; \ | |||||
float remain; \ | float remain; \ | ||||
_mode##Reducer(DType, size_t) { \ | _mode##Reducer(DType, size_t) { \ | ||||
res = GiBroadcastFloat32(_init); \ | res = GiBroadcastFloat32(_init); \ | ||||
@@ -171,12 +174,12 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max()); | |||||
} \ | } \ | ||||
_mode##Reducer() = default; \ | _mode##Reducer() = default; \ | ||||
void feed(const float* val) { \ | void feed(const float* val) { \ | ||||
GI_FLOAT32 vval = GiLoadFloat32(val); \ | |||||
res = Gi##_Mode##imumFloat32(res, vval); \ | |||||
GI_FLOAT32_t vval = GiLoadFloat32(val); \ | |||||
res = Gi##_Mode##NanFloat32(res, vval); \ | |||||
} \ | } \ | ||||
void feed_remain(const float* val) { \ | void feed_remain(const float* val) { \ | ||||
using namespace std; \ | using namespace std; \ | ||||
remain = _mode(*val, remain); \ | |||||
remain = _Mode##_NAN(*val, remain); \ | |||||
} \ | } \ | ||||
void post(float* dst) { GiStoreFloat32(dst, res); } \ | void post(float* dst) { GiStoreFloat32(dst, res); } \ | ||||
void post_remain(float* dst) { *dst = remain; } \ | void post_remain(float* dst) { *dst = remain; } \ | ||||
@@ -185,21 +188,23 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max()); | |||||
REDUCER_MAX_MIN_C(max, Max, std::numeric_limits<dt_float32>::lowest()); | REDUCER_MAX_MIN_C(max, Max, std::numeric_limits<dt_float32>::lowest()); | ||||
REDUCER_MAX_MIN_C(min, Min, std::numeric_limits<dt_float32>::max()); | REDUCER_MAX_MIN_C(min, Min, std::numeric_limits<dt_float32>::max()); | ||||
#undef REDUCER_MAX_MIN_C | #undef REDUCER_MAX_MIN_C | ||||
#undef Max_NAN | |||||
#undef Min_NAN | |||||
#define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \ | #define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \ | ||||
template <> \ | template <> \ | ||||
struct _mode##Reducer<dt_qint8, int8_t, int8_t, true> { \ | struct _mode##Reducer<dt_qint8, int8_t, int8_t, true> { \ | ||||
using ctype = int8_t; \ | using ctype = int8_t; \ | ||||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \ | static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \ | ||||
GI_INT8 res; \ | |||||
GI_INT8_t res; \ | |||||
_mode##Reducer(DType, size_t) { res = GiBroadcastInt8(_init); } \ | _mode##Reducer(DType, size_t) { res = GiBroadcastInt8(_init); } \ | ||||
_mode##Reducer() = default; \ | _mode##Reducer() = default; \ | ||||
void feed(const int8_t* val) { \ | void feed(const int8_t* val) { \ | ||||
GI_INT8 vval = GiLoadInt8(val); \ | |||||
GI_INT8_t vval = GiLoadInt8(val); \ | |||||
res = Gi##_Mode##imumInt8(vval, res); \ | res = Gi##_Mode##imumInt8(vval, res); \ | ||||
} \ | } \ | ||||
void feed_remain(const int8_t* val) { \ | void feed_remain(const int8_t* val) { \ | ||||
GI_INT8 vval = GiBroadcastInt8(*val); \ | |||||
GI_INT8_t vval = GiBroadcastInt8(*val); \ | |||||
res = Gi##_Mode##imumInt8(res, vval); \ | res = Gi##_Mode##imumInt8(res, vval); \ | ||||
} \ | } \ | ||||
void post(int8_t* dst) { *dst = GiReduce##_Mode##Int8(res); } \ | void post(int8_t* dst) { *dst = GiReduce##_Mode##Int8(res); } \ | ||||
@@ -214,7 +219,7 @@ REDUCER_MAX_MIN_C1(min, Min, 127); | |||||
struct _mode##Reducer<dt_qint8, int8_t, int8_t, false> { \ | struct _mode##Reducer<dt_qint8, int8_t, int8_t, false> { \ | ||||
using ctype = int8_t; \ | using ctype = int8_t; \ | ||||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \ | static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \ | ||||
GI_INT8 res; \ | |||||
GI_INT8_t res; \ | |||||
int8_t remain; \ | int8_t remain; \ | ||||
_mode##Reducer(DType, size_t) { \ | _mode##Reducer(DType, size_t) { \ | ||||
res = GiBroadcastInt8(_init); \ | res = GiBroadcastInt8(_init); \ | ||||
@@ -222,7 +227,7 @@ REDUCER_MAX_MIN_C1(min, Min, 127); | |||||
} \ | } \ | ||||
_mode##Reducer() = default; \ | _mode##Reducer() = default; \ | ||||
void feed(const int8_t* val) { \ | void feed(const int8_t* val) { \ | ||||
GI_INT8 vval = GiLoadInt8(val); \ | |||||
GI_INT8_t vval = GiLoadInt8(val); \ | |||||
res = Gi##_Mode##imumInt8(res, vval); \ | res = Gi##_Mode##imumInt8(res, vval); \ | ||||
} \ | } \ | ||||
void feed_remain(const int8_t* val) { \ | void feed_remain(const int8_t* val) { \ | ||||
@@ -248,7 +253,7 @@ struct ProductReducer; | |||||
struct _mode##Reducer<dt_float32, float, float, true> { \ | struct _mode##Reducer<dt_float32, float, float, true> { \ | ||||
using ctype = float; \ | using ctype = float; \ | ||||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | ||||
GI_FLOAT32 res; \ | |||||
GI_FLOAT32_t res; \ | |||||
float remain; \ | float remain; \ | ||||
_mode##Reducer(DType, size_t) { \ | _mode##Reducer(DType, size_t) { \ | ||||
res = GiBroadcastFloat32(_init); \ | res = GiBroadcastFloat32(_init); \ | ||||
@@ -256,7 +261,7 @@ struct ProductReducer; | |||||
} \ | } \ | ||||
_mode##Reducer() = default; \ | _mode##Reducer() = default; \ | ||||
void feed(const float* val) { \ | void feed(const float* val) { \ | ||||
GI_FLOAT32 vval = GiLoadFloat32(val); \ | |||||
GI_FLOAT32_t vval = GiLoadFloat32(val); \ | |||||
res = Gi##_Mode##Float32(vval, res); \ | res = Gi##_Mode##Float32(vval, res); \ | ||||
} \ | } \ | ||||
void feed_remain(const float* val) { \ | void feed_remain(const float* val) { \ | ||||
@@ -280,7 +285,7 @@ REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f); | |||||
struct _mode##Reducer<dt_float32, float, float, false> { \ | struct _mode##Reducer<dt_float32, float, float, false> { \ | ||||
using ctype = float; \ | using ctype = float; \ | ||||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | ||||
GI_FLOAT32 res; \ | |||||
GI_FLOAT32_t res; \ | |||||
float remain; \ | float remain; \ | ||||
_mode##Reducer(DType, size_t) { \ | _mode##Reducer(DType, size_t) { \ | ||||
res = GiBroadcastFloat32(_init); \ | res = GiBroadcastFloat32(_init); \ | ||||
@@ -288,7 +293,7 @@ REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f); | |||||
} \ | } \ | ||||
_mode##Reducer() = default; \ | _mode##Reducer() = default; \ | ||||
void feed(const float* val) { \ | void feed(const float* val) { \ | ||||
GI_FLOAT32 vval = GiLoadFloat32(val); \ | |||||
GI_FLOAT32_t vval = GiLoadFloat32(val); \ | |||||
res = Gi##_Mode##Float32(vval, res); \ | res = Gi##_Mode##Float32(vval, res); \ | ||||
} \ | } \ | ||||
void feed_remain(const float* val) { \ | void feed_remain(const float* val) { \ | ||||
@@ -313,7 +318,7 @@ struct SumSqrReducer<dt_float32, float, float, true> { | |||||
using ctype = float; | using ctype = float; | ||||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | ||||
GI_FLOAT32 res; | |||||
GI_FLOAT32_t res; | |||||
float result; | float result; | ||||
SumSqrReducer(DType, size_t cnt) : result(0.0f) { | SumSqrReducer(DType, size_t cnt) : result(0.0f) { | ||||
MEGDNN_MARK_USED_VAR(cnt); | MEGDNN_MARK_USED_VAR(cnt); | ||||
@@ -321,7 +326,7 @@ struct SumSqrReducer<dt_float32, float, float, true> { | |||||
} | } | ||||
SumSqrReducer() = default; | SumSqrReducer() = default; | ||||
void feed(const float* val) { | void feed(const float* val) { | ||||
GI_FLOAT32 vval = GiLoadFloat32(val); | |||||
GI_FLOAT32_t vval = GiLoadFloat32(val); | |||||
res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res); | res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res); | ||||
} | } | ||||
void feed_remain(const float* val) { | void feed_remain(const float* val) { | ||||
@@ -338,7 +343,7 @@ struct SumSqrReducer<dt_float32, float, float, false> { | |||||
using ctype = float; | using ctype = float; | ||||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | ||||
GI_FLOAT32 res; | |||||
GI_FLOAT32_t res; | |||||
float remain; | float remain; | ||||
SumSqrReducer(DType, size_t cnt) : remain(0.0f) { | SumSqrReducer(DType, size_t cnt) : remain(0.0f) { | ||||
MEGDNN_MARK_USED_VAR(cnt); | MEGDNN_MARK_USED_VAR(cnt); | ||||
@@ -346,7 +351,7 @@ struct SumSqrReducer<dt_float32, float, float, false> { | |||||
} | } | ||||
SumSqrReducer() = default; | SumSqrReducer() = default; | ||||
void feed(const float* val) { | void feed(const float* val) { | ||||
GI_FLOAT32 vval = GiLoadFloat32(val); | |||||
GI_FLOAT32_t vval = GiLoadFloat32(val); | |||||
res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res); | res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res); | ||||
} | } | ||||
void feed_remain(const float* val) { remain += (*val) * (*val); } | void feed_remain(const float* val) { remain += (*val) * (*val); } | ||||