GitOrigin-RevId: 37409bae9a
release-1.10
@@ -82,29 +82,33 @@ | |||
#if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \ | |||
defined(GI_FMA_INTRINSICS) | |||
typedef __m256 GI_FLOAT32; | |||
typedef __m256i GI_UINT8; | |||
typedef __m256i GI_INT8; | |||
typedef __m256i GI_INT16; | |||
typedef __m256i GI_INT32; | |||
typedef __m256 GI_FLOAT32_t; | |||
typedef __m256i GI_UINT8_t; | |||
typedef __m256i GI_INT8_t; | |||
typedef __m256i GI_INT16_t; | |||
typedef __m256i GI_INT32_t; | |||
typedef __m256i GI_UINT32_t; | |||
#elif defined(GI_NEON_INTRINSICS) | |||
typedef float32x4_t GI_FLOAT32; | |||
typedef uint8x16_t GI_UINT8; | |||
typedef int8x16_t GI_INT8; | |||
typedef int16x8_t GI_INT16; | |||
typedef int32x4_t GI_INT32; | |||
typedef float32x4_t GI_FLOAT32_t; | |||
typedef uint8x16_t GI_UINT8_t; | |||
typedef int8x16_t GI_INT8_t; | |||
typedef int16x8_t GI_INT16_t; | |||
typedef int32x4_t GI_INT32_t; | |||
typedef uint32x4_t GI_UINT32_t; | |||
#elif defined(GI_SSE2_INTRINSICS) || defined(GI_SSE42_INTRINSICS) | |||
typedef __m128 GI_FLOAT32; | |||
typedef __m128i GI_UINT8; | |||
typedef __m128i GI_INT8; | |||
typedef __m128i GI_INT16; | |||
typedef __m128i GI_INT32; | |||
typedef __m128 GI_FLOAT32_t; | |||
typedef __m128i GI_UINT8_t; | |||
typedef __m128i GI_INT8_t; | |||
typedef __m128i GI_INT16_t; | |||
typedef __m128i GI_INT32_t; | |||
typedef __m128i GI_UINT32_t; | |||
#else | |||
typedef float GI_FLOAT32 __attribute__((vector_size(16))); | |||
typedef uint8_t GI_UINT8 __attribute__((vector_size(16))); | |||
typedef int8_t GI_INT8 __attribute__((vector_size(16))); | |||
typedef int16_t GI_INT16 __attribute__((vector_size(16))); | |||
typedef int32_t GI_INT32 __attribute__((vector_size(16))); | |||
typedef float GI_FLOAT32_t __attribute__((vector_size(16))); | |||
typedef uint8_t GI_UINT8_t __attribute__((vector_size(16))); | |||
typedef int8_t GI_INT8_t __attribute__((vector_size(16))); | |||
typedef int16_t GI_INT16_t __attribute__((vector_size(16))); | |||
typedef int32_t GI_INT32_t __attribute__((vector_size(16))); | |||
typedef uint32_t GI_UINT32_t __attribute__((vector_size(16))); | |||
#endif | |||
//! general intrinsic support dynamic length simd, if avx or avx2 the simd | |||
@@ -129,24 +133,31 @@ typedef int32_t GI_INT32 __attribute__((vector_size(16))); | |||
#define Min(a, b) (a) < (b) ? (a) : (b) | |||
typedef struct { | |||
GI_INT32 val[2]; | |||
} GI_INT32_V2; | |||
GI_INT32_t val[2]; | |||
} GI_INT32_V2_t; | |||
typedef struct { | |||
GI_INT32 val[4]; | |||
} GI_INT32_V4; | |||
GI_INT32_t val[4]; | |||
} GI_INT32_V4_t; | |||
typedef struct { | |||
GI_FLOAT32 val[2]; | |||
} GI_FLOAT32_V2; | |||
GI_FLOAT32_t val[2]; | |||
} GI_FLOAT32_V2_t; | |||
typedef struct { | |||
GI_FLOAT32 val[4]; | |||
} GI_FLOAT32_V4; | |||
GI_FLOAT32_t val[4]; | |||
} GI_FLOAT32_V4_t; | |||
typedef struct { | |||
GI_INT16_t val[2]; | |||
} GI_INT16_V2_t; | |||
typedef struct { | |||
GI_INT8_t val[2]; | |||
} GI_INT8_V2_t; | |||
GI_FORCEINLINE | |||
GI_INT32 | |||
GiAndInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
GI_INT32_t GiAndInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vandq_s32(Vector1, Vector2); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
@@ -157,8 +168,7 @@ GiAndInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
} | |||
GI_FORCEINLINE | |||
GI_INT32 | |||
GiOrInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
GI_INT32_t GiOrInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vorrq_s32(Vector1, Vector2); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
@@ -169,8 +179,7 @@ GiOrInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
} | |||
GI_FORCEINLINE | |||
GI_INT32 | |||
GiAndNotInt32(GI_INT32 VectorNot, GI_INT32 Vector) { | |||
GI_INT32_t GiAndNotInt32(GI_INT32_t VectorNot, GI_INT32_t Vector) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vandq_s32(vmvnq_s32(VectorNot), Vector); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
@@ -181,8 +190,7 @@ GiAndNotInt32(GI_INT32 VectorNot, GI_INT32 Vector) { | |||
} | |||
GI_FORCEINLINE | |||
GI_INT32 | |||
GiXorInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
GI_INT32_t GiXorInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return veorq_s32(Vector1, Vector2); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
@@ -14,20 +14,51 @@ | |||
#include "gi_common.h" | |||
GI_FORCEINLINE | |||
GI_INT32 | |||
GiReinterpretAsInt32(GI_FLOAT32 In) { | |||
GI_INT32_t GiReinterpretAsInt32(GI_FLOAT32_t In) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vreinterpretq_s32_f32(In); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_castps_si128(In); | |||
#else | |||
return GI_INT32(In); | |||
return *(GI_INT32_t*)(&In); | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_INT32 | |||
GiRoundAsInt32(GI_FLOAT32 Vector) { | |||
GI_UINT32_t GiReinterpretAsUint32(GI_FLOAT32_t In) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vreinterpretq_u32_f32(In); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_castps_si128(In); | |||
#else | |||
return *(GI_UINT32_t*)(&In); | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32_t GiReintInt32ToFloat32(GI_INT32_t Vector) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vreinterpretq_f32_s32(Vector); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_castsi128_ps(Vector); | |||
#else | |||
return *(GI_FLOAT32_t*)(&Vector); | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32_t GiReintUint32ToFloat32(GI_UINT32_t Vector) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vreinterpretq_f32_u32(Vector); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_castsi128_ps(Vector); | |||
#else | |||
return *(GI_FLOAT32_t*)(&Vector); | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_INT32_t GiRoundAsInt32(GI_FLOAT32_t Vector) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
#if __ARM_ARCH >= 8 | |||
return vcvtaq_s32_f32(Vector); | |||
@@ -47,7 +78,7 @@ GiRoundAsInt32(GI_FLOAT32 Vector) { | |||
return _mm_castps_si128( | |||
_mm_round_ps(vres0, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); | |||
#else | |||
GI_INT32 ret; | |||
GI_INT32_t ret; | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||
ret[i] = (int32_t)round(Vector[i]); | |||
} | |||
@@ -56,42 +87,43 @@ GiRoundAsInt32(GI_FLOAT32 Vector) { | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32 | |||
GiCastToFloat32(GI_INT32 Vector) { | |||
GI_INT32_t GiCastToInt32(GI_FLOAT32_t Vector) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vcvtq_f32_s32(Vector); | |||
return vcvtq_s32_f32(Vector); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_cvtepi32_ps(Vector); | |||
return _mm_cvttps_epi32(Vector); | |||
#else | |||
GI_FLOAT32 ret; | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||
ret[i] = float(Vector[i]); | |||
GI_INT32_t ret; | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||
ret[i] = (int32_t)(Vector[i]); | |||
} | |||
return ret; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32 | |||
GiReinterpretAsFloat32(GI_INT32 Vector) { | |||
GI_FLOAT32_t GiCastToFloat32(GI_INT32_t Vector) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vreinterpretq_f32_s32(Vector); | |||
return vcvtq_f32_s32(Vector); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_castsi128_ps(Vector); | |||
return _mm_cvtepi32_ps(Vector); | |||
#else | |||
return GI_FLOAT32(Vector); | |||
GI_FLOAT32_t ret; | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||
ret[i] = float(Vector[i]); | |||
} | |||
return ret; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32 | |||
GiBroadcastFloat32(float Value) { | |||
GI_FLOAT32_t GiBroadcastFloat32(float Value) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vdupq_n_f32(Value); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_set1_ps(Value); | |||
#else | |||
GI_FLOAT32 ret; | |||
GI_FLOAT32_t ret; | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||
ret[i] = Value; | |||
} | |||
@@ -100,14 +132,13 @@ GiBroadcastFloat32(float Value) { | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32 | |||
GiBroadcastFloat32(const float* Value) { | |||
GI_FLOAT32_t GiLoadBroadcastFloat32(const float* Value) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vld1q_dup_f32(Value); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_load_ps1(Value); | |||
#else | |||
GI_FLOAT32 ret; | |||
GI_FLOAT32_t ret; | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||
ret[i] = *Value; | |||
} | |||
@@ -116,8 +147,7 @@ GiBroadcastFloat32(const float* Value) { | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32 | |||
GiZeroFloat32(void) { | |||
GI_FLOAT32_t GiZeroFloat32(void) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vdupq_n_f32(0.0f); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
@@ -128,14 +158,13 @@ GiZeroFloat32(void) { | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32 | |||
GiLoadFloat32(const float* Buffer) { | |||
GI_FLOAT32_t GiLoadFloat32(const float* Buffer) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vld1q_f32(Buffer); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_loadu_ps(Buffer); | |||
#else | |||
GI_FLOAT32 ret; | |||
GI_FLOAT32_t ret; | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||
ret[i] = Buffer[i]; | |||
} | |||
@@ -144,7 +173,7 @@ GiLoadFloat32(const float* Buffer) { | |||
} | |||
GI_FORCEINLINE | |||
void GiStoreFloat32(float* Buffer, GI_FLOAT32 Vector) { | |||
void GiStoreFloat32(float* Buffer, GI_FLOAT32_t Vector) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
vst1q_f32(Buffer, Vector); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
@@ -156,33 +185,22 @@ void GiStoreFloat32(float* Buffer, GI_FLOAT32 Vector) { | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
void GiStoreAlignedFloat32(float* Buffer, GI_FLOAT32 Vector) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
vst1q_f32(Buffer, Vector); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
_mm_store_ps(Buffer, Vector); | |||
#else | |||
GiStoreFloat32(Buffer, Vector); | |||
#endif | |||
} | |||
#if defined(GI_NEON_INTRINSICS) | |||
#define GISTORELANEFLOAT32(i) \ | |||
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32 Vector) { \ | |||
vst1q_lane_f32(Buffer, Vector, i); \ | |||
#define GISTORELANEFLOAT32(i) \ | |||
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \ | |||
vst1q_lane_f32(Buffer, Vector, i); \ | |||
} | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
#define GISTORELANEFLOAT32(i) \ | |||
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32 Vector) { \ | |||
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \ | |||
_mm_store_ss(Buffer, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \ | |||
} | |||
#else | |||
#define GISTORELANEFLOAT32(i) \ | |||
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32 Vector) { \ | |||
*Buffer = Vector[i]; \ | |||
#define GISTORELANEFLOAT32(i) \ | |||
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \ | |||
*Buffer = Vector[i]; \ | |||
} | |||
#endif | |||
@@ -194,20 +212,20 @@ GISTORELANEFLOAT32(3) | |||
#undef GISTORELANEFLOAT32 | |||
#if defined(GI_NEON_INTRINSICS) | |||
#define GIEXTRACTLANEFLOAT32(i) \ | |||
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32 Vector) { \ | |||
return vgetq_lane_f32(Vector, i); \ | |||
#define GIEXTRACTLANEFLOAT32(i) \ | |||
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \ | |||
return vgetq_lane_f32(Vector, i); \ | |||
} | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
#define GIEXTRACTLANEFLOAT32(i) \ | |||
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32 Vector) { \ | |||
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \ | |||
return _mm_cvtss_f32(_mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \ | |||
} | |||
#else | |||
#define GIEXTRACTLANEFLOAT32(i) \ | |||
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32 Vector) { \ | |||
return Vector[i]; \ | |||
#define GIEXTRACTLANEFLOAT32(i) \ | |||
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \ | |||
return Vector[i]; \ | |||
} | |||
#endif | |||
@@ -218,8 +236,7 @@ GIEXTRACTLANEFLOAT32(3) | |||
#undef GIEXTRACTLANEFLOAT32 | |||
GI_FORCEINLINE | |||
GI_FLOAT32 | |||
GiInterleaveLowFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
GI_FLOAT32_t GiInterleaveLowFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
#if defined(GI_NEON64_INTRINSICS) | |||
return vzip1q_f32(Vector1, Vector2); | |||
#elif defined(GI_NEON32_INTRINSICS) | |||
@@ -228,7 +245,7 @@ GiInterleaveLowFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_unpacklo_ps(Vector1, Vector2); | |||
#else | |||
GI_FLOAT32 ret; | |||
GI_FLOAT32_t ret; | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) { | |||
ret[2 * i] = Vector1[i]; | |||
ret[2 * i + 1] = Vector2[i]; | |||
@@ -238,8 +255,7 @@ GiInterleaveLowFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32 | |||
GiInterleaveHighFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
GI_FLOAT32_t GiInterleaveHighFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
#if defined(GI_NEON64_INTRINSICS) | |||
return vzip2q_f32(Vector1, Vector2); | |||
#elif defined(GI_NEON32_INTRINSICS) | |||
@@ -248,7 +264,7 @@ GiInterleaveHighFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_unpackhi_ps(Vector1, Vector2); | |||
#else | |||
GI_FLOAT32 ret; | |||
GI_FLOAT32_t ret; | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) { | |||
ret[2 * i] = Vector1[GI_SIMD_LEN_BYTE / 2 + i]; | |||
ret[2 * i + 1] = Vector2[GI_SIMD_LEN_BYTE / 2 + i]; | |||
@@ -258,8 +274,7 @@ GiInterleaveHighFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32 | |||
GiAddFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
GI_FLOAT32_t GiAddFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vaddq_f32(Vector1, Vector2); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
@@ -270,8 +285,7 @@ GiAddFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32 | |||
GiSubtractFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
GI_FLOAT32_t GiSubtractFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vsubq_f32(Vector1, Vector2); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
@@ -282,8 +296,7 @@ GiSubtractFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32 | |||
GiMultiplyFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
GI_FLOAT32_t GiMultiplyFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vmulq_f32(Vector1, Vector2); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
@@ -294,12 +307,11 @@ GiMultiplyFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32 | |||
GiMultiplyScalerFloat32(GI_FLOAT32 Vector1, float Scaler) { | |||
GI_FLOAT32_t GiMultiplyScalerFloat32(GI_FLOAT32_t Vector1, float Scaler) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vmulq_n_f32(Vector1, Scaler); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
GI_FLOAT32 Vector2 = _mm_set1_ps(Scaler); | |||
GI_FLOAT32_t Vector2 = _mm_set1_ps(Scaler); | |||
return _mm_mul_ps(Vector1, Vector2); | |||
#else | |||
return Vector1 * Scaler; | |||
@@ -307,10 +319,14 @@ GiMultiplyScalerFloat32(GI_FLOAT32 Vector1, float Scaler) { | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32 | |||
GiMultiplyAddVecFloat32(GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
GI_FLOAT32_t GiMultiplyAddFloat32( | |||
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
#if defined(__ARM_FEATURE_FMA) | |||
return vfmaq_f32(VectorSum, Vector1, Vector2); | |||
#else | |||
return vmlaq_f32(VectorSum, Vector1, Vector2); | |||
#endif | |||
#elif defined(GI_FMA3_INTRINSICS) | |||
return _mm_fmadd_ps(Vector1, Vector2, VectorSum); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
@@ -321,41 +337,75 @@ GiMultiplyAddVecFloat32(GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vec | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32 | |||
GiMultiplyAddScalarFloat32(GI_FLOAT32 VectorSum, GI_FLOAT32 Vector, float Scalar) { | |||
GI_FLOAT32_t GiMultiplySubFloat32( | |||
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vmlaq_n_f32(VectorSum, Vector, Scalar); | |||
return vmlsq_f32(VectorSum, Vector1, Vector2); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_sub_ps(VectorSum, _mm_mul_ps(Vector1, Vector2)); | |||
#else | |||
return VectorSum - Vector1 * Vector2; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32_t GiMultiplyAddScalarFloat32( | |||
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector, float Scalar) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
#if defined(__ARM_FEATURE_FMA) | |||
return vfmaq_n_f32(VectorSum, Vector, Scalar); | |||
#else | |||
return vfmla_n_f32(VectorSum, Vector, Scalar); | |||
#endif | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return GiMultiplyAddVecFloat32(VectorSum, GiBroadcastFloat32(Scalar), Vector); | |||
return GiMultiplyAddFloat32(VectorSum, GiBroadcastFloat32(Scalar), Vector); | |||
#else | |||
return VectorSum + Vector * Scalar; | |||
#endif | |||
} | |||
#if defined(GI_NEON_INTRINSICS) | |||
#define GIMULTIPLYADDLANFLOAT32(i) \ | |||
GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \ | |||
GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \ | |||
return vmlaq_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i); \ | |||
#if defined(__ARM_FEATURE_FMA) | |||
#define GIMULTIPLYADDLANFLOAT32(i) \ | |||
GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ | |||
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ | |||
return vfmaq_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i); \ | |||
} | |||
GIMULTIPLYADDLANFLOAT32(0) | |||
GIMULTIPLYADDLANFLOAT32(1) | |||
#undef GIMULTIPLYADDLANFLOAT32 | |||
#define GIMULTIPLYADDLANFLOAT32(i) \ | |||
GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \ | |||
GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \ | |||
GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ | |||
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ | |||
return vfmaq_lane_f32(VectorSum, Vector1, vget_high_f32(Vector2), i - 2); \ | |||
} | |||
GIMULTIPLYADDLANFLOAT32(2) | |||
GIMULTIPLYADDLANFLOAT32(3) | |||
#else | |||
#define GIMULTIPLYADDLANFLOAT32(i) \ | |||
GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ | |||
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ | |||
return vmlaq_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i); \ | |||
} | |||
GIMULTIPLYADDLANFLOAT32(0) | |||
GIMULTIPLYADDLANFLOAT32(1) | |||
#undef GIMULTIPLYADDLANFLOAT32 | |||
#define GIMULTIPLYADDLANFLOAT32(i) \ | |||
GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ | |||
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ | |||
return vmlaq_lane_f32(VectorSum, Vector1, vget_high_f32(Vector2), i - 2); \ | |||
} | |||
GIMULTIPLYADDLANFLOAT32(2) | |||
GIMULTIPLYADDLANFLOAT32(3) | |||
#endif | |||
#undef GIMULTIPLYADDLANFLOAT32 | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
#define GIMULTIPLYADDLANFLOAT32(i) \ | |||
GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \ | |||
GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \ | |||
return GiMultiplyAddScalarFloat32( \ | |||
VectorSum, Vector1, GiExtractLane##i##Float32(Vector2)); \ | |||
#define GIMULTIPLYADDLANFLOAT32(i) \ | |||
GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ | |||
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ | |||
return GiMultiplyAddScalarFloat32( \ | |||
VectorSum, Vector1, GiExtractLane##i##Float32(Vector2)); \ | |||
} | |||
GIMULTIPLYADDLANFLOAT32(0) | |||
GIMULTIPLYADDLANFLOAT32(1) | |||
@@ -363,10 +413,10 @@ GIMULTIPLYADDLANFLOAT32(2) | |||
GIMULTIPLYADDLANFLOAT32(3) | |||
#undef GIMULTIPLYADDLANFLOAT32 | |||
#else | |||
#define GIMULTIPLYADDLANFLOAT32(i) \ | |||
GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \ | |||
GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \ | |||
return VectorSum + Vector1 * Vector2[i]; \ | |||
#define GIMULTIPLYADDLANFLOAT32(i) \ | |||
GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ | |||
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ | |||
return VectorSum + Vector1 * Vector2[i]; \ | |||
} | |||
GIMULTIPLYADDLANFLOAT32(0) | |||
GIMULTIPLYADDLANFLOAT32(1) | |||
@@ -376,8 +426,7 @@ GIMULTIPLYADDLANFLOAT32(3) | |||
#endif | |||
GI_FORCEINLINE | |||
GI_FLOAT32 | |||
GiDivideFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
GI_FLOAT32_t GiDivideFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
#if defined(GI_NEON64_INTRINSICS) | |||
return vdivq_f32(Vector1, Vector2); | |||
#elif defined(GI_NEON32_INTRINSICS) | |||
@@ -392,64 +441,129 @@ GiDivideFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32 | |||
GiGreaterThanFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
GI_FLOAT32_t GiRecpeSFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
#if defined(GI_NEON64_INTRINSICS) | |||
return vrecpsq_f32(Vector1, Vector2); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
GI_FLOAT32_t two = _mm_set1_ps(2.0f); | |||
return _mm_sub_ps(two, _mm_mul_ps(Vector1, Vector2)); | |||
#else | |||
return (2.0f - Vector1 * Vector2); | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32_t GiRecpeFloat32(GI_FLOAT32_t Vector) { | |||
#if defined(GI_NEON32_INTRINSICS) | |||
return vrecpeq_f32(Vector); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
GI_FLOAT32_t ones = _mm_set1_ps(1.0f); | |||
return _mm_div_ps(ones, Vector); | |||
#else | |||
return 1 / Vector; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32_t GiNegFloat32(GI_FLOAT32_t Vector) { | |||
#if defined(GI_NEON32_INTRINSICS) | |||
return vnegq_f32(Vector); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
GI_FLOAT32_t zero = _mm_set1_ps(0.0f); | |||
return _mm_sub_ps(zero, Vector); | |||
#else | |||
return -Vector; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_UINT32_t GiGreaterThanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vcgtq_f32(Vector1, Vector2); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_castps_si128(_mm_cmpgt_ps(Vector1, Vector2)); | |||
#else | |||
GI_UINT32_t ret; | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||
ret[i] = Vector1[i] > Vector2[i] ? 0xFFFFFFFF : 0; | |||
} | |||
return ret; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_UINT32_t GiLessThanEqFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vreinterpretq_f32_u32(vcgtq_f32(Vector1, Vector2)); | |||
return vcleq_f32(Vector1, Vector2); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_cmpgt_ps(Vector1, Vector2); | |||
return _mm_castps_si128(_mm_cmple_ps(Vector1, Vector2)); | |||
#else | |||
return Vector1 > Vector2; | |||
GI_UINT32_t ret; | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||
ret[i] = Vector1[i] <= Vector2[i] ? 0xFFFFFFFF : 0; | |||
} | |||
return ret; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32 | |||
GiAndFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
GI_UINT32_t GiLessThanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vcltq_f32(Vector1, Vector2); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_castps_si128(_mm_cmplt_ps(Vector1, Vector2)); | |||
#else | |||
GI_UINT32_t ret; | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||
ret[i] = Vector1[i] < Vector2[i] ? 0xFFFFFFFF : 0; | |||
} | |||
return ret; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32_t GiAndFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
#if defined(GI_SSE2_INTRINSICS) | |||
return _mm_and_ps(Vector1, Vector2); | |||
#else | |||
return GiReinterpretAsFloat32( | |||
return GiReintInt32ToFloat32( | |||
GiAndInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2))); | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32 | |||
GiOrFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
GI_FLOAT32_t GiOrFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
#if defined(GI_SSE2_INTRINSICS) | |||
return _mm_or_ps(Vector1, Vector2); | |||
#else | |||
return GiReinterpretAsFloat32( | |||
return GiReintInt32ToFloat32( | |||
GiOrInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2))); | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32 | |||
GiAndNotFloat32(GI_FLOAT32 VectorNot, GI_FLOAT32 Vector) { | |||
GI_FLOAT32_t GiAndNotFloat32(GI_FLOAT32_t VectorNot, GI_FLOAT32_t Vector) { | |||
#if defined(GI_SSE2_INTRINSICS) | |||
return _mm_andnot_ps(VectorNot, Vector); | |||
#else | |||
return GiReinterpretAsFloat32(GiAndNotInt32( | |||
return GiReintInt32ToFloat32(GiAndNotInt32( | |||
GiReinterpretAsInt32(VectorNot), GiReinterpretAsInt32(Vector))); | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32 | |||
GiXorFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
GI_FLOAT32_t GiXorFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
#if defined(GI_SSE2_INTRINSICS) | |||
return _mm_xor_ps(Vector1, Vector2); | |||
#else | |||
return GiReinterpretAsFloat32( | |||
return GiReintInt32ToFloat32( | |||
GiXorInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2))); | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32 | |||
GiBlendFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2, GI_FLOAT32 Selection) { | |||
GI_FLOAT32_t GiBlendFloat32( | |||
GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2, GI_FLOAT32_t Selection) { | |||
return GiOrFloat32( | |||
GiAndFloat32(Vector2, Selection), GiAndNotFloat32(Selection, Vector1)); | |||
} | |||
@@ -458,14 +572,54 @@ GiBlendFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2, GI_FLOAT32 Selection) { | |||
#define MAX_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b); | |||
GI_FORCEINLINE | |||
GI_FLOAT32 | |||
GiMaximumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
GI_FLOAT32_t GiBSLFloat32( | |||
GI_UINT32_t Selection, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vbslq_f32(Selection, Vector1, Vector2); | |||
#else | |||
return GiBlendFloat32(Vector1, Vector2, GiReintUint32ToFloat32(Selection)); | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32_t GiMaximumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vmaxq_f32(Vector1, Vector2); | |||
#elif defined(GI_NEON32_INTRINSICS) | |||
return _mm_max_ps(Vector1, Vector2); | |||
#else | |||
GI_FLOAT32_t max; | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||
max[i] = Max(Vector1[i], Vector2[i]); | |||
} | |||
return max; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32_t GiMinimumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vminq_f32(Vector1, Vector2); | |||
#elif defined(GI_NEON32_INTRINSICS) | |||
return _mm_min_ps(Vector1, Vector2); | |||
#else | |||
GI_FLOAT32_t min; | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||
min[i] = Min(Vector1[i], Vector2[i]); | |||
} | |||
return min; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32_t GiMaxNanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vmaxq_f32(Vector1, Vector2); | |||
#else | |||
//! _mm_max_ps does not fellow the IEEE standard when input is NAN, so | |||
//! implement by C code | |||
GI_FLOAT32 max; | |||
#define MAX_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b); | |||
GI_FLOAT32_t max; | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||
max[i] = MAX_NAN(Vector1[i], Vector2[i]); | |||
} | |||
@@ -474,14 +628,14 @@ GiMaximumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32 | |||
GiMinimumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
GI_FLOAT32_t GiMinNanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vminq_f32(Vector1, Vector2); | |||
#else | |||
//! _mm_min_ps does not fellow the IEEE standard when input is NAN, so | |||
//! implement by C code | |||
GI_FLOAT32 min; | |||
#define MIN_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b); | |||
GI_FLOAT32_t min; | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||
min[i] = MIN_NAN(Vector1[i], Vector2[i]); | |||
} | |||
@@ -490,15 +644,14 @@ GiMinimumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32 | |||
GiClampFloat32(GI_FLOAT32 Value, float LowerRange, float UpperRange) { | |||
GI_FLOAT32_t GiClampFloat32(GI_FLOAT32_t Value, float LowerRange, float UpperRange) { | |||
Value = GiMaximumFloat32(GiBroadcastFloat32(LowerRange), Value); | |||
Value = GiMinimumFloat32(GiBroadcastFloat32(UpperRange), Value); | |||
return Value; | |||
} | |||
GI_FORCEINLINE | |||
float GiReduceAddFloat32(GI_FLOAT32 Vector) { | |||
float GiReduceAddFloat32(GI_FLOAT32_t Vector) { | |||
#if defined(GI_NEON64_INTRINSICS) | |||
Vector = vpaddq_f32(Vector, Vector); | |||
Vector = vpaddq_f32(Vector, Vector); | |||
@@ -525,7 +678,7 @@ float GiReduceAddFloat32(GI_FLOAT32 Vector) { | |||
} | |||
GI_FORCEINLINE | |||
float GiReduceMultiplyFloat32(GI_FLOAT32 Vector) { | |||
float GiReduceMultiplyFloat32(GI_FLOAT32_t Vector) { | |||
#if defined(GI_NEON64_INTRINSICS) | |||
float32x2_t low = vget_low_f32(Vector); | |||
float32x2_t high = vget_high_f32(Vector); | |||
@@ -550,7 +703,7 @@ float GiReduceMultiplyFloat32(GI_FLOAT32 Vector) { | |||
#define Min(a, b) (a) < (b) ? (a) : (b) | |||
GI_FORCEINLINE | |||
float GiReduceMaximumFloat32(GI_FLOAT32 Vector) { | |||
float GiReduceMaxNanFloat32(GI_FLOAT32_t Vector) { | |||
#if defined(GI_NEON64_INTRINSICS) | |||
return vmaxvq_f32(Vector); | |||
#elif defined(GI_NEON32_INTRINSICS) | |||
@@ -560,9 +713,9 @@ float GiReduceMaximumFloat32(GI_FLOAT32 Vector) { | |||
VectorLow = vpmax_f32(VectorLow, VectorHigh); | |||
return vget_lane_f32(VectorLow, 0); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
Vector = GiMaximumFloat32( | |||
Vector = GiMaxNanFloat32( | |||
Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3))); | |||
Vector = GiMaximumFloat32( | |||
Vector = GiMaxNanFloat32( | |||
Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); | |||
return GiExtractLane0Float32(Vector); | |||
#else | |||
@@ -575,7 +728,7 @@ float GiReduceMaximumFloat32(GI_FLOAT32 Vector) { | |||
} | |||
GI_FORCEINLINE | |||
float GiReduceMinimumFloat32(GI_FLOAT32 Vector) { | |||
float GiReduceMinNanFloat32(GI_FLOAT32_t Vector) { | |||
#if defined(GI_NEON64_INTRINSICS) | |||
return vminvq_f32(Vector); | |||
#elif defined(GI_NEON32_INTRINSICS) | |||
@@ -585,9 +738,9 @@ float GiReduceMinimumFloat32(GI_FLOAT32 Vector) { | |||
VectorLow = vpmin_f32(VectorLow, VectorHigh); | |||
return vget_lane_f32(VectorLow, 0); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
Vector = GiMinimumFloat32( | |||
Vector = GiMinNanFloat32( | |||
Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3))); | |||
Vector = GiMinimumFloat32( | |||
Vector = GiMinNanFloat32( | |||
Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); | |||
return GiExtractLane0Float32(Vector); | |||
#else | |||
@@ -599,4 +752,24 @@ float GiReduceMinimumFloat32(GI_FLOAT32 Vector) { | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_FLOAT32_t GiAbsFloat32(GI_FLOAT32_t Vector1) { | |||
#if defined(GI_NEON64_INTRINSICS) | |||
return vabsq_f32(Vector1); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
union { | |||
unsigned int int_val; | |||
float float_val; | |||
} value; | |||
value.int_val = 0x7fffffff; | |||
return _mm_and_ps(Vector1, _mm_set_ps1(value.float_val)); | |||
#else | |||
GI_FLOAT32_t ret; | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||
ret[i] = Vector1[i] > 0 ? Vector1[i] : -Vector1[i]; | |||
} | |||
return ret; | |||
#endif | |||
} | |||
// vim: syntax=cpp.doxygen |
@@ -14,14 +14,13 @@ | |||
#include "gi_common.h" | |||
GI_FORCEINLINE | |||
GI_INT32 | |||
GiBroadcastInt32(int32_t Value) { | |||
GI_INT32_t GiBroadcastInt32(int32_t Value) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vdupq_n_s32(Value); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_set1_epi32(Value); | |||
#else | |||
GI_INT32 ret; | |||
GI_INT32_t ret; | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||
ret[i] = Value; | |||
} | |||
@@ -30,14 +29,28 @@ GiBroadcastInt32(int32_t Value) { | |||
} | |||
GI_FORCEINLINE | |||
GI_INT8 | |||
GiBroadcastInt8(int8_t Value) { | |||
GI_UINT32_t GiBroadcastUint32(int32_t Value) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vdupq_n_u32(Value); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_set1_epi32(Value); | |||
#else | |||
GI_UINT32_t ret; | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||
ret[i] = Value; | |||
} | |||
return ret; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_INT8_t GiBroadcastInt8(int8_t Value) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vdupq_n_s8(Value); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_set1_epi8(Value); | |||
#else | |||
GI_INT8 ret; | |||
GI_INT8_t ret; | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | |||
ret[i] = Value; | |||
} | |||
@@ -46,14 +59,13 @@ GiBroadcastInt8(int8_t Value) { | |||
} | |||
GI_FORCEINLINE | |||
GI_INT32 | |||
GiLoadInt32(const int32_t* Buffer) { | |||
GI_INT32_t GiLoadInt32(const int32_t* Buffer) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vld1q_s32(Buffer); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_loadu_si128((const __m128i*)Buffer); | |||
#else | |||
GI_INT32 ret; | |||
GI_INT32_t ret; | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||
ret[i] = Buffer[i]; | |||
} | |||
@@ -62,14 +74,13 @@ GiLoadInt32(const int32_t* Buffer) { | |||
} | |||
GI_FORCEINLINE | |||
GI_INT8 | |||
GiLoadInt8(const int8_t* Buffer) { | |||
GI_INT8_t GiLoadInt8(const int8_t* Buffer) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vld1q_s8(Buffer); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_loadu_si128((const __m128i*)Buffer); | |||
#else | |||
GI_INT8 ret; | |||
GI_INT8_t ret; | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | |||
ret[i] = Buffer[i]; | |||
} | |||
@@ -78,7 +89,7 @@ GiLoadInt8(const int8_t* Buffer) { | |||
} | |||
GI_FORCEINLINE | |||
void GiStoreInt32(int32_t* Buffer, GI_INT32 Vector) { | |||
void GiStoreInt32(int32_t* Buffer, GI_INT32_t Vector) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
vst1q_s32(Buffer, Vector); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
@@ -90,8 +101,60 @@ void GiStoreInt32(int32_t* Buffer, GI_INT32 Vector) { | |||
#endif | |||
} | |||
#if defined(GI_NEON_INTRINSICS) | |||
#define GISTORELANEINT32(i) \ | |||
GI_FORCEINLINE void GiStoreLane##i##Int32(int32_t* Buffer, GI_INT32_t Vector) { \ | |||
vst1q_lane_s32(Buffer, Vector, i); \ | |||
} | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
#define GISTORELANEINT32(i) \ | |||
GI_FORCEINLINE void GiStoreLane##i##Int32(int32_t* Buffer, GI_INT32_t Vector) { \ | |||
GI_FLOAT32_t tmp = _mm_castsi128_ps(Vector); \ | |||
_mm_store_ss( \ | |||
(float*)Buffer, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(i, i, i, i))); \ | |||
} | |||
#else | |||
#define GISTORELANEINT32(i) \ | |||
GI_FORCEINLINE void GiStoreLane##i##Int32(int32_t* Buffer, GI_INT32_t Vector) { \ | |||
*Buffer = Vector[i]; \ | |||
} | |||
#endif | |||
GISTORELANEINT32(0) | |||
GISTORELANEINT32(1) | |||
GISTORELANEINT32(2) | |||
GISTORELANEINT32(3) | |||
#undef GISTORELANEFLOAT32 | |||
GI_FORCEINLINE | |||
GI_INT8_t GiReinterInt32ToInt8(GI_INT32_t Vector) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vreinterpretq_s8_s32(Vector); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return Vector; | |||
#else | |||
return *(GI_INT8_t*)&Vector; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
void GiStoreInt16(int16_t* Buffer, GI_INT16_t Vector) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
vst1q_s16(Buffer, Vector); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
_mm_storeu_si128((__m128i*)Buffer, Vector); | |||
#else | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); i++) { | |||
Buffer[i] = Vector[i]; | |||
} | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
void GiStoreInt8(int8_t* Buffer, GI_INT8 Vector) { | |||
void GiStoreInt8(int8_t* Buffer, GI_INT8_t Vector) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
vst1q_s8(Buffer, Vector); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
@@ -104,7 +167,7 @@ void GiStoreInt8(int8_t* Buffer, GI_INT8 Vector) { | |||
} | |||
GI_FORCEINLINE | |||
void GiStoreLowInt8(int8_t* Buffer, GI_INT8 Vector) { | |||
void GiStoreLowInt8(int8_t* Buffer, GI_INT8_t Vector) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
vst1_s8(Buffer, vget_low_s8(Vector)); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
@@ -117,7 +180,7 @@ void GiStoreLowInt8(int8_t* Buffer, GI_INT8 Vector) { | |||
} | |||
GI_FORCEINLINE | |||
void GiStoreHihgInt8(int8_t* Buffer, GI_INT8 Vector) { | |||
void GiStoreHihgInt8(int8_t* Buffer, GI_INT8_t Vector) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
vst1_s8(Buffer, vget_high_s8(Vector)); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
@@ -130,8 +193,47 @@ void GiStoreHihgInt8(int8_t* Buffer, GI_INT8 Vector) { | |||
} | |||
GI_FORCEINLINE | |||
GI_INT32 | |||
GiAddInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
GI_INT32_t GiNegInt32(GI_INT32_t Vector) { | |||
#if defined(GI_NEON32_INTRINSICS) | |||
return vnegq_s32(Vector); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
GI_INT32_t zero = _mm_set1_epi32(0); | |||
return _mm_sub_epi32(zero, Vector); | |||
#else | |||
return -Vector; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_INT8_t GiNegInt8(GI_INT8_t Vector) { | |||
#if defined(GI_NEON32_INTRINSICS) | |||
return vnegq_s8(Vector); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
GI_INT32_t zero = _mm_set1_epi8(0); | |||
return _mm_sub_epi8(zero, Vector); | |||
#else | |||
return -Vector; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_UINT32_t GiTestAndSetUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vtstq_u32(Vector1, Vector2); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
GI_UINT32_t tmp = _mm_and_si128(Vector1, Vector2); | |||
return _mm_cmpeq_epi32(tmp, _mm_setzero_si128()); | |||
#else | |||
GI_UINT32_t ret; | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||
ret[i] = Vector1[i] & Vector2[i] ? 0xFFFFFFFF : 0; | |||
} | |||
return ret; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_INT32_t GiAddInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vaddq_s32(Vector1, Vector2); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
@@ -142,8 +244,40 @@ GiAddInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
} | |||
GI_FORCEINLINE | |||
GI_INT32 | |||
GiSubtractInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
GI_UINT32_t GiAddUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vaddq_u32(Vector1, Vector2); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_add_epi32(Vector1, Vector2); | |||
#else | |||
return Vector1 + Vector2; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_INT16_t GiAddInt16(GI_INT16_t Vector1, GI_INT16_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vaddq_s16(Vector1, Vector2); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_add_epi16(Vector1, Vector2); | |||
#else | |||
return Vector1 + Vector2; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_INT8_t GiAddInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vaddq_s8(Vector1, Vector2); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_add_epi8(Vector1, Vector2); | |||
#else | |||
return Vector1 + Vector2; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_INT32_t GiSubtractInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vsubq_s32(Vector1, Vector2); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
@@ -154,20 +288,82 @@ GiSubtractInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
} | |||
GI_FORCEINLINE | |||
GI_INT32 | |||
GiMultiplyInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
GI_UINT32_t GiSubtractUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vsubq_u32(Vector1, Vector2); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_sub_epi32(Vector1, Vector2); | |||
#else | |||
return Vector1 - Vector2; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_INT8_t GiSubtractInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vsubq_s8(Vector1, Vector2); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_sub_epi8(Vector1, Vector2); | |||
#else | |||
return Vector1 - Vector2; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_INT32_t GiMultiplyInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vmulq_s32(Vector1, Vector2); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_mul_epi32(Vector1, Vector2); | |||
GI_FLOAT32_t v0 = _mm_cvtepi32_ps(Vector1); | |||
GI_FLOAT32_t v1 = _mm_cvtepi32_ps(Vector2); | |||
return _mm_cvttps_epi32(_mm_mul_ps(v0, v1)); | |||
#else | |||
return Vector1 * Vector2; | |||
#endif | |||
} | |||
//! in x86, there is no int multiply, so implement it naive | |||
GI_FORCEINLINE | |||
GI_INT8_t GiMultiplyInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vmulq_s8(Vector1, Vector2); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
int8_t v1[16], v2[16], res[16]; | |||
_mm_storeu_si128((__m128i*)v1, Vector1); | |||
_mm_storeu_si128((__m128i*)v2, Vector2); | |||
for (size_t id = 0; id < 16; id++) { | |||
res[id] = v1[id] * v2[id]; | |||
} | |||
return _mm_loadu_si128((__m128i*)res); | |||
#else | |||
return Vector1 * Vector2; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_INT8 | |||
GiAndInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||
GI_INT32_t GiMultiplyAddInt32( | |||
GI_INT32_t Vector1, GI_INT32_t Vector2, GI_INT32_t Vector3) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vmlaq_s32(Vector1, Vector2, Vector3); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_add_epi32(Vector1, GiMultiplyInt32(Vector2, Vector3)); | |||
#else | |||
return Vector1 + Vector2 * Vector3; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_INT8_t GiMultiplyAddInt8(GI_INT8_t Vector1, GI_INT8_t Vector2, GI_INT8_t Vector3) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vmlaq_s8(Vector1, Vector2, Vector3); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_add_epi8(Vector1, GiMultiplyInt8(Vector2, Vector3)); | |||
#else | |||
return Vector1 + Vector2 * Vector3; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_INT8_t GiAndInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vandq_s8(Vector1, Vector2); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
@@ -178,8 +374,18 @@ GiAndInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||
} | |||
GI_FORCEINLINE | |||
GI_INT8 | |||
GiOrInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||
GI_UINT32_t GiEOrUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return veorq_u32(Vector1, Vector2); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_xor_si128(Vector1, Vector2); | |||
#else | |||
return Vector1 ^ Vector2; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_INT8_t GiOrInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vorrq_s8(Vector1, Vector2); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
@@ -190,21 +396,19 @@ GiOrInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||
} | |||
GI_FORCEINLINE | |||
GI_INT8 | |||
GiAndNotInt8(GI_INT8 VectorNot, GI_INT8 Vector) { | |||
GI_INT8_t GiAndNotInt8(GI_INT8_t VectorNot, GI_INT8_t Vector) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vandq_s8(vmvnq_s8(VectorNot), Vector); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_andnot_si128(VectorNot, Vector); | |||
#else | |||
GI_INT8 Not = ~VectorNot; | |||
GI_INT8_t Not = ~VectorNot; | |||
return (Not & Vector); | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_INT8 | |||
GiXorInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||
GI_INT8_t GiXorInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return veorq_s8(Vector1, Vector2); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
@@ -214,47 +418,85 @@ GiXorInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_INT32_t GiShiftLeft23Int32(GI_INT32_t Vector) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
#define GISHIFTLEFTINT32(i) \ | |||
GI_FORCEINLINE GI_INT32 GiShiftLeft##i##Int32(GI_INT32 Vector) { \ | |||
return vshlq_n_s32(Vector, i); \ | |||
} | |||
return vshlq_n_s32(Vector, 23); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
#define GISHIFTLEFTINT32(i) \ | |||
GI_FORCEINLINE GI_INT32 GiShiftLeft##i##Int32(GI_INT32 Vector) { \ | |||
return _mm_slli_epi32(Vector, i); \ | |||
} | |||
return _mm_slli_epi32(Vector, 23); | |||
#else | |||
#define GISHIFTLEFTINT32(i) \ | |||
GI_FORCEINLINE GI_INT32 GiShiftLeft##i##Int32(GI_INT32 Vector) { \ | |||
return Vector << i; \ | |||
} | |||
return Vector << 23; | |||
#endif | |||
} | |||
GISHIFTLEFTINT32(0) | |||
GISHIFTLEFTINT32(1) | |||
GISHIFTLEFTINT32(2) | |||
GISHIFTLEFTINT32(3) | |||
#undef GISHIFTLEFTINT32 | |||
GI_FORCEINLINE | |||
GI_INT32_t GiShiftRight23Int32(GI_INT32_t Vector) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vshrq_n_s32(Vector, 23); | |||
#elif defined(GI_SSE2_INTRINSICS) | |||
return _mm_srai_epi32(Vector, 23); | |||
#else | |||
return Vector >> 23; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_INT32 | |||
GiBlendInt32(GI_INT32 Vector1, GI_INT32 Vector2, GI_INT32 Selection) { | |||
GI_INT32_t GiBlendInt32(GI_INT32_t Vector1, GI_INT32_t Vector2, GI_INT32_t Selection) { | |||
return GiOrInt32(GiAndInt32(Vector2, Selection), GiAndNotInt32(Selection, Vector1)); | |||
} | |||
GI_FORCEINLINE | |||
GI_INT8 | |||
GiBlendInt8(GI_INT8 Vector1, GI_INT8 Vector2, GI_INT8 Selection) { | |||
GI_INT8_t GiBlendInt8(GI_INT8_t Vector1, GI_INT8_t Vector2, GI_INT8_t Selection) { | |||
return GiOrInt8(GiAndInt8(Vector2, Selection), GiAndNotInt8(Selection, Vector1)); | |||
} | |||
GI_FORCEINLINE | |||
GI_INT32 | |||
GiMaximumInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
GI_INT32_t GiAbsInt32(GI_INT32_t Vector) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vabsq_s32(Vector); | |||
#elif defined(GI_SSE42_INTRINSICS) | |||
return _mm_abs_epi32(Vector); | |||
#else | |||
GI_INT32_t ret; | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||
ret[i] = Vector[i] > 0 ? Vector[i] : -Vector[i]; | |||
} | |||
return ret; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_INT16_t GiAbsInt16(GI_INT16_t Vector) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vabsq_s16(Vector); | |||
#elif defined(GI_SSE42_INTRINSICS) | |||
return _mm_abs_epi16(Vector); | |||
#else | |||
GI_INT16_t ret; | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); i++) { | |||
ret[i] = Vector[i] > 0 ? Vector[i] : -Vector[i]; | |||
} | |||
return ret; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_INT8_t GiAbsInt8(GI_INT8_t Vector) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vabsq_s8(Vector); | |||
#elif defined(GI_SSE42_INTRINSICS) | |||
return _mm_abs_epi8(Vector); | |||
#else | |||
GI_INT8_t ret; | |||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | |||
ret[i] = Vector[i] > 0 ? Vector[i] : -Vector[i]; | |||
} | |||
return ret; | |||
#endif | |||
} | |||
GI_FORCEINLINE | |||
GI_INT32_t GiMaximumInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vmaxq_s32(Vector1, Vector2); | |||
#elif defined(GI_SSE42_INTRINSICS) | |||
@@ -267,8 +509,7 @@ GiMaximumInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
} | |||
GI_FORCEINLINE | |||
GI_INT32 | |||
GiMinimumInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
GI_INT32_t GiMinimumInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vminq_s32(Vector1, Vector2); | |||
#elif defined(GI_SSE42_INTRINSICS) | |||
@@ -281,14 +522,12 @@ GiMinimumInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
} | |||
GI_FORCEINLINE | |||
GI_INT8 | |||
GiBlendInt8x16(GI_INT8 Vector1, GI_INT8 Vector2, GI_INT8 Selection) { | |||
GI_INT8_t GiBlendInt8x16(GI_INT8_t Vector1, GI_INT8_t Vector2, GI_INT8_t Selection) { | |||
return GiOrInt8(GiAndInt8(Vector2, Selection), GiAndNotInt8(Selection, Vector1)); | |||
} | |||
GI_FORCEINLINE | |||
GI_INT8 | |||
GiMaximumInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||
GI_INT8_t GiMaximumInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vmaxq_s8(Vector1, Vector2); | |||
#elif defined(GI_SSE42_INTRINSICS) | |||
@@ -301,8 +540,7 @@ GiMaximumInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||
} | |||
GI_FORCEINLINE | |||
GI_INT8 | |||
GiMinimumInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||
GI_INT8_t GiMinimumInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vminq_s8(Vector1, Vector2); | |||
#elif defined(GI_SSE42_INTRINSICS) | |||
@@ -315,8 +553,7 @@ GiMinimumInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||
} | |||
GI_FORCEINLINE | |||
GI_INT16 | |||
GiMoveHighLongInt8(GI_INT8 Vector) { | |||
GI_INT16_t GiMoveHighLongInt8(GI_INT8_t Vector) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vmovl_s8(vget_high_s8(Vector)); | |||
#elif defined(GI_SSE42_INTRINSICS) | |||
@@ -330,7 +567,7 @@ GiMoveHighLongInt8(GI_INT8 Vector) { | |||
} | |||
return _mm_loadu_si128((__m128i*)data); | |||
#else | |||
GI_INT16 ret; | |||
GI_INT16_t ret; | |||
int8_t* data = (int8_t*)&Vector; | |||
size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); | |||
for (size_t i = 0; i < half_length; i++) { | |||
@@ -341,8 +578,7 @@ GiMoveHighLongInt8(GI_INT8 Vector) { | |||
} | |||
GI_FORCEINLINE | |||
GI_INT16 | |||
GiMoveLowLongInt8(GI_INT8 Vector) { | |||
GI_INT16_t GiMoveLowLongInt8(GI_INT8_t Vector) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vmovl_s8(vget_low_s8(Vector)); | |||
#elif defined(GI_SSE42_INTRINSICS) | |||
@@ -356,7 +592,7 @@ GiMoveLowLongInt8(GI_INT8 Vector) { | |||
} | |||
return _mm_loadu_si128((__m128i*)data); | |||
#else | |||
GI_INT16 ret; | |||
GI_INT16_t ret; | |||
size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); | |||
for (size_t i = 0; i < half_length; i++) { | |||
ret[i] = Vector[i]; | |||
@@ -366,8 +602,7 @@ GiMoveLowLongInt8(GI_INT8 Vector) { | |||
} | |||
GI_FORCEINLINE | |||
GI_INT32 | |||
GiMoveHighLongInt16(GI_INT16 Vector) { | |||
GI_INT32_t GiMoveHighLongInt16(GI_INT16_t Vector) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vmovl_s16(vget_high_s16(Vector)); | |||
#elif defined(GI_SSE42_INTRINSICS) | |||
@@ -381,7 +616,7 @@ GiMoveHighLongInt16(GI_INT16 Vector) { | |||
} | |||
return _mm_loadu_si128((__m128i*)data); | |||
#else | |||
GI_INT32 ret; | |||
GI_INT32_t ret; | |||
size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); | |||
for (size_t i = 0; i < half_length; i++) { | |||
ret[i] = Vector[half_length + i]; | |||
@@ -391,8 +626,7 @@ GiMoveHighLongInt16(GI_INT16 Vector) { | |||
} | |||
GI_FORCEINLINE | |||
GI_INT32 | |||
GiMoveLowLongInt16(GI_INT16 Vector) { | |||
GI_INT32_t GiMoveLowLongInt16(GI_INT16_t Vector) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
return vmovl_s16(vget_low_s16(Vector)); | |||
#elif defined(GI_SSE42_INTRINSICS) | |||
@@ -406,7 +640,7 @@ GiMoveLowLongInt16(GI_INT16 Vector) { | |||
} | |||
return _mm_loadu_si128((__m128i*)data); | |||
#else | |||
GI_INT32 ret; | |||
GI_INT32_t ret; | |||
size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); | |||
for (size_t i = 0; i < half_length; i++) { | |||
ret[i] = Vector[i]; | |||
@@ -416,7 +650,7 @@ GiMoveLowLongInt16(GI_INT16 Vector) { | |||
} | |||
GI_FORCEINLINE | |||
int32_t GiReduceAddInt8(GI_INT8 Vector) { | |||
int32_t GiReduceAddInt8(GI_INT8_t Vector) { | |||
#if defined(GI_NEON64_INTRINSICS) | |||
return vaddlvq_s8(Vector); | |||
#elif defined(GI_NEON32_INTRINSICS) | |||
@@ -461,7 +695,7 @@ int32_t GiReduceAddInt8(GI_INT8 Vector) { | |||
} | |||
GI_FORCEINLINE | |||
int8_t GiReduceMaxInt8(GI_INT8 Vector) { | |||
int8_t GiReduceMaxInt8(GI_INT8_t Vector) { | |||
#if defined(GI_NEON64_INTRINSICS) | |||
return vmaxvq_s8(Vector); | |||
#elif defined(GI_NEON32_INTRINSICS) | |||
@@ -509,7 +743,7 @@ int8_t GiReduceMaxInt8(GI_INT8 Vector) { | |||
} | |||
GI_FORCEINLINE | |||
int8_t GiReduceMinInt8(GI_INT8 Vector) { | |||
int8_t GiReduceMinInt8(GI_INT8_t Vector) { | |||
#if defined(GI_NEON64_INTRINSICS) | |||
return vminvq_s8(Vector); | |||
#elif defined(GI_NEON32_INTRINSICS) | |||
@@ -562,8 +796,7 @@ int8_t GiReduceMinInt8(GI_INT8 Vector) { | |||
//! convert to the short type with the lower bit fill the real data, the high bite | |||
//! will repeat the lower bit | |||
GI_FORCEINLINE | |||
GI_INT8 | |||
GiCvtFromFloat32ToInt8(GI_FLOAT32 src) { | |||
GI_INT8_t GiCvtFromFloat32ToInt8(GI_FLOAT32_t src) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
#if __ARM_ARCH >= 8 | |||
int32x4_t vres0 = vcvtaq_s32_f32(src); | |||
@@ -595,7 +828,7 @@ GiCvtFromFloat32ToInt8(GI_FLOAT32 src) { | |||
__m128i vepi8 = _mm_packs_epi16(vepi16, vepi16); | |||
return vepi8; | |||
#else | |||
GI_INT8 ret; | |||
GI_INT8_t ret; | |||
int length = GI_SIMD_LEN_BYTE / sizeof(float); | |||
for (int i = 0; i < length; i++) { | |||
int8_t data = Saturate(round(src[i]), -128, 127); | |||
@@ -609,8 +842,7 @@ GiCvtFromFloat32ToInt8(GI_FLOAT32 src) { | |||
} | |||
GI_FORCEINLINE | |||
GI_INT8 | |||
GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2 vsrc) { | |||
GI_INT8_t GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2_t vsrc) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
#if __ARM_ARCH >= 8 | |||
int32x4_t vres0 = vcvtaq_s32_f32(vsrc.val[0]); | |||
@@ -653,7 +885,7 @@ GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2 vsrc) { | |||
__m128i vepi8 = _mm_packs_epi16(vepi16_0, vepi16_0); | |||
return vepi8; | |||
#else | |||
GI_INT8 ret; | |||
GI_INT8_t ret; | |||
int length = GI_SIMD_LEN_BYTE / sizeof(float); | |||
for (int i = 0; i < 2 * length; i++) { | |||
ret[i] = Saturate(round(vsrc.val[i / length][i % length]), -128, 127); | |||
@@ -663,8 +895,7 @@ GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2 vsrc) { | |||
} | |||
GI_FORCEINLINE | |||
GI_INT8 | |||
GiCvtFromFloat32V4ToInt8(GI_FLOAT32_V4 vsrc) { | |||
GI_INT8_t GiCvtFromFloat32V4ToInt8(GI_FLOAT32_V4_t vsrc) { | |||
#if defined(GI_NEON_INTRINSICS) | |||
#if __ARM_ARCH >= 8 | |||
int32x4_t vres0 = vcvtaq_s32_f32(vsrc.val[0]); | |||
@@ -726,7 +957,7 @@ GiCvtFromFloat32V4ToInt8(GI_FLOAT32_V4 vsrc) { | |||
__m128i vepi8 = _mm_packs_epi16(vepi16_0, vepi16_1); | |||
return vepi8; | |||
#else | |||
GI_INT8 ret; | |||
GI_INT8_t ret; | |||
int length = GI_SIMD_LEN_BYTE / sizeof(float); | |||
for (int i = 0; i < 4 * length; i++) { | |||
ret[i] = Saturate(round(vsrc.val[i / length][i % length]), -128, 127); | |||
@@ -46,25 +46,25 @@ struct MeanReducer<dt_qint8, int8_t, int32_t, false> { | |||
using ctype = int8_t; | |||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); | |||
GI_INT32 res[4]; | |||
GI_INT32_t res[4]; | |||
int32_t remain; | |||
int32_t cnt; | |||
float coef; | |||
GI_FLOAT32 vcoef; | |||
GI_FLOAT32_t vcoef; | |||
MeanReducer(DType, size_t cnt) : remain(0), cnt(cnt), coef(1.0 / cnt) { | |||
memset(res, 0, sizeof(res)); | |||
vcoef = GiBroadcastFloat32(coef); | |||
} | |||
MeanReducer() = default; | |||
void feed(const int8_t* val) { | |||
const GI_INT8 vval = GiLoadInt8(val); | |||
const GI_INT16 vval_low = GiMoveLowLongInt8(vval); | |||
const GI_INT16 vval_high = GiMoveHighLongInt8(vval); | |||
const GI_INT8_t vval = GiLoadInt8(val); | |||
const GI_INT16_t vval_low = GiMoveLowLongInt8(vval); | |||
const GI_INT16_t vval_high = GiMoveHighLongInt8(vval); | |||
const GI_INT32 vval_low_low = GiMoveLowLongInt16(vval_low); | |||
const GI_INT32 vval_low_high = GiMoveHighLongInt16(vval_low); | |||
const GI_INT32 vval_high_low = GiMoveLowLongInt16(vval_high); | |||
const GI_INT32 vval_high_high = GiMoveHighLongInt16(vval_high); | |||
const GI_INT32_t vval_low_low = GiMoveLowLongInt16(vval_low); | |||
const GI_INT32_t vval_low_high = GiMoveHighLongInt16(vval_low); | |||
const GI_INT32_t vval_high_low = GiMoveLowLongInt16(vval_high); | |||
const GI_INT32_t vval_high_high = GiMoveHighLongInt16(vval_high); | |||
res[0] = GiAddInt32(res[0], vval_low_low); | |||
res[1] = GiAddInt32(res[1], vval_low_high); | |||
@@ -74,11 +74,11 @@ struct MeanReducer<dt_qint8, int8_t, int32_t, false> { | |||
void feed_remain(const int8_t* val) { remain += *val; } | |||
void post(int8_t* dst) { | |||
for (int i = 0; i < 4; i += 2) { | |||
GI_FLOAT32 vitem0 = GiMultiplyFloat32(GiCastToFloat32(res[i]), vcoef); | |||
GI_FLOAT32 vitem1 = GiMultiplyFloat32(GiCastToFloat32(res[i + 1]), vcoef); | |||
GI_FLOAT32_t vitem0 = GiMultiplyFloat32(GiCastToFloat32(res[i]), vcoef); | |||
GI_FLOAT32_t vitem1 = GiMultiplyFloat32(GiCastToFloat32(res[i + 1]), vcoef); | |||
GiStoreLowInt8( | |||
dst, | |||
(QConverter::convert<GI_INT8, GI_FLOAT32_V2>({{vitem0, vitem1}}))); | |||
dst, (QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>( | |||
{{vitem0, vitem1}}))); | |||
dst += 8; | |||
} | |||
} | |||
@@ -93,7 +93,7 @@ struct MeanReducer<dt_float32, float, float, true> { | |||
using ctype = float; | |||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | |||
GI_FLOAT32 res; | |||
GI_FLOAT32_t res; | |||
float result; | |||
float coef; | |||
MeanReducer(DType, size_t cnt) : result(0.0f), coef(1.0 / cnt) { | |||
@@ -113,7 +113,7 @@ struct MeanReducer<dt_float32, float, float, false> { | |||
using ctype = float; | |||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | |||
GI_FLOAT32 res; | |||
GI_FLOAT32_t res; | |||
float remain; | |||
float coef; | |||
MeanReducer(DType, size_t cnt) : remain(0.0f), coef(1.0 / cnt) { | |||
@@ -140,30 +140,33 @@ struct minReducer; | |||
struct _mode##Reducer<dt_float32, float, float, true> { \ | |||
using ctype = float; \ | |||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | |||
GI_FLOAT32 res; \ | |||
GI_FLOAT32_t res; \ | |||
_mode##Reducer(DType, size_t) { res = GiBroadcastFloat32(_init); } \ | |||
_mode##Reducer() = default; \ | |||
void feed(const float* val) { \ | |||
auto vval = GiLoadFloat32(val); \ | |||
res = Gi##_Mode##imumFloat32(res, vval); \ | |||
res = Gi##_Mode##NanFloat32(res, vval); \ | |||
} \ | |||
void feed_remain(const float* val) { \ | |||
auto vval = GiBroadcastFloat32(*val); \ | |||
res = Gi##_Mode##imumFloat32(vval, res); \ | |||
res = Gi##_Mode##NanFloat32(vval, res); \ | |||
} \ | |||
void post(float* dst) { *dst = GiReduce##_Mode##imumFloat32(res); } \ | |||
void post(float* dst) { *dst = GiReduce##_Mode##NanFloat32(res); } \ | |||
} | |||
REDUCER_MAX_MIN_C1(max, Max, std::numeric_limits<dt_float32>::lowest()); | |||
REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max()); | |||
#undef REDUCER_MAX_MIN_C1 | |||
#define Max_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b); | |||
#define Min_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b); | |||
#define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \ | |||
template <> \ | |||
struct _mode##Reducer<dt_float32, float, float, false> { \ | |||
using ctype = float; \ | |||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | |||
GI_FLOAT32 res; \ | |||
GI_FLOAT32_t res; \ | |||
float remain; \ | |||
_mode##Reducer(DType, size_t) { \ | |||
res = GiBroadcastFloat32(_init); \ | |||
@@ -171,12 +174,12 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max()); | |||
} \ | |||
_mode##Reducer() = default; \ | |||
void feed(const float* val) { \ | |||
GI_FLOAT32 vval = GiLoadFloat32(val); \ | |||
res = Gi##_Mode##imumFloat32(res, vval); \ | |||
GI_FLOAT32_t vval = GiLoadFloat32(val); \ | |||
res = Gi##_Mode##NanFloat32(res, vval); \ | |||
} \ | |||
void feed_remain(const float* val) { \ | |||
using namespace std; \ | |||
remain = _mode(*val, remain); \ | |||
remain = _Mode##_NAN(*val, remain); \ | |||
} \ | |||
void post(float* dst) { GiStoreFloat32(dst, res); } \ | |||
void post_remain(float* dst) { *dst = remain; } \ | |||
@@ -185,21 +188,23 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max()); | |||
REDUCER_MAX_MIN_C(max, Max, std::numeric_limits<dt_float32>::lowest()); | |||
REDUCER_MAX_MIN_C(min, Min, std::numeric_limits<dt_float32>::max()); | |||
#undef REDUCER_MAX_MIN_C | |||
#undef Max_NAN | |||
#undef Min_NAN | |||
#define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \ | |||
template <> \ | |||
struct _mode##Reducer<dt_qint8, int8_t, int8_t, true> { \ | |||
using ctype = int8_t; \ | |||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \ | |||
GI_INT8 res; \ | |||
GI_INT8_t res; \ | |||
_mode##Reducer(DType, size_t) { res = GiBroadcastInt8(_init); } \ | |||
_mode##Reducer() = default; \ | |||
void feed(const int8_t* val) { \ | |||
GI_INT8 vval = GiLoadInt8(val); \ | |||
GI_INT8_t vval = GiLoadInt8(val); \ | |||
res = Gi##_Mode##imumInt8(vval, res); \ | |||
} \ | |||
void feed_remain(const int8_t* val) { \ | |||
GI_INT8 vval = GiBroadcastInt8(*val); \ | |||
GI_INT8_t vval = GiBroadcastInt8(*val); \ | |||
res = Gi##_Mode##imumInt8(res, vval); \ | |||
} \ | |||
void post(int8_t* dst) { *dst = GiReduce##_Mode##Int8(res); } \ | |||
@@ -214,7 +219,7 @@ REDUCER_MAX_MIN_C1(min, Min, 127); | |||
struct _mode##Reducer<dt_qint8, int8_t, int8_t, false> { \ | |||
using ctype = int8_t; \ | |||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \ | |||
GI_INT8 res; \ | |||
GI_INT8_t res; \ | |||
int8_t remain; \ | |||
_mode##Reducer(DType, size_t) { \ | |||
res = GiBroadcastInt8(_init); \ | |||
@@ -222,7 +227,7 @@ REDUCER_MAX_MIN_C1(min, Min, 127); | |||
} \ | |||
_mode##Reducer() = default; \ | |||
void feed(const int8_t* val) { \ | |||
GI_INT8 vval = GiLoadInt8(val); \ | |||
GI_INT8_t vval = GiLoadInt8(val); \ | |||
res = Gi##_Mode##imumInt8(res, vval); \ | |||
} \ | |||
void feed_remain(const int8_t* val) { \ | |||
@@ -248,7 +253,7 @@ struct ProductReducer; | |||
struct _mode##Reducer<dt_float32, float, float, true> { \ | |||
using ctype = float; \ | |||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | |||
GI_FLOAT32 res; \ | |||
GI_FLOAT32_t res; \ | |||
float remain; \ | |||
_mode##Reducer(DType, size_t) { \ | |||
res = GiBroadcastFloat32(_init); \ | |||
@@ -256,7 +261,7 @@ struct ProductReducer; | |||
} \ | |||
_mode##Reducer() = default; \ | |||
void feed(const float* val) { \ | |||
GI_FLOAT32 vval = GiLoadFloat32(val); \ | |||
GI_FLOAT32_t vval = GiLoadFloat32(val); \ | |||
res = Gi##_Mode##Float32(vval, res); \ | |||
} \ | |||
void feed_remain(const float* val) { \ | |||
@@ -280,7 +285,7 @@ REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f); | |||
struct _mode##Reducer<dt_float32, float, float, false> { \ | |||
using ctype = float; \ | |||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | |||
GI_FLOAT32 res; \ | |||
GI_FLOAT32_t res; \ | |||
float remain; \ | |||
_mode##Reducer(DType, size_t) { \ | |||
res = GiBroadcastFloat32(_init); \ | |||
@@ -288,7 +293,7 @@ REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f); | |||
} \ | |||
_mode##Reducer() = default; \ | |||
void feed(const float* val) { \ | |||
GI_FLOAT32 vval = GiLoadFloat32(val); \ | |||
GI_FLOAT32_t vval = GiLoadFloat32(val); \ | |||
res = Gi##_Mode##Float32(vval, res); \ | |||
} \ | |||
void feed_remain(const float* val) { \ | |||
@@ -313,7 +318,7 @@ struct SumSqrReducer<dt_float32, float, float, true> { | |||
using ctype = float; | |||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | |||
GI_FLOAT32 res; | |||
GI_FLOAT32_t res; | |||
float result; | |||
SumSqrReducer(DType, size_t cnt) : result(0.0f) { | |||
MEGDNN_MARK_USED_VAR(cnt); | |||
@@ -321,7 +326,7 @@ struct SumSqrReducer<dt_float32, float, float, true> { | |||
} | |||
SumSqrReducer() = default; | |||
void feed(const float* val) { | |||
GI_FLOAT32 vval = GiLoadFloat32(val); | |||
GI_FLOAT32_t vval = GiLoadFloat32(val); | |||
res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res); | |||
} | |||
void feed_remain(const float* val) { | |||
@@ -338,7 +343,7 @@ struct SumSqrReducer<dt_float32, float, float, false> { | |||
using ctype = float; | |||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | |||
GI_FLOAT32 res; | |||
GI_FLOAT32_t res; | |||
float remain; | |||
SumSqrReducer(DType, size_t cnt) : remain(0.0f) { | |||
MEGDNN_MARK_USED_VAR(cnt); | |||
@@ -346,7 +351,7 @@ struct SumSqrReducer<dt_float32, float, float, false> { | |||
} | |||
SumSqrReducer() = default; | |||
void feed(const float* val) { | |||
GI_FLOAT32 vval = GiLoadFloat32(val); | |||
GI_FLOAT32_t vval = GiLoadFloat32(val); | |||
res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res); | |||
} | |||
void feed_remain(const float* val) { remain += (*val) * (*val); } | |||