GitOrigin-RevId: f250aa7b2a
tags/v1.9.0
@@ -95,8 +95,8 @@ typedef __m128i GI_INT16; | |||||
typedef __m128i GI_INT32; | typedef __m128i GI_INT32; | ||||
#else | #else | ||||
typedef float GI_FLOAT32 __attribute__((vector_size(16))); | typedef float GI_FLOAT32 __attribute__((vector_size(16))); | ||||
typedef uint16_t GI_UINT8 __attribute__((vector_size(16))); | |||||
typedef int16_t GI_INT8 __attribute__((vector_size(16))); | |||||
typedef uint8_t GI_UINT8 __attribute__((vector_size(16))); | |||||
typedef int8_t GI_INT8 __attribute__((vector_size(16))); | |||||
typedef int16_t GI_INT16 __attribute__((vector_size(16))); | typedef int16_t GI_INT16 __attribute__((vector_size(16))); | ||||
typedef int32_t GI_INT32 __attribute__((vector_size(16))); | typedef int32_t GI_INT32 __attribute__((vector_size(16))); | ||||
#endif | #endif | ||||
@@ -119,6 +119,9 @@ typedef int32_t GI_INT32 __attribute__((vector_size(16))); | |||||
#define GI_SIMD_LEN_BYTE 16 | #define GI_SIMD_LEN_BYTE 16 | ||||
#endif | #endif | ||||
#define Max(a, b) (a) > (b) ? (a) : (b) | |||||
#define Min(a, b) (a) < (b) ? (a) : (b) | |||||
typedef struct { | typedef struct { | ||||
GI_INT32 val[2]; | GI_INT32 val[2]; | ||||
} GI_INT32_V2; | } GI_INT32_V2; | ||||
@@ -223,7 +223,7 @@ GiInterleaveLowFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
#if defined(GI_NEON64_INTRINSICS) | #if defined(GI_NEON64_INTRINSICS) | ||||
return vzip1q_f32(Vector1, Vector2); | return vzip1q_f32(Vector1, Vector2); | ||||
#elif defined(GI_NEON32_INTRINSICS) | #elif defined(GI_NEON32_INTRINSICS) | ||||
float32x2_t zipped = vzipq_f32(Vector1, Vector2); | |||||
float32x4x2_t zipped = vzipq_f32(Vector1, Vector2); | |||||
return zipped.val[0]; | return zipped.val[0]; | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_unpacklo_ps(Vector1, Vector2); | return _mm_unpacklo_ps(Vector1, Vector2); | ||||
@@ -243,7 +243,7 @@ GiInterleaveHighFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
#if defined(GI_NEON64_INTRINSICS) | #if defined(GI_NEON64_INTRINSICS) | ||||
return vzip2q_f32(Vector1, Vector2); | return vzip2q_f32(Vector1, Vector2); | ||||
#elif defined(GI_NEON32_INTRINSICS) | #elif defined(GI_NEON32_INTRINSICS) | ||||
float32x2_t zipped = vzipq_f32(Vector1, Vector2); | |||||
float32x4x2_t zipped = vzipq_f32(Vector1, Vector2); | |||||
return zipped.val[1]; | return zipped.val[1]; | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_unpackhi_ps(Vector1, Vector2); | return _mm_unpackhi_ps(Vector1, Vector2); | ||||
@@ -460,7 +460,14 @@ GiMaximumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
#if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
return vmaxq_f32(Vector1, Vector2); | return vmaxq_f32(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_max_ps(Vector1, Vector2); | |||||
//! _mm_max_ps does not fellow the IEEE standard when input is NAN, so | |||||
//! implement by C code | |||||
#define MAX_NAN(a, b) (std::isnan(a) || (a) > (b)) ? (a) : (b); | |||||
GI_FLOAT32 max; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
max[i] = MAX_NAN(Vector1[i], Vector2[i]); | |||||
} | |||||
return max; | |||||
#else | #else | ||||
return GiBlendFloat32(Vector2, Vector1, Vector1 > Vector2); | return GiBlendFloat32(Vector2, Vector1, Vector1 > Vector2); | ||||
#endif | #endif | ||||
@@ -473,6 +480,14 @@ GiMinimumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
return vminq_f32(Vector1, Vector2); | return vminq_f32(Vector1, Vector2); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_min_ps(Vector1, Vector2); | return _mm_min_ps(Vector1, Vector2); | ||||
//! _mm_min_ps does not fellow the IEEE standard when input is NAN, so | |||||
//! implement by C code | |||||
#define MIN_NAN(a, b) (std::isnan(a) || (a) < (b)) ? (a) : (b); | |||||
GI_FLOAT32 min; | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
min[i] = MIN_NAN(Vector1[i], Vector2[i]); | |||||
} | |||||
return min; | |||||
#else | #else | ||||
return GiBlendFloat32(Vector2, Vector1, Vector2 > Vector1); | return GiBlendFloat32(Vector2, Vector1, Vector2 > Vector1); | ||||
#endif | #endif | ||||
@@ -97,7 +97,7 @@ void GiStoreInt8(int8_t* Buffer, GI_INT8 Vector) { | |||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
_mm_storeu_si128((__m128i*)Buffer, Vector); | _mm_storeu_si128((__m128i*)Buffer, Vector); | ||||
#else | #else | ||||
for (int i = 0; i < 16; i++) { | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | |||||
Buffer[i] = Vector[i]; | Buffer[i] = Vector[i]; | ||||
} | } | ||||
#endif | #endif | ||||
@@ -197,7 +197,8 @@ GiAndNotInt8(GI_INT8 VectorNot, GI_INT8 Vector) { | |||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
return _mm_andnot_si128(VectorNot, Vector); | return _mm_andnot_si128(VectorNot, Vector); | ||||
#else | #else | ||||
return (~VectorNot) & Vector; | |||||
GI_INT8 Not = ~VectorNot; | |||||
return (Not & Vector); | |||||
#endif | #endif | ||||
} | } | ||||
@@ -327,11 +328,13 @@ GiMoveHighLongInt8(GI_INT8 Vector) { | |||||
for (int i = 0; i < 8; i++) { | for (int i = 0; i < 8; i++) { | ||||
data[i] = o_data[8 + i]; | data[i] = o_data[8 + i]; | ||||
} | } | ||||
return _mm_loadu_si16(data); | |||||
return _mm_loadu_si128((__m128i*)data); | |||||
#else | #else | ||||
GI_INT16 ret; | GI_INT16 ret; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); i++) { | |||||
ret[i] = Vector[GI_SIMD_LEN_BYTE / 2 + i]; | |||||
int8_t* data = (int8_t*)&Vector; | |||||
size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); | |||||
for (size_t i = 0; i < half_length; i++) { | |||||
ret[i] = data[i + half_length]; | |||||
} | } | ||||
return ret; | return ret; | ||||
#endif | #endif | ||||
@@ -351,10 +354,11 @@ GiMoveLowLongInt8(GI_INT8 Vector) { | |||||
for (int i = 0; i < 8; i++) { | for (int i = 0; i < 8; i++) { | ||||
data[i] = o_data[i]; | data[i] = o_data[i]; | ||||
} | } | ||||
return _mm_loadu_si16(data); | |||||
return _mm_loadu_si128((__m128i*)data); | |||||
#else | #else | ||||
GI_INT16 ret; | GI_INT16 ret; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); i++) { | |||||
size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); | |||||
for (size_t i = 0; i < half_length; i++) { | |||||
ret[i] = Vector[i]; | ret[i] = Vector[i]; | ||||
} | } | ||||
return ret; | return ret; | ||||
@@ -375,11 +379,12 @@ GiMoveHighLongInt16(GI_INT16 Vector) { | |||||
for (int i = 0; i < 4; i++) { | for (int i = 0; i < 4; i++) { | ||||
data[i] = o_data[4 + i]; | data[i] = o_data[4 + i]; | ||||
} | } | ||||
return _mm_loadu_si32(data); | |||||
return _mm_loadu_si128((__m128i*)data); | |||||
#else | #else | ||||
GI_INT32 ret; | GI_INT32 ret; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); i++) { | |||||
ret[i] = Vector[GI_SIMD_LEN_BYTE / 2 + i]; | |||||
size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); | |||||
for (size_t i = 0; i < half_length; i++) { | |||||
ret[i] = Vector[half_length + i]; | |||||
} | } | ||||
return ret; | return ret; | ||||
#endif | #endif | ||||
@@ -399,10 +404,11 @@ GiMoveLowLongInt16(GI_INT16 Vector) { | |||||
for (int i = 0; i < 4; i++) { | for (int i = 0; i < 4; i++) { | ||||
data[i] = o_data[i]; | data[i] = o_data[i]; | ||||
} | } | ||||
return _mm_loadu_si32(data); | |||||
return _mm_loadu_si128((__m128i*)data); | |||||
#else | #else | ||||
GI_INT32 ret; | GI_INT32 ret; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); i++) { | |||||
size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); | |||||
for (size_t i = 0; i < half_length; i++) { | |||||
ret[i] = Vector[i]; | ret[i] = Vector[i]; | ||||
} | } | ||||
return ret; | return ret; | ||||
@@ -414,7 +420,7 @@ int16_t GiReduceAddInt8(GI_INT8 Vector) { | |||||
#if defined(GI_NEON64_INTRINSICS) | #if defined(GI_NEON64_INTRINSICS) | ||||
return vaddlvq_s8(Vector); | return vaddlvq_s8(Vector); | ||||
#elif defined(GI_NEON32_INTRINSICS) | #elif defined(GI_NEON32_INTRINSICS) | ||||
int32_t sum = vpaddlq_s16(vpaddlq_s8(Vector)); | |||||
int32x4_t sum = vpaddlq_s16(vpaddlq_s8(Vector)); | |||||
return (vgetq_lane_s32(sum, 0) + vgetq_lane_s32(sum, 1) + vgetq_lane_s32(sum, 2) + | return (vgetq_lane_s32(sum, 0) + vgetq_lane_s32(sum, 1) + vgetq_lane_s32(sum, 2) + | ||||
vgetq_lane_s32(sum, 3)); | vgetq_lane_s32(sum, 3)); | ||||
#elif defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE42_INTRINSICS) | ||||
@@ -431,8 +437,8 @@ int16_t GiReduceAddInt8(GI_INT8 Vector) { | |||||
return (int16_t)(ret); | return (int16_t)(ret); | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
__m64 low = GiGetLowInt8x16(Vector); | |||||
__m64 high = GiGetHighInt8x16(Vector); | |||||
__m64 low = _mm_movepi64_pi64(Vector); | |||||
__m64 high = _mm_movepi64_pi64(_mm_unpackhi_epi64(Vector, Vector)); | |||||
__m128 v0 = _mm_cvtpi8_ps(low); | __m128 v0 = _mm_cvtpi8_ps(low); | ||||
__m128 v1 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(low, low)); | __m128 v1 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(low, low)); | ||||
__m128 v2 = _mm_cvtpi8_ps(high); | __m128 v2 = _mm_cvtpi8_ps(high); | ||||
@@ -447,16 +453,13 @@ int16_t GiReduceAddInt8(GI_INT8 Vector) { | |||||
return (int16_t)(ret0 + ret1 + ret2 + ret3); | return (int16_t)(ret0 + ret1 + ret2 + ret3); | ||||
#else | #else | ||||
int32_t sum = 0; | int32_t sum = 0; | ||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||||
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | |||||
sum += Vector[i]; | sum += Vector[i]; | ||||
} | } | ||||
return sum; | return sum; | ||||
#endif | #endif | ||||
} | } | ||||
#define Max(a, b) (a) > (b) ? (a) : (b) | |||||
#define Min(a, b) (a) < (b) ? (a) : (b) | |||||
GI_FORCEINLINE | GI_FORCEINLINE | ||||
int8_t GiReduceMaxInt8(GI_INT8 Vector) { | int8_t GiReduceMaxInt8(GI_INT8 Vector) { | ||||
#if defined(GI_NEON64_INTRINSICS) | #if defined(GI_NEON64_INTRINSICS) | ||||
@@ -480,23 +483,23 @@ int8_t GiReduceMaxInt8(GI_INT8 Vector) { | |||||
ret = Max(_mm_extract_epi32(sum, 3), ret); | ret = Max(_mm_extract_epi32(sum, 3), ret); | ||||
return (int8_t)ret; | return (int8_t)ret; | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
__m64 low = GiGetLowInt8x16(Vector); | |||||
__m64 high = GiGetHighInt8x16(Vector); | |||||
__m64 low = _mm_movepi64_pi64(Vector); | |||||
__m64 high = _mm_movepi64_pi64(_mm_unpackhi_epi64(Vector, Vector)); | |||||
__m128 v0 = _mm_cvtpi8_ps(low); | __m128 v0 = _mm_cvtpi8_ps(low); | ||||
__m128 v1 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(low, low)); | __m128 v1 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(low, low)); | ||||
__m128 v2 = _mm_cvtpi8_ps(high); | __m128 v2 = _mm_cvtpi8_ps(high); | ||||
__m128 v3 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(high, high)); | __m128 v3 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(high, high)); | ||||
__m128 sum0 = _mm_add_ps(v0, v1); | |||||
__m128 sum1 = _mm_add_ps(v2, v3); | |||||
__m128 sum = _mm_add_ps(sum0, sum1); | |||||
float ret0 = _mm_cvtss_f32(sum); | |||||
float ret1 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1))); | |||||
float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(2, 2, 2, 2))); | |||||
float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 3, 3))); | |||||
__m128 max0 = _mm_max_ps(v0, v1); | |||||
__m128 max1 = _mm_max_ps(v2, v3); | |||||
__m128 max = _mm_max_ps(max0, max1); | |||||
float ret0 = _mm_cvtss_f32(max); | |||||
float ret1 = _mm_cvtss_f32(_mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 1, 1, 1))); | |||||
float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(max, max, _MM_SHUFFLE(2, 2, 2, 2))); | |||||
float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(max, max, _MM_SHUFFLE(3, 3, 3, 3))); | |||||
return (int8_t)(Max(Max(ret0, ret1), Max(ret2, ret3))); | return (int8_t)(Max(Max(ret0, ret1), Max(ret2, ret3))); | ||||
#else | #else | ||||
int8_t max = Vector[0]; | int8_t max = Vector[0]; | ||||
for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||||
for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | |||||
max = Max(max, Vector[i]); | max = Max(max, Vector[i]); | ||||
} | } | ||||
return max; | return max; | ||||
@@ -526,23 +529,23 @@ int8_t GiReduceMinInt8(GI_INT8 Vector) { | |||||
ret = Min(_mm_extract_epi32(sum, 3), ret); | ret = Min(_mm_extract_epi32(sum, 3), ret); | ||||
return (int8_t)ret; | return (int8_t)ret; | ||||
#elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
__m64 low = GiGetLowInt8x16(Vector); | |||||
__m64 high = GiGetHighInt8x16(Vector); | |||||
__m64 low = _mm_movepi64_pi64(Vector); | |||||
__m64 high = _mm_movepi64_pi64(_mm_unpackhi_epi64(Vector, Vector)); | |||||
__m128 v0 = _mm_cvtpi8_ps(low); | __m128 v0 = _mm_cvtpi8_ps(low); | ||||
__m128 v1 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(low, low)); | __m128 v1 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(low, low)); | ||||
__m128 v2 = _mm_cvtpi8_ps(high); | __m128 v2 = _mm_cvtpi8_ps(high); | ||||
__m128 v3 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(high, high)); | __m128 v3 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(high, high)); | ||||
__m128 sum0 = _mm_add_ps(v0, v1); | |||||
__m128 sum1 = _mm_add_ps(v2, v3); | |||||
__m128 sum = _mm_add_ps(sum0, sum1); | |||||
float ret0 = _mm_cvtss_f32(sum); | |||||
float ret1 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1))); | |||||
float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(2, 2, 2, 2))); | |||||
float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 3, 3))); | |||||
__m128 min0 = _mm_min_ps(v0, v1); | |||||
__m128 min1 = _mm_min_ps(v2, v3); | |||||
__m128 min = _mm_min_ps(min0, min1); | |||||
float ret0 = _mm_cvtss_f32(min); | |||||
float ret1 = _mm_cvtss_f32(_mm_shuffle_ps(min, min, _MM_SHUFFLE(1, 1, 1, 1))); | |||||
float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(min, min, _MM_SHUFFLE(2, 2, 2, 2))); | |||||
float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(min, min, _MM_SHUFFLE(3, 3, 3, 3))); | |||||
return (int8_t)(Min(Min(ret0, ret1), Min(ret2, ret3))); | return (int8_t)(Min(Min(ret0, ret1), Min(ret2, ret3))); | ||||
#else | #else | ||||
int8_t min = Vector[0]; | int8_t min = Vector[0]; | ||||
for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||||
for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | |||||
min = Min(min, Vector[i]); | min = Min(min, Vector[i]); | ||||
} | } | ||||
return min; | return min; | ||||
@@ -561,8 +564,7 @@ GiCvtFromFloat32ToInt8(GI_FLOAT32 src) { | |||||
#if __ARM_ARCH >= 8 | #if __ARM_ARCH >= 8 | ||||
int32x4_t vres0 = vcvtaq_s32_f32(src); | int32x4_t vres0 = vcvtaq_s32_f32(src); | ||||
int16x8_t mid_s16 = vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres0)); | int16x8_t mid_s16 = vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres0)); | ||||
int8x8_t ret = vqmovn_s16(vcombine_s16(vqmovn_s32(mid_s16), vqmovn_s32(mid_s16))); | |||||
return vcombine_s16(ret, ret); | |||||
return vcombine_s8(vqmovn_s16(mid_s16), vqmovn_s16(mid_s16)); | |||||
#else | #else | ||||
float32x4_t vzero = vdupq_n_f32(0.f); | float32x4_t vzero = vdupq_n_f32(0.f); | ||||
float32x4_t vfhalf = vdupq_n_f32(0.5f); | float32x4_t vfhalf = vdupq_n_f32(0.5f); | ||||
@@ -570,8 +572,7 @@ GiCvtFromFloat32ToInt8(GI_FLOAT32 src) { | |||||
float32x4_t vinc0 = vbslq_f32(vcgeq_f32(src, vzero), vfhalf, vfneg_half); | float32x4_t vinc0 = vbslq_f32(vcgeq_f32(src, vzero), vfhalf, vfneg_half); | ||||
int32x4_t vres0 = vcvtq_s32_f32(vaddq_f32(src, vinc0)); | int32x4_t vres0 = vcvtq_s32_f32(vaddq_f32(src, vinc0)); | ||||
int16x8_t mid_s16 = vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres0)); | int16x8_t mid_s16 = vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres0)); | ||||
int8x8_t ret = vqmovn_s16(vcombine_s16(vqmovn_s32(mid_s16), vqmovn_s32(mid_s16))); | |||||
return vcombine_s16(ret, ret); | |||||
return vcombine_s8(vqmovn_s16(mid_s16), vqmovn_s16(mid_s16)); | |||||
#endif | #endif | ||||
#elif defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE42_INTRINSICS) | ||||
__m128 vfzero = _mm_set1_ps(0.f); | __m128 vfzero = _mm_set1_ps(0.f); | ||||
@@ -0,0 +1,81 @@ | |||||
/** | |||||
* \file dnn/src/arm_common/quantized_converter.h | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | |||||
#pragma once | |||||
#include "megdnn/dtype.h" | |||||
#include "megdnn/oprs.h" | |||||
#include "src/common/utils.h" | |||||
#include "src/fallback/general_intrinsic/gi_float.h" | |||||
#include "src/fallback/general_intrinsic/gi_int.h" | |||||
namespace megdnn { | |||||
namespace fallback { | |||||
struct QConverterBase { | |||||
inline static GI_INT32 vzero() { return GiBroadcastInt32(0); } | |||||
inline static GI_FLOAT32 vfzero() { return GiBroadcastFloat32(0.f); } | |||||
inline static GI_FLOAT32 vfhalf() { return GiBroadcastFloat32(0.5f); } | |||||
inline static GI_FLOAT32 vfneg_half() { return GiBroadcastFloat32(-0.5f); } | |||||
}; | |||||
struct QConverter { | |||||
template <typename dst_type, typename... src_type> | |||||
static inline dst_type convert(const src_type&... src); | |||||
template <typename dst_type, typename... src_type> | |||||
static inline dst_type round(const src_type&... src); | |||||
}; | |||||
template <> | |||||
inline dt_qint8 QConverter::convert(const float& src) { | |||||
return dt_qint8(saturate<int8_t, float>(std::round(src), -128, 127)); | |||||
} | |||||
template <> | |||||
inline dt_quint8 QConverter::convert(const float& src, const uint8_t& zp) { | |||||
return dt_quint8(saturate<uint8_t, float>(std::round(src) + zp, 0, 255)); | |||||
} | |||||
template <> | |||||
inline dt_qint32 QConverter::convert(const float& src) { | |||||
return dt_qint32(saturate<int32_t, float>( | |||||
std::round(src), static_cast<float>(std::numeric_limits<int32_t>::min()), | |||||
static_cast<float>(std::numeric_limits<int32_t>::max()))); | |||||
} | |||||
template <> | |||||
inline GI_FLOAT32_V2 QConverter::convert(const GI_INT16& vsrc) { | |||||
GI_INT32 vhi = GiMoveHighLongInt16(vsrc); | |||||
GI_INT32 vlo = GiMoveLowLongInt16(vsrc); | |||||
return {{GiCastToFloat32(vlo), GiCastToFloat32(vhi)}}; | |||||
} | |||||
template <> | |||||
inline GI_INT8 QConverter::convert(const GI_FLOAT32_V2& vsrc) { | |||||
return GiCvtFromFloat32V2ToInt8(vsrc); | |||||
} | |||||
template <> | |||||
inline GI_INT8 QConverter::convert(const GI_FLOAT32& src) { | |||||
return GiCvtFromFloat32ToInt8(src); | |||||
} | |||||
template <> | |||||
inline GI_INT32 QConverter::round(const GI_FLOAT32& vsrc) { | |||||
return GiRoundAsInt32(vsrc); | |||||
} | |||||
} // namespace fallback | |||||
} // namespace megdnn | |||||
// vim: syntax=cpp.doxygen |
@@ -14,11 +14,13 @@ | |||||
#include "src/naive/handle.h" | #include "src/naive/handle.h" | ||||
#include "midout.h" | #include "midout.h" | ||||
#include "reducer.h" | |||||
#include "src/common/reduce_helper.h" | #include "src/common/reduce_helper.h" | ||||
MIDOUT_DECL(megdnn_fb_reduce_op) | MIDOUT_DECL(megdnn_fb_reduce_op) | ||||
MIDOUT_DECL(megdnn_fb_reduce_c) | MIDOUT_DECL(megdnn_fb_reduce_c) | ||||
MIDOUT_DECL(megdnn_fb_reduce_dtype) | MIDOUT_DECL(megdnn_fb_reduce_dtype) | ||||
MIDOUT_DECL(megdnn_fallback_reduce_optimized) | |||||
namespace { | namespace { | ||||
@@ -77,11 +79,20 @@ namespace fallback { | |||||
void ReduceImpl::exec( | void ReduceImpl::exec( | ||||
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { | _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { | ||||
check_exec(src.layout, dst.layout, workspace.size); | |||||
if (!exec_optimized(src, dst, workspace)) { | |||||
return exec_fallback(src, dst, workspace); | |||||
} | |||||
} | |||||
void ReduceImpl::exec_fallback( | |||||
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { | |||||
using namespace reduce; | using namespace reduce; | ||||
using Mode = Param::Mode; | using Mode = Param::Mode; | ||||
check_exec(src.layout, dst.layout, workspace.size); | check_exec(src.layout, dst.layout, workspace.size); | ||||
size_t A, B, C; | size_t A, B, C; | ||||
get_ABC(src.layout, A, B, C, param().axis); | get_ABC(src.layout, A, B, C, param().axis); | ||||
#define cb_by_op(src_type, dst_type, _wtype, mode_, Op_, kern_func) \ | #define cb_by_op(src_type, dst_type, _wtype, mode_, Op_, kern_func) \ | ||||
if (param().mode == mode_) { \ | if (param().mode == mode_) { \ | ||||
typedef DTypeTrait<src_type>::ctype src_ctype; \ | typedef DTypeTrait<src_type>::ctype src_ctype; \ | ||||
@@ -176,6 +187,101 @@ void ReduceImpl::exec( | |||||
naive::ReduceForwardImpl::exec(src, dst, workspace); | naive::ReduceForwardImpl::exec(src, dst, workspace); | ||||
} | } | ||||
bool ReduceImpl::exec_optimized( | |||||
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace) { | |||||
size_t A, B, C; | |||||
reduce::get_ABC(src.layout, A, B, C, param().axis); | |||||
bool execed = false; | |||||
using Mode = param::Reduce::Mode; | |||||
#define DISPATCH_FUNC(Reducer, dtype, ctype, comp_type) \ | |||||
if (C == 1) { \ | |||||
using _Reducer = Reducer<dtype, ctype, comp_type, true>; \ | |||||
std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)> \ | |||||
do_reduce = Exec<_Reducer, true>::do_reduce; \ | |||||
MIDOUT_BEGIN( \ | |||||
megdnn_fallback_reduce_optimized, ctype, dtype, comp_type, \ | |||||
midout_iv(0)) { \ | |||||
MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \ | |||||
reinterpret_cast<ctype*>(src.raw_ptr()), \ | |||||
reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \ | |||||
execed = true; \ | |||||
} \ | |||||
MIDOUT_END(); \ | |||||
} else { \ | |||||
using _Reducer = Reducer<dtype, ctype, comp_type, false>; \ | |||||
std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)> \ | |||||
do_reduce = Exec<_Reducer, false>::do_reduce; \ | |||||
MIDOUT_BEGIN( \ | |||||
megdnn_fallback_reduce_optimized, ctype, dtype, comp_type, \ | |||||
midout_iv(1)) { \ | |||||
MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \ | |||||
reinterpret_cast<ctype*>(src.raw_ptr()), \ | |||||
reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \ | |||||
execed = true; \ | |||||
} \ | |||||
MIDOUT_END(); \ | |||||
} | |||||
#define DISPATCH_MODE_QUANTIZED(dtype, ctype, comp_type) \ | |||||
switch (param().mode) { \ | |||||
case Mode::MEAN: \ | |||||
DISPATCH_FUNC(MeanReducer, dtype, ctype, comp_type); \ | |||||
break; \ | |||||
case Mode::MAX: \ | |||||
DISPATCH_FUNC(maxReducer, dtype, ctype, ctype); \ | |||||
break; \ | |||||
case Mode::MIN: \ | |||||
DISPATCH_FUNC(minReducer, dtype, ctype, ctype); \ | |||||
break; \ | |||||
default: \ | |||||
break; \ | |||||
} | |||||
#define DISPATCH_MODE_FLOAT(dtype, ctype, comp_type) \ | |||||
switch (param().mode) { \ | |||||
case Mode::MEAN: \ | |||||
DISPATCH_FUNC(MeanReducer, dtype, ctype, comp_type); \ | |||||
break; \ | |||||
case Mode::MAX: \ | |||||
DISPATCH_FUNC(maxReducer, dtype, ctype, ctype); \ | |||||
break; \ | |||||
case Mode::MIN: \ | |||||
DISPATCH_FUNC(minReducer, dtype, ctype, ctype); \ | |||||
break; \ | |||||
case Mode::SUM: \ | |||||
DISPATCH_FUNC(SumReducer, dtype, ctype, ctype); \ | |||||
break; \ | |||||
case Mode::SUM_SQR: \ | |||||
DISPATCH_FUNC(SumSqrReducer, dtype, ctype, ctype); \ | |||||
break; \ | |||||
case Mode::PRODUCT: \ | |||||
DISPATCH_FUNC(ProductReducer, dtype, ctype, ctype); \ | |||||
break; \ | |||||
default: \ | |||||
break; \ | |||||
} | |||||
if (src.layout.is_contiguous() && | |||||
src.layout.dtype.category() == DTypeCategory::QUANTIZED && | |||||
param().data_type == param::Reduce::DataType::DEFAULT) { | |||||
DType src_type = src.layout.dtype; | |||||
if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS8) { | |||||
DISPATCH_MODE_QUANTIZED(dt_qint8, int8_t, int32_t) | |||||
} | |||||
} else if ( | |||||
src.layout.is_contiguous() && | |||||
src.layout.dtype.category() == DTypeCategory::FLOAT && | |||||
param().data_type == param::Reduce::DataType::DEFAULT) { | |||||
DType src_type = src.layout.dtype; | |||||
if (src.layout.dtype.enumv() == DTypeEnum::Float32) { | |||||
DISPATCH_MODE_FLOAT(dt_float32, float, float) | |||||
} | |||||
} | |||||
return execed; | |||||
#undef DISPATCH_FUNC | |||||
#undef DISPATCH_MODE_QUANTIZED | |||||
#undef DISPATCH_MODE_FLOAT | |||||
} | |||||
} // namespace fallback | } // namespace fallback | ||||
} // namespace megdnn | } // namespace megdnn | ||||
// vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen |
@@ -19,6 +19,10 @@ public: | |||||
using ReduceForwardImpl::ReduceForwardImpl; | using ReduceForwardImpl::ReduceForwardImpl; | ||||
void exec( | void exec( | ||||
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace) override; | _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace) override; | ||||
bool exec_optimized( | |||||
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace); | |||||
void exec_fallback( | |||||
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace); | |||||
}; | }; | ||||
} // namespace fallback | } // namespace fallback | ||||
@@ -0,0 +1,417 @@ | |||||
/** | |||||
* \file dnn/src/fallback/reduce/reducer.h | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
*/ | |||||
#pragma once | |||||
#include "src/common/utils.h" | |||||
#include "src/fallback/general_intrinsic/gi_float.h" | |||||
#include "src/fallback/general_intrinsic/gi_int.h" | |||||
#include "src/fallback/quantized_converter.h" | |||||
using namespace megdnn; | |||||
using namespace fallback; | |||||
namespace { | |||||
/*****************************Mean Reducer***********************/ | |||||
template <typename dtype, typename ctype, typename comp_type, bool C1> | |||||
struct MeanReducer; | |||||
template <> | |||||
struct MeanReducer<dt_qint8, int8_t, int32_t, true> { | |||||
using ctype = int8_t; | |||||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); | |||||
int32_t res; | |||||
float coef; | |||||
MeanReducer(DType, size_t cnt) : res(0), coef(1.0 / cnt) {} | |||||
MeanReducer() = default; | |||||
void feed(const int8_t* val) { res += GiReduceAddInt8(GiLoadInt8(val)); } | |||||
void feed_remain(const int8_t* val) { res += *val; } | |||||
void post(int8_t* dst) { | |||||
float sum = res * coef; | |||||
*dst = std::round(sum); | |||||
} | |||||
}; | |||||
template <> | |||||
struct MeanReducer<dt_qint8, int8_t, int32_t, false> { | |||||
using ctype = int8_t; | |||||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); | |||||
GI_INT32 res[4]; | |||||
int32_t remain; | |||||
int32_t cnt; | |||||
float coef; | |||||
GI_FLOAT32 vcoef; | |||||
MeanReducer(DType, size_t cnt) : remain(0), cnt(cnt), coef(1.0 / cnt) { | |||||
memset(res, 0, sizeof(res)); | |||||
vcoef = GiBroadcastFloat32(coef); | |||||
} | |||||
MeanReducer() = default; | |||||
void feed(const int8_t* val) { | |||||
const GI_INT8 vval = GiLoadInt8(val); | |||||
const GI_INT16 vval_low = GiMoveLowLongInt8(vval); | |||||
const GI_INT16 vval_high = GiMoveHighLongInt8(vval); | |||||
const GI_INT32 vval_low_low = GiMoveLowLongInt16(vval_low); | |||||
const GI_INT32 vval_low_high = GiMoveHighLongInt16(vval_low); | |||||
const GI_INT32 vval_high_low = GiMoveLowLongInt16(vval_high); | |||||
const GI_INT32 vval_high_high = GiMoveHighLongInt16(vval_high); | |||||
res[0] = GiAddInt32(res[0], vval_low_low); | |||||
res[1] = GiAddInt32(res[1], vval_low_high); | |||||
res[2] = GiAddInt32(res[2], vval_high_low); | |||||
res[3] = GiAddInt32(res[3], vval_high_high); | |||||
} | |||||
void feed_remain(const int8_t* val) { remain += *val; } | |||||
void post(int8_t* dst) { | |||||
for (int i = 0; i < 4; i += 2) { | |||||
GI_FLOAT32 vitem0 = GiMultiplyFloat32(GiCastToFloat32(res[i]), vcoef); | |||||
GI_FLOAT32 vitem1 = GiMultiplyFloat32(GiCastToFloat32(res[i + 1]), vcoef); | |||||
GiStoreLowInt8( | |||||
dst, | |||||
(QConverter::convert<GI_INT8, GI_FLOAT32_V2>({{vitem0, vitem1}}))); | |||||
dst += 8; | |||||
} | |||||
} | |||||
void post_remain(int8_t* dst) { | |||||
float sum = remain * coef; | |||||
*dst = std::round(sum); | |||||
} | |||||
}; | |||||
template <> | |||||
struct MeanReducer<dt_float32, float, float, true> { | |||||
using ctype = float; | |||||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | |||||
GI_FLOAT32 res; | |||||
float result; | |||||
float coef; | |||||
MeanReducer(DType, size_t cnt) : result(0.0f), coef(1.0 / cnt) { | |||||
res = GiBroadcastFloat32(0.0f); | |||||
} | |||||
MeanReducer() = default; | |||||
void feed(const float* val) { res = GiAddFloat32(GiLoadFloat32(val), res); } | |||||
void feed_remain(const float* val) { result += *val; } | |||||
void post(float* dst) { | |||||
result += GiReduceAddFloat32(res); | |||||
*dst = result * coef; | |||||
} | |||||
}; | |||||
template <> | |||||
struct MeanReducer<dt_float32, float, float, false> { | |||||
using ctype = float; | |||||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | |||||
GI_FLOAT32 res; | |||||
float remain; | |||||
float coef; | |||||
MeanReducer(DType, size_t cnt) : remain(0.0f), coef(1.0 / cnt) { | |||||
res = GiBroadcastFloat32(0.0f); | |||||
} | |||||
MeanReducer() = default; | |||||
void feed(const float* val) { res = GiAddFloat32(GiLoadFloat32(val), res); } | |||||
void feed_remain(const float* val) { remain += *val; } | |||||
void post(float* dst) { | |||||
res = GiMultiplyScalerFloat32(res, coef); | |||||
GiStoreFloat32(dst, res); | |||||
} | |||||
void post_remain(float* dst) { *dst = remain * coef; } | |||||
}; | |||||
/******************************max min Reducer****************************/ | |||||
template <typename dtype, typename ctype, typename comp_type, bool C1> | |||||
struct maxReducer; | |||||
template <typename dtype, typename ctype, typename comp_type, bool C1> | |||||
struct minReducer; | |||||
#define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \ | |||||
template <> \ | |||||
struct _mode##Reducer<dt_float32, float, float, true> { \ | |||||
using ctype = float; \ | |||||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | |||||
GI_FLOAT32 res; \ | |||||
_mode##Reducer(DType, size_t) { res = GiBroadcastFloat32(_init); } \ | |||||
_mode##Reducer() = default; \ | |||||
void feed(const float* val) { \ | |||||
auto vval = GiLoadFloat32(val); \ | |||||
res = Gi##_Mode##imumFloat32(vval, res); \ | |||||
} \ | |||||
void feed_remain(const float* val) { \ | |||||
auto vval = GiBroadcastFloat32(*val); \ | |||||
res = Gi##_Mode##imumFloat32(vval, res); \ | |||||
} \ | |||||
void post(float* dst) { *dst = GiReduce##_Mode##imumFloat32(res); } \ | |||||
} | |||||
REDUCER_MAX_MIN_C1(max, Max, std::numeric_limits<dt_float32>::lowest()); | |||||
REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max()); | |||||
#undef REDUCER_MAX_MIN_C1 | |||||
#define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \ | |||||
template <> \ | |||||
struct _mode##Reducer<dt_float32, float, float, false> { \ | |||||
using ctype = float; \ | |||||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | |||||
GI_FLOAT32 res; \ | |||||
float remain; \ | |||||
_mode##Reducer(DType, size_t) { \ | |||||
res = GiBroadcastFloat32(_init); \ | |||||
remain = _init; \ | |||||
} \ | |||||
_mode##Reducer() = default; \ | |||||
void feed(const float* val) { \ | |||||
GI_FLOAT32 vval = GiLoadFloat32(val); \ | |||||
res = Gi##_Mode##imumFloat32(vval, res); \ | |||||
} \ | |||||
void feed_remain(const float* val) { \ | |||||
using namespace std; \ | |||||
remain = _mode(*val, remain); \ | |||||
} \ | |||||
void post(float* dst) { GiStoreFloat32(dst, res); } \ | |||||
void post_remain(float* dst) { *dst = remain; } \ | |||||
} | |||||
REDUCER_MAX_MIN_C(max, Max, std::numeric_limits<dt_float32>::lowest()); | |||||
REDUCER_MAX_MIN_C(min, Min, std::numeric_limits<dt_float32>::max()); | |||||
#undef REDUCER_MAX_MIN_C | |||||
#define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \ | |||||
template <> \ | |||||
struct _mode##Reducer<dt_qint8, int8_t, int8_t, true> { \ | |||||
using ctype = int8_t; \ | |||||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \ | |||||
GI_INT8 res; \ | |||||
_mode##Reducer(DType, size_t) { res = GiBroadcastInt8(_init); } \ | |||||
_mode##Reducer() = default; \ | |||||
void feed(const int8_t* val) { \ | |||||
GI_INT8 vval = GiLoadInt8(val); \ | |||||
res = Gi##_Mode##imumInt8(vval, res); \ | |||||
} \ | |||||
void feed_remain(const int8_t* val) { \ | |||||
GI_INT8 vval = GiBroadcastInt8(*val); \ | |||||
res = Gi##_Mode##imumInt8(vval, res); \ | |||||
} \ | |||||
void post(int8_t* dst) { *dst = GiReduce##_Mode##Int8(res); } \ | |||||
} | |||||
REDUCER_MAX_MIN_C1(max, Max, -128); | |||||
REDUCER_MAX_MIN_C1(min, Min, 127); | |||||
#undef REDUCER_MAX_MIN_C1 | |||||
#define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \ | |||||
template <> \ | |||||
struct _mode##Reducer<dt_qint8, int8_t, int8_t, false> { \ | |||||
using ctype = int8_t; \ | |||||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \ | |||||
GI_INT8 res; \ | |||||
int8_t remain; \ | |||||
_mode##Reducer(DType, size_t) { \ | |||||
res = GiBroadcastInt8(_init); \ | |||||
remain = _init; \ | |||||
} \ | |||||
_mode##Reducer() = default; \ | |||||
void feed(const int8_t* val) { \ | |||||
GI_INT8 vval = GiLoadInt8(val); \ | |||||
res = Gi##_Mode##imumInt8(vval, res); \ | |||||
} \ | |||||
void feed_remain(const int8_t* val) { \ | |||||
using namespace std; \ | |||||
remain = _mode(*val, remain); \ | |||||
} \ | |||||
void post(int8_t* dst) { GiStoreInt8(dst, res); } \ | |||||
void post_remain(int8_t* dst) { *dst = remain; } \ | |||||
} | |||||
REDUCER_MAX_MIN_C(max, Max, -128); | |||||
REDUCER_MAX_MIN_C(min, Min, 127); | |||||
#undef REDUCER_MAX_MIN_C | |||||
/***************************Sum Product Reducer***************************/ | |||||
template <typename dtype, typename ctype, typename comp_type, bool C1> | |||||
struct SumReducer; | |||||
template <typename dtype, typename ctype, typename comp_type, bool C1> | |||||
struct ProductReducer; | |||||
#define REDUCER_SUM_PRODUCT_C1(_mode, _Mode, _op, _init) \ | |||||
template <> \ | |||||
struct _mode##Reducer<dt_float32, float, float, true> { \ | |||||
using ctype = float; \ | |||||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | |||||
GI_FLOAT32 res; \ | |||||
float remain; \ | |||||
_mode##Reducer(DType, size_t) { \ | |||||
res = GiBroadcastFloat32(_init); \ | |||||
remain = _init; \ | |||||
} \ | |||||
_mode##Reducer() = default; \ | |||||
void feed(const float* val) { \ | |||||
GI_FLOAT32 vval = GiLoadFloat32(val); \ | |||||
res = Gi##_Mode##Float32(vval, res); \ | |||||
} \ | |||||
void feed_remain(const float* val) { \ | |||||
using namespace std; \ | |||||
auto op = _op<float>(); \ | |||||
remain = op(remain, *val); \ | |||||
} \ | |||||
void post(float* dst) { \ | |||||
using namespace std; \ | |||||
auto op = _op<float>(); \ | |||||
*dst = op(remain, GiReduce##_Mode##Float32(res)); \ | |||||
} \ | |||||
} | |||||
REDUCER_SUM_PRODUCT_C1(Sum, Add, plus, 0.0f); | |||||
REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f); | |||||
#undef REDUCER_SUM_PRODUCT_C1 | |||||
#define REDUCER_SUM_PRODUCT_C(_mode, _Mode, _op, _init) \ | |||||
template <> \ | |||||
struct _mode##Reducer<dt_float32, float, float, false> { \ | |||||
using ctype = float; \ | |||||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | |||||
GI_FLOAT32 res; \ | |||||
float remain; \ | |||||
_mode##Reducer(DType, size_t) { \ | |||||
res = GiBroadcastFloat32(_init); \ | |||||
remain = _init; \ | |||||
} \ | |||||
_mode##Reducer() = default; \ | |||||
void feed(const float* val) { \ | |||||
GI_FLOAT32 vval = GiLoadFloat32(val); \ | |||||
res = Gi##_Mode##Float32(vval, res); \ | |||||
} \ | |||||
void feed_remain(const float* val) { \ | |||||
using namespace std; \ | |||||
auto op = _op<float>(); \ | |||||
remain = op(remain, (*val)); \ | |||||
} \ | |||||
void post(float* dst) { GiStoreFloat32(dst, res); } \ | |||||
void post_remain(float* dst) { *dst = remain; } \ | |||||
} | |||||
REDUCER_SUM_PRODUCT_C(Sum, Add, plus, 0.0f); | |||||
REDUCER_SUM_PRODUCT_C(Product, Multiply, multiplies, 1.0f); | |||||
#undef REDUCER_SUM_PRODUCT_C | |||||
/***************************SumSqr Reducer***************************/ | |||||
template <typename dtype, typename ctype, typename comp_type, bool C1> | |||||
struct SumSqrReducer; | |||||
template <> | |||||
struct SumSqrReducer<dt_float32, float, float, true> { | |||||
using ctype = float; | |||||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | |||||
GI_FLOAT32 res; | |||||
float result; | |||||
SumSqrReducer(DType, size_t cnt) : result(0.0f) { | |||||
MEGDNN_MARK_USED_VAR(cnt); | |||||
res = GiBroadcastFloat32(0.0f); | |||||
} | |||||
SumSqrReducer() = default; | |||||
void feed(const float* val) { | |||||
GI_FLOAT32 vval = GiLoadFloat32(val); | |||||
res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res); | |||||
} | |||||
void feed_remain(const float* val) { | |||||
float vval = *val; | |||||
result += vval * vval; | |||||
} | |||||
void post(float* dst) { | |||||
result += GiReduceAddFloat32(res); | |||||
*dst = result; | |||||
} | |||||
}; | |||||
template <> | |||||
struct SumSqrReducer<dt_float32, float, float, false> { | |||||
using ctype = float; | |||||
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | |||||
GI_FLOAT32 res; | |||||
float remain; | |||||
SumSqrReducer(DType, size_t cnt) : remain(0.0f) { | |||||
MEGDNN_MARK_USED_VAR(cnt); | |||||
res = GiBroadcastFloat32(0.0f); | |||||
} | |||||
SumSqrReducer() = default; | |||||
void feed(const float* val) { | |||||
GI_FLOAT32 vval = GiLoadFloat32(val); | |||||
res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res); | |||||
} | |||||
void feed_remain(const float* val) { remain += (*val) * (*val); } | |||||
void post(float* dst) { GiStoreFloat32(dst, res); } | |||||
void post_remain(float* dst) { *dst = remain; } | |||||
}; | |||||
/**************************************do reduce*************************/ | |||||
template <typename Reducer, bool C1> | |||||
struct Exec { | |||||
static void do_reduce( | |||||
const typename Reducer::ctype* src, typename Reducer::ctype* dst, | |||||
DType src_dtype, size_t A, size_t B, size_t C); | |||||
}; | |||||
template <typename Reducer> | |||||
struct Exec<Reducer, true> { | |||||
static void do_reduce( | |||||
const typename Reducer::ctype* src, typename Reducer::ctype* dst, | |||||
DType src_dtype, size_t A, size_t B, size_t) { | |||||
size_t a = 0; | |||||
for (; a < A; a++) { | |||||
Reducer reducer0(src_dtype, B); | |||||
auto temp_src0 = src + a * B; | |||||
size_t b = 0; | |||||
for (; b + Reducer::SIMD_WIDTH <= B; b += Reducer::SIMD_WIDTH) { | |||||
reducer0.feed(temp_src0); | |||||
temp_src0 += Reducer::SIMD_WIDTH; | |||||
} | |||||
for (; b < B; b++) { | |||||
reducer0.feed_remain(temp_src0); | |||||
temp_src0++; | |||||
} | |||||
reducer0.post(dst); | |||||
dst++; | |||||
} | |||||
} | |||||
}; | |||||
template <typename Reducer> | |||||
struct Exec<Reducer, false> { | |||||
static void do_reduce( | |||||
const typename Reducer::ctype* src, typename Reducer::ctype* dst, | |||||
DType src_dtype, size_t A, size_t B, size_t C) { | |||||
for (size_t a = 0; a < A; a++) { | |||||
size_t c = 0; | |||||
for (; c + Reducer::SIMD_WIDTH <= C; c += Reducer::SIMD_WIDTH) { | |||||
Reducer reducer(src_dtype, B); | |||||
for (size_t b = 0; b < B; b++) | |||||
reducer.feed(src + c + C * b); | |||||
reducer.post(dst); | |||||
dst += Reducer::SIMD_WIDTH; | |||||
} | |||||
for (; c < C; c++) { | |||||
Reducer reducer(src_dtype, B); | |||||
for (size_t b = 0; b < B; b++) | |||||
reducer.feed_remain(src + c + C * b); | |||||
reducer.post_remain(dst); | |||||
dst++; | |||||
} | |||||
src += B * C; | |||||
} | |||||
} | |||||
}; | |||||
} // namespace | |||||
// vim: syntax=cpp.doxygen |
@@ -181,7 +181,6 @@ TEST_F(ARM_COMMON, LSTM_FORWARD_RECORD) { | |||||
TEST_F(ARM_COMMON, BENCHMARK_LSTM_FORWARD) { | TEST_F(ARM_COMMON, BENCHMARK_LSTM_FORWARD) { | ||||
Benchmarker<LSTM> optimized_bench(handle()); | Benchmarker<LSTM> optimized_bench(handle()); | ||||
constexpr size_t RUNS = 20; | |||||
auto run = [&](size_t hidden_size, size_t input_size) { | auto run = [&](size_t hidden_size, size_t input_size) { | ||||
optimized_bench.set_times(20).set_display(true); | optimized_bench.set_times(20).set_display(true); | ||||
size_t gate_hidden_size = 4 * hidden_size; | size_t gate_hidden_size = 4 * hidden_size; | ||||
@@ -18,6 +18,75 @@ | |||||
using namespace megdnn; | using namespace megdnn; | ||||
using namespace test; | using namespace test; | ||||
TEST_F(FALLBACK, REDUCE_FULL) { | |||||
using Param = Reduce::Param; | |||||
using Mode = Param::Mode; | |||||
Checker<Reduce> checker(handle()); | |||||
UniformIntRNG rng{INT8_MIN >> 1, INT8_MAX >> 1}; | |||||
checker.set_rng(0, &rng); | |||||
struct Config { | |||||
Param param; | |||||
DType dtype; | |||||
TensorShape shape; | |||||
Config(Param param, DType dtype, TensorShape shape) | |||||
: param(param), dtype(dtype), shape(shape) {} | |||||
}; | |||||
std::vector<Config> configs; | |||||
for (auto mode : {Mode::MEAN, Mode::MAX, Mode::MIN}) | |||||
for (auto dtype : std::vector<DType>{ | |||||
dtype::Float32(), dtype::Float16(), dtype::QuantizedS8(1.3f), | |||||
dtype::Quantized8Asymm(1.3f, static_cast<uint8_t>(3))}) | |||||
for (int32_t axis : {0, 1, 2}) { | |||||
for (size_t A : {1, 3, 5}) { | |||||
for (size_t B : {4, 6, 9, 16, 33, 45}) { | |||||
for (size_t C : {4, 6, 9, 16, 33, 45}) { | |||||
TensorShape shape{A, B, C}; | |||||
Param param(mode, axis); | |||||
Config config(param, dtype, shape); | |||||
configs.push_back(config); | |||||
} | |||||
} | |||||
} | |||||
} | |||||
for (auto&& config : configs) { | |||||
auto&& dtype = config.dtype; | |||||
auto&& param = config.param; | |||||
auto&& shape = config.shape; | |||||
checker.set_dtype(0, dtype).set_param(param).execs({shape, {}}); | |||||
} | |||||
configs.clear(); | |||||
for (auto mode : {Mode::SUM, Mode::PRODUCT, Mode::SUM_SQR}) | |||||
for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) | |||||
for (int32_t axis : {0, 1, 2}) { | |||||
for (size_t A : {1, 3, 5}) { | |||||
for (size_t B : {4, 6, 9, 16, 33, 45}) { | |||||
for (size_t C : {4, 6, 9, 16, 33, 45}) { | |||||
TensorShape shape{A, B, C}; | |||||
Param param(mode, axis); | |||||
Config config(param, dtype, shape); | |||||
configs.push_back(config); | |||||
} | |||||
} | |||||
} | |||||
} | |||||
UniformFloatRNG rng_float(-2, 2); | |||||
checker.set_rng(0, &rng_float); | |||||
checker.set_epsilon(1e-1); | |||||
for (auto&& config : configs) { | |||||
auto&& dtype = config.dtype; | |||||
auto&& param = config.param; | |||||
auto&& shape = config.shape; | |||||
if (dtype == dtype::Float16()) | |||||
checker.set_epsilon(1e-1); | |||||
else | |||||
checker.set_epsilon(1e-3); | |||||
checker.set_dtype(0, dtype).set_param(param).execs({shape, {}}); | |||||
} | |||||
} | |||||
TEST_F(FALLBACK, REDUCE) { | TEST_F(FALLBACK, REDUCE) { | ||||
using Param = Reduce::Param; | using Param = Reduce::Param; | ||||
using Mode = Param::Mode; | using Mode = Param::Mode; | ||||