You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gi_float.h 40 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327
  1. /**
  2. * \file dnn/src/fallback/general_intrinsic/gi_float.h
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2022 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #pragma once
  12. #include "gi_common.h"
  13. GI_FORCEINLINE
  14. GI_INT32_t GiReinterpretAsInt32(GI_FLOAT32_t In) {
  15. #if defined(GI_NEON_INTRINSICS)
  16. return vreinterpretq_s32_f32(In);
  17. #elif defined(GI_SSE2_INTRINSICS)
  18. return _mm_castps_si128(In);
  19. #else
  20. return (GI_INT32_t)In;
  21. #endif
  22. }
  23. GI_FORCEINLINE
  24. GI_UINT32_t GiReinterpretAsUint32(GI_FLOAT32_t In) {
  25. #if defined(GI_NEON_INTRINSICS)
  26. return vreinterpretq_u32_f32(In);
  27. #elif defined(GI_SSE2_INTRINSICS)
  28. return _mm_castps_si128(In);
  29. #else
  30. return (GI_UINT32_t)In;
  31. #endif
  32. }
  33. GI_FORCEINLINE
  34. GI_FLOAT32_t GiReintInt32ToFloat32(GI_INT32_t Vector) {
  35. #if defined(GI_NEON_INTRINSICS)
  36. return vreinterpretq_f32_s32(Vector);
  37. #elif defined(GI_SSE2_INTRINSICS)
  38. return _mm_castsi128_ps(Vector);
  39. #else
  40. return (GI_FLOAT32_t)Vector;
  41. #endif
  42. }
  43. GI_FORCEINLINE
  44. GI_FLOAT32_t GiReintUint32ToFloat32(GI_UINT32_t Vector) {
  45. #if defined(GI_NEON_INTRINSICS)
  46. return vreinterpretq_f32_u32(Vector);
  47. #elif defined(GI_SSE2_INTRINSICS)
  48. return _mm_castsi128_ps(Vector);
  49. #else
  50. return (GI_FLOAT32_t)Vector;
  51. #endif
  52. }
  53. GI_FORCEINLINE
  54. GI_INT32_t GiRoundAsInt32(GI_FLOAT32_t Vector) {
  55. #if defined(GI_NEON_INTRINSICS)
  56. #if __ARM_ARCH >= 8
  57. return vcvtaq_s32_f32(Vector);
  58. #else
  59. float32x4_t vinc0 = vbslq_f32(vcgeq_f32(Vector, vfzero), vfhalf, vfneg_half);
  60. return vcvtq_s32_f32(vaddq_f32(Vector, vinc0));
  61. #endif
  62. #elif defined(GI_SSE42_INTRINSICS)
  63. __m128 vinc0 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(Vector, vfzero));
  64. return _mm_cvttps_epi32(_mm_add_ps(Vector, vinc0));
  65. #else
  66. GI_INT32_t ret;
  67. GI_INT32_NAIVE_t tmp_ret;
  68. GI_FLOAT32_NAIVE_t s0;
  69. memcpy(&s0, &Vector, sizeof(GI_FLOAT32_NAIVE_t));
  70. for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
  71. tmp_ret[i] = (int32_t)round(s0[i]);
  72. }
  73. memcpy(&ret, &tmp_ret, sizeof(GI_INT32_t));
  74. return ret;
  75. #endif
  76. }
  77. GI_FORCEINLINE
  78. GI_INT32_t GiCastToInt32(GI_FLOAT32_t Vector) {
  79. #if defined(GI_NEON_INTRINSICS)
  80. return vcvtq_s32_f32(Vector);
  81. #elif defined(GI_SSE2_INTRINSICS)
  82. return _mm_cvttps_epi32(Vector);
  83. #else
  84. GI_INT32_t ret;
  85. for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
  86. ret[i] = (int32_t)(Vector[i]);
  87. }
  88. return ret;
  89. #endif
  90. }
  91. GI_FORCEINLINE
  92. GI_FLOAT32_t GiCastToFloat32(GI_INT32_t Vector) {
  93. #if defined(GI_NEON_INTRINSICS)
  94. return vcvtq_f32_s32(Vector);
  95. #elif defined(GI_SSE2_INTRINSICS)
  96. return _mm_cvtepi32_ps(Vector);
  97. #else
  98. GI_FLOAT32_t ret;
  99. for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
  100. ret[i] = (float)Vector[i];
  101. }
  102. return ret;
  103. #endif
  104. }
  105. GI_FORCEINLINE
  106. GI_FLOAT32_t GiLoadBroadcastFloat32(const float* Value) {
  107. #if defined(GI_NEON_INTRINSICS)
  108. return vld1q_dup_f32(Value);
  109. #elif defined(GI_SSE2_INTRINSICS)
  110. return _mm_load_ps1(Value);
  111. #else
  112. GI_FLOAT32_t ret;
  113. for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
  114. ret[i] = *Value;
  115. }
  116. return ret;
  117. #endif
  118. }
  119. GI_FORCEINLINE
  120. GI_FLOAT32_t GiZeroFloat32(void) {
  121. #if defined(GI_NEON_INTRINSICS)
  122. return vdupq_n_f32(0.0f);
  123. #elif defined(GI_SSE2_INTRINSICS)
  124. return _mm_setzero_ps();
  125. #else
  126. return GiBroadcastFloat32(0.0f);
  127. #endif
  128. }
  129. GI_FORCEINLINE
  130. GI_FLOAT32_t GiLoadFloat32(const float* Buffer) {
  131. #if defined(GI_NEON_INTRINSICS)
  132. return vld1q_f32(Buffer);
  133. #elif defined(GI_SSE2_INTRINSICS)
  134. if ((((uintptr_t)(Buffer)) & 15) == 0)
  135. return _mm_load_ps(Buffer);
  136. else
  137. return _mm_loadu_ps(Buffer);
  138. #else
  139. GI_FLOAT32_t ret;
  140. for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
  141. ret[i] = Buffer[i];
  142. }
  143. return ret;
  144. #endif
  145. }
  146. GI_FORCEINLINE
  147. GI_FLOAT32_t GiLoadFloat32LowHalf(const float* Buffer) {
  148. #if defined(GI_NEON_INTRINSICS)
  149. return vcombine_f32(vld1_f32(Buffer), vdup_n_f32(0.f));
  150. #elif defined(GI_SSE2_INTRINSICS)
  151. typedef __m64_128 float32x2_t;
  152. float32x2_t low, high;
  153. low.m64_f32[0] = Buffer[0];
  154. low.m64_f32[1] = Buffer[1];
  155. high.m64_f32[0] = 0;
  156. high.m64_f32[1] = 0;
  157. __m128i res = _mm_unpacklo_epi64(_pM128i(low), _pM128i(high));
  158. return _M128(res);
  159. #else
  160. GI_FLOAT32_t ret;
  161. memset(&ret, 0, sizeof(GI_FLOAT32_t));
  162. for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float) / 2; i++) {
  163. ret[i] = Buffer[i];
  164. }
  165. return ret;
  166. #endif
  167. }
  168. GI_FORCEINLINE
  169. GI_FLOAT32_t GiMlaqFloat32(GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t c) {
  170. #if defined(GI_NEON_INTRINSICS)
  171. #if defined(__ARM_FEATURE_FMA)
  172. return vfmaq_f32(a, b, c);
  173. #else
  174. return vmlaq_f32(a, b, c);
  175. #endif
  176. #elif defined(GI_SSE2_INTRINSICS)
  177. // fma is coming soon, but right now:
  178. __m128 res;
  179. res = _mm_mul_ps(c, b);
  180. return _mm_add_ps(a, res);
  181. #else
  182. GI_FLOAT32_t ret;
  183. for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
  184. ret[i] = a[i] + (b[i] * c[i]);
  185. }
  186. return ret;
  187. #endif
  188. }
  189. GI_FORCEINLINE GI_FLOAT32_V2_t GiUzpqFloat32(GI_FLOAT32_t a, GI_FLOAT32_t b) {
  190. #if defined(GI_NEON_INTRINSICS)
  191. return vuzpq_f32(a, b);
  192. #elif defined(GI_SSE2_INTRINSICS)
  193. GI_FLOAT32_V2_t v32x4;
  194. v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
  195. v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
  196. return v32x4;
  197. #else
  198. GI_FLOAT32_V2_t ret;
  199. ret.val[0][0] = a[0];
  200. ret.val[0][1] = a[2];
  201. ret.val[0][2] = b[0];
  202. ret.val[0][3] = b[2];
  203. ret.val[1][0] = a[1];
  204. ret.val[1][1] = a[3];
  205. ret.val[1][2] = b[1];
  206. ret.val[1][3] = b[3];
  207. return ret;
  208. #endif
  209. }
  210. GI_FORCEINLINE float32x2_t GiDupFloat32(float a) {
  211. #if defined(GI_NEON_INTRINSICS)
  212. return vdup_n_f32(a);
  213. #elif defined(GI_SSE2_INTRINSICS)
  214. float32x2_t res;
  215. res.m64_f32[0] = a;
  216. res.m64_f32[1] = a;
  217. return res;
  218. #else
  219. float32x2_t res;
  220. res[0] = a;
  221. res[1] = a;
  222. return res;
  223. #endif
  224. }
  225. GI_FORCEINLINE float32x2_t GiLdFloat32(float const* ptr) {
  226. #if defined(GI_NEON_INTRINSICS)
  227. return vld1_f32(ptr);
  228. #elif defined(GI_SSE2_INTRINSICS)
  229. float32x2_t res;
  230. res.m64_f32[0] = *(ptr);
  231. res.m64_f32[1] = *(ptr + 1);
  232. return res;
  233. #else
  234. float32x2_t res;
  235. res[0] = *(ptr);
  236. res[1] = *(ptr + 1);
  237. return res;
  238. #endif
  239. }
  240. GI_FORCEINLINE float32x2_t GiAddDFloat32(float32x2_t a, float32x2_t b) {
  241. #if defined(GI_NEON_INTRINSICS)
  242. return vadd_f32(a, b);
  243. #elif defined(GI_SSE2_INTRINSICS)
  244. __m128 res;
  245. __m64_128 res64;
  246. res = _mm_add_ps(_pM128(a), _pM128(b)); // SSE, use only low 64 bits
  247. _M64f(res64, res);
  248. return res64;
  249. #else
  250. float32x2_t res;
  251. res[0] = a[0] + b[0];
  252. res[1] = a[1] + b[1];
  253. return res;
  254. #endif
  255. }
  256. #if defined(GI_NEON_INTRINSICS)
  257. #define GiGetLaneFloat32(v, lane) vget_lane_f32(v, lane)
  258. #else
  259. GI_FORCEINLINE float __gi_vget_lane_f32(float32x2_t v, const int lane) {
  260. #if defined(GI_SSE2_INTRINSICS)
  261. return _sse_vget_lane_f32(v, lane);
  262. #else
  263. return v[lane];
  264. #endif
  265. }
  266. #define GiGetLaneFloat32(v, lane) __gi_vget_lane_f32(v, lane)
  267. #endif
  268. #if defined(GI_NEON_INTRINSICS)
  269. #define GiSetLaneFloat32(value, vec, lane) vset_lane_f32(value, vec, lane)
  270. #else
  271. GI_FORCEINLINE float32x2_t
  272. __gi_vset_lane_f32(float32_t value, float32x2_t vec, int lane) {
  273. #if defined(GI_SSE2_INTRINSICS)
  274. float32x2_t res;
  275. res = vec;
  276. res.m64_f32[lane] = value;
  277. return res;
  278. #else
  279. float32x2_t res;
  280. res = vec;
  281. res[lane] = value;
  282. return res;
  283. #endif
  284. }
  285. #define GiSetLaneFloat32(value, vec, lane) __gi_vset_lane_f32(value, vec, lane)
  286. #endif
  287. GI_FORCEINLINE void GiSt1Float32(float* ptr, float32x2_t val) {
  288. #if defined(GI_NEON_INTRINSICS)
  289. return vst1_f32(ptr, val);
  290. #elif defined(GI_SSE2_INTRINSICS)
  291. *(ptr) = val.m64_f32[0];
  292. *(ptr + 1) = val.m64_f32[1];
  293. return;
  294. #else
  295. *(ptr) = val[0];
  296. *(ptr + 1) = val[1];
  297. return;
  298. #endif
  299. }
  300. GI_FORCEINLINE GI_FLOAT32_V2_t GiLd2qFloat32(const float* Buffer) {
  301. #if defined(GI_NEON_INTRINSICS)
  302. return vld2q_f32(Buffer);
  303. #elif defined(GI_SSE2_INTRINSICS)
  304. GI_FLOAT32_V2_t v;
  305. v.val[0] = GiLoadFloat32(Buffer);
  306. v.val[1] = GiLoadFloat32((Buffer + 4));
  307. v = GiUzpqFloat32(v.val[0], v.val[1]);
  308. return v;
  309. #else
  310. GI_FLOAT32_V2_t ret;
  311. ret.val[0][0] = Buffer[0];
  312. ret.val[0][1] = Buffer[2];
  313. ret.val[0][2] = Buffer[4];
  314. ret.val[0][3] = Buffer[6];
  315. ret.val[1][0] = Buffer[1];
  316. ret.val[1][1] = Buffer[3];
  317. ret.val[1][2] = Buffer[5];
  318. ret.val[1][3] = Buffer[7];
  319. return ret;
  320. #endif
  321. }
  322. #if defined(GI_NEON_INTRINSICS)
  323. #define GiExtqFloat32(a, b, n) vextq_f32(a, b, n)
  324. #elif defined(GI_SSE2_INTRINSICS)
  325. #define GiExtqFloat32(a, b, n) _M128(_sse_vextq_s32(_M128i(a), _M128i(b), n));
  326. #else
  327. GI_FORCEINLINE GI_FLOAT32_t
  328. __naive_gi_vextq_f32(GI_FLOAT32_t a, GI_FLOAT32_t b, const int n) {
  329. GI_FLOAT32_t ret;
  330. int t_count = GI_SIMD_LEN_BYTE / sizeof(float);
  331. int a_count = t_count - n;
  332. for (int i = 0; i < a_count; i++) {
  333. ret[i] = a[i + n];
  334. }
  335. for (int i = 0; i < n; i++) {
  336. ret[i + a_count] = b[i];
  337. }
  338. return ret;
  339. }
  340. #define GiExtqFloat32(a, b, n) __naive_gi_vextq_f32(a, b, n)
  341. #endif
  342. GI_FORCEINLINE
  343. GI_FLOAT32_t GiMultiplySubFloat32(
  344. GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
  345. #if defined(GI_NEON_INTRINSICS)
  346. return vmlsq_f32(VectorSum, Vector1, Vector2);
  347. #elif defined(GI_SSE2_INTRINSICS)
  348. return _mm_sub_ps(VectorSum, _mm_mul_ps(Vector1, Vector2));
  349. #else
  350. GI_FLOAT32_t ret;
  351. for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
  352. ret[i] = VectorSum[i] - Vector1[i] * Vector2[i];
  353. }
  354. return ret;
  355. #endif
  356. }
  357. #if defined(GI_SSE2_INTRINSICS)
  358. GI_FORCEINLINE GI_FLOAT32_t
  359. _MM_INSERT_PS(GI_FLOAT32_t vec, GI_FLOAT32_t p, const int LANE) {
  360. _GI_ALIGN_16 uint32_t mask[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff};
  361. __m128 tmp, vec_masked, p_masked;
  362. mask[LANE >> 4] = 0x0;
  363. vec_masked = _mm_and_ps(*(__m128*)mask, vec);
  364. p_masked = _mm_andnot_ps(*(__m128*)mask, p);
  365. tmp = _mm_or_ps(vec_masked, p_masked);
  366. return tmp;
  367. }
  368. GI_FORCEINLINE float32x2_t sse_vget_high_f32(GI_FLOAT32_t a) {
  369. __m128i res;
  370. __m64_128 res64;
  371. res = _mm_unpackhi_epi64(_M128i(a), _M128i(a));
  372. return64(res);
  373. }
  374. GI_FORCEINLINE float32x2_t sse_vget_low_f32(GI_FLOAT32_t a) {
  375. float32x2_t res64;
  376. _M64f(res64, a);
  377. return res64;
  378. }
  379. GI_FORCEINLINE GI_FLOAT32_t
  380. sse_vmlaq_lane_f32(GI_FLOAT32_t a, GI_FLOAT32_t b, float32x2_t v, int l) {
  381. float32_t vlane;
  382. GI_FLOAT32_t c;
  383. vlane = _sse_vget_lane_f32(v, l);
  384. c = _mm_set1_ps(vlane);
  385. return GiMlaqFloat32(a, b, c);
  386. }
  387. GI_FORCEINLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE) {
  388. _GI_ALIGN_16 int32_t tmp[4];
  389. _mm_store_si128((__m128i*)tmp, _M128i(vec));
  390. return tmp[LANE];
  391. }
  392. GI_FORCEINLINE float32_t sse_vgetq_lane_f32(GI_FLOAT32_t vec, int lane) {
  393. float32_t floatVal;
  394. char* const floatVal_c = (char*)&floatVal;
  395. *((int32_t*)floatVal_c) = _MM_EXTRACT_PS(vec, lane);
  396. return floatVal;
  397. }
  398. GI_FORCEINLINE GI_FLOAT32_t
  399. sse_vmlsq_lane_f32(GI_FLOAT32_t a, GI_FLOAT32_t b, float32x2_t v, int l) {
  400. float32_t vlane;
  401. GI_FLOAT32_t c;
  402. vlane = (float)GiGetLaneFloat32(v, l);
  403. c = GiBroadcastFloat32(vlane);
  404. return GiMultiplySubFloat32(a, b, c);
  405. }
  406. #endif
  407. #if defined(GI_NEON_INTRINSICS)
  408. #define GiLd1qLaneFloat32(Buffer, src, n) vld1q_lane_f32(Buffer, src, n)
  409. #else
  410. GI_FORCEINLINE GI_FLOAT32_t
  411. __gi_vld1q_lane_f32(const float* Buffer, GI_FLOAT32_t src, const int n) {
  412. #if defined(GI_SSE2_INTRINSICS)
  413. GI_FLOAT32_t p;
  414. p = _mm_set1_ps(*(Buffer));
  415. return _MM_INSERT_PS(src, p, _INSERTPS_NDX(0, n));
  416. #else
  417. GI_FLOAT32_t ret;
  418. memcpy(&ret, &src, sizeof(GI_FLOAT32_t));
  419. ret[n] = *Buffer;
  420. return ret;
  421. #endif
  422. }
  423. #define GiLd1qLaneFloat32(Buffer, src, n) __gi_vld1q_lane_f32(Buffer, src, n)
  424. #endif
  425. #if defined(GI_NEON_INTRINSICS)
  426. #define GiSetqLaneFloat32(value, vec, lane) vsetq_lane_f32(value, vec, lane)
  427. #else
  428. GI_FORCEINLINE GI_FLOAT32_t
  429. __gi_vsetq_lane_f32(float value, GI_FLOAT32_t vec, const int lane) {
  430. float val = value;
  431. return GiLd1qLaneFloat32(&val, vec, lane);
  432. }
  433. #define GiSetqLaneFloat32(value, vec, lane) __gi_vsetq_lane_f32(value, vec, lane)
  434. #endif
  435. #if defined(GI_NEON_INTRINSICS)
  436. #define GiMlaqLaneFloat32HighHalf(a, b, v, lane) \
  437. vmlaq_lane_f32(a, b, vget_high_f32(v), lane)
  438. #elif defined(GI_SSE2_INTRINSICS)
  439. #define GiMlaqLaneFloat32HighHalf(a, b, v, lane) \
  440. sse_vmlaq_lane_f32(a, b, sse_vget_high_f32(v), lane)
  441. #else
  442. GI_FORCEINLINE GI_FLOAT32_t __naive_gi_vmlaq_lane_f32_high_half(
  443. GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t v, const int lane) {
  444. GI_FLOAT32_t ret;
  445. for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
  446. ret[i] = a[i] + (b[i] * v[lane + 2]);
  447. }
  448. return ret;
  449. }
  450. #define GiMlaqLaneFloat32HighHalf(a, b, v, lane) \
  451. __naive_gi_vmlaq_lane_f32_high_half(a, b, v, lane)
  452. #endif
  453. #if defined(GI_NEON_INTRINSICS)
  454. #define GiVmlaqLaneFloat32LowHalf(a, b, v, lane) \
  455. vmlaq_lane_f32(a, b, vget_low_f32(v), lane)
  456. #elif defined(GI_SSE2_INTRINSICS)
  457. #define GiVmlaqLaneFloat32LowHalf(a, b, v, lane) \
  458. sse_vmlaq_lane_f32(a, b, sse_vget_low_f32(v), lane)
  459. #else
  460. GI_FORCEINLINE GI_FLOAT32_t __naive_gi_vmlaq_lane_f32_low_half(
  461. GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t v, const int lane) {
  462. GI_FLOAT32_t ret;
  463. for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
  464. ret[i] = a[i] + (b[i] * v[lane]);
  465. }
  466. return ret;
  467. }
  468. #define GiVmlaqLaneFloat32LowHalf(a, b, v, lane) \
  469. __naive_gi_vmlaq_lane_f32_low_half(a, b, v, lane)
  470. #endif
  471. GI_FORCEINLINE
  472. void GiStoreFloat32(float* Buffer, GI_FLOAT32_t Vector) {
  473. #if defined(GI_NEON_INTRINSICS)
  474. vst1q_f32(Buffer, Vector);
  475. #elif defined(GI_SSE2_INTRINSICS)
  476. _mm_storeu_ps(Buffer, Vector);
  477. #else
  478. for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
  479. Buffer[i] = Vector[i];
  480. }
  481. #endif
  482. }
  483. #if defined(GI_NEON_INTRINSICS)
  484. #define GISTORELANEFLOAT32(i) \
  485. GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \
  486. vst1q_lane_f32(Buffer, Vector, i); \
  487. }
  488. #elif defined(GI_SSE2_INTRINSICS)
  489. #define GISTORELANEFLOAT32(i) \
  490. GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \
  491. _mm_store_ss(Buffer, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \
  492. }
  493. #else
  494. #define GISTORELANEFLOAT32(i) \
  495. GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \
  496. *Buffer = Vector[i]; \
  497. }
  498. #endif
  499. GISTORELANEFLOAT32(0)
  500. GISTORELANEFLOAT32(1)
  501. GISTORELANEFLOAT32(2)
  502. GISTORELANEFLOAT32(3)
  503. #undef GISTORELANEFLOAT32
  504. #if defined(GI_NEON_INTRINSICS)
  505. #define GIEXTRACTLANEFLOAT32(i) \
  506. GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \
  507. return vgetq_lane_f32(Vector, i); \
  508. }
  509. #elif defined(GI_SSE2_INTRINSICS)
  510. #define GIEXTRACTLANEFLOAT32(i) \
  511. GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \
  512. return _mm_cvtss_f32(_mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \
  513. }
  514. #else
  515. #define GIEXTRACTLANEFLOAT32(i) \
  516. GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \
  517. return Vector[i]; \
  518. }
  519. #endif
  520. GIEXTRACTLANEFLOAT32(0)
  521. GIEXTRACTLANEFLOAT32(1)
  522. GIEXTRACTLANEFLOAT32(2)
  523. GIEXTRACTLANEFLOAT32(3)
  524. #undef GIEXTRACTLANEFLOAT32
  525. GI_FORCEINLINE
  526. GI_FLOAT32_V2_t GiZipqFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
  527. #if defined(GI_NEON_INTRINSICS)
  528. return vzipq_f32(Vector1, Vector2);
  529. #elif defined(GI_SSE2_INTRINSICS)
  530. GI_FLOAT32_V2_t f32x4;
  531. f32x4.val[0] = _mm_unpacklo_ps(Vector1, Vector2);
  532. f32x4.val[1] = _mm_unpackhi_ps(Vector1, Vector2);
  533. return f32x4;
  534. #else
  535. GI_FLOAT32_V2_t ret;
  536. ret.val[0][0] = Vector1[0];
  537. ret.val[0][1] = Vector2[0];
  538. ret.val[0][2] = Vector1[1];
  539. ret.val[0][3] = Vector2[1];
  540. ret.val[1][0] = Vector1[2];
  541. ret.val[1][1] = Vector2[2];
  542. ret.val[1][2] = Vector1[3];
  543. ret.val[1][3] = Vector2[3];
  544. return ret;
  545. #endif
  546. }
  547. GI_FORCEINLINE
  548. GI_FLOAT32_t GiInterleaveLowFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
  549. #if defined(GI_NEON64_INTRINSICS)
  550. return vzip1q_f32(Vector1, Vector2);
  551. #elif defined(GI_NEON32_INTRINSICS)
  552. float32x4x2_t zipped = vzipq_f32(Vector1, Vector2);
  553. return zipped.val[0];
  554. #elif defined(GI_SSE2_INTRINSICS)
  555. return _mm_unpacklo_ps(Vector1, Vector2);
  556. #else
  557. GI_FLOAT32_t ret;
  558. for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) {
  559. ret[2 * i] = Vector1[i];
  560. ret[2 * i + 1] = Vector2[i];
  561. }
  562. return ret;
  563. #endif
  564. }
  565. GI_FORCEINLINE
  566. GI_FLOAT32_t GiInterleaveHighFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
  567. #if defined(GI_NEON64_INTRINSICS)
  568. return vzip2q_f32(Vector1, Vector2);
  569. #elif defined(GI_NEON32_INTRINSICS)
  570. float32x4x2_t zipped = vzipq_f32(Vector1, Vector2);
  571. return zipped.val[1];
  572. #elif defined(GI_SSE2_INTRINSICS)
  573. return _mm_unpackhi_ps(Vector1, Vector2);
  574. #else
  575. GI_FLOAT32_t ret;
  576. for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) {
  577. ret[2 * i] = Vector1[GI_SIMD_LEN_BYTE / 2 / sizeof(float) + i];
  578. ret[2 * i + 1] = Vector2[GI_SIMD_LEN_BYTE / 2 / sizeof(float) + i];
  579. }
  580. return ret;
  581. #endif
  582. }
  583. GI_FORCEINLINE
  584. GI_FLOAT32_t GiAddFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
  585. #if defined(GI_NEON_INTRINSICS)
  586. return vaddq_f32(Vector1, Vector2);
  587. #elif defined(GI_SSE2_INTRINSICS)
  588. return _mm_add_ps(Vector1, Vector2);
  589. #else
  590. return Vector1 + Vector2;
  591. #endif
  592. }
  593. GI_FORCEINLINE
  594. GI_FLOAT32_t GiSubtractFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
  595. #if defined(GI_NEON_INTRINSICS)
  596. return vsubq_f32(Vector1, Vector2);
  597. #elif defined(GI_SSE2_INTRINSICS)
  598. return _mm_sub_ps(Vector1, Vector2);
  599. #else
  600. return Vector1 - Vector2;
  601. #endif
  602. }
  603. GI_FORCEINLINE
  604. GI_FLOAT32_t GiMultiplyFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
  605. #if defined(GI_NEON_INTRINSICS)
  606. return vmulq_f32(Vector1, Vector2);
  607. #elif defined(GI_SSE2_INTRINSICS)
  608. return _mm_mul_ps(Vector1, Vector2);
  609. #else
  610. return Vector1 * Vector2;
  611. #endif
  612. }
  613. GI_FORCEINLINE
  614. GI_FLOAT32_t GiMultiplyScalerFloat32(GI_FLOAT32_t Vector1, float Scaler) {
  615. #if defined(GI_NEON_INTRINSICS)
  616. return vmulq_n_f32(Vector1, Scaler);
  617. #elif defined(GI_SSE2_INTRINSICS)
  618. GI_FLOAT32_t Vector2 = _mm_set1_ps(Scaler);
  619. return _mm_mul_ps(Vector1, Vector2);
  620. #else
  621. return Vector1 * Scaler;
  622. #endif
  623. }
  624. GI_FORCEINLINE
  625. GI_FLOAT32_t GiMultiplyAddFloat32(
  626. GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
  627. #if defined(GI_NEON_INTRINSICS)
  628. return v_fma_ps_f32(VectorSum, Vector1, Vector2);
  629. #elif defined(GI_FMA3_INTRINSICS)
  630. return _mm_fmadd_ps(Vector1, Vector2, VectorSum);
  631. #elif defined(GI_SSE2_INTRINSICS)
  632. return _mm_add_ps(_mm_mul_ps(Vector1, Vector2), VectorSum);
  633. #else
  634. return Vector1 * Vector2 + VectorSum;
  635. #endif
  636. }
  637. GI_FORCEINLINE
  638. GI_FLOAT32_t GiMultiplyAddScalarFloat32(
  639. GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector, float Scalar) {
  640. #if defined(GI_NEON_INTRINSICS)
  641. return v_fma_n_f32(VectorSum, Vector, Scalar);
  642. #elif defined(GI_SSE2_INTRINSICS)
  643. return GiMultiplyAddFloat32(VectorSum, GiBroadcastFloat32(Scalar), Vector);
  644. #else
  645. return VectorSum + Vector * Scalar;
  646. #endif
  647. }
  648. #if defined(GI_NEON_INTRINSICS)
  649. #define GIMULTIPLYADDLANFLOAT32(i) \
  650. GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \
  651. GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \
  652. return v_fma_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i); \
  653. }
  654. GIMULTIPLYADDLANFLOAT32(0)
  655. GIMULTIPLYADDLANFLOAT32(1)
  656. #undef GIMULTIPLYADDLANFLOAT32
  657. #define GIMULTIPLYADDLANFLOAT32(i) \
  658. GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \
  659. GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \
  660. return v_fma_lane_f32(VectorSum, Vector1, vget_high_f32(Vector2), i - 2); \
  661. }
  662. GIMULTIPLYADDLANFLOAT32(2)
  663. GIMULTIPLYADDLANFLOAT32(3)
  664. #undef GIMULTIPLYADDLANFLOAT32
  665. #else
  666. #define GIMULTIPLYADDLANFLOAT32(i) \
  667. GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \
  668. GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \
  669. return GiMultiplyAddScalarFloat32( \
  670. VectorSum, Vector1, GiExtractLane##i##Float32(Vector2)); \
  671. }
  672. GIMULTIPLYADDLANFLOAT32(0)
  673. GIMULTIPLYADDLANFLOAT32(1)
  674. GIMULTIPLYADDLANFLOAT32(2)
  675. GIMULTIPLYADDLANFLOAT32(3)
  676. #undef GIMULTIPLYADDLANFLOAT32
  677. #endif
  678. GI_FORCEINLINE
  679. GI_FLOAT32_t GiDivideFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
  680. #if defined(GI_NEON64_INTRINSICS)
  681. return vdivq_f32(Vector1, Vector2);
  682. #elif defined(GI_NEON32_INTRINSICS)
  683. float32x4_t recp = vrecpeq_f32(Vector2);
  684. recp = vmulq_f32(vrecpsq_f32(Vector2, recp), recp);
  685. return vmulq_f32(Vector1, recp);
  686. #elif defined(GI_SSE2_INTRINSICS)
  687. return _mm_div_ps(Vector1, Vector2);
  688. #else
  689. return Vector1 / Vector2;
  690. #endif
  691. }
  692. GI_FORCEINLINE
  693. GI_FLOAT32_t GiRecpeSFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
  694. #if defined(GI_NEON64_INTRINSICS)
  695. return vrecpsq_f32(Vector1, Vector2);
  696. #elif defined(GI_SSE2_INTRINSICS)
  697. GI_FLOAT32_t two = _mm_set1_ps(2.0f);
  698. return _mm_sub_ps(two, _mm_mul_ps(Vector1, Vector2));
  699. #else
  700. return (2.0f - Vector1 * Vector2);
  701. #endif
  702. }
  703. GI_FORCEINLINE
  704. GI_FLOAT32_t GiRecpeFloat32(GI_FLOAT32_t Vector) {
  705. #if defined(GI_NEON32_INTRINSICS)
  706. return vrecpeq_f32(Vector);
  707. #elif defined(GI_SSE2_INTRINSICS)
  708. GI_FLOAT32_t ones = _mm_set1_ps(1.0f);
  709. return _mm_div_ps(ones, Vector);
  710. #else
  711. //! FIXME: neon or sse always have low accuracy than 1/x
  712. return 1 / Vector;
  713. #endif
  714. }
  715. GI_FORCEINLINE
  716. GI_FLOAT32_t GiNegFloat32(GI_FLOAT32_t Vector) {
  717. #if defined(GI_NEON32_INTRINSICS)
  718. return vnegq_f32(Vector);
  719. #elif defined(GI_SSE2_INTRINSICS)
  720. GI_FLOAT32_t zero = _mm_set1_ps(0.0f);
  721. return _mm_sub_ps(zero, Vector);
  722. #else
  723. return -Vector;
  724. #endif
  725. }
  726. GI_FORCEINLINE
  727. GI_UINT32_t GiGreaterThanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
  728. #if defined(GI_NEON_INTRINSICS)
  729. return vcgtq_f32(Vector1, Vector2);
  730. #elif defined(GI_SSE2_INTRINSICS)
  731. return _mm_castps_si128(_mm_cmpgt_ps(Vector1, Vector2));
  732. #else
  733. GI_UINT32_t ret;
  734. for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
  735. ret[i] = Vector1[i] > Vector2[i] ? 0xFFFFFFFF : 0;
  736. }
  737. return ret;
  738. #endif
  739. }
  740. GI_FORCEINLINE
  741. GI_UINT32_t GiLessThanEqFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
  742. #if defined(GI_NEON_INTRINSICS)
  743. return vcleq_f32(Vector1, Vector2);
  744. #elif defined(GI_SSE2_INTRINSICS)
  745. return _mm_castps_si128(_mm_cmple_ps(Vector1, Vector2));
  746. #else
  747. GI_UINT32_t ret;
  748. for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
  749. ret[i] = Vector1[i] <= Vector2[i] ? 0xFFFFFFFF : 0;
  750. }
  751. return ret;
  752. #endif
  753. }
  754. GI_FORCEINLINE
  755. GI_UINT32_t GiLessThanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
  756. #if defined(GI_NEON_INTRINSICS)
  757. return vcltq_f32(Vector1, Vector2);
  758. #elif defined(GI_SSE2_INTRINSICS)
  759. return _mm_castps_si128(_mm_cmplt_ps(Vector1, Vector2));
  760. #else
  761. GI_UINT32_t ret;
  762. for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
  763. ret[i] = Vector1[i] < Vector2[i] ? 0xFFFFFFFF : 0;
  764. }
  765. return ret;
  766. #endif
  767. }
  768. GI_FORCEINLINE
  769. GI_FLOAT32_t GiAndFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
  770. #if defined(GI_SSE2_INTRINSICS)
  771. return _mm_and_ps(Vector1, Vector2);
  772. #else
  773. return GiReintInt32ToFloat32(
  774. GiAndInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2)));
  775. #endif
  776. }
  777. GI_FORCEINLINE
  778. GI_FLOAT32_t GiOrFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
  779. #if defined(GI_SSE2_INTRINSICS)
  780. return _mm_or_ps(Vector1, Vector2);
  781. #else
  782. return GiReintInt32ToFloat32(
  783. GiOrInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2)));
  784. #endif
  785. }
  786. GI_FORCEINLINE
  787. GI_FLOAT32_t GiAndNotFloat32(GI_FLOAT32_t VectorNot, GI_FLOAT32_t Vector) {
  788. #if defined(GI_SSE2_INTRINSICS)
  789. return _mm_andnot_ps(VectorNot, Vector);
  790. #else
  791. return GiReintInt32ToFloat32(GiAndNotInt32(
  792. GiReinterpretAsInt32(VectorNot), GiReinterpretAsInt32(Vector)));
  793. #endif
  794. }
  795. GI_FORCEINLINE
  796. GI_FLOAT32_t GiXorFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
  797. #if defined(GI_SSE2_INTRINSICS)
  798. return _mm_xor_ps(Vector1, Vector2);
  799. #else
  800. return GiReintInt32ToFloat32(
  801. GiXorInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2)));
  802. #endif
  803. }
  804. GI_FORCEINLINE
  805. GI_FLOAT32_t GiBlendFloat32(
  806. GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2, GI_FLOAT32_t Selection) {
  807. return GiOrFloat32(
  808. GiAndFloat32(Vector1, Selection), GiAndNotFloat32(Selection, Vector2));
  809. }
  810. #define MIN_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b);
  811. #define MAX_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b);
  812. GI_FORCEINLINE
  813. GI_FLOAT32_t GiBSLFloat32(
  814. GI_UINT32_t Selection, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
  815. #if defined(GI_NEON_INTRINSICS)
  816. return vbslq_f32(Selection, Vector1, Vector2);
  817. #else
  818. return GiBlendFloat32(Vector1, Vector2, GiReintUint32ToFloat32(Selection));
  819. #endif
  820. }
  821. GI_FORCEINLINE
  822. GI_FLOAT32_t GiMaximumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
  823. #if defined(GI_NEON_INTRINSICS)
  824. return vmaxq_f32(Vector1, Vector2);
  825. #elif defined(GI_NEON32_INTRINSICS)
  826. return _mm_max_ps(Vector1, Vector2);
  827. #else
  828. GI_FLOAT32_t max;
  829. for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
  830. max[i] = Max(Vector1[i], Vector2[i]);
  831. }
  832. return max;
  833. #endif
  834. }
  835. GI_FORCEINLINE
  836. GI_FLOAT32_t GiMinimumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
  837. #if defined(GI_NEON_INTRINSICS)
  838. return vminq_f32(Vector1, Vector2);
  839. #elif defined(GI_NEON32_INTRINSICS)
  840. return _mm_min_ps(Vector1, Vector2);
  841. #else
  842. GI_FLOAT32_t min;
  843. for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
  844. min[i] = Min(Vector1[i], Vector2[i]);
  845. }
  846. return min;
  847. #endif
  848. }
  849. GI_FORCEINLINE
  850. GI_FLOAT32_t GiMaxNanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
  851. #if defined(GI_NEON_INTRINSICS)
  852. return vmaxq_f32(Vector1, Vector2);
  853. #else
  854. //! _mm_max_ps does not fellow the IEEE standard when input is NAN, so
  855. //! implement by C code
  856. GI_FLOAT32_t max;
  857. for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
  858. max[i] = MAX_NAN(Vector1[i], Vector2[i]);
  859. }
  860. return max;
  861. #endif
  862. }
  863. GI_FORCEINLINE
  864. GI_FLOAT32_t GiMinNanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
  865. #if defined(GI_NEON_INTRINSICS)
  866. return vminq_f32(Vector1, Vector2);
  867. #else
  868. //! _mm_min_ps does not fellow the IEEE standard when input is NAN, so
  869. //! implement by C code
  870. GI_FLOAT32_t min;
  871. for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
  872. min[i] = MIN_NAN(Vector1[i], Vector2[i]);
  873. }
  874. return min;
  875. #endif
  876. }
  877. GI_FORCEINLINE
  878. GI_FLOAT32_t GiClampFloat32(GI_FLOAT32_t Value, float LowerRange, float UpperRange) {
  879. Value = GiMaximumFloat32(GiBroadcastFloat32(LowerRange), Value);
  880. Value = GiMinimumFloat32(GiBroadcastFloat32(UpperRange), Value);
  881. return Value;
  882. }
  883. GI_FORCEINLINE
  884. float GiReduceAddFloat32(GI_FLOAT32_t Vector) {
  885. #if defined(GI_NEON64_INTRINSICS)
  886. Vector = vpaddq_f32(Vector, Vector);
  887. Vector = vpaddq_f32(Vector, Vector);
  888. return vgetq_lane_f32(Vector, 0);
  889. #elif defined(GI_NEON32_INTRINSICS)
  890. float32x2_t VectorLow = vget_low_f32(Vector);
  891. float32x2_t VectorHigh = vget_high_f32(Vector);
  892. VectorLow = vpadd_f32(VectorLow, VectorHigh);
  893. VectorLow = vpadd_f32(VectorLow, VectorHigh);
  894. return vget_lane_f32(VectorLow, 0);
  895. #elif defined(GI_SSE2_INTRINSICS)
  896. Vector = GiAddFloat32(
  897. Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3)));
  898. Vector = GiAddFloat32(
  899. Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1)));
  900. return GiExtractLane0Float32(Vector);
  901. #else
  902. float ret = 0;
  903. for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
  904. ret += Vector[i];
  905. }
  906. return ret;
  907. #endif
  908. }
  909. GI_FORCEINLINE
  910. float GiReduceMultiplyFloat32(GI_FLOAT32_t Vector) {
  911. #if defined(GI_NEON64_INTRINSICS)
  912. float32x2_t low = vget_low_f32(Vector);
  913. float32x2_t high = vget_high_f32(Vector);
  914. float32x2_t res = vmul_f32(low, high);
  915. return vget_lane_f32(res, 0) * vget_lane_f32(res, 1);
  916. #elif defined(GI_SSE2_INTRINSICS)
  917. Vector = GiMultiplyFloat32(
  918. Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3)));
  919. Vector = GiMultiplyFloat32(
  920. Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1)));
  921. return GiExtractLane0Float32(Vector);
  922. #else
  923. float ret = 1;
  924. for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
  925. ret *= Vector[i];
  926. }
  927. return ret;
  928. #endif
  929. }
  930. #define Max(a, b) (a) > (b) ? (a) : (b)
  931. #define Min(a, b) (a) < (b) ? (a) : (b)
  932. GI_FORCEINLINE
  933. float GiReduceMaxNanFloat32(GI_FLOAT32_t Vector) {
  934. #if defined(GI_NEON64_INTRINSICS)
  935. return vmaxvq_f32(Vector);
  936. #elif defined(GI_NEON32_INTRINSICS)
  937. float32x2_t VectorLow = vget_low_f32(Vector);
  938. float32x2_t VectorHigh = vget_high_f32(Vector);
  939. VectorLow = vpmax_f32(VectorLow, VectorHigh);
  940. VectorLow = vpmax_f32(VectorLow, VectorHigh);
  941. return vget_lane_f32(VectorLow, 0);
  942. #elif defined(GI_SSE2_INTRINSICS)
  943. Vector = GiMaxNanFloat32(
  944. Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3)));
  945. Vector = GiMaxNanFloat32(
  946. Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1)));
  947. return GiExtractLane0Float32(Vector);
  948. #else
  949. float ret = Vector[0];
  950. for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
  951. ret = MAX_NAN(ret, Vector[i]);
  952. }
  953. return ret;
  954. #endif
  955. }
  956. GI_FORCEINLINE
  957. float GiReduceMinNanFloat32(GI_FLOAT32_t Vector) {
  958. #if defined(GI_NEON64_INTRINSICS)
  959. return vminvq_f32(Vector);
  960. #elif defined(GI_NEON32_INTRINSICS)
  961. float32x2_t VectorLow = vget_low_f32(Vector);
  962. float32x2_t VectorHigh = vget_high_f32(Vector);
  963. VectorLow = vpmin_f32(VectorLow, VectorHigh);
  964. VectorLow = vpmin_f32(VectorLow, VectorHigh);
  965. return vget_lane_f32(VectorLow, 0);
  966. #elif defined(GI_SSE2_INTRINSICS)
  967. Vector = GiMinNanFloat32(
  968. Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3)));
  969. Vector = GiMinNanFloat32(
  970. Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1)));
  971. return GiExtractLane0Float32(Vector);
  972. #else
  973. float ret = Vector[0];
  974. for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
  975. ret = MIN_NAN(ret, Vector[i]);
  976. }
  977. return ret;
  978. #endif
  979. }
  980. GI_FORCEINLINE
  981. GI_FLOAT32_t GiAbsFloat32(GI_FLOAT32_t Vector1) {
  982. #if defined(GI_NEON64_INTRINSICS)
  983. return vabsq_f32(Vector1);
  984. #elif defined(GI_SSE2_INTRINSICS)
  985. union {
  986. unsigned int int_val;
  987. float float_val;
  988. } value;
  989. value.int_val = 0x7fffffff;
  990. return _mm_and_ps(Vector1, _mm_set_ps1(value.float_val));
  991. #else
  992. GI_FLOAT32_t ret;
  993. for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
  994. ret[i] = Vector1[i] > 0 ? Vector1[i] : -Vector1[i];
  995. }
  996. return ret;
  997. #endif
  998. }
  999. #if defined(GI_SSE2_INTRINSICS)
  1000. typedef __m128i int8x16_t;
  1001. typedef __m64_128 int8x8_t;
  1002. GI_FORCEINLINE int8x16_t vcombine_s8(int8x8_t low, int8x8_t high) {
  1003. return _mm_unpacklo_epi64(_pM128i(low), _pM128i(high));
  1004. }
  1005. typedef __m64_128 int64x1_t;
  1006. GI_FORCEINLINE int64x1_t vget_low_s64(GI_INT64_t a) {
  1007. int64x1_t res64;
  1008. return64(a);
  1009. }
  1010. GI_FORCEINLINE int64x1_t vget_high_s64(GI_INT64_t a) {
  1011. int64x1_t res64;
  1012. __m128i res;
  1013. res = _mm_unpackhi_epi64(a, a);
  1014. return64(res);
  1015. }
  1016. #endif
  1017. GI_FORCEINLINE GI_INT64_t GiZip1qS64(GI_INT64_t __p0, GI_INT64_t __p1) {
  1018. #if defined(GI_NEON_INTRINSICS)
  1019. return vzip1q_s64(__p0, __p1);
  1020. #elif defined(GI_SSE2_INTRINSICS)
  1021. #define vcombine_s64 vcombine_s8
  1022. return vcombine_s64(vget_low_s64(__p0), vget_low_s64(__p1));
  1023. #else
  1024. GI_INT64_t ret;
  1025. ret[0] = __p0[0];
  1026. ret[1] = __p1[0];
  1027. return ret;
  1028. #endif
  1029. }
  1030. GI_FORCEINLINE GI_INT64_t GiZip2qS64(GI_INT64_t __p0, GI_INT64_t __p1) {
  1031. #if defined(GI_NEON_INTRINSICS)
  1032. return vzip2q_s64(__p0, __p1);
  1033. #elif defined(GI_SSE2_INTRINSICS)
  1034. #define vcombine_s64 vcombine_s8
  1035. return vcombine_s64(vget_high_s64(__p0), vget_high_s64(__p1));
  1036. #else
  1037. GI_INT64_t ret;
  1038. ret[0] = __p0[1];
  1039. ret[1] = __p1[1];
  1040. return ret;
  1041. #endif
  1042. }
  1043. GI_FORCEINLINE GI_FLOAT32_t GiReinterpretqS64ToFloat32(GI_INT64_t a) {
  1044. #if defined(GI_NEON_INTRINSICS)
  1045. return vreinterpretq_f32_s64(a);
  1046. #elif defined(GI_SSE2_INTRINSICS)
  1047. return _M128(a);
  1048. #else
  1049. GI_FLOAT32_t ret;
  1050. memcpy(&ret, &a, sizeof(GI_FLOAT32_t));
  1051. return ret;
  1052. #endif
  1053. }
  1054. GI_FORCEINLINE GI_INT64_t GiReinterpretqFloat32ToS64(GI_FLOAT32_t a) {
  1055. #if defined(GI_NEON_INTRINSICS)
  1056. return vreinterpretq_s64_f32(a);
  1057. #elif defined(GI_SSE2_INTRINSICS)
  1058. return _M128i(a);
  1059. #else
  1060. GI_INT64_t ret;
  1061. memcpy(&ret, &a, sizeof(GI_INT64_t));
  1062. return ret;
  1063. #endif
  1064. }
  1065. #if defined(GI_NEON_INTRINSICS)
  1066. #define GiSimdFmaLane(a, b, c, d) vfmaq_laneq_f32(a, b, c, d)
  1067. #else
  1068. GI_FORCEINLINE GI_FLOAT32_t
  1069. ___gi_vmlaq_lane_f32(GI_FLOAT32_t a, GI_FLOAT32_t b, float32x2_t v, int l) {
  1070. float vlane;
  1071. GI_FLOAT32_t c;
  1072. vlane = (float)GiGetLaneFloat32(v, l);
  1073. c = GiBroadcastFloat32(vlane);
  1074. return GiMlaqFloat32(a, b, c);
  1075. }
  1076. GI_FORCEINLINE float32x2_t ___gi_vget_low_f32(GI_FLOAT32_t a) {
  1077. #if defined(GI_SSE2_INTRINSICS)
  1078. float32x2_t res64;
  1079. _M64f(res64, a);
  1080. return res64;
  1081. #else
  1082. float32x2_t ret;
  1083. ret[0] = a[0];
  1084. ret[1] = a[1];
  1085. return ret;
  1086. #endif
  1087. }
  1088. GI_FORCEINLINE float32x2_t ___gi_vget_high_f32(GI_FLOAT32_t a) {
  1089. #if defined(GI_SSE2_INTRINSICS)
  1090. __m128i res;
  1091. __m64_128 res64;
  1092. res = _mm_unpackhi_epi64(_M128i(a), _M128i(a));
  1093. return64(res);
  1094. #else
  1095. float32x2_t ret;
  1096. ret[0] = a[2];
  1097. ret[1] = a[3];
  1098. return ret;
  1099. #endif
  1100. }
  1101. GI_FORCEINLINE GI_FLOAT32_t
  1102. ___gi_vfmaq_laneq_f32(GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t v, int l) {
  1103. if (l < 2) {
  1104. return ___gi_vmlaq_lane_f32(a, b, ___gi_vget_low_f32(v), l);
  1105. } else {
  1106. return ___gi_vmlaq_lane_f32(a, b, ___gi_vget_high_f32(v), l - 2);
  1107. }
  1108. }
  1109. #define GiSimdFmaLane(a, b, c, d) ___gi_vfmaq_laneq_f32(a, b, c, d)
  1110. #endif
  1111. #if defined(GI_NEON_INTRINSICS)
  1112. #if MEGDNN_AARCH64
  1113. #define GiMlaqLowLaneFloat32(__a, __b, __v, __lane) \
  1114. vmlaq_laneq_f32(__a, __b, __v, __lane)
  1115. #define GiMlaqHighLaneFloat32(__a, __b, __v, __lane) \
  1116. vmlaq_laneq_f32(__a, __b, __v, __lane)
  1117. #else
  1118. #define GiMlaqLowLaneFloat32(__a, __b, __v, __lane) \
  1119. __extension__({ \
  1120. float32x2_t c = vget_low_f32(__v); \
  1121. GI_FLOAT32_t __ret = vmlaq_lane_f32(__a, __b, c, __lane); \
  1122. __ret; \
  1123. })
  1124. #define GiMlaqHighLaneFloat32(__a, __b, __v, __lane) \
  1125. __extension__({ \
  1126. float32x2_t c = vget_high_f32(__v); \
  1127. GI_FLOAT32_t __ret = vmlaq_lane_f32(__a, __b, c, (__lane - 2)); \
  1128. __ret; \
  1129. })
  1130. #endif
  1131. #elif defined(GI_SSE2_INTRINSICS)
  1132. #define GiMlaqLowLaneFloat32(__a, __b, __v, __lane) \
  1133. __extension__({ \
  1134. float32x2_t c = sse_vget_low_f32(__v); \
  1135. GI_FLOAT32_t __ret = sse_vmlaq_lane_f32(__a, __b, c, __lane); \
  1136. __ret; \
  1137. })
  1138. #define GiMlaqHighLaneFloat32(__a, __b, __v, __lane) \
  1139. __extension__({ \
  1140. float32x2_t c = sse_vget_high_f32(__v); \
  1141. GI_FLOAT32_t __ret = sse_vmlaq_lane_f32(__a, __b, c, (__lane - 2)); \
  1142. __ret; \
  1143. })
  1144. #else
  1145. //! naive
  1146. #define GiMlaqLowLaneFloat32(__a, __b, __v, __lane) \
  1147. __extension__({ \
  1148. GI_FLOAT32_t __ret; \
  1149. for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { \
  1150. __ret[i] = __a[i] + (__b[i] * __v[__lane]); \
  1151. } \
  1152. __ret; \
  1153. })
  1154. #define GiMlaqHighLaneFloat32(__a, __b, __v, __lane) \
  1155. __extension__({ \
  1156. GI_FLOAT32_t __ret; \
  1157. for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { \
  1158. __ret[i] = __a[i] + (__b[i] * __v[__lane]); \
  1159. } \
  1160. __ret; \
  1161. })
  1162. #endif
  1163. #if defined(GI_NEON_INTRINSICS)
  1164. #define GiFmsqLaneQFloat32(a, b, v, lane) vfmsq_laneq_f32(a, b, v, lane)
  1165. #elif defined(GI_SSE2_INTRINSICS)
  1166. #define SSE_VFMSQ_LANEQ_F32(lane) \
  1167. GI_FORCEINLINE GI_FLOAT32_t sse_vfmsq_lane_##lane##_q_f32( \
  1168. GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t v) { \
  1169. return sse_vmlsq_lane_f32(a, b, sse_vget_low_f32(v), lane); \
  1170. }
  1171. SSE_VFMSQ_LANEQ_F32(0)
  1172. SSE_VFMSQ_LANEQ_F32(1)
  1173. #undef SSE_VFMSQ_LANEQ_F32
  1174. #define SSE_VFMSQ_LANEQ_F32(lane) \
  1175. GI_FORCEINLINE GI_FLOAT32_t sse_vfmsq_lane_##lane##_q_f32( \
  1176. GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t v) { \
  1177. return sse_vmlsq_lane_f32(a, b, sse_vget_high_f32(v), lane - 2); \
  1178. }
  1179. SSE_VFMSQ_LANEQ_F32(2)
  1180. SSE_VFMSQ_LANEQ_F32(3)
  1181. #undef SSE_VFMSQ_LANEQ_F32
  1182. #define GiFmsqLaneQFloat32(a, b, v, lane) sse_vfmsq_lane_##lane##_q_f32(a, b, v)
  1183. #else
  1184. //! naive
  1185. GI_FORCEINLINE GI_FLOAT32_t __naive_GiFmsqLaneQFloat32(
  1186. GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t v, const int lane) {
  1187. GI_FLOAT32_t ret;
  1188. for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
  1189. ret[i] = a[i] - (b[i] * v[lane]);
  1190. }
  1191. return ret;
  1192. }
  1193. #define GiFmsqLaneQFloat32(a, b, v, lane) __naive_GiFmsqLaneQFloat32(a, b, v, lane)
  1194. #endif
  1195. GI_FORCEINLINE GI_FLOAT32_t GiCombineFloat32(float32x2_t a, float32x2_t b) {
  1196. #if defined(GI_NEON_INTRINSICS)
  1197. return vcombine_f32(a, b);
  1198. #elif defined(GI_SSE2_INTRINSICS)
  1199. __m128i res;
  1200. res = _mm_unpacklo_epi64(_pM128i(a), _pM128i(b));
  1201. return _M128(res);
  1202. #else
  1203. GI_FLOAT32_t res;
  1204. res[0] = a[0];
  1205. res[1] = a[1];
  1206. res[2] = b[0];
  1207. res[3] = b[1];
  1208. return res;
  1209. #endif
  1210. }
  1211. GI_FORCEINLINE float32x2_t GiGetLowFloat32(GI_FLOAT32_t a) {
  1212. #if defined(GI_NEON_INTRINSICS)
  1213. return vget_low_f32(a);
  1214. #else
  1215. return ___gi_vget_low_f32(a);
  1216. #endif
  1217. }
  1218. GI_FORCEINLINE float32x2_t GiGetHighFloat32(GI_FLOAT32_t a) {
  1219. #if defined(GI_NEON_INTRINSICS)
  1220. return vget_high_f32(a);
  1221. #else
  1222. return ___gi_vget_high_f32(a);
  1223. #endif
  1224. }