You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

fp16_t.cc 38 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "common/fp16_t.h"
  17. #include "external/register/register_types.h"
  18. namespace {
  19. constexpr uint16_t kManBitLength = 11;
  20. }
  21. namespace ge {
  22. /// @ingroup fp16_t global filed
  23. /// @brief round mode of last valid digital
  24. enum TagFp16RoundMode g_round_mode = kRoundToNearest;
  25. void ExtractFp16(const uint16_t &val, uint16_t &s, int16_t &e, uint16_t &m) {
  26. // 1.Extract
  27. s = static_cast<uint16_t>(FP16_EXTRAC_SIGN(val));
  28. e = static_cast<int16_t>(FP16_EXTRAC_EXP(val));
  29. m = static_cast<uint16_t>(FP16_EXTRAC_MAN(val));
  30. // Denormal
  31. if (e == 0) {
  32. e = 1;
  33. }
  34. }
  35. /// @ingroup fp16_t static method
  36. /// @param [in] man truncated mantissa
  37. /// @param [in] shift_out left shift bits based on ten bits
  38. /// @brief judge whether to add one to the result while converting fp16_t to other datatype
  39. /// @return Return true if add one, otherwise false
  40. static bool IsRoundOne(uint64_t man, uint16_t trunc_len) {
  41. uint64_t mask0 = 0x4;
  42. uint64_t mask1 = 0x2;
  43. uint64_t mask2;
  44. uint16_t shift_out = static_cast<uint16_t>(trunc_len - kDim2);
  45. mask0 = mask0 << shift_out;
  46. mask1 = mask1 << shift_out;
  47. mask2 = mask1 - 1;
  48. bool last_bit = ((man & mask0) > 0);
  49. bool trunc_high = false;
  50. bool trunc_left = false;
  51. if (g_round_mode == kRoundToNearest) {
  52. trunc_high = ((man & mask1) > 0);
  53. trunc_left = ((man & mask2) > 0);
  54. }
  55. return (trunc_high && (trunc_left || last_bit));
  56. }
  57. /// @ingroup fp16_t public method
  58. /// @param [in] exp exponent of fp16_t value
  59. /// @param [in] man exponent of fp16_t value
  60. /// @brief normalize fp16_t value
  61. /// @return
  62. static void Fp16Normalize(int16_t &exp, uint16_t &man) {
  63. // set to invalid data
  64. if (exp >= kFp16MaxExp) {
  65. exp = static_cast<int16_t>(kFp16MaxExp);
  66. man = static_cast<uint16_t>(kFp16MaxMan);
  67. } else if (exp == 0 && man == kFp16ManHideBit) {
  68. exp++;
  69. man = 0;
  70. }
  71. }
  72. /// @ingroup fp16_t math conversion static method
  73. /// @param [in] fp_val uint16_t value of fp16_t object
  74. /// @brief Convert fp16_t to float/fp32
  75. /// @return Return float/fp32 value of fp_val which is the value of fp16_t object
  76. static float Fp16ToFloat(const uint16_t &fp_val) {
  77. uint16_t hf_sign;
  78. uint16_t hf_man;
  79. int16_t hf_exp;
  80. ExtractFp16(fp_val, hf_sign, hf_exp, hf_man);
  81. while (hf_man && !(hf_man & kFp16ManHideBit)) {
  82. hf_man <<= 1;
  83. hf_exp--;
  84. }
  85. uint32_t e_ret, m_ret;
  86. uint32_t s_ret = hf_sign;
  87. if (hf_man == 0) {
  88. e_ret = 0;
  89. m_ret = 0;
  90. } else {
  91. e_ret = hf_exp - kFp16ExpBias + kFp32ExpBias;
  92. m_ret = hf_man & kFp16ManMask;
  93. m_ret = m_ret << (kFp32ManLen - kFp16ManLen);
  94. }
  95. uint32_t f_val = FP32_CONSTRUCTOR(s_ret, e_ret, m_ret);
  96. auto p_ret_v = reinterpret_cast<float *>(&f_val);
  97. return *p_ret_v;
  98. }
  99. /// @ingroup fp16_t math conversion static method
  100. /// @param [in] fp_val uint16_t value of fp16_t object
  101. /// @brief Convert fp16_t to double/fp64
  102. /// @return Return double/fp64 value of fp_val which is the value of fp16_t object
  103. static double Fp16ToDouble(const uint16_t &fp_val) {
  104. uint16_t hf_sign;
  105. uint16_t hf_man;
  106. int16_t hf_exp;
  107. ExtractFp16(fp_val, hf_sign, hf_exp, hf_man);
  108. while (hf_man && !(hf_man & kFp16ManHideBit)) {
  109. hf_man <<= 1;
  110. hf_exp--;
  111. }
  112. uint64_t e_ret;
  113. uint64_t m_ret;
  114. uint64_t s_ret = hf_sign;
  115. if (!hf_man) {
  116. e_ret = 0;
  117. m_ret = 0;
  118. } else {
  119. e_ret = hf_exp - kFp16ExpBias + kFp64ExpBias;
  120. m_ret = hf_man & kFp16ManMask;
  121. m_ret = m_ret << (kFp64ManLen - kFp16ManLen);
  122. }
  123. uint64_t f_val = (s_ret << kFp64SignIndex) | (e_ret << kFp64ManLen) | (m_ret);
  124. auto p_ret_v = reinterpret_cast<double *>(&f_val);
  125. return *p_ret_v;
  126. }
  127. /// @ingroup fp16_t static method
  128. /// @param [in] s_ret sign of fp16_t value
  129. /// @param [in] long_int_m man uint64_t value of fp16_t object
  130. /// @param [in] shift_out shift offset
  131. /// @brief calculate uint8 value by sign,man and shift offset
  132. /// @return Return uint8 value of fp16_t object
  133. static uint8_t GetUint8ValByMan(uint8_t s_ret, const uint64_t &long_int_m, const uint16_t &shift_out) {
  134. bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen);
  135. auto m_ret = static_cast<uint8_t>((long_int_m >> (kFp16ManLen + shift_out)) & kBitLen8Max);
  136. need_round = need_round && ((s_ret == 0 && m_ret < kInt8Max) || (s_ret == 1 && m_ret <= kInt8Max));
  137. if (need_round) {
  138. m_ret++;
  139. }
  140. if (s_ret) {
  141. m_ret = (~m_ret) + 1;
  142. }
  143. if (m_ret == 0) {
  144. s_ret = 0;
  145. }
  146. return static_cast<uint8_t>((s_ret << kBitShift7) | (m_ret));
  147. }
  148. /// @ingroup fp16_t math conversion static method
  149. /// @param [in] fp_val uint16_t value of fp16_t object
  150. /// @brief Convert fp16_t to int8_t
  151. /// @return Return int8_t value of fp_val which is the value of fp16_t object
  152. static int8_t Fp16ToInt8(const uint16_t &fp_val) {
  153. int8_t ret;
  154. uint8_t ret_v;
  155. // 1.get s_ret and shift it to bit0.
  156. uint8_t s_ret = FP16_EXTRAC_SIGN(fp_val);
  157. // 2.get hf_e and hf_m
  158. uint16_t hf_e = FP16_EXTRAC_EXP(fp_val);
  159. uint16_t hf_m = FP16_EXTRAC_MAN(fp_val);
  160. if (FP16_IS_DENORM(fp_val)) { // Denormalized number
  161. ret_v = 0;
  162. ret = *(reinterpret_cast<uint8_t *>(&ret_v));
  163. return ret;
  164. }
  165. uint64_t long_int_m = hf_m;
  166. uint8_t overflow_flag = 0;
  167. uint16_t shift_out = 0;
  168. if (FP16_IS_INVALID(fp_val)) { // Inf or NaN
  169. overflow_flag = 1;
  170. } else {
  171. while (hf_e != kFp16ExpBias) {
  172. if (hf_e > kFp16ExpBias) {
  173. hf_e--;
  174. long_int_m = long_int_m << 1;
  175. if (s_ret == 1 && long_int_m >= 0x20000u) { // sign=1,negative number(<0)
  176. long_int_m = 0x20000u; // 10 0000 0000 0000 0000 10(fp16_t-man)+7(int8)=17bit
  177. overflow_flag = 1;
  178. break;
  179. } else if (s_ret != 1 && long_int_m >= 0x1FFFFu) { // sign=0,positive number(>0)
  180. long_int_m = 0x1FFFFu; // 01 1111 1111 1111 1111 10(fp16_t-man)+7(int8)
  181. overflow_flag = 1;
  182. break;
  183. }
  184. } else {
  185. hf_e++;
  186. shift_out++;
  187. }
  188. }
  189. }
  190. if (overflow_flag) {
  191. ret_v = kInt8Max + s_ret;
  192. } else {
  193. // Generate final result
  194. ret_v = GetUint8ValByMan(s_ret, long_int_m, shift_out);
  195. }
  196. ret = *(reinterpret_cast<uint8_t *>(&ret_v));
  197. return ret;
  198. }
  199. /// @ingroup fp16_t math conversion static method
  200. /// @param [in] fp_val uint16_t value of fp16_t object
  201. /// @brief Convert fp16_t to uint8_t
  202. /// @return Return uint8_t value of fp_val which is the value of fp16_t object
  203. static uint8_t Fp16ToUInt8(const uint16_t &fp_val) {
  204. uint8_t m_ret = 0;
  205. // 1.get s_ret and shift it to bit0.
  206. uint8_t s_ret = FP16_EXTRAC_SIGN(fp_val);
  207. // 2.get hf_e and hf_m
  208. uint16_t hf_e = FP16_EXTRAC_EXP(fp_val);
  209. uint16_t hf_m = FP16_EXTRAC_MAN(fp_val);
  210. if (FP16_IS_DENORM(fp_val)) { // Denormalized number
  211. return 0;
  212. }
  213. if (FP16_IS_INVALID(fp_val)) { // Inf or NaN
  214. m_ret = ~0;
  215. } else {
  216. uint64_t long_int_m = hf_m;
  217. uint8_t overflow_flag = 0;
  218. uint16_t shift_out = 0;
  219. while (hf_e != kFp16ExpBias) {
  220. if (hf_e > kFp16ExpBias) {
  221. hf_e--;
  222. long_int_m = long_int_m << 1;
  223. if (long_int_m >= 0x40000Lu) { // overflow 0100 0000 0000 0000 0000
  224. long_int_m = 0x3FFFFLu; // 11 1111 1111 1111 1111 10(fp16_t-man)+8(uint8)=18bit
  225. overflow_flag = 1;
  226. m_ret = ~0;
  227. break;
  228. }
  229. } else {
  230. hf_e++;
  231. shift_out++;
  232. }
  233. }
  234. if (!overflow_flag) {
  235. bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen);
  236. m_ret = static_cast<uint8_t>((long_int_m >> (kFp16ManLen + shift_out)) & kBitLen8Max);
  237. if (need_round && m_ret != kBitLen8Max) {
  238. m_ret++;
  239. }
  240. }
  241. }
  242. if (s_ret == 1) { // Negative number
  243. m_ret = 0;
  244. }
  245. // m_ret equal to final result
  246. return m_ret;
  247. }
  248. /// @ingroup fp16_t static method
  249. /// @param [in] s_ret sign of fp16_t value
  250. /// @param [in] long_int_m man uint64_t value of fp16_t object
  251. /// @param [in] shift_out shift offset
  252. /// @brief calculate uint16 value by sign,man and shift offset
  253. /// @return Return uint16 value of fp16_t object
  254. static uint16_t GetUint16ValByMan(uint16_t s_ret, const uint64_t &long_int_m, const uint16_t &shift_out) {
  255. bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen);
  256. auto m_ret = static_cast<uint16_t>((long_int_m >> (kFp16ManLen + shift_out)) & kBitLen16Max);
  257. if (need_round && m_ret < kInt16Max) {
  258. m_ret++;
  259. }
  260. if (s_ret) {
  261. m_ret = (~m_ret) + 1;
  262. }
  263. if (m_ret == 0) {
  264. s_ret = 0;
  265. }
  266. return static_cast<uint16_t>((s_ret << kBitShift15) | (m_ret));
  267. }
  268. /// @ingroup fp16_t math conversion static method
  269. /// @param [in] fp_val uint16_t value of fp16_t object
  270. /// @brief Convert fp16_t to int16_t
  271. /// @return Return int16_t value of fp_val which is the value of fp16_t object
  272. static int16_t Fp16ToInt16(const uint16_t &fp_val) {
  273. int16_t ret;
  274. uint16_t ret_v;
  275. // 1.get s_ret and shift it to bit0.
  276. uint16_t s_ret = FP16_EXTRAC_SIGN(fp_val);
  277. // 2.get hf_e and hf_m
  278. uint16_t hf_e = FP16_EXTRAC_EXP(fp_val);
  279. uint16_t hf_m = FP16_EXTRAC_MAN(fp_val);
  280. if (FP16_IS_DENORM(fp_val)) { // Denormalized number
  281. ret_v = 0;
  282. ret = *(reinterpret_cast<uint8_t *>(&ret_v));
  283. return ret;
  284. }
  285. uint64_t long_int_m = hf_m;
  286. uint8_t overflow_flag = 0;
  287. uint16_t shift_out = 0;
  288. if (FP16_IS_INVALID(fp_val)) { // Inf or NaN
  289. overflow_flag = 1;
  290. } else {
  291. while (hf_e != kFp16ExpBias) {
  292. if (hf_e > kFp16ExpBias) {
  293. hf_e--;
  294. long_int_m = long_int_m << 1;
  295. if (s_ret == 1 && long_int_m > 0x2000000Lu) { // sign=1,negative number(<0)
  296. long_int_m = 0x2000000Lu; // 10(fp16_t-man)+15(int16)=25bit
  297. overflow_flag = 1;
  298. break;
  299. } else if (s_ret != 1 && long_int_m >= 0x1FFFFFFLu) { // sign=0,positive number(>0) Overflow
  300. long_int_m = 0x1FFFFFFLu; // 10(fp16_t-man)+15(int16)=25bit
  301. overflow_flag = 1;
  302. break;
  303. }
  304. } else {
  305. hf_e++;
  306. shift_out++;
  307. }
  308. }
  309. }
  310. if (overflow_flag) {
  311. ret_v = kInt16Max + s_ret;
  312. } else {
  313. // Generate final result
  314. ret_v = GetUint16ValByMan(s_ret, long_int_m, shift_out);
  315. }
  316. ret = *(reinterpret_cast<int16_t *>(&ret_v));
  317. return ret;
  318. }
  319. /// @ingroup fp16_t math conversion static method
  320. /// @param [in] fp_val uint16_t value of fp16_t object
  321. /// @brief Convert fp16_t to uint16_t
  322. /// @return Return uint16_t value of fp_val which is the value of fp16_t object
  323. static uint16_t Fp16ToUInt16(const uint16_t &fp_val) {
  324. uint16_t m_ret = 0;
  325. // 1.get s_ret and shift it to bit0.
  326. uint16_t s_ret = FP16_EXTRAC_SIGN(fp_val);
  327. // 2.get hf_e and hf_m
  328. uint16_t hf_e = FP16_EXTRAC_EXP(fp_val);
  329. uint16_t hf_m = FP16_EXTRAC_MAN(fp_val);
  330. if (FP16_IS_DENORM(fp_val)) { // Denormalized number
  331. return 0;
  332. }
  333. if (FP16_IS_INVALID(fp_val)) { // Inf or NaN
  334. m_ret = ~0;
  335. } else {
  336. uint64_t long_int_m = hf_m;
  337. uint16_t shift_out = 0;
  338. while (hf_e != kFp16ExpBias) {
  339. if (hf_e > kFp16ExpBias) {
  340. hf_e--;
  341. long_int_m = long_int_m << 1;
  342. } else {
  343. hf_e++;
  344. shift_out++;
  345. }
  346. }
  347. bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen);
  348. m_ret = static_cast<uint16_t>((long_int_m >> (kFp16ManLen + shift_out)) & kBitLen16Max);
  349. if (need_round && m_ret != kBitLen16Max) {
  350. m_ret++;
  351. }
  352. }
  353. if (s_ret == 1) { // Negative number
  354. m_ret = 0;
  355. }
  356. // m_ret equal to final result
  357. return m_ret;
  358. }
  359. /// @ingroup fp16_t math convertion static method
  360. /// @param [in] fp_val uint16_t value of fp16_t object
  361. /// @brief Convert fp16_t to int32_t
  362. /// @return Return int32_t value of fp_val which is the value of fp16_t object
  363. static int32_t Fp16ToInt32(const uint16_t &fp_val) {
  364. uint32_t ret_v;
  365. // 1.get s_ret and shift it to bit0.
  366. uint32_t s_ret = FP16_EXTRAC_SIGN(fp_val);
  367. // 2.get hf_e and hf_m
  368. uint16_t hf_e = FP16_EXTRAC_EXP(fp_val);
  369. uint16_t hf_m = FP16_EXTRAC_MAN(fp_val);
  370. if (FP16_IS_INVALID(fp_val)) { // Inf or NaN
  371. ret_v = kInt32Max + s_ret;
  372. } else {
  373. uint64_t long_int_m = hf_m;
  374. uint16_t shift_out = 0;
  375. while (hf_e != kFp16ExpBias) {
  376. if (hf_e > kFp16ExpBias) {
  377. hf_e--;
  378. long_int_m = long_int_m << 1;
  379. } else {
  380. hf_e++;
  381. shift_out++;
  382. }
  383. }
  384. bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen);
  385. auto m_ret = static_cast<uint32_t>((long_int_m >> (kFp16ManLen + shift_out)) & kBitLen32Max);
  386. if (need_round && m_ret < kInt32Max) {
  387. m_ret++;
  388. }
  389. if (s_ret == 1) {
  390. m_ret = (~m_ret) + 1;
  391. }
  392. if (m_ret == 0) {
  393. s_ret = 0;
  394. }
  395. // Generate final result
  396. ret_v = (s_ret << kBitShift31) | (m_ret);
  397. }
  398. return *(reinterpret_cast<int32_t *>(&ret_v));
  399. }
  400. /// @ingroup fp16_t math conversion static method
  401. /// @param [in] fp_val uint16_t value of fp16_t object
  402. /// @brief Convert fp16_t to uint32_t
  403. /// @return Return uint32_t value of fp_val which is the value of fp16_t object
  404. static uint32_t Fp16ToUInt32(const uint16_t &fp_val) {
  405. uint32_t m_ret;
  406. // 1.get s_ret and shift it to bit0.
  407. uint32_t s_ret = FP16_EXTRAC_SIGN(fp_val);
  408. // 2.get hf_e and hf_m
  409. uint16_t hf_e = FP16_EXTRAC_EXP(fp_val);
  410. uint16_t hf_m = FP16_EXTRAC_MAN(fp_val);
  411. if (FP16_IS_DENORM(fp_val)) { // Denormalized number
  412. return 0u;
  413. }
  414. if (FP16_IS_INVALID(fp_val)) { // Inf or NaN
  415. m_ret = ~0u;
  416. } else {
  417. uint64_t long_int_m = hf_m;
  418. uint16_t shift_out = 0;
  419. while (hf_e != kFp16ExpBias) {
  420. if (hf_e > kFp16ExpBias) {
  421. hf_e--;
  422. long_int_m = long_int_m << 1;
  423. } else {
  424. hf_e++;
  425. shift_out++;
  426. }
  427. }
  428. bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen);
  429. m_ret = static_cast<uint32_t>(long_int_m >> (kFp16ManLen + shift_out)) & kBitLen32Max;
  430. if (need_round && m_ret != kBitLen32Max) {
  431. m_ret++;
  432. }
  433. }
  434. if (s_ret == 1) { // Negative number
  435. m_ret = 0;
  436. }
  437. // m_ret equal to final result
  438. return m_ret;
  439. }
  440. static uint16_t Fp16AddCalVal(uint16_t &s_ret, int16_t e_ret, uint16_t m_ret, uint32_t m_trunc, uint16_t shift_out) {
  441. uint16_t m_min = kFp16ManHideBit << shift_out;
  442. uint16_t m_max = m_min << 1;
  443. // Denormal
  444. while (m_ret < m_min && e_ret > 0) { // the value of m_ret should not be smaller than 2^23
  445. m_ret = m_ret << 1;
  446. m_ret += (kFp32SignMask & m_trunc) >> kFp32SignIndex;
  447. m_trunc = m_trunc << 1;
  448. e_ret = e_ret - 1;
  449. }
  450. while (m_ret >= m_max) { // the value of m_ret should be smaller than 2^24
  451. m_trunc = m_trunc >> 1;
  452. m_trunc = m_trunc | (kFp32SignMask * (m_ret & 1));
  453. m_ret = m_ret >> 1;
  454. e_ret = e_ret + 1;
  455. }
  456. bool b_last_bit = ((m_ret & 1) > 0);
  457. bool b_trunc_high = 0;
  458. bool b_trunc_left = 0;
  459. b_trunc_high = (kRoundToNearest == g_round_mode) && ((m_trunc & kFp32SignMask) > 0);
  460. b_trunc_left = (kRoundToNearest == g_round_mode) && ((m_trunc & kFp32AbsMax) > 0);
  461. m_ret = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, m_ret, shift_out);
  462. while (m_ret >= m_max) {
  463. m_ret = m_ret >> 1;
  464. e_ret = e_ret + 1;
  465. }
  466. if (e_ret == 0 && m_ret <= m_max) {
  467. m_ret = m_ret >> 1;
  468. }
  469. Fp16Normalize(e_ret, m_ret);
  470. uint16_t ret = FP16_CONSTRUCTOR(s_ret, static_cast<uint16_t>(e_ret), m_ret);
  471. return ret;
  472. }
  473. /// @ingroup fp16_t math operator
  474. /// @param [in] v_1 left operator value of fp16_t object
  475. /// @param [in] v_2 right operator value of fp16_t object
  476. /// @brief Performing fp16_t addition
  477. /// @return Return fp16_t result of adding this and fp
  478. static uint16_t Fp16Add(uint16_t v_1, uint16_t v_2) {
  479. uint16_t s_a;
  480. uint16_t s_b;
  481. int16_t e_a;
  482. int16_t e_b;
  483. uint32_t m_a;
  484. uint32_t m_b;
  485. uint16_t m_a_tmp;
  486. uint16_t m_b_tmp;
  487. uint16_t shift_out = 0;
  488. // 1.Extract
  489. ExtractFp16(v_1, s_a, e_a, m_a_tmp);
  490. ExtractFp16(v_2, s_b, e_b, m_b_tmp);
  491. m_a = m_a_tmp;
  492. m_b = m_b_tmp;
  493. uint16_t sum;
  494. uint16_t s_ret;
  495. if (s_a != s_b) {
  496. ReverseMan(s_a > 0, m_a);
  497. ReverseMan(s_b > 0, m_b);
  498. sum = static_cast<uint16_t>(GetManSum(e_a, m_a, e_b, m_b));
  499. s_ret = (sum & kFp16SignMask) >> kFp16SignIndex;
  500. ReverseMan(s_ret > 0, m_a);
  501. ReverseMan(s_ret > 0, m_b);
  502. } else {
  503. sum = static_cast<uint16_t>(GetManSum(e_a, m_a, e_b, m_b));
  504. s_ret = s_a;
  505. }
  506. if (sum == 0) {
  507. shift_out = 3; // shift to left 3 bits
  508. m_a = m_a << shift_out;
  509. m_b = m_b << shift_out;
  510. }
  511. uint32_t m_trunc = 0;
  512. int16_t e_ret = std::max(e_a, e_b);
  513. int16_t e_tmp = std::abs(e_a - e_b);
  514. if (e_a > e_b) {
  515. m_trunc = (m_b << (kBitShift32 - static_cast<uint16_t>(e_tmp)));
  516. m_b = RightShift(m_b, e_tmp);
  517. } else if (e_a < e_b) {
  518. m_trunc = (m_a << (kBitShift32 - static_cast<uint16_t>(e_tmp)));
  519. m_a = RightShift(m_a, e_tmp);
  520. }
  521. // calculate mantissav
  522. auto m_ret = static_cast<uint16_t>(m_a + m_b);
  523. return Fp16AddCalVal(s_ret, e_ret, m_ret, m_trunc, shift_out);
  524. }
  525. /// @ingroup fp16_t math operator
  526. /// @param [in] v_1 left operator value of fp16_t object
  527. /// @param [in] v_2 right operator value of fp16_t object
  528. /// @brief Performing fp16_t subtraction
  529. /// @return Return fp16_t result of subtraction fp from this
  530. static uint16_t Fp16Sub(uint16_t v_1, uint16_t v_2) {
  531. // Reverse
  532. uint16_t tmp = ((~(v_2)) & kFp16SignMask) | (v_2 & kFp16AbsMax);
  533. return Fp16Add(v_1, tmp);
  534. }
  535. /// @ingroup fp16_t math operator
  536. /// @param [in] v_1 left operator value of fp16_t object
  537. /// @param [in] v_2 right operator value of fp16_t object
  538. /// @brief Performing fp16_t multiplication
  539. /// @return Return fp16_t result of multiplying this and fp
  540. static uint16_t Fp16Mul(uint16_t v_1, uint16_t v_2) {
  541. uint16_t s_a, s_b;
  542. int16_t e_a, e_b;
  543. uint32_t m_a, m_b;
  544. uint16_t s_ret, m_ret;
  545. int16_t e_ret;
  546. uint32_t mul_m;
  547. uint16_t m_a_tmp, m_b_tmp;
  548. // 1.Extract
  549. ExtractFp16(v_1, s_a, e_a, m_a_tmp);
  550. ExtractFp16(v_2, s_b, e_b, m_b_tmp);
  551. m_a = m_a_tmp;
  552. m_b = m_b_tmp;
  553. e_ret = e_a + e_b - kFp16ExpBias - kDim10;
  554. mul_m = m_a * m_b;
  555. s_ret = s_a ^ s_b;
  556. uint32_t m_min = kFp16ManHideBit;
  557. uint32_t m_max = m_min << 1;
  558. uint32_t m_trunc = 0;
  559. // the value of m_ret should not be smaller than 2^23
  560. while (mul_m < m_min && e_ret > 1) {
  561. mul_m = mul_m << 1;
  562. e_ret = e_ret - 1;
  563. }
  564. while (mul_m >= m_max || e_ret < 1) {
  565. m_trunc = m_trunc >> 1;
  566. m_trunc = m_trunc | (kFp32SignMask * (mul_m & 1));
  567. mul_m = mul_m >> 1;
  568. e_ret = e_ret + 1;
  569. }
  570. bool b_last_bit = ((mul_m & 1) > 0);
  571. bool b_trunc_high = 0;
  572. bool b_trunc_left = 0;
  573. b_trunc_high = (kRoundToNearest == g_round_mode) && ((m_trunc & kFp32SignMask) > 0);
  574. b_trunc_left = (kRoundToNearest == g_round_mode) && ((m_trunc & kFp32AbsMax) > 0);
  575. mul_m = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, mul_m);
  576. while (mul_m >= m_max || e_ret < 0) {
  577. mul_m = mul_m >> 1;
  578. e_ret = e_ret + 1;
  579. }
  580. if (e_ret == 1 && mul_m < kFp16ManHideBit) {
  581. e_ret = 0;
  582. }
  583. m_ret = static_cast<uint16_t>(mul_m);
  584. Fp16Normalize(e_ret, m_ret);
  585. uint16_t ret = FP16_CONSTRUCTOR(s_ret, static_cast<uint16_t>(e_ret), m_ret);
  586. return ret;
  587. }
  588. /// @ingroup fp16_t math operator divided
  589. /// @param [in] v_1 left operator value of fp16_t object
  590. /// @param [in] v_2 right operator value of fp16_t object
  591. /// @brief Performing fp16_t division
  592. /// @return Return fp16_t result of division this by fp
  593. static uint16_t Fp16Div(uint16_t v_1, uint16_t v_2) {
  594. uint16_t ret;
  595. if (FP16_IS_ZERO(v_2)) { // result is inf
  596. // throw "fp16_t division by zero.";
  597. uint16_t s_a, s_b;
  598. uint16_t s_ret;
  599. s_a = FP16_EXTRAC_SIGN(v_1);
  600. s_b = FP16_EXTRAC_SIGN(v_2);
  601. s_ret = s_a ^ s_b;
  602. ret = FP16_CONSTRUCTOR(s_ret, kFp16MaxExp, 0u);
  603. } else if (FP16_IS_ZERO(v_1)) {
  604. ret = 0u;
  605. } else {
  606. uint16_t s_a, s_b;
  607. int16_t e_a, e_b;
  608. uint64_t m_a, m_b;
  609. float m_div;
  610. uint16_t m_a_tmp, m_b_tmp;
  611. // 1.Extract
  612. ExtractFp16(v_1, s_a, e_a, m_a_tmp);
  613. ExtractFp16(v_2, s_b, e_b, m_b_tmp);
  614. m_a = m_a_tmp;
  615. m_b = m_b_tmp;
  616. uint64_t m_tmp;
  617. if (e_a > e_b) {
  618. m_tmp = m_a;
  619. uint16_t tmp;
  620. tmp = e_a - e_b;
  621. for (int i = 0; i < tmp; i++) {
  622. m_tmp = m_tmp << 1;
  623. }
  624. m_a = m_tmp;
  625. } else if (e_a < e_b) {
  626. m_tmp = m_b;
  627. uint16_t tmp = e_b - e_a;
  628. for (int i = 0; i < tmp; i++) {
  629. m_tmp = m_tmp << 1;
  630. }
  631. m_b = m_tmp;
  632. }
  633. m_div = static_cast<float>(m_a * 1.0f / m_b);
  634. fp16_t fp_div;
  635. fp_div = m_div;
  636. ret = fp_div.val;
  637. if (s_a != s_b) {
  638. ret |= kFp16SignMask;
  639. }
  640. }
  641. return ret;
  642. }
  643. // operate
  644. fp16_t fp16_t::operator+(const fp16_t fp) {
  645. uint16_t ret_val = Fp16Add(val, fp.val);
  646. fp16_t ret(ret_val);
  647. return ret;
  648. }
  649. fp16_t fp16_t::operator-(const fp16_t fp) {
  650. uint16_t ret_val = Fp16Sub(val, fp.val);
  651. fp16_t ret(ret_val);
  652. return ret;
  653. }
  654. fp16_t fp16_t::operator*(const fp16_t fp) {
  655. uint16_t ret_val = Fp16Mul(val, fp.val);
  656. fp16_t ret(ret_val);
  657. return ret;
  658. }
  659. fp16_t fp16_t::operator/(const fp16_t fp) {
  660. uint16_t ret_val = Fp16Div(val, fp.val);
  661. fp16_t ret(ret_val);
  662. return ret;
  663. }
  664. fp16_t fp16_t::operator+=(const fp16_t fp) {
  665. val = Fp16Add(val, fp.val);
  666. return *this;
  667. }
  668. fp16_t fp16_t::operator-=(const fp16_t fp) {
  669. val = Fp16Sub(val, fp.val);
  670. return *this;
  671. }
  672. fp16_t fp16_t::operator*=(const fp16_t fp) {
  673. val = Fp16Mul(val, fp.val);
  674. return *this;
  675. }
  676. fp16_t fp16_t::operator/=(const fp16_t fp) {
  677. val = Fp16Div(val, fp.val);
  678. return *this;
  679. }
  680. // compare
  681. bool fp16_t::operator==(const fp16_t &fp) const {
  682. bool result = true;
  683. if (FP16_IS_ZERO(val) && FP16_IS_ZERO(fp.val)) {
  684. result = true;
  685. } else {
  686. result = ((val & kBitLen16Max) == (fp.val & kBitLen16Max)); // bit compare
  687. }
  688. return result;
  689. }
  690. bool fp16_t::operator!=(const fp16_t &fp) const {
  691. bool result = true;
  692. if (FP16_IS_ZERO(val) && FP16_IS_ZERO(fp.val)) {
  693. result = false;
  694. } else {
  695. result = ((val & kBitLen16Max) != (fp.val & kBitLen16Max)); // bit compare
  696. }
  697. return result;
  698. }
  699. bool fp16_t::operator>(const fp16_t &fp) const {
  700. uint16_t s_a, s_b;
  701. uint16_t e_a, e_b;
  702. uint16_t m_a, m_b;
  703. bool result = true;
  704. // 1.Extract
  705. s_a = FP16_EXTRAC_SIGN(val);
  706. s_b = FP16_EXTRAC_SIGN(fp.val);
  707. e_a = FP16_EXTRAC_EXP(val);
  708. e_b = FP16_EXTRAC_EXP(fp.val);
  709. m_a = FP16_EXTRAC_MAN(val);
  710. m_b = FP16_EXTRAC_MAN(fp.val);
  711. // Compare
  712. if ((s_a == 0) && (s_b > 0)) { // + -
  713. // -0=0
  714. result = !(FP16_IS_ZERO(val) && FP16_IS_ZERO(fp.val));
  715. } else if ((s_a == 0) && (s_b == 0)) { // + +
  716. if (e_a > e_b) { // e_a - e_b >= 1; Va always larger than Vb
  717. result = true;
  718. } else if (e_a == e_b) {
  719. result = m_a > m_b;
  720. } else {
  721. result = false;
  722. }
  723. } else if ((s_a > 0) && (s_b > 0)) { // - - opposite to + +
  724. if (e_a < e_b) {
  725. result = true;
  726. } else if (e_a == e_b) {
  727. result = m_a < m_b;
  728. } else {
  729. result = false;
  730. }
  731. } else { // - +
  732. result = false;
  733. }
  734. return result;
  735. }
  736. bool fp16_t::operator>=(const fp16_t &fp) const {
  737. bool result = true;
  738. if ((*this) > fp) {
  739. result = true;
  740. } else if ((*this) == fp) {
  741. result = true;
  742. } else {
  743. result = false;
  744. }
  745. return result;
  746. }
  747. bool fp16_t::operator<(const fp16_t &fp) const {
  748. bool result = true;
  749. if ((*this) >= fp) {
  750. result = false;
  751. } else {
  752. result = true;
  753. }
  754. return result;
  755. }
  756. bool fp16_t::operator<=(const fp16_t &fp) const {
  757. bool result = true;
  758. if ((*this) > fp) {
  759. result = false;
  760. } else {
  761. result = true;
  762. }
  763. return result;
  764. }
  765. // evaluation
  766. fp16_t &fp16_t::operator=(const fp16_t &fp) {
  767. if (&fp == this) {
  768. return *this;
  769. }
  770. val = fp.val;
  771. return *this;
  772. }
  773. fp16_t &fp16_t::operator=(const float &f_val) {
  774. uint16_t s_ret, m_ret;
  775. int16_t e_ret;
  776. uint32_t e_f, m_f;
  777. const uint32_t ui32_v = *(reinterpret_cast<const uint32_t *>(&f_val)); // 1:8:23bit sign:exp:man
  778. uint32_t m_len_delta;
  779. s_ret = static_cast<uint16_t>((ui32_v & kFp32SignMask) >> kFp32SignIndex); // 4Byte->2Byte
  780. e_f = (ui32_v & kFp32ExpMask) >> kFp32ManLen; // 8 bit exponent
  781. m_f = (ui32_v & kFp32ManMask); // 23 bit mantissa dont't need to care about denormal
  782. m_len_delta = kFp32ManLen - kFp16ManLen;
  783. bool need_round = false;
  784. // Exponent overflow/NaN converts to signed inf/NaN
  785. if (e_f > 0x8Fu) { // 0x8Fu:142=127+15
  786. e_ret = kFp16MaxExp - 1;
  787. m_ret = kFp16MaxMan;
  788. } else if (e_f <= 0x70u) { // 0x70u:112=127-15 Exponent underflow converts to denormalized half or signed zero
  789. e_ret = 0;
  790. if (e_f >= 0x67) { // 0x67:103=127-24 Denormal
  791. m_f = (m_f | kFp32ManHideBit);
  792. uint16_t shift_out = kFp32ManLen;
  793. uint64_t m_tmp = (static_cast<uint64_t>(m_f)) << (e_f - 0x67);
  794. need_round = IsRoundOne(m_tmp, shift_out);
  795. m_ret = static_cast<uint16_t>(m_tmp >> shift_out);
  796. if (need_round) {
  797. m_ret++;
  798. }
  799. } else if (e_f == 0x66 && m_f > 0) { // 0x66:102 Denormal 0<f_v<min(Denormal)
  800. m_ret = 1;
  801. } else {
  802. m_ret = 0;
  803. }
  804. } else { // Regular case with no overflow or underflow
  805. e_ret = static_cast<int16_t>(e_f - 0x70u);
  806. need_round = IsRoundOne(m_f, static_cast<uint16_t>(m_len_delta));
  807. m_ret = static_cast<uint16_t>(m_f >> m_len_delta);
  808. if (need_round) {
  809. m_ret++;
  810. }
  811. if (m_ret & kFp16ManHideBit) {
  812. e_ret++;
  813. }
  814. }
  815. Fp16Normalize(e_ret, m_ret);
  816. val = FP16_CONSTRUCTOR(s_ret, static_cast<uint16_t>(e_ret), m_ret);
  817. return *this;
  818. }
  819. fp16_t &fp16_t::operator=(const int8_t &i_val) {
  820. uint16_t s_ret, e_ret, m_ret;
  821. s_ret = static_cast<uint16_t>(((static_cast<uint8_t>(i_val)) & 0x80) >> kDim7);
  822. m_ret = static_cast<uint16_t>(((static_cast<uint8_t>(i_val)) & kInt8Max));
  823. if (m_ret == 0) {
  824. e_ret = 0;
  825. } else {
  826. if (s_ret) { // negative number(<0)
  827. m_ret = static_cast<uint16_t>(std::abs(i_val)); // complement
  828. }
  829. e_ret = kFp16ManLen;
  830. while ((m_ret & kFp16ManHideBit) == 0) {
  831. m_ret = m_ret << 1;
  832. e_ret = e_ret - 1;
  833. }
  834. e_ret = e_ret + kFp16ExpBias;
  835. }
  836. val = FP16_CONSTRUCTOR(s_ret, e_ret, m_ret);
  837. return *this;
  838. }
  839. fp16_t &fp16_t::operator=(const uint8_t &ui_val) {
  840. uint16_t s_ret, e_ret, m_ret;
  841. s_ret = 0;
  842. e_ret = 0;
  843. m_ret = ui_val;
  844. if (m_ret) {
  845. e_ret = kFp16ManLen;
  846. while ((m_ret & kFp16ManHideBit) == 0) {
  847. m_ret = m_ret << 1;
  848. e_ret = e_ret - 1;
  849. }
  850. e_ret = e_ret + kFp16ExpBias;
  851. }
  852. val = FP16_CONSTRUCTOR(s_ret, e_ret, m_ret);
  853. return *this;
  854. }
  855. static void SetValByUint16Val(const uint16_t &input_val, const uint16_t &sign, uint16_t &ret_val) {
  856. uint32_t m_tmp = (input_val & kFp32AbsMax);
  857. uint16_t m_min = kFp16ManHideBit;
  858. uint16_t m_max = m_min << 1;
  859. uint16_t len = static_cast<uint16_t>(GetManBitLength(m_tmp));
  860. if (m_tmp) {
  861. int16_t e_ret;
  862. if (len > kDim11) {
  863. e_ret = kFp16ExpBias + kFp16ManLen;
  864. uint16_t e_tmp = len - kDim11;
  865. uint32_t trunc_mask = 1;
  866. for (int i = 1; i < e_tmp; i++) {
  867. trunc_mask = (trunc_mask << 1) + 1;
  868. }
  869. uint32_t m_trunc = (m_tmp & trunc_mask) << (kBitShift32 - e_tmp);
  870. for (int i = 0; i < e_tmp; i++) {
  871. m_tmp = (m_tmp >> 1);
  872. e_ret = e_ret + 1;
  873. }
  874. bool b_last_bit = ((m_tmp & 1) > 0);
  875. bool b_trunc_high = 0;
  876. bool b_trunc_left = 0;
  877. if (kRoundToNearest == g_round_mode) { // trunc
  878. b_trunc_high = ((m_trunc & kFp32SignMask) > 0);
  879. b_trunc_left = ((m_trunc & kFp32AbsMax) > 0);
  880. }
  881. m_tmp = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, m_tmp);
  882. while (m_tmp >= m_max || e_ret < 0) {
  883. m_tmp = m_tmp >> 1;
  884. e_ret = e_ret + 1;
  885. }
  886. } else {
  887. e_ret = kFp16ExpBias;
  888. m_tmp = m_tmp << (kManBitLength - len);
  889. e_ret = e_ret + (len - 1);
  890. }
  891. auto m_ret = static_cast<uint16_t>(m_tmp);
  892. ret_val = FP16_CONSTRUCTOR(sign, static_cast<uint16_t>(e_ret), m_ret);
  893. }
  894. }
  895. fp16_t &fp16_t::operator=(const int16_t &i_val) {
  896. if (i_val == 0) {
  897. val = 0;
  898. } else {
  899. uint16_t ui_val = *(reinterpret_cast<const uint16_t *>(&i_val));
  900. auto s_ret = static_cast<uint16_t>(ui_val >> kBitShift15);
  901. if (s_ret) {
  902. int16_t iValM = -i_val;
  903. ui_val = *(reinterpret_cast<uint16_t *>(&iValM));
  904. }
  905. SetValByUint16Val(ui_val, s_ret, val);
  906. }
  907. return *this;
  908. }
  909. fp16_t &fp16_t::operator=(const uint16_t &ui_val) {
  910. if (ui_val == 0) {
  911. val = 0;
  912. } else {
  913. int16_t e_ret;
  914. uint16_t m_ret = ui_val;
  915. uint16_t m_min = kFp16ManHideBit;
  916. uint16_t m_max = m_min << 1;
  917. uint16_t len = static_cast<uint16_t>(GetManBitLength(m_ret));
  918. if (len > kManBitLength) {
  919. e_ret = kFp16ExpBias + kFp16ManLen;
  920. uint32_t m_trunc;
  921. uint32_t trunc_mask = 1;
  922. uint16_t e_tmp = len - kManBitLength;
  923. for (int i = 1; i < e_tmp; i++) {
  924. trunc_mask = (trunc_mask << 1) + 1;
  925. }
  926. m_trunc = (m_ret & trunc_mask) << (kBitShift32 - e_tmp);
  927. for (int i = 0; i < e_tmp; i++) {
  928. m_ret = (m_ret >> 1);
  929. e_ret = e_ret + 1;
  930. }
  931. bool b_last_bit = ((m_ret & 1) > 0);
  932. bool b_trunc_high = 0;
  933. bool b_trunc_left = 0;
  934. if (kRoundToNearest == g_round_mode) { // trunc
  935. b_trunc_high = ((m_trunc & kFp32SignMask) > 0);
  936. b_trunc_left = ((m_trunc & kFp32AbsMax) > 0);
  937. }
  938. m_ret = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, m_ret);
  939. while (m_ret >= m_max || e_ret < 0) {
  940. m_ret = m_ret >> 1;
  941. e_ret = e_ret + 1;
  942. }
  943. if (FP16_IS_INVALID(val)) {
  944. val = kFp16Max;
  945. }
  946. } else {
  947. e_ret = kFp16ExpBias;
  948. m_ret = m_ret << (kDim11 - len);
  949. e_ret = e_ret + (len - 1);
  950. }
  951. val = FP16_CONSTRUCTOR(0u, static_cast<uint16_t>(e_ret), m_ret);
  952. }
  953. return *this;
  954. }
  955. static void SetValByUint32Val(const uint32_t &input_val, const uint16_t &sign, uint16_t &ret_val) {
  956. int16_t e_ret;
  957. uint32_t m_tmp = (input_val & kFp32AbsMax);
  958. uint32_t m_min = kFp16ManHideBit;
  959. uint32_t m_max = m_min << 1;
  960. uint16_t len = static_cast<uint16_t>(GetManBitLength(m_tmp));
  961. if (len > kDim11) {
  962. e_ret = kFp16ExpBias + kFp16ManLen;
  963. uint32_t m_trunc = 0;
  964. uint32_t trunc_mask = 1;
  965. uint16_t e_tmp = len - kDim11;
  966. for (int i = 1; i < e_tmp; i++) {
  967. trunc_mask = (trunc_mask << 1) + 1;
  968. }
  969. m_trunc = (m_tmp & trunc_mask) << (kBitShift32 - e_tmp);
  970. for (int i = 0; i < e_tmp; i++) {
  971. m_tmp = (m_tmp >> 1);
  972. e_ret = e_ret + 1;
  973. }
  974. bool b_last_bit = ((m_tmp & 1) > 0);
  975. bool b_trunc_high = 0;
  976. bool b_trunc_left = 0;
  977. if (kRoundToNearest == g_round_mode) { // trunc
  978. b_trunc_high = ((m_trunc & kFp32SignMask) > 0);
  979. b_trunc_left = ((m_trunc & kFp32AbsMax) > 0);
  980. }
  981. m_tmp = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, m_tmp);
  982. while (m_tmp >= m_max || e_ret < 0) {
  983. m_tmp = m_tmp >> 1;
  984. e_ret = e_ret + 1;
  985. }
  986. if (e_ret >= kFp16MaxExp) {
  987. e_ret = kFp16MaxExp - 1;
  988. m_tmp = kFp16MaxMan;
  989. }
  990. } else {
  991. e_ret = kFp16ExpBias;
  992. m_tmp = m_tmp << (kDim11 - len);
  993. e_ret = e_ret + (len - 1);
  994. }
  995. auto m_ret = static_cast<uint16_t>(m_tmp);
  996. ret_val = FP16_CONSTRUCTOR(sign, static_cast<uint16_t>(e_ret), m_ret);
  997. }
  998. fp16_t &fp16_t::operator=(const int32_t &i_val) {
  999. if (i_val == 0) {
  1000. val = 0;
  1001. } else {
  1002. uint32_t ui_val = *(reinterpret_cast<const uint32_t *>(&i_val));
  1003. auto s_ret = static_cast<uint16_t>(ui_val >> kBitShift31);
  1004. if (s_ret) {
  1005. int32_t iValM = -i_val;
  1006. ui_val = *(reinterpret_cast<uint32_t *>(&iValM));
  1007. }
  1008. SetValByUint32Val(ui_val, s_ret, val);
  1009. }
  1010. return *this;
  1011. }
  1012. fp16_t &fp16_t::operator=(const uint32_t &ui_val) {
  1013. if (ui_val == 0) {
  1014. val = 0;
  1015. } else {
  1016. int16_t e_ret;
  1017. uint32_t m_tmp = ui_val;
  1018. uint32_t m_min = kFp16ManHideBit;
  1019. uint32_t m_max = m_min << 1;
  1020. uint16_t len = static_cast<uint16_t>(GetManBitLength(m_tmp));
  1021. if (len > kDim11) {
  1022. e_ret = kFp16ExpBias + kFp16ManLen;
  1023. uint32_t m_trunc = 0;
  1024. uint32_t trunc_mask = 1;
  1025. uint16_t e_tmp = len - kDim11;
  1026. for (int i = 1; i < e_tmp; i++) {
  1027. trunc_mask = (trunc_mask << 1) + 1;
  1028. }
  1029. m_trunc = (m_tmp & trunc_mask) << static_cast<uint32_t>(kBitShift32 - e_tmp);
  1030. for (uint16_t i = 0; i < e_tmp; i++) {
  1031. m_tmp = (m_tmp >> 1);
  1032. e_ret = e_ret + 1;
  1033. }
  1034. bool b_last_bit = ((m_tmp & 1) > 0);
  1035. bool b_trunc_high = false;
  1036. bool b_trunc_left = false;
  1037. if (g_round_mode == kRoundToNearest) { // trunc
  1038. b_trunc_high = ((m_trunc & kFp32SignMask) > 0);
  1039. b_trunc_left = ((m_trunc & kFp32AbsMax) > 0);
  1040. }
  1041. m_tmp = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, m_tmp);
  1042. while (m_tmp >= m_max || e_ret < 0) {
  1043. m_tmp = m_tmp >> 1;
  1044. e_ret = e_ret + 1;
  1045. }
  1046. if (e_ret >= kFp16MaxExp) {
  1047. e_ret = kFp16MaxExp - 1;
  1048. m_tmp = kFp16MaxMan;
  1049. }
  1050. } else {
  1051. e_ret = kFp16ExpBias;
  1052. m_tmp = m_tmp << (kDim11 - len);
  1053. e_ret = e_ret + (len - 1);
  1054. }
  1055. auto m_ret = static_cast<uint16_t>(m_tmp);
  1056. val = FP16_CONSTRUCTOR(0u, static_cast<uint16_t>(e_ret), m_ret);
  1057. }
  1058. return *this;
  1059. }
  1060. fp16_t &fp16_t::operator=(const double &d_val) {
  1061. uint16_t s_ret;
  1062. uint16_t m_ret;
  1063. int16_t e_ret;
  1064. uint64_t e_d;
  1065. uint64_t m_d;
  1066. uint64_t ui64_v = *(reinterpret_cast<const uint64_t *>(&d_val)); // 1:11:52bit sign:exp:man
  1067. uint32_t m_len_delta;
  1068. s_ret = static_cast<uint16_t>((ui64_v & kFp64SignMask) >> kFp64SignIndex); // 4Byte
  1069. e_d = (ui64_v & kFp64ExpMask) >> kFp64ManLen; // 10 bit exponent
  1070. m_d = (ui64_v & kFp64ManMask); // 52 bit mantissa
  1071. m_len_delta = kFp64ManLen - kFp16ManLen;
  1072. bool need_round = false;
  1073. // Exponent overflow/NaN converts to signed inf/NaN
  1074. if (e_d >= 0x410u) { // 0x410:1040=1023+16
  1075. e_ret = kFp16MaxExp - 1;
  1076. m_ret = kFp16MaxMan;
  1077. val = FP16_CONSTRUCTOR(s_ret, static_cast<uint16_t>(e_ret), m_ret);
  1078. } else if (e_d <= 0x3F0u) { // Exponent underflow converts to denormalized half or signed zero
  1079. // 0x3F0:1008=1023-15
  1080. // Signed zeros, denormalized floats, and floats with small
  1081. // exponents all convert to signed zero half precision.
  1082. e_ret = 0;
  1083. if (e_d >= 0x3E7u) { // 0x3E7u:999=1023-24 Denormal
  1084. // Underflows to a denormalized value
  1085. m_d = (kFp64ManHideBit | m_d);
  1086. uint16_t shift_out = kFp64ManLen;
  1087. uint64_t m_tmp = (static_cast<uint64_t>(m_d)) << (e_d - 0x3E7u);
  1088. need_round = IsRoundOne(m_tmp, shift_out);
  1089. m_ret = static_cast<uint16_t>(m_tmp >> shift_out);
  1090. if (need_round) {
  1091. m_ret++;
  1092. }
  1093. } else if (e_d == 0x3E6u && m_d > 0) {
  1094. m_ret = 1;
  1095. } else {
  1096. m_ret = 0;
  1097. }
  1098. } else { // Regular case with no overflow or underflow
  1099. e_ret = static_cast<int16_t>(e_d - 0x3F0u);
  1100. need_round = IsRoundOne(m_d, m_len_delta);
  1101. m_ret = static_cast<uint16_t>(m_d >> m_len_delta);
  1102. if (need_round) {
  1103. m_ret++;
  1104. }
  1105. if (m_ret & kFp16ManHideBit) {
  1106. e_ret++;
  1107. }
  1108. }
  1109. Fp16Normalize(e_ret, m_ret);
  1110. val = FP16_CONSTRUCTOR(s_ret, static_cast<uint16_t>(e_ret), m_ret);
  1111. return *this;
  1112. }
  1113. // convert
  1114. fp16_t::operator float() const { return Fp16ToFloat(val); }
  1115. fp16_t::operator double() const { return Fp16ToDouble(val); }
  1116. fp16_t::operator int8_t() const { return Fp16ToInt8(val); }
  1117. fp16_t::operator uint8_t() const { return Fp16ToUInt8(val); }
  1118. fp16_t::operator int16_t() const { return Fp16ToInt16(val); }
  1119. fp16_t::operator uint16_t() const { return Fp16ToUInt16(val); }
  1120. fp16_t::operator int32_t() const { return Fp16ToInt32(val); }
  1121. fp16_t::operator uint32_t() const { return Fp16ToUInt32(val); }
  1122. // Cannot be used, just in order to solve the compile error
  1123. fp16_t::operator int64_t() const { return 0; }
  1124. // Cannot be used, just in order to solve the compile error
  1125. fp16_t::operator uint64_t() const { return 0; }
  1126. FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int fp16_t::IsInf() {
  1127. if ((val & kFp16AbsMax) == kFp16ExpMask) {
  1128. if (val & kFp16SignMask) {
  1129. return -1;
  1130. } else {
  1131. return 1;
  1132. }
  1133. } else {
  1134. return 0;
  1135. }
  1136. }
  1137. FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY float fp16_t::ToFloat() const { return Fp16ToFloat(val); }
  1138. FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY double fp16_t::ToDouble() const { return Fp16ToDouble(val); }
  1139. FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int8_t fp16_t::ToInt8() const { return Fp16ToInt8(val); }
  1140. FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY uint8_t fp16_t::ToUInt8() const { return Fp16ToUInt8(val); }
  1141. FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int16_t fp16_t::ToInt16() const { return Fp16ToInt16(val); }
  1142. FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY uint16_t fp16_t::ToUInt16() const { return Fp16ToUInt16(val); }
  1143. FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int32_t fp16_t::ToInt32() const { return Fp16ToInt32(val); }
  1144. FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY uint32_t fp16_t::ToUInt32() const { return Fp16ToUInt32(val); }
  1145. } // namespace ge

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示