You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

StringUtils.cs 9.5 kB

10 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. /*
  2. * Copyright (C) 2010 ZXing authors
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. using System;
  17. using System.Collections.Generic;
  18. using System.Text;
  19. namespace ZXing.Common
  20. {
  21. /// <summary>
  22. /// Common string-related functions.
  23. /// </summary>
  24. /// <author>Sean Owen</author>
  25. /// <author>Alex Dupre</author>
  26. public static class StringUtils
  27. {
  28. #if (WINDOWS_PHONE70 || WINDOWS_PHONE71 || WINDOWS_PHONE80 || SILVERLIGHT4 || SILVERLIGHT5 || NETFX_CORE || PORTABLE)
  29. private const String PLATFORM_DEFAULT_ENCODING = "UTF-8";
  30. #else
  31. private static String PLATFORM_DEFAULT_ENCODING = Encoding.Default.WebName;
  32. #endif
  33. public static String SHIFT_JIS = "SJIS";
  34. public static String GB2312 = "GB2312";
  35. private const String EUC_JP = "EUC-JP";
  36. private const String UTF8 = "UTF-8";
  37. private const String ISO88591 = "ISO-8859-1";
  38. private static readonly bool ASSUME_SHIFT_JIS =
  39. String.Compare(SHIFT_JIS, PLATFORM_DEFAULT_ENCODING, StringComparison.OrdinalIgnoreCase) == 0 ||
  40. String.Compare(EUC_JP, PLATFORM_DEFAULT_ENCODING, StringComparison.OrdinalIgnoreCase) == 0;
  41. /// <summary>
  42. /// Guesses the encoding.
  43. /// </summary>
  44. /// <param name="bytes">bytes encoding a string, whose encoding should be guessed</param>
  45. /// <param name="hints">decode hints if applicable</param>
  46. /// <returns>name of guessed encoding; at the moment will only guess one of:
  47. /// {@link #SHIFT_JIS}, {@link #UTF8}, {@link #ISO88591}, or the platform
  48. /// default encoding if none of these can possibly be correct</returns>
  49. public static String guessEncoding(byte[] bytes, IDictionary<DecodeHintType, object> hints)
  50. {
  51. if (hints != null && hints.ContainsKey(DecodeHintType.CHARACTER_SET))
  52. {
  53. String characterSet = (String)hints[DecodeHintType.CHARACTER_SET];
  54. if (characterSet != null)
  55. {
  56. return characterSet;
  57. }
  58. }
  59. // For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS,
  60. // which should be by far the most common encodings.
  61. int length = bytes.Length;
  62. bool canBeISO88591 = true;
  63. bool canBeShiftJIS = true;
  64. bool canBeUTF8 = true;
  65. int utf8BytesLeft = 0;
  66. //int utf8LowChars = 0;
  67. int utf2BytesChars = 0;
  68. int utf3BytesChars = 0;
  69. int utf4BytesChars = 0;
  70. int sjisBytesLeft = 0;
  71. //int sjisLowChars = 0;
  72. int sjisKatakanaChars = 0;
  73. //int sjisDoubleBytesChars = 0;
  74. int sjisCurKatakanaWordLength = 0;
  75. int sjisCurDoubleBytesWordLength = 0;
  76. int sjisMaxKatakanaWordLength = 0;
  77. int sjisMaxDoubleBytesWordLength = 0;
  78. //int isoLowChars = 0;
  79. //int isoHighChars = 0;
  80. int isoHighOther = 0;
  81. bool utf8bom = bytes.Length > 3 &&
  82. bytes[0] == 0xEF &&
  83. bytes[1] == 0xBB &&
  84. bytes[2] == 0xBF;
  85. for (int i = 0;
  86. i < length && (canBeISO88591 || canBeShiftJIS || canBeUTF8);
  87. i++)
  88. {
  89. int value = bytes[i] & 0xFF;
  90. // UTF-8 stuff
  91. if (canBeUTF8)
  92. {
  93. if (utf8BytesLeft > 0)
  94. {
  95. if ((value & 0x80) == 0)
  96. {
  97. canBeUTF8 = false;
  98. }
  99. else
  100. {
  101. utf8BytesLeft--;
  102. }
  103. }
  104. else if ((value & 0x80) != 0)
  105. {
  106. if ((value & 0x40) == 0)
  107. {
  108. canBeUTF8 = false;
  109. }
  110. else
  111. {
  112. utf8BytesLeft++;
  113. if ((value & 0x20) == 0)
  114. {
  115. utf2BytesChars++;
  116. }
  117. else
  118. {
  119. utf8BytesLeft++;
  120. if ((value & 0x10) == 0)
  121. {
  122. utf3BytesChars++;
  123. }
  124. else
  125. {
  126. utf8BytesLeft++;
  127. if ((value & 0x08) == 0)
  128. {
  129. utf4BytesChars++;
  130. }
  131. else
  132. {
  133. canBeUTF8 = false;
  134. }
  135. }
  136. }
  137. }
  138. } //else {
  139. //utf8LowChars++;
  140. //}
  141. }
  142. // ISO-8859-1 stuff
  143. if (canBeISO88591)
  144. {
  145. if (value > 0x7F && value < 0xA0)
  146. {
  147. canBeISO88591 = false;
  148. }
  149. else if (value > 0x9F)
  150. {
  151. if (value < 0xC0 || value == 0xD7 || value == 0xF7)
  152. {
  153. isoHighOther++;
  154. } //else {
  155. //isoHighChars++;
  156. //}
  157. } //else {
  158. //isoLowChars++;
  159. //}
  160. }
  161. // Shift_JIS stuff
  162. if (canBeShiftJIS)
  163. {
  164. if (sjisBytesLeft > 0)
  165. {
  166. if (value < 0x40 || value == 0x7F || value > 0xFC)
  167. {
  168. canBeShiftJIS = false;
  169. }
  170. else
  171. {
  172. sjisBytesLeft--;
  173. }
  174. }
  175. else if (value == 0x80 || value == 0xA0 || value > 0xEF)
  176. {
  177. canBeShiftJIS = false;
  178. }
  179. else if (value > 0xA0 && value < 0xE0)
  180. {
  181. sjisKatakanaChars++;
  182. sjisCurDoubleBytesWordLength = 0;
  183. sjisCurKatakanaWordLength++;
  184. if (sjisCurKatakanaWordLength > sjisMaxKatakanaWordLength)
  185. {
  186. sjisMaxKatakanaWordLength = sjisCurKatakanaWordLength;
  187. }
  188. }
  189. else if (value > 0x7F)
  190. {
  191. sjisBytesLeft++;
  192. //sjisDoubleBytesChars++;
  193. sjisCurKatakanaWordLength = 0;
  194. sjisCurDoubleBytesWordLength++;
  195. if (sjisCurDoubleBytesWordLength > sjisMaxDoubleBytesWordLength)
  196. {
  197. sjisMaxDoubleBytesWordLength = sjisCurDoubleBytesWordLength;
  198. }
  199. }
  200. else
  201. {
  202. //sjisLowChars++;
  203. sjisCurKatakanaWordLength = 0;
  204. sjisCurDoubleBytesWordLength = 0;
  205. }
  206. }
  207. }
  208. if (canBeUTF8 && utf8BytesLeft > 0)
  209. {
  210. canBeUTF8 = false;
  211. }
  212. if (canBeShiftJIS && sjisBytesLeft > 0)
  213. {
  214. canBeShiftJIS = false;
  215. }
  216. // Easy -- if there is BOM or at least 1 valid not-single byte character (and no evidence it can't be UTF-8), done
  217. if (canBeUTF8 && (utf8bom || utf2BytesChars + utf3BytesChars + utf4BytesChars > 0))
  218. {
  219. return UTF8;
  220. }
  221. // Easy -- if assuming Shift_JIS or at least 3 valid consecutive not-ascii characters (and no evidence it can't be), done
  222. if (canBeShiftJIS && (ASSUME_SHIFT_JIS || sjisMaxKatakanaWordLength >= 3 || sjisMaxDoubleBytesWordLength >= 3))
  223. {
  224. return SHIFT_JIS;
  225. }
  226. // Distinguishing Shift_JIS and ISO-8859-1 can be a little tough for short words. The crude heuristic is:
  227. // - If we saw
  228. // - only two consecutive katakana chars in the whole text, or
  229. // - at least 10% of bytes that could be "upper" not-alphanumeric Latin1,
  230. // - then we conclude Shift_JIS, else ISO-8859-1
  231. if (canBeISO88591 && canBeShiftJIS)
  232. {
  233. return (sjisMaxKatakanaWordLength == 2 && sjisKatakanaChars == 2) || isoHighOther * 10 >= length
  234. ? SHIFT_JIS : ISO88591;
  235. }
  236. // Otherwise, try in order ISO-8859-1, Shift JIS, UTF-8 and fall back to default platform encoding
  237. if (canBeISO88591)
  238. {
  239. return ISO88591;
  240. }
  241. if (canBeShiftJIS)
  242. {
  243. return SHIFT_JIS;
  244. }
  245. if (canBeUTF8)
  246. {
  247. return UTF8;
  248. }
  249. // Otherwise, we take a wild guess with platform encoding
  250. return PLATFORM_DEFAULT_ENCODING;
  251. }
  252. }
  253. }