You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

charset_test.go 8.8 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. // Copyright 2019 The Gitea Authors. All rights reserved.
  2. // Use of this source code is governed by a MIT-style
  3. // license that can be found in the LICENSE file.
  4. package charset
  5. import (
  6. "testing"
  7. "code.gitea.io/gitea/modules/setting"
  8. "github.com/stretchr/testify/assert"
  9. )
  10. func TestRemoveBOMIfPresent(t *testing.T) {
  11. res := RemoveBOMIfPresent([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  12. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
  13. res = RemoveBOMIfPresent([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  14. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
  15. }
  16. func TestToUTF8WithErr(t *testing.T) {
  17. var res string
  18. var err error
  19. res, err = ToUTF8WithErr([]byte{0x41, 0x42, 0x43})
  20. assert.Equal(t, "ABC", res)
  21. assert.NoError(t, err)
  22. res, err = ToUTF8WithErr([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  23. assert.Equal(t, "áéíóú", res)
  24. assert.NoError(t, err)
  25. res, err = ToUTF8WithErr([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  26. assert.Equal(t, "áéíóú", res)
  27. assert.NoError(t, err)
  28. // This test FAILS
  29. res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73})
  30. assert.Equal(t, "Hola, así cómo ños", res)
  31. assert.NoError(t, err)
  32. res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73})
  33. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  34. assert.Regexp(t, "^Hola, así cómo", res)
  35. assert.NoError(t, err)
  36. res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73})
  37. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  38. assert.Regexp(t, "^Hola, así cómo", res)
  39. assert.NoError(t, err)
  40. // Japanese (Shift-JIS)
  41. res, err = ToUTF8WithErr([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42})
  42. assert.Equal(t, "日属秘ぞしちゅ。", res)
  43. assert.NoError(t, err)
  44. res, err = ToUTF8WithErr([]byte{0x00, 0x00, 0x00, 0x00})
  45. assert.Equal(t, "\x00\x00\x00\x00", res)
  46. assert.NoError(t, err)
  47. }
  48. func TestToUTF8WithFallback(t *testing.T) {
  49. // "ABC"
  50. res := ToUTF8WithFallback([]byte{0x41, 0x42, 0x43})
  51. assert.Equal(t, []byte{0x41, 0x42, 0x43}, res)
  52. // "áéíóú"
  53. res = ToUTF8WithFallback([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  54. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
  55. // UTF8 BOM + "áéíóú"
  56. res = ToUTF8WithFallback([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  57. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
  58. // "Hola, así cómo ños"
  59. res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73})
  60. assert.Equal(t, []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63, 0xC3, 0xB3, 0x6D, 0x6F, 0x20, 0xC3, 0xB1, 0x6F, 0x73}, res)
  61. // "Hola, así cómo "
  62. minmatch := []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63, 0xC3, 0xB3, 0x6D, 0x6F, 0x20}
  63. res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73})
  64. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  65. assert.Equal(t, minmatch, res[0:len(minmatch)])
  66. res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73})
  67. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  68. assert.Equal(t, minmatch, res[0:len(minmatch)])
  69. // Japanese (Shift-JIS)
  70. // "日属秘ぞしちゅ。"
  71. res = ToUTF8WithFallback([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42})
  72. assert.Equal(t, []byte{0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
  73. 0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82}, res)
  74. res = ToUTF8WithFallback([]byte{0x00, 0x00, 0x00, 0x00})
  75. assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res)
  76. }
  77. func TestToUTF8(t *testing.T) {
  78. res := ToUTF8("ABC")
  79. assert.Equal(t, "ABC", res)
  80. res = ToUTF8("áéíóú")
  81. assert.Equal(t, "áéíóú", res)
  82. // With utf-8 BOM
  83. res = ToUTF8("\ufeffáéíóú")
  84. assert.Equal(t, "áéíóú", res)
  85. res = ToUTF8("Hola, así cómo ños")
  86. assert.Equal(t, "Hola, así cómo ños", res)
  87. res = ToUTF8("Hola, así cómo \x07ños")
  88. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  89. assert.Regexp(t, "^Hola, así cómo", res)
  90. // This test FAILS
  91. // res = ToUTF8("Hola, así cómo \x81ños")
  92. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  93. // assert.Regexp(t, "^Hola, así cómo", res)
  94. // Japanese (Shift-JIS)
  95. res = ToUTF8("\x93\xFA\x91\xAE\x94\xE9\x82\xBC\x82\xB5\x82\xBF\x82\xE3\x81\x42")
  96. assert.Equal(t, "日属秘ぞしちゅ。", res)
  97. res = ToUTF8("\x00\x00\x00\x00")
  98. assert.Equal(t, "\x00\x00\x00\x00", res)
  99. }
  100. func TestToUTF8DropErrors(t *testing.T) {
  101. // "ABC"
  102. res := ToUTF8DropErrors([]byte{0x41, 0x42, 0x43})
  103. assert.Equal(t, []byte{0x41, 0x42, 0x43}, res)
  104. // "áéíóú"
  105. res = ToUTF8DropErrors([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  106. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
  107. // UTF8 BOM + "áéíóú"
  108. res = ToUTF8DropErrors([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  109. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
  110. // "Hola, así cómo ños"
  111. res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73})
  112. assert.Equal(t, []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63, 0xC3, 0xB3, 0x6D, 0x6F, 0x20, 0xC3, 0xB1, 0x6F, 0x73}, res)
  113. // "Hola, así cómo "
  114. minmatch := []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63, 0xC3, 0xB3, 0x6D, 0x6F, 0x20}
  115. res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73})
  116. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  117. assert.Equal(t, minmatch, res[0:len(minmatch)])
  118. res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73})
  119. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  120. assert.Equal(t, minmatch, res[0:len(minmatch)])
  121. // Japanese (Shift-JIS)
  122. // "日属秘ぞしちゅ。"
  123. res = ToUTF8DropErrors([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42})
  124. assert.Equal(t, []byte{0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
  125. 0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82}, res)
  126. res = ToUTF8DropErrors([]byte{0x00, 0x00, 0x00, 0x00})
  127. assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res)
  128. }
  129. func TestDetectEncoding(t *testing.T) {
  130. testSuccess := func(b []byte, expected string) {
  131. encoding, err := DetectEncoding(b)
  132. assert.NoError(t, err)
  133. assert.Equal(t, expected, encoding)
  134. }
  135. // utf-8
  136. b := []byte("just some ascii")
  137. testSuccess(b, "UTF-8")
  138. // utf-8-sig: "hey" (with BOM)
  139. b = []byte{0xef, 0xbb, 0xbf, 0x68, 0x65, 0x79}
  140. testSuccess(b, "UTF-8")
  141. // utf-16: "hey<accented G>"
  142. b = []byte{0xff, 0xfe, 0x68, 0x00, 0x65, 0x00, 0x79, 0x00, 0xf4, 0x01}
  143. testSuccess(b, "UTF-16LE")
  144. // iso-8859-1: d<accented e>cor<newline>
  145. b = []byte{0x44, 0xe9, 0x63, 0x6f, 0x72, 0x0a}
  146. encoding, err := DetectEncoding(b)
  147. assert.NoError(t, err)
  148. // due to a race condition in `chardet` library, it could either detect
  149. // "ISO-8859-1" or "IS0-8859-2" here. Technically either is correct, so
  150. // we accept either.
  151. assert.Contains(t, encoding, "ISO-8859")
  152. setting.Repository.AnsiCharset = "placeholder"
  153. testSuccess(b, "placeholder")
  154. // invalid bytes
  155. b = []byte{0xfa}
  156. _, err = DetectEncoding(b)
  157. assert.Error(t, err)
  158. }