@@ -33,32 +33,32 @@ var ( | |||||
ISO8859_8I encoding.Encoding = &iso8859_8I | ISO8859_8I encoding.Encoding = &iso8859_8I | ||||
iso8859_6E = internal.Encoding{ | iso8859_6E = internal.Encoding{ | ||||
ISO8859_6, | |||||
"ISO-8859-6E", | |||||
identifier.ISO88596E, | |||||
Encoding: ISO8859_6, | |||||
Name: "ISO-8859-6E", | |||||
MIB: identifier.ISO88596E, | |||||
} | } | ||||
iso8859_6I = internal.Encoding{ | iso8859_6I = internal.Encoding{ | ||||
ISO8859_6, | |||||
"ISO-8859-6I", | |||||
identifier.ISO88596I, | |||||
Encoding: ISO8859_6, | |||||
Name: "ISO-8859-6I", | |||||
MIB: identifier.ISO88596I, | |||||
} | } | ||||
iso8859_8E = internal.Encoding{ | iso8859_8E = internal.Encoding{ | ||||
ISO8859_8, | |||||
"ISO-8859-8E", | |||||
identifier.ISO88598E, | |||||
Encoding: ISO8859_8, | |||||
Name: "ISO-8859-8E", | |||||
MIB: identifier.ISO88598E, | |||||
} | } | ||||
iso8859_8I = internal.Encoding{ | iso8859_8I = internal.Encoding{ | ||||
ISO8859_8, | |||||
"ISO-8859-8I", | |||||
identifier.ISO88598I, | |||||
Encoding: ISO8859_8, | |||||
Name: "ISO-8859-8I", | |||||
MIB: identifier.ISO88598I, | |||||
} | } | ||||
) | ) | ||||
// All is a list of all defined encodings in this package. | // All is a list of all defined encodings in this package. | ||||
var All = listAll | |||||
var All []encoding.Encoding = listAll | |||||
// TODO: implement these encodings, in order of importance. | // TODO: implement these encodings, in order of importance. | ||||
// ASCII, ISO8859_1: Rather common. Close to Windows 1252. | // ASCII, ISO8859_1: Rather common. Close to Windows 1252. | ||||
@@ -70,8 +70,8 @@ type utf8Enc struct { | |||||
data [3]byte | data [3]byte | ||||
} | } | ||||
// charmap describes an 8-bit character set encoding. | |||||
type charmap struct { | |||||
// Charmap is an 8-bit character set encoding. | |||||
type Charmap struct { | |||||
// name is the encoding's name. | // name is the encoding's name. | ||||
name string | name string | ||||
// mib is the encoding type of this encoder. | // mib is the encoding type of this encoder. | ||||
@@ -79,7 +79,7 @@ type charmap struct { | |||||
// asciiSuperset states whether the encoding is a superset of ASCII. | // asciiSuperset states whether the encoding is a superset of ASCII. | ||||
asciiSuperset bool | asciiSuperset bool | ||||
// low is the lower bound of the encoded byte for a non-ASCII rune. If | // low is the lower bound of the encoded byte for a non-ASCII rune. If | ||||
// charmap.asciiSuperset is true then this will be 0x80, otherwise 0x00. | |||||
// Charmap.asciiSuperset is true then this will be 0x80, otherwise 0x00. | |||||
low uint8 | low uint8 | ||||
// replacement is the encoded replacement character. | // replacement is the encoded replacement character. | ||||
replacement byte | replacement byte | ||||
@@ -91,26 +91,30 @@ type charmap struct { | |||||
encode [256]uint32 | encode [256]uint32 | ||||
} | } | ||||
func (m *charmap) NewDecoder() *encoding.Decoder { | |||||
// NewDecoder implements the encoding.Encoding interface. | |||||
func (m *Charmap) NewDecoder() *encoding.Decoder { | |||||
return &encoding.Decoder{Transformer: charmapDecoder{charmap: m}} | return &encoding.Decoder{Transformer: charmapDecoder{charmap: m}} | ||||
} | } | ||||
func (m *charmap) NewEncoder() *encoding.Encoder { | |||||
// NewEncoder implements the encoding.Encoding interface. | |||||
func (m *Charmap) NewEncoder() *encoding.Encoder { | |||||
return &encoding.Encoder{Transformer: charmapEncoder{charmap: m}} | return &encoding.Encoder{Transformer: charmapEncoder{charmap: m}} | ||||
} | } | ||||
func (m *charmap) String() string { | |||||
// String returns the Charmap's name. | |||||
func (m *Charmap) String() string { | |||||
return m.name | return m.name | ||||
} | } | ||||
func (m *charmap) ID() (mib identifier.MIB, other string) { | |||||
// ID implements an internal interface. | |||||
func (m *Charmap) ID() (mib identifier.MIB, other string) { | |||||
return m.mib, "" | return m.mib, "" | ||||
} | } | ||||
// charmapDecoder implements transform.Transformer by decoding to UTF-8. | // charmapDecoder implements transform.Transformer by decoding to UTF-8. | ||||
type charmapDecoder struct { | type charmapDecoder struct { | ||||
transform.NopResetter | transform.NopResetter | ||||
charmap *charmap | |||||
charmap *Charmap | |||||
} | } | ||||
func (m charmapDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { | func (m charmapDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { | ||||
@@ -142,10 +146,22 @@ func (m charmapDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, | |||||
return nDst, nSrc, err | return nDst, nSrc, err | ||||
} | } | ||||
// DecodeByte returns the Charmap's rune decoding of the byte b. | |||||
func (m *Charmap) DecodeByte(b byte) rune { | |||||
switch x := &m.decode[b]; x.len { | |||||
case 1: | |||||
return rune(x.data[0]) | |||||
case 2: | |||||
return rune(x.data[0]&0x1f)<<6 | rune(x.data[1]&0x3f) | |||||
default: | |||||
return rune(x.data[0]&0x0f)<<12 | rune(x.data[1]&0x3f)<<6 | rune(x.data[2]&0x3f) | |||||
} | |||||
} | |||||
// charmapEncoder implements transform.Transformer by encoding from UTF-8. | // charmapEncoder implements transform.Transformer by encoding from UTF-8. | ||||
type charmapEncoder struct { | type charmapEncoder struct { | ||||
transform.NopResetter | transform.NopResetter | ||||
charmap *charmap | |||||
charmap *Charmap | |||||
} | } | ||||
func (m charmapEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { | func (m charmapEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { | ||||
@@ -207,3 +223,27 @@ loop: | |||||
} | } | ||||
return nDst, nSrc, err | return nDst, nSrc, err | ||||
} | } | ||||
// EncodeRune returns the Charmap's byte encoding of the rune r. ok is whether | |||||
// r is in the Charmap's repertoire. If not, b is set to the Charmap's | |||||
// replacement byte. This is often the ASCII substitute character '\x1a'. | |||||
func (m *Charmap) EncodeRune(r rune) (b byte, ok bool) { | |||||
if r < utf8.RuneSelf && m.asciiSuperset { | |||||
return byte(r), true | |||||
} | |||||
for low, high := int(m.low), 0x100; ; { | |||||
if low >= high { | |||||
return m.replacement, false | |||||
} | |||||
mid := (low + high) / 2 | |||||
got := m.encode[mid] | |||||
gotRune := rune(got & (1<<24 - 1)) | |||||
if gotRune < r { | |||||
low = mid + 1 | |||||
} else if gotRune > r { | |||||
high = mid | |||||
} else { | |||||
return byte(got >> 24), true | |||||
} | |||||
} | |||||
} |
@@ -1,524 +0,0 @@ | |||||
// Copyright 2013 The Go Authors. All rights reserved. | |||||
// Use of this source code is governed by a BSD-style | |||||
// license that can be found in the LICENSE file. | |||||
// +build ignore | |||||
package main | |||||
import ( | |||||
"bufio" | |||||
"fmt" | |||||
"log" | |||||
"net/http" | |||||
"sort" | |||||
"strings" | |||||
"unicode/utf8" | |||||
"golang.org/x/text/encoding" | |||||
"golang.org/x/text/internal/gen" | |||||
) | |||||
const ascii = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + | |||||
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" + | |||||
` !"#$%&'()*+,-./0123456789:;<=>?` + | |||||
`@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` + | |||||
"`abcdefghijklmnopqrstuvwxyz{|}~\u007f" | |||||
var encodings = []struct { | |||||
name string | |||||
mib string | |||||
comment string | |||||
varName string | |||||
replacement byte | |||||
mapping string | |||||
}{ | |||||
{ | |||||
"IBM Code Page 437", | |||||
"PC8CodePage437", | |||||
"", | |||||
"CodePage437", | |||||
encoding.ASCIISub, | |||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM437-2.1.2.ucm", | |||||
}, | |||||
{ | |||||
"IBM Code Page 850", | |||||
"PC850Multilingual", | |||||
"", | |||||
"CodePage850", | |||||
encoding.ASCIISub, | |||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM850-2.1.2.ucm", | |||||
}, | |||||
{ | |||||
"IBM Code Page 852", | |||||
"PCp852", | |||||
"", | |||||
"CodePage852", | |||||
encoding.ASCIISub, | |||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM852-2.1.2.ucm", | |||||
}, | |||||
{ | |||||
"IBM Code Page 855", | |||||
"IBM855", | |||||
"", | |||||
"CodePage855", | |||||
encoding.ASCIISub, | |||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM855-2.1.2.ucm", | |||||
}, | |||||
{ | |||||
"Windows Code Page 858", // PC latin1 with Euro | |||||
"IBM00858", | |||||
"", | |||||
"CodePage858", | |||||
encoding.ASCIISub, | |||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/windows-858-2000.ucm", | |||||
}, | |||||
{ | |||||
"IBM Code Page 860", | |||||
"IBM860", | |||||
"", | |||||
"CodePage860", | |||||
encoding.ASCIISub, | |||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM860-2.1.2.ucm", | |||||
}, | |||||
{ | |||||
"IBM Code Page 862", | |||||
"PC862LatinHebrew", | |||||
"", | |||||
"CodePage862", | |||||
encoding.ASCIISub, | |||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM862-2.1.2.ucm", | |||||
}, | |||||
{ | |||||
"IBM Code Page 863", | |||||
"IBM863", | |||||
"", | |||||
"CodePage863", | |||||
encoding.ASCIISub, | |||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM863-2.1.2.ucm", | |||||
}, | |||||
{ | |||||
"IBM Code Page 865", | |||||
"IBM865", | |||||
"", | |||||
"CodePage865", | |||||
encoding.ASCIISub, | |||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM865-2.1.2.ucm", | |||||
}, | |||||
{ | |||||
"IBM Code Page 866", | |||||
"IBM866", | |||||
"", | |||||
"CodePage866", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-ibm866.txt", | |||||
}, | |||||
{ | |||||
"ISO 8859-1", | |||||
"ISOLatin1", | |||||
"", | |||||
"ISO8859_1", | |||||
encoding.ASCIISub, | |||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_1-1998.ucm", | |||||
}, | |||||
{ | |||||
"ISO 8859-2", | |||||
"ISOLatin2", | |||||
"", | |||||
"ISO8859_2", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-iso-8859-2.txt", | |||||
}, | |||||
{ | |||||
"ISO 8859-3", | |||||
"ISOLatin3", | |||||
"", | |||||
"ISO8859_3", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-iso-8859-3.txt", | |||||
}, | |||||
{ | |||||
"ISO 8859-4", | |||||
"ISOLatin4", | |||||
"", | |||||
"ISO8859_4", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-iso-8859-4.txt", | |||||
}, | |||||
{ | |||||
"ISO 8859-5", | |||||
"ISOLatinCyrillic", | |||||
"", | |||||
"ISO8859_5", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-iso-8859-5.txt", | |||||
}, | |||||
{ | |||||
"ISO 8859-6", | |||||
"ISOLatinArabic", | |||||
"", | |||||
"ISO8859_6,ISO8859_6E,ISO8859_6I", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-iso-8859-6.txt", | |||||
}, | |||||
{ | |||||
"ISO 8859-7", | |||||
"ISOLatinGreek", | |||||
"", | |||||
"ISO8859_7", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-iso-8859-7.txt", | |||||
}, | |||||
{ | |||||
"ISO 8859-8", | |||||
"ISOLatinHebrew", | |||||
"", | |||||
"ISO8859_8,ISO8859_8E,ISO8859_8I", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-iso-8859-8.txt", | |||||
}, | |||||
{ | |||||
"ISO 8859-10", | |||||
"ISOLatin6", | |||||
"", | |||||
"ISO8859_10", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-iso-8859-10.txt", | |||||
}, | |||||
{ | |||||
"ISO 8859-13", | |||||
"ISO885913", | |||||
"", | |||||
"ISO8859_13", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-iso-8859-13.txt", | |||||
}, | |||||
{ | |||||
"ISO 8859-14", | |||||
"ISO885914", | |||||
"", | |||||
"ISO8859_14", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-iso-8859-14.txt", | |||||
}, | |||||
{ | |||||
"ISO 8859-15", | |||||
"ISO885915", | |||||
"", | |||||
"ISO8859_15", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-iso-8859-15.txt", | |||||
}, | |||||
{ | |||||
"ISO 8859-16", | |||||
"ISO885916", | |||||
"", | |||||
"ISO8859_16", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-iso-8859-16.txt", | |||||
}, | |||||
{ | |||||
"KOI8-R", | |||||
"KOI8R", | |||||
"", | |||||
"KOI8R", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-koi8-r.txt", | |||||
}, | |||||
{ | |||||
"KOI8-U", | |||||
"KOI8U", | |||||
"", | |||||
"KOI8U", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-koi8-u.txt", | |||||
}, | |||||
{ | |||||
"Macintosh", | |||||
"Macintosh", | |||||
"", | |||||
"Macintosh", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-macintosh.txt", | |||||
}, | |||||
{ | |||||
"Macintosh Cyrillic", | |||||
"MacintoshCyrillic", | |||||
"", | |||||
"MacintoshCyrillic", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-x-mac-cyrillic.txt", | |||||
}, | |||||
{ | |||||
"Windows 874", | |||||
"Windows874", | |||||
"", | |||||
"Windows874", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-windows-874.txt", | |||||
}, | |||||
{ | |||||
"Windows 1250", | |||||
"Windows1250", | |||||
"", | |||||
"Windows1250", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-windows-1250.txt", | |||||
}, | |||||
{ | |||||
"Windows 1251", | |||||
"Windows1251", | |||||
"", | |||||
"Windows1251", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-windows-1251.txt", | |||||
}, | |||||
{ | |||||
"Windows 1252", | |||||
"Windows1252", | |||||
"", | |||||
"Windows1252", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-windows-1252.txt", | |||||
}, | |||||
{ | |||||
"Windows 1253", | |||||
"Windows1253", | |||||
"", | |||||
"Windows1253", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-windows-1253.txt", | |||||
}, | |||||
{ | |||||
"Windows 1254", | |||||
"Windows1254", | |||||
"", | |||||
"Windows1254", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-windows-1254.txt", | |||||
}, | |||||
{ | |||||
"Windows 1255", | |||||
"Windows1255", | |||||
"", | |||||
"Windows1255", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-windows-1255.txt", | |||||
}, | |||||
{ | |||||
"Windows 1256", | |||||
"Windows1256", | |||||
"", | |||||
"Windows1256", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-windows-1256.txt", | |||||
}, | |||||
{ | |||||
"Windows 1257", | |||||
"Windows1257", | |||||
"", | |||||
"Windows1257", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-windows-1257.txt", | |||||
}, | |||||
{ | |||||
"Windows 1258", | |||||
"Windows1258", | |||||
"", | |||||
"Windows1258", | |||||
encoding.ASCIISub, | |||||
"http://encoding.spec.whatwg.org/index-windows-1258.txt", | |||||
}, | |||||
{ | |||||
"X-User-Defined", | |||||
"XUserDefined", | |||||
"It is defined at http://encoding.spec.whatwg.org/#x-user-defined", | |||||
"XUserDefined", | |||||
encoding.ASCIISub, | |||||
ascii + | |||||
"\uf780\uf781\uf782\uf783\uf784\uf785\uf786\uf787" + | |||||
"\uf788\uf789\uf78a\uf78b\uf78c\uf78d\uf78e\uf78f" + | |||||
"\uf790\uf791\uf792\uf793\uf794\uf795\uf796\uf797" + | |||||
"\uf798\uf799\uf79a\uf79b\uf79c\uf79d\uf79e\uf79f" + | |||||
"\uf7a0\uf7a1\uf7a2\uf7a3\uf7a4\uf7a5\uf7a6\uf7a7" + | |||||
"\uf7a8\uf7a9\uf7aa\uf7ab\uf7ac\uf7ad\uf7ae\uf7af" + | |||||
"\uf7b0\uf7b1\uf7b2\uf7b3\uf7b4\uf7b5\uf7b6\uf7b7" + | |||||
"\uf7b8\uf7b9\uf7ba\uf7bb\uf7bc\uf7bd\uf7be\uf7bf" + | |||||
"\uf7c0\uf7c1\uf7c2\uf7c3\uf7c4\uf7c5\uf7c6\uf7c7" + | |||||
"\uf7c8\uf7c9\uf7ca\uf7cb\uf7cc\uf7cd\uf7ce\uf7cf" + | |||||
"\uf7d0\uf7d1\uf7d2\uf7d3\uf7d4\uf7d5\uf7d6\uf7d7" + | |||||
"\uf7d8\uf7d9\uf7da\uf7db\uf7dc\uf7dd\uf7de\uf7df" + | |||||
"\uf7e0\uf7e1\uf7e2\uf7e3\uf7e4\uf7e5\uf7e6\uf7e7" + | |||||
"\uf7e8\uf7e9\uf7ea\uf7eb\uf7ec\uf7ed\uf7ee\uf7ef" + | |||||
"\uf7f0\uf7f1\uf7f2\uf7f3\uf7f4\uf7f5\uf7f6\uf7f7" + | |||||
"\uf7f8\uf7f9\uf7fa\uf7fb\uf7fc\uf7fd\uf7fe\uf7ff", | |||||
}, | |||||
} | |||||
func getWHATWG(url string) string { | |||||
res, err := http.Get(url) | |||||
if err != nil { | |||||
log.Fatalf("%q: Get: %v", url, err) | |||||
} | |||||
defer res.Body.Close() | |||||
mapping := make([]rune, 128) | |||||
for i := range mapping { | |||||
mapping[i] = '\ufffd' | |||||
} | |||||
scanner := bufio.NewScanner(res.Body) | |||||
for scanner.Scan() { | |||||
s := strings.TrimSpace(scanner.Text()) | |||||
if s == "" || s[0] == '#' { | |||||
continue | |||||
} | |||||
x, y := 0, 0 | |||||
if _, err := fmt.Sscanf(s, "%d\t0x%x", &x, &y); err != nil { | |||||
log.Fatalf("could not parse %q", s) | |||||
} | |||||
if x < 0 || 128 <= x { | |||||
log.Fatalf("code %d is out of range", x) | |||||
} | |||||
if 0x80 <= y && y < 0xa0 { | |||||
// We diverge from the WHATWG spec by mapping control characters | |||||
// in the range [0x80, 0xa0) to U+FFFD. | |||||
continue | |||||
} | |||||
mapping[x] = rune(y) | |||||
} | |||||
return ascii + string(mapping) | |||||
} | |||||
func getUCM(url string) string { | |||||
res, err := http.Get(url) | |||||
if err != nil { | |||||
log.Fatalf("%q: Get: %v", url, err) | |||||
} | |||||
defer res.Body.Close() | |||||
mapping := make([]rune, 256) | |||||
for i := range mapping { | |||||
mapping[i] = '\ufffd' | |||||
} | |||||
charsFound := 0 | |||||
scanner := bufio.NewScanner(res.Body) | |||||
for scanner.Scan() { | |||||
s := strings.TrimSpace(scanner.Text()) | |||||
if s == "" || s[0] == '#' { | |||||
continue | |||||
} | |||||
var c byte | |||||
var r rune | |||||
if _, err := fmt.Sscanf(s, `<U%x> \x%x |0`, &r, &c); err != nil { | |||||
continue | |||||
} | |||||
mapping[c] = r | |||||
charsFound++ | |||||
} | |||||
if charsFound < 200 { | |||||
log.Fatalf("%q: only %d characters found (wrong page format?)", url, charsFound) | |||||
} | |||||
return string(mapping) | |||||
} | |||||
func main() { | |||||
mibs := map[string]bool{} | |||||
all := []string{} | |||||
w := gen.NewCodeWriter() | |||||
defer w.WriteGoFile("tables.go", "charmap") | |||||
printf := func(s string, a ...interface{}) { fmt.Fprintf(w, s, a...) } | |||||
printf("import (\n") | |||||
printf("\t\"golang.org/x/text/encoding\"\n") | |||||
printf("\t\"golang.org/x/text/encoding/internal/identifier\"\n") | |||||
printf(")\n\n") | |||||
for _, e := range encodings { | |||||
varNames := strings.Split(e.varName, ",") | |||||
all = append(all, varNames...) | |||||
varName := varNames[0] | |||||
switch { | |||||
case strings.HasPrefix(e.mapping, "http://encoding.spec.whatwg.org/"): | |||||
e.mapping = getWHATWG(e.mapping) | |||||
case strings.HasPrefix(e.mapping, "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/"): | |||||
e.mapping = getUCM(e.mapping) | |||||
} | |||||
asciiSuperset, low := strings.HasPrefix(e.mapping, ascii), 0x00 | |||||
if asciiSuperset { | |||||
low = 0x80 | |||||
} | |||||
lvn := 1 | |||||
if strings.HasPrefix(varName, "ISO") || strings.HasPrefix(varName, "KOI") { | |||||
lvn = 3 | |||||
} | |||||
lowerVarName := strings.ToLower(varName[:lvn]) + varName[lvn:] | |||||
printf("// %s is the %s encoding.\n", varName, e.name) | |||||
if e.comment != "" { | |||||
printf("//\n// %s\n", e.comment) | |||||
} | |||||
printf("var %s encoding.Encoding = &%s\n\nvar %s = charmap{\nname: %q,\n", | |||||
varName, lowerVarName, lowerVarName, e.name) | |||||
if mibs[e.mib] { | |||||
log.Fatalf("MIB type %q declared multiple times.", e.mib) | |||||
} | |||||
printf("mib: identifier.%s,\n", e.mib) | |||||
printf("asciiSuperset: %t,\n", asciiSuperset) | |||||
printf("low: 0x%02x,\n", low) | |||||
printf("replacement: 0x%02x,\n", e.replacement) | |||||
printf("decode: [256]utf8Enc{\n") | |||||
i, backMapping := 0, map[rune]byte{} | |||||
for _, c := range e.mapping { | |||||
if _, ok := backMapping[c]; !ok && c != utf8.RuneError { | |||||
backMapping[c] = byte(i) | |||||
} | |||||
var buf [8]byte | |||||
n := utf8.EncodeRune(buf[:], c) | |||||
if n > 3 { | |||||
panic(fmt.Sprintf("rune %q (%U) is too long", c, c)) | |||||
} | |||||
printf("{%d,[3]byte{0x%02x,0x%02x,0x%02x}},", n, buf[0], buf[1], buf[2]) | |||||
if i%2 == 1 { | |||||
printf("\n") | |||||
} | |||||
i++ | |||||
} | |||||
printf("},\n") | |||||
printf("encode: [256]uint32{\n") | |||||
encode := make([]uint32, 0, 256) | |||||
for c, i := range backMapping { | |||||
encode = append(encode, uint32(i)<<24|uint32(c)) | |||||
} | |||||
sort.Sort(byRune(encode)) | |||||
for len(encode) < cap(encode) { | |||||
encode = append(encode, encode[len(encode)-1]) | |||||
} | |||||
for i, enc := range encode { | |||||
printf("0x%08x,", enc) | |||||
if i%8 == 7 { | |||||
printf("\n") | |||||
} | |||||
} | |||||
printf("},\n}\n") | |||||
// Add an estimate of the size of a single charmap{} struct value, which | |||||
// includes two 256 elem arrays of 4 bytes and some extra fields, which | |||||
// align to 3 uint64s on 64-bit architectures. | |||||
w.Size += 2*4*256 + 3*8 | |||||
} | |||||
// TODO: add proper line breaking. | |||||
printf("var listAll = []encoding.Encoding{\n%s,\n}\n\n", strings.Join(all, ",\n")) | |||||
} | |||||
type byRune []uint32 | |||||
func (b byRune) Len() int { return len(b) } | |||||
func (b byRune) Less(i, j int) bool { return b[i]&0xffffff < b[j]&0xffffff } | |||||
func (b byRune) Swap(i, j int) { b[i], b[j] = b[j], b[i] } |
@@ -1,167 +0,0 @@ | |||||
// Copyright 2015 The Go Authors. All rights reserved. | |||||
// Use of this source code is governed by a BSD-style | |||||
// license that can be found in the LICENSE file. | |||||
// +build ignore | |||||
package main | |||||
import ( | |||||
"bytes" | |||||
"encoding/json" | |||||
"fmt" | |||||
"log" | |||||
"strings" | |||||
"golang.org/x/text/internal/gen" | |||||
) | |||||
type group struct { | |||||
Encodings []struct { | |||||
Labels []string | |||||
Name string | |||||
} | |||||
} | |||||
func main() { | |||||
gen.Init() | |||||
r := gen.Open("http://www.w3.org/TR", "w3", "encoding/indexes/encodings.json") | |||||
var groups []group | |||||
if err := json.NewDecoder(r).Decode(&groups); err != nil { | |||||
log.Fatalf("Error reading encodings.json: %v", err) | |||||
} | |||||
w := &bytes.Buffer{} | |||||
fmt.Fprintln(w, "type htmlEncoding byte") | |||||
fmt.Fprintln(w, "const (") | |||||
for i, g := range groups { | |||||
for _, e := range g.Encodings { | |||||
name := consts[e.Name] | |||||
if name == "" { | |||||
log.Fatalf("No const defined for %s.", e.Name) | |||||
} | |||||
if i == 0 { | |||||
fmt.Fprintf(w, "%s htmlEncoding = iota\n", name) | |||||
} else { | |||||
fmt.Fprintf(w, "%s\n", name) | |||||
} | |||||
} | |||||
} | |||||
fmt.Fprintln(w, "numEncodings") | |||||
fmt.Fprint(w, ")\n\n") | |||||
fmt.Fprintln(w, "var canonical = [numEncodings]string{") | |||||
for _, g := range groups { | |||||
for _, e := range g.Encodings { | |||||
fmt.Fprintf(w, "%q,\n", e.Name) | |||||
} | |||||
} | |||||
fmt.Fprint(w, "}\n\n") | |||||
fmt.Fprintln(w, "var nameMap = map[string]htmlEncoding{") | |||||
for _, g := range groups { | |||||
for _, e := range g.Encodings { | |||||
for _, l := range e.Labels { | |||||
fmt.Fprintf(w, "%q: %s,\n", l, consts[e.Name]) | |||||
} | |||||
} | |||||
} | |||||
fmt.Fprint(w, "}\n\n") | |||||
var tags []string | |||||
fmt.Fprintln(w, "var localeMap = []htmlEncoding{") | |||||
for _, loc := range locales { | |||||
tags = append(tags, loc.tag) | |||||
fmt.Fprintf(w, "%s, // %s \n", consts[loc.name], loc.tag) | |||||
} | |||||
fmt.Fprint(w, "}\n\n") | |||||
fmt.Fprintf(w, "const locales = %q\n", strings.Join(tags, " ")) | |||||
gen.WriteGoFile("tables.go", "htmlindex", w.Bytes()) | |||||
} | |||||
// consts maps canonical encoding name to internal constant. | |||||
var consts = map[string]string{ | |||||
"utf-8": "utf8", | |||||
"ibm866": "ibm866", | |||||
"iso-8859-2": "iso8859_2", | |||||
"iso-8859-3": "iso8859_3", | |||||
"iso-8859-4": "iso8859_4", | |||||
"iso-8859-5": "iso8859_5", | |||||
"iso-8859-6": "iso8859_6", | |||||
"iso-8859-7": "iso8859_7", | |||||
"iso-8859-8": "iso8859_8", | |||||
"iso-8859-8-i": "iso8859_8I", | |||||
"iso-8859-10": "iso8859_10", | |||||
"iso-8859-13": "iso8859_13", | |||||
"iso-8859-14": "iso8859_14", | |||||
"iso-8859-15": "iso8859_15", | |||||
"iso-8859-16": "iso8859_16", | |||||
"koi8-r": "koi8r", | |||||
"koi8-u": "koi8u", | |||||
"macintosh": "macintosh", | |||||
"windows-874": "windows874", | |||||
"windows-1250": "windows1250", | |||||
"windows-1251": "windows1251", | |||||
"windows-1252": "windows1252", | |||||
"windows-1253": "windows1253", | |||||
"windows-1254": "windows1254", | |||||
"windows-1255": "windows1255", | |||||
"windows-1256": "windows1256", | |||||
"windows-1257": "windows1257", | |||||
"windows-1258": "windows1258", | |||||
"x-mac-cyrillic": "macintoshCyrillic", | |||||
"gbk": "gbk", | |||||
"gb18030": "gb18030", | |||||
// "hz-gb-2312": "hzgb2312", // Was removed from WhatWG | |||||
"big5": "big5", | |||||
"euc-jp": "eucjp", | |||||
"iso-2022-jp": "iso2022jp", | |||||
"shift_jis": "shiftJIS", | |||||
"euc-kr": "euckr", | |||||
"replacement": "replacement", | |||||
"utf-16be": "utf16be", | |||||
"utf-16le": "utf16le", | |||||
"x-user-defined": "xUserDefined", | |||||
} | |||||
// locales is taken from | |||||
// https://html.spec.whatwg.org/multipage/syntax.html#encoding-sniffing-algorithm. | |||||
var locales = []struct{ tag, name string }{ | |||||
{"und", "windows-1252"}, // The default value. | |||||
{"ar", "windows-1256"}, | |||||
{"ba", "windows-1251"}, | |||||
{"be", "windows-1251"}, | |||||
{"bg", "windows-1251"}, | |||||
{"cs", "windows-1250"}, | |||||
{"el", "iso-8859-7"}, | |||||
{"et", "windows-1257"}, | |||||
{"fa", "windows-1256"}, | |||||
{"he", "windows-1255"}, | |||||
{"hr", "windows-1250"}, | |||||
{"hu", "iso-8859-2"}, | |||||
{"ja", "shift_jis"}, | |||||
{"kk", "windows-1251"}, | |||||
{"ko", "euc-kr"}, | |||||
{"ku", "windows-1254"}, | |||||
{"ky", "windows-1251"}, | |||||
{"lt", "windows-1257"}, | |||||
{"lv", "windows-1257"}, | |||||
{"mk", "windows-1251"}, | |||||
{"pl", "iso-8859-2"}, | |||||
{"ru", "windows-1251"}, | |||||
{"sah", "windows-1251"}, | |||||
{"sk", "windows-1250"}, | |||||
{"sl", "iso-8859-2"}, | |||||
{"sr", "windows-1251"}, | |||||
{"tg", "windows-1251"}, | |||||
{"th", "windows-874"}, | |||||
{"tr", "windows-1254"}, | |||||
{"tt", "windows-1251"}, | |||||
{"uk", "windows-1251"}, | |||||
{"vi", "windows-1258"}, | |||||
{"zh-hans", "gb18030"}, | |||||
{"zh-hant", "big5"}, | |||||
} |
@@ -1,4 +1,4 @@ | |||||
// This file was generated by go generate; DO NOT EDIT | |||||
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT. | |||||
package htmlindex | package htmlindex | ||||
@@ -1,137 +0,0 @@ | |||||
// Copyright 2015 The Go Authors. All rights reserved. | |||||
// Use of this source code is governed by a BSD-style | |||||
// license that can be found in the LICENSE file. | |||||
// +build ignore | |||||
package main | |||||
import ( | |||||
"bytes" | |||||
"encoding/xml" | |||||
"fmt" | |||||
"io" | |||||
"log" | |||||
"strings" | |||||
"golang.org/x/text/internal/gen" | |||||
) | |||||
type registry struct { | |||||
XMLName xml.Name `xml:"registry"` | |||||
Updated string `xml:"updated"` | |||||
Registry []struct { | |||||
ID string `xml:"id,attr"` | |||||
Record []struct { | |||||
Name string `xml:"name"` | |||||
Xref []struct { | |||||
Type string `xml:"type,attr"` | |||||
Data string `xml:"data,attr"` | |||||
} `xml:"xref"` | |||||
Desc struct { | |||||
Data string `xml:",innerxml"` | |||||
// Any []struct { | |||||
// Data string `xml:",chardata"` | |||||
// } `xml:",any"` | |||||
// Data string `xml:",chardata"` | |||||
} `xml:"description,"` | |||||
MIB string `xml:"value"` | |||||
Alias []string `xml:"alias"` | |||||
MIME string `xml:"preferred_alias"` | |||||
} `xml:"record"` | |||||
} `xml:"registry"` | |||||
} | |||||
func main() { | |||||
r := gen.OpenIANAFile("assignments/character-sets/character-sets.xml") | |||||
reg := ®istry{} | |||||
if err := xml.NewDecoder(r).Decode(®); err != nil && err != io.EOF { | |||||
log.Fatalf("Error decoding charset registry: %v", err) | |||||
} | |||||
if len(reg.Registry) == 0 || reg.Registry[0].ID != "character-sets-1" { | |||||
log.Fatalf("Unexpected ID %s", reg.Registry[0].ID) | |||||
} | |||||
w := &bytes.Buffer{} | |||||
fmt.Fprintf(w, "const (\n") | |||||
for _, rec := range reg.Registry[0].Record { | |||||
constName := "" | |||||
for _, a := range rec.Alias { | |||||
if strings.HasPrefix(a, "cs") && strings.IndexByte(a, '-') == -1 { | |||||
// Some of the constant definitions have comments in them. Strip those. | |||||
constName = strings.Title(strings.SplitN(a[2:], "\n", 2)[0]) | |||||
} | |||||
} | |||||
if constName == "" { | |||||
switch rec.MIB { | |||||
case "2085": | |||||
constName = "HZGB2312" // Not listed as alias for some reason. | |||||
default: | |||||
log.Fatalf("No cs alias defined for %s.", rec.MIB) | |||||
} | |||||
} | |||||
if rec.MIME != "" { | |||||
rec.MIME = fmt.Sprintf(" (MIME: %s)", rec.MIME) | |||||
} | |||||
fmt.Fprintf(w, "// %s is the MIB identifier with IANA name %s%s.\n//\n", constName, rec.Name, rec.MIME) | |||||
if len(rec.Desc.Data) > 0 { | |||||
fmt.Fprint(w, "// ") | |||||
d := xml.NewDecoder(strings.NewReader(rec.Desc.Data)) | |||||
inElem := true | |||||
attr := "" | |||||
for { | |||||
t, err := d.Token() | |||||
if err != nil { | |||||
if err != io.EOF { | |||||
log.Fatal(err) | |||||
} | |||||
break | |||||
} | |||||
switch x := t.(type) { | |||||
case xml.CharData: | |||||
attr = "" // Don't need attribute info. | |||||
a := bytes.Split([]byte(x), []byte("\n")) | |||||
for i, b := range a { | |||||
if b = bytes.TrimSpace(b); len(b) != 0 { | |||||
if !inElem && i > 0 { | |||||
fmt.Fprint(w, "\n// ") | |||||
} | |||||
inElem = false | |||||
fmt.Fprintf(w, "%s ", string(b)) | |||||
} | |||||
} | |||||
case xml.StartElement: | |||||
if x.Name.Local == "xref" { | |||||
inElem = true | |||||
use := false | |||||
for _, a := range x.Attr { | |||||
if a.Name.Local == "type" { | |||||
use = use || a.Value != "person" | |||||
} | |||||
if a.Name.Local == "data" && use { | |||||
attr = a.Value + " " | |||||
} | |||||
} | |||||
} | |||||
case xml.EndElement: | |||||
inElem = false | |||||
fmt.Fprint(w, attr) | |||||
} | |||||
} | |||||
fmt.Fprint(w, "\n") | |||||
} | |||||
for _, x := range rec.Xref { | |||||
switch x.Type { | |||||
case "rfc": | |||||
fmt.Fprintf(w, "// Reference: %s\n", strings.ToUpper(x.Data)) | |||||
case "uri": | |||||
fmt.Fprintf(w, "// Reference: %s\n", x.Data) | |||||
} | |||||
} | |||||
fmt.Fprintf(w, "%s MIB = %s\n", constName, rec.MIB) | |||||
fmt.Fprintln(w) | |||||
} | |||||
fmt.Fprintln(w, ")") | |||||
gen.WriteGoFile("mib.go", "identifier", w.Bytes()) | |||||
} |
@@ -36,8 +36,8 @@ package identifier | |||||
// - http://www.ietf.org/rfc/rfc2978.txt | // - http://www.ietf.org/rfc/rfc2978.txt | ||||
// - http://www.unicode.org/reports/tr22/ | // - http://www.unicode.org/reports/tr22/ | ||||
// - http://www.w3.org/TR/encoding/ | // - http://www.w3.org/TR/encoding/ | ||||
// - http://www.w3.org/TR/encoding/indexes/encodings.json | |||||
// - https://encoding.spec.whatwg.org/ | // - https://encoding.spec.whatwg.org/ | ||||
// - https://encoding.spec.whatwg.org/encodings.json | |||||
// - https://tools.ietf.org/html/rfc6657#section-5 | // - https://tools.ietf.org/html/rfc6657#section-5 | ||||
// Interface can be implemented by Encodings to define the CCS or CES for which | // Interface can be implemented by Encodings to define the CCS or CES for which | ||||
@@ -1,4 +1,4 @@ | |||||
// This file was generated by go generate; DO NOT EDIT | |||||
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT. | |||||
package identifier | package identifier | ||||
@@ -5,7 +5,6 @@ | |||||
package japanese | package japanese | ||||
import ( | import ( | ||||
"errors" | |||||
"unicode/utf8" | "unicode/utf8" | ||||
"golang.org/x/text/encoding" | "golang.org/x/text/encoding" | ||||
@@ -23,10 +22,9 @@ var eucJP = internal.Encoding{ | |||||
identifier.EUCPkdFmtJapanese, | identifier.EUCPkdFmtJapanese, | ||||
} | } | ||||
var errInvalidEUCJP = errors.New("japanese: invalid EUC-JP encoding") | |||||
type eucJPDecoder struct{ transform.NopResetter } | type eucJPDecoder struct{ transform.NopResetter } | ||||
// See https://encoding.spec.whatwg.org/#euc-jp-decoder. | |||||
func (eucJPDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { | func (eucJPDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { | ||||
r, size := rune(0), 0 | r, size := rune(0), 0 | ||||
loop: | loop: | ||||
@@ -37,60 +35,79 @@ loop: | |||||
case c0 == 0x8e: | case c0 == 0x8e: | ||||
if nSrc+1 >= len(src) { | if nSrc+1 >= len(src) { | ||||
err = transform.ErrShortSrc | |||||
break loop | |||||
if !atEOF { | |||||
err = transform.ErrShortSrc | |||||
break loop | |||||
} | |||||
r, size = utf8.RuneError, 1 | |||||
break | |||||
} | } | ||||
c1 := src[nSrc+1] | c1 := src[nSrc+1] | ||||
if c1 < 0xa1 || 0xdf < c1 { | |||||
err = errInvalidEUCJP | |||||
break loop | |||||
switch { | |||||
case c1 < 0xa1: | |||||
r, size = utf8.RuneError, 1 | |||||
case c1 > 0xdf: | |||||
r, size = utf8.RuneError, 2 | |||||
if c1 == 0xff { | |||||
size = 1 | |||||
} | |||||
default: | |||||
r, size = rune(c1)+(0xff61-0xa1), 2 | |||||
} | } | ||||
r, size = rune(c1)+(0xff61-0xa1), 2 | |||||
case c0 == 0x8f: | case c0 == 0x8f: | ||||
if nSrc+2 >= len(src) { | if nSrc+2 >= len(src) { | ||||
err = transform.ErrShortSrc | |||||
break loop | |||||
if !atEOF { | |||||
err = transform.ErrShortSrc | |||||
break loop | |||||
} | |||||
r, size = utf8.RuneError, 1 | |||||
if p := nSrc + 1; p < len(src) && 0xa1 <= src[p] && src[p] < 0xfe { | |||||
size = 2 | |||||
} | |||||
break | |||||
} | } | ||||
c1 := src[nSrc+1] | c1 := src[nSrc+1] | ||||
if c1 < 0xa1 || 0xfe < c1 { | if c1 < 0xa1 || 0xfe < c1 { | ||||
err = errInvalidEUCJP | |||||
break loop | |||||
r, size = utf8.RuneError, 1 | |||||
break | |||||
} | } | ||||
c2 := src[nSrc+2] | c2 := src[nSrc+2] | ||||
if c2 < 0xa1 || 0xfe < c2 { | if c2 < 0xa1 || 0xfe < c2 { | ||||
err = errInvalidEUCJP | |||||
break loop | |||||
r, size = utf8.RuneError, 2 | |||||
break | |||||
} | } | ||||
r, size = '\ufffd', 3 | |||||
r, size = utf8.RuneError, 3 | |||||
if i := int(c1-0xa1)*94 + int(c2-0xa1); i < len(jis0212Decode) { | if i := int(c1-0xa1)*94 + int(c2-0xa1); i < len(jis0212Decode) { | ||||
r = rune(jis0212Decode[i]) | r = rune(jis0212Decode[i]) | ||||
if r == 0 { | if r == 0 { | ||||
r = '\ufffd' | |||||
r = utf8.RuneError | |||||
} | } | ||||
} | } | ||||
case 0xa1 <= c0 && c0 <= 0xfe: | case 0xa1 <= c0 && c0 <= 0xfe: | ||||
if nSrc+1 >= len(src) { | if nSrc+1 >= len(src) { | ||||
err = transform.ErrShortSrc | |||||
break loop | |||||
if !atEOF { | |||||
err = transform.ErrShortSrc | |||||
break loop | |||||
} | |||||
r, size = utf8.RuneError, 1 | |||||
break | |||||
} | } | ||||
c1 := src[nSrc+1] | c1 := src[nSrc+1] | ||||
if c1 < 0xa1 || 0xfe < c1 { | if c1 < 0xa1 || 0xfe < c1 { | ||||
err = errInvalidEUCJP | |||||
break loop | |||||
r, size = utf8.RuneError, 1 | |||||
break | |||||
} | } | ||||
r, size = '\ufffd', 2 | |||||
r, size = utf8.RuneError, 2 | |||||
if i := int(c0-0xa1)*94 + int(c1-0xa1); i < len(jis0208Decode) { | if i := int(c0-0xa1)*94 + int(c1-0xa1); i < len(jis0208Decode) { | ||||
r = rune(jis0208Decode[i]) | r = rune(jis0208Decode[i]) | ||||
if r == 0 { | if r == 0 { | ||||
r = '\ufffd' | |||||
r = utf8.RuneError | |||||
} | } | ||||
} | } | ||||
default: | default: | ||||
err = errInvalidEUCJP | |||||
break loop | |||||
r, size = utf8.RuneError, 1 | |||||
} | } | ||||
if nDst+utf8.RuneLen(r) > len(dst) { | if nDst+utf8.RuneLen(r) > len(dst) { | ||||
@@ -99,9 +116,6 @@ loop: | |||||
} | } | ||||
nDst += utf8.EncodeRune(dst[nDst:], r) | nDst += utf8.EncodeRune(dst[nDst:], r) | ||||
} | } | ||||
if atEOF && err == transform.ErrShortSrc { | |||||
err = errInvalidEUCJP | |||||
} | |||||
return nDst, nSrc, err | return nDst, nSrc, err | ||||
} | } | ||||
@@ -5,7 +5,6 @@ | |||||
package japanese | package japanese | ||||
import ( | import ( | ||||
"errors" | |||||
"unicode/utf8" | "unicode/utf8" | ||||
"golang.org/x/text/encoding" | "golang.org/x/text/encoding" | ||||
@@ -31,8 +30,6 @@ func iso2022JPNewEncoder() transform.Transformer { | |||||
return new(iso2022JPEncoder) | return new(iso2022JPEncoder) | ||||
} | } | ||||
var errInvalidISO2022JP = errors.New("japanese: invalid ISO-2022-JP encoding") | |||||
const ( | const ( | ||||
asciiState = iota | asciiState = iota | ||||
katakanaState | katakanaState | ||||
@@ -50,45 +47,51 @@ func (d *iso2022JPDecoder) Reset() { | |||||
func (d *iso2022JPDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { | func (d *iso2022JPDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { | ||||
r, size := rune(0), 0 | r, size := rune(0), 0 | ||||
loop: | |||||
for ; nSrc < len(src); nSrc += size { | for ; nSrc < len(src); nSrc += size { | ||||
c0 := src[nSrc] | c0 := src[nSrc] | ||||
if c0 >= utf8.RuneSelf { | if c0 >= utf8.RuneSelf { | ||||
err = errInvalidISO2022JP | |||||
break loop | |||||
r, size = '\ufffd', 1 | |||||
goto write | |||||
} | } | ||||
if c0 == asciiEsc { | if c0 == asciiEsc { | ||||
if nSrc+2 >= len(src) { | if nSrc+2 >= len(src) { | ||||
err = transform.ErrShortSrc | |||||
break loop | |||||
if !atEOF { | |||||
return nDst, nSrc, transform.ErrShortSrc | |||||
} | |||||
// TODO: is it correct to only skip 1?? | |||||
r, size = '\ufffd', 1 | |||||
goto write | |||||
} | } | ||||
size = 3 | size = 3 | ||||
c1 := src[nSrc+1] | c1 := src[nSrc+1] | ||||
c2 := src[nSrc+2] | c2 := src[nSrc+2] | ||||
switch { | switch { | ||||
case c1 == '$' && (c2 == '@' || c2 == 'B'): | |||||
case c1 == '$' && (c2 == '@' || c2 == 'B'): // 0x24 {0x40, 0x42} | |||||
*d = jis0208State | *d = jis0208State | ||||
continue | continue | ||||
case c1 == '$' && c2 == '(': | |||||
case c1 == '$' && c2 == '(': // 0x24 0x28 | |||||
if nSrc+3 >= len(src) { | if nSrc+3 >= len(src) { | ||||
err = transform.ErrShortSrc | |||||
break loop | |||||
if !atEOF { | |||||
return nDst, nSrc, transform.ErrShortSrc | |||||
} | |||||
r, size = '\ufffd', 1 | |||||
goto write | |||||
} | } | ||||
size = 4 | size = 4 | ||||
if src[nSrc]+3 == 'D' { | |||||
if src[nSrc+3] == 'D' { | |||||
*d = jis0212State | *d = jis0212State | ||||
continue | continue | ||||
} | } | ||||
case c1 == '(' && (c2 == 'B' || c2 == 'J'): | |||||
case c1 == '(' && (c2 == 'B' || c2 == 'J'): // 0x28 {0x42, 0x4A} | |||||
*d = asciiState | *d = asciiState | ||||
continue | continue | ||||
case c1 == '(' && c2 == 'I': | |||||
case c1 == '(' && c2 == 'I': // 0x28 0x49 | |||||
*d = katakanaState | *d = katakanaState | ||||
continue | continue | ||||
} | } | ||||
err = errInvalidISO2022JP | |||||
break loop | |||||
r, size = '\ufffd', 1 | |||||
goto write | |||||
} | } | ||||
switch *d { | switch *d { | ||||
@@ -97,8 +100,8 @@ loop: | |||||
case katakanaState: | case katakanaState: | ||||
if c0 < 0x21 || 0x60 <= c0 { | if c0 < 0x21 || 0x60 <= c0 { | ||||
err = errInvalidISO2022JP | |||||
break loop | |||||
r, size = '\ufffd', 1 | |||||
goto write | |||||
} | } | ||||
r, size = rune(c0)+(0xff61-0x21), 1 | r, size = rune(c0)+(0xff61-0x21), 1 | ||||
@@ -106,11 +109,14 @@ loop: | |||||
if c0 == 0x0a { | if c0 == 0x0a { | ||||
*d = asciiState | *d = asciiState | ||||
r, size = rune(c0), 1 | r, size = rune(c0), 1 | ||||
break | |||||
goto write | |||||
} | } | ||||
if nSrc+1 >= len(src) { | if nSrc+1 >= len(src) { | ||||
err = transform.ErrShortSrc | |||||
break loop | |||||
if !atEOF { | |||||
return nDst, nSrc, transform.ErrShortSrc | |||||
} | |||||
r, size = '\ufffd', 1 | |||||
goto write | |||||
} | } | ||||
size = 2 | size = 2 | ||||
c1 := src[nSrc+1] | c1 := src[nSrc+1] | ||||
@@ -121,22 +127,19 @@ loop: | |||||
r = rune(jis0212Decode[i]) | r = rune(jis0212Decode[i]) | ||||
} else { | } else { | ||||
r = '\ufffd' | r = '\ufffd' | ||||
break | |||||
goto write | |||||
} | } | ||||
if r == 0 { | if r == 0 { | ||||
r = '\ufffd' | r = '\ufffd' | ||||
} | } | ||||
} | } | ||||
write: | |||||
if nDst+utf8.RuneLen(r) > len(dst) { | if nDst+utf8.RuneLen(r) > len(dst) { | ||||
err = transform.ErrShortDst | |||||
break loop | |||||
return nDst, nSrc, transform.ErrShortDst | |||||
} | } | ||||
nDst += utf8.EncodeRune(dst[nDst:], r) | nDst += utf8.EncodeRune(dst[nDst:], r) | ||||
} | } | ||||
if atEOF && err == transform.ErrShortSrc { | |||||
err = errInvalidISO2022JP | |||||
} | |||||
return nDst, nSrc, err | return nDst, nSrc, err | ||||
} | } | ||||
@@ -1,161 +0,0 @@ | |||||
// Copyright 2013 The Go Authors. All rights reserved. | |||||
// Use of this source code is governed by a BSD-style | |||||
// license that can be found in the LICENSE file. | |||||
// +build ignore | |||||
package main | |||||
// This program generates tables.go: | |||||
// go run maketables.go | gofmt > tables.go | |||||
// TODO: Emoji extensions? | |||||
// http://www.unicode.org/faq/emoji_dingbats.html | |||||
// http://www.unicode.org/Public/UNIDATA/EmojiSources.txt | |||||
import ( | |||||
"bufio" | |||||
"fmt" | |||||
"log" | |||||
"net/http" | |||||
"sort" | |||||
"strings" | |||||
) | |||||
type entry struct { | |||||
jisCode, table int | |||||
} | |||||
func main() { | |||||
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n") | |||||
fmt.Printf("// Package japanese provides Japanese encodings such as EUC-JP and Shift JIS.\n") | |||||
fmt.Printf(`package japanese // import "golang.org/x/text/encoding/japanese"` + "\n\n") | |||||
reverse := [65536]entry{} | |||||
for i := range reverse { | |||||
reverse[i].table = -1 | |||||
} | |||||
tables := []struct { | |||||
url string | |||||
name string | |||||
}{ | |||||
{"http://encoding.spec.whatwg.org/index-jis0208.txt", "0208"}, | |||||
{"http://encoding.spec.whatwg.org/index-jis0212.txt", "0212"}, | |||||
} | |||||
for i, table := range tables { | |||||
res, err := http.Get(table.url) | |||||
if err != nil { | |||||
log.Fatalf("%q: Get: %v", table.url, err) | |||||
} | |||||
defer res.Body.Close() | |||||
mapping := [65536]uint16{} | |||||
scanner := bufio.NewScanner(res.Body) | |||||
for scanner.Scan() { | |||||
s := strings.TrimSpace(scanner.Text()) | |||||
if s == "" || s[0] == '#' { | |||||
continue | |||||
} | |||||
x, y := 0, uint16(0) | |||||
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil { | |||||
log.Fatalf("%q: could not parse %q", table.url, s) | |||||
} | |||||
if x < 0 || 120*94 <= x { | |||||
log.Fatalf("%q: JIS code %d is out of range", table.url, x) | |||||
} | |||||
mapping[x] = y | |||||
if reverse[y].table == -1 { | |||||
reverse[y] = entry{jisCode: x, table: i} | |||||
} | |||||
} | |||||
if err := scanner.Err(); err != nil { | |||||
log.Fatalf("%q: scanner error: %v", table.url, err) | |||||
} | |||||
fmt.Printf("// jis%sDecode is the decoding table from JIS %s code to Unicode.\n// It is defined at %s\n", | |||||
table.name, table.name, table.url) | |||||
fmt.Printf("var jis%sDecode = [...]uint16{\n", table.name) | |||||
for i, m := range mapping { | |||||
if m != 0 { | |||||
fmt.Printf("\t%d: 0x%04X,\n", i, m) | |||||
} | |||||
} | |||||
fmt.Printf("}\n\n") | |||||
} | |||||
// Any run of at least separation continuous zero entries in the reverse map will | |||||
// be a separate encode table. | |||||
const separation = 1024 | |||||
intervals := []interval(nil) | |||||
low, high := -1, -1 | |||||
for i, v := range reverse { | |||||
if v.table == -1 { | |||||
continue | |||||
} | |||||
if low < 0 { | |||||
low = i | |||||
} else if i-high >= separation { | |||||
if high >= 0 { | |||||
intervals = append(intervals, interval{low, high}) | |||||
} | |||||
low = i | |||||
} | |||||
high = i + 1 | |||||
} | |||||
if high >= 0 { | |||||
intervals = append(intervals, interval{low, high}) | |||||
} | |||||
sort.Sort(byDecreasingLength(intervals)) | |||||
fmt.Printf("const (\n") | |||||
fmt.Printf("\tjis0208 = 1\n") | |||||
fmt.Printf("\tjis0212 = 2\n") | |||||
fmt.Printf("\tcodeMask = 0x7f\n") | |||||
fmt.Printf("\tcodeShift = 7\n") | |||||
fmt.Printf("\ttableShift = 14\n") | |||||
fmt.Printf(")\n\n") | |||||
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals)) | |||||
fmt.Printf("// encodeX are the encoding tables from Unicode to JIS code,\n") | |||||
fmt.Printf("// sorted by decreasing length.\n") | |||||
for i, v := range intervals { | |||||
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high) | |||||
} | |||||
fmt.Printf("//\n") | |||||
fmt.Printf("// The high two bits of the value record whether the JIS code comes from the\n") | |||||
fmt.Printf("// JIS0208 table (high bits == 1) or the JIS0212 table (high bits == 2).\n") | |||||
fmt.Printf("// The low 14 bits are two 7-bit unsigned integers j1 and j2 that form the\n") | |||||
fmt.Printf("// JIS code (94*j1 + j2) within that table.\n") | |||||
fmt.Printf("\n") | |||||
for i, v := range intervals { | |||||
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high) | |||||
fmt.Printf("var encode%d = [...]uint16{\n", i) | |||||
for j := v.low; j < v.high; j++ { | |||||
x := reverse[j] | |||||
if x.table == -1 { | |||||
continue | |||||
} | |||||
fmt.Printf("\t%d - %d: jis%s<<14 | 0x%02X<<7 | 0x%02X,\n", | |||||
j, v.low, tables[x.table].name, x.jisCode/94, x.jisCode%94) | |||||
} | |||||
fmt.Printf("}\n\n") | |||||
} | |||||
} | |||||
// interval is a half-open interval [low, high). | |||||
type interval struct { | |||||
low, high int | |||||
} | |||||
func (i interval) len() int { return i.high - i.low } | |||||
// byDecreasingLength sorts intervals by decreasing length. | |||||
type byDecreasingLength []interval | |||||
func (b byDecreasingLength) Len() int { return len(b) } | |||||
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() } | |||||
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] } |
@@ -5,7 +5,6 @@ | |||||
package japanese | package japanese | ||||
import ( | import ( | ||||
"errors" | |||||
"unicode/utf8" | "unicode/utf8" | ||||
"golang.org/x/text/encoding" | "golang.org/x/text/encoding" | ||||
@@ -24,8 +23,6 @@ var shiftJIS = internal.Encoding{ | |||||
identifier.ShiftJIS, | identifier.ShiftJIS, | ||||
} | } | ||||
var errInvalidShiftJIS = errors.New("japanese: invalid Shift JIS encoding") | |||||
type shiftJISDecoder struct{ transform.NopResetter } | type shiftJISDecoder struct{ transform.NopResetter } | ||||
func (shiftJISDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { | func (shiftJISDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { | ||||
@@ -48,28 +45,32 @@ loop: | |||||
c0 = 2*c0 - 0x21 | c0 = 2*c0 - 0x21 | ||||
if nSrc+1 >= len(src) { | if nSrc+1 >= len(src) { | ||||
err = transform.ErrShortSrc | |||||
break loop | |||||
if !atEOF { | |||||
err = transform.ErrShortSrc | |||||
break loop | |||||
} | |||||
r, size = '\ufffd', 1 | |||||
goto write | |||||
} | } | ||||
c1 := src[nSrc+1] | c1 := src[nSrc+1] | ||||
switch { | switch { | ||||
case c1 < 0x40: | case c1 < 0x40: | ||||
err = errInvalidShiftJIS | |||||
break loop | |||||
r, size = '\ufffd', 1 // c1 is ASCII so output on next round | |||||
goto write | |||||
case c1 < 0x7f: | case c1 < 0x7f: | ||||
c0-- | c0-- | ||||
c1 -= 0x40 | c1 -= 0x40 | ||||
case c1 == 0x7f: | case c1 == 0x7f: | ||||
err = errInvalidShiftJIS | |||||
break loop | |||||
r, size = '\ufffd', 1 // c1 is ASCII so output on next round | |||||
goto write | |||||
case c1 < 0x9f: | case c1 < 0x9f: | ||||
c0-- | c0-- | ||||
c1 -= 0x41 | c1 -= 0x41 | ||||
case c1 < 0xfd: | case c1 < 0xfd: | ||||
c1 -= 0x9f | c1 -= 0x9f | ||||
default: | default: | ||||
err = errInvalidShiftJIS | |||||
break loop | |||||
r, size = '\ufffd', 2 | |||||
goto write | |||||
} | } | ||||
r, size = '\ufffd', 2 | r, size = '\ufffd', 2 | ||||
if i := int(c0)*94 + int(c1); i < len(jis0208Decode) { | if i := int(c0)*94 + int(c1); i < len(jis0208Decode) { | ||||
@@ -79,20 +80,19 @@ loop: | |||||
} | } | ||||
} | } | ||||
case c0 == 0x80: | |||||
r, size = 0x80, 1 | |||||
default: | default: | ||||
err = errInvalidShiftJIS | |||||
break loop | |||||
r, size = '\ufffd', 1 | |||||
} | } | ||||
write: | |||||
if nDst+utf8.RuneLen(r) > len(dst) { | if nDst+utf8.RuneLen(r) > len(dst) { | ||||
err = transform.ErrShortDst | err = transform.ErrShortDst | ||||
break loop | break loop | ||||
} | } | ||||
nDst += utf8.EncodeRune(dst[nDst:], r) | nDst += utf8.EncodeRune(dst[nDst:], r) | ||||
} | } | ||||
if atEOF && err == transform.ErrShortSrc { | |||||
err = errInvalidShiftJIS | |||||
} | |||||
return nDst, nSrc, err | return nDst, nSrc, err | ||||
} | } | ||||
@@ -5,7 +5,6 @@ | |||||
package korean | package korean | ||||
import ( | import ( | ||||
"errors" | |||||
"unicode/utf8" | "unicode/utf8" | ||||
"golang.org/x/text/encoding" | "golang.org/x/text/encoding" | ||||
@@ -26,8 +25,6 @@ var eucKR = internal.Encoding{ | |||||
identifier.EUCKR, | identifier.EUCKR, | ||||
} | } | ||||
var errInvalidEUCKR = errors.New("korean: invalid EUC-KR encoding") | |||||
type eucKRDecoder struct{ transform.NopResetter } | type eucKRDecoder struct{ transform.NopResetter } | ||||
func (eucKRDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { | func (eucKRDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { | ||||
@@ -40,10 +37,15 @@ loop: | |||||
case 0x81 <= c0 && c0 < 0xff: | case 0x81 <= c0 && c0 < 0xff: | ||||
if nSrc+1 >= len(src) { | if nSrc+1 >= len(src) { | ||||
err = transform.ErrShortSrc | |||||
break loop | |||||
if !atEOF { | |||||
err = transform.ErrShortSrc | |||||
break loop | |||||
} | |||||
r, size = utf8.RuneError, 1 | |||||
break | |||||
} | } | ||||
c1 := src[nSrc+1] | c1 := src[nSrc+1] | ||||
size = 2 | |||||
if c0 < 0xc7 { | if c0 < 0xc7 { | ||||
r = 178 * rune(c0-0x81) | r = 178 * rune(c0-0x81) | ||||
switch { | switch { | ||||
@@ -54,39 +56,36 @@ loop: | |||||
case 0x81 <= c1 && c1 < 0xff: | case 0x81 <= c1 && c1 < 0xff: | ||||
r += rune(c1) - (0x81 - 2*26) | r += rune(c1) - (0x81 - 2*26) | ||||
default: | default: | ||||
err = errInvalidEUCKR | |||||
break loop | |||||
goto decError | |||||
} | } | ||||
} else if 0xa1 <= c1 && c1 < 0xff { | } else if 0xa1 <= c1 && c1 < 0xff { | ||||
r = 178*(0xc7-0x81) + rune(c0-0xc7)*94 + rune(c1-0xa1) | r = 178*(0xc7-0x81) + rune(c0-0xc7)*94 + rune(c1-0xa1) | ||||
} else { | } else { | ||||
err = errInvalidEUCKR | |||||
break loop | |||||
goto decError | |||||
} | } | ||||
if int(r) < len(decode) { | if int(r) < len(decode) { | ||||
r = rune(decode[r]) | r = rune(decode[r]) | ||||
if r == 0 { | |||||
r = '\ufffd' | |||||
if r != 0 { | |||||
break | |||||
} | } | ||||
} else { | |||||
r = '\ufffd' | |||||
} | } | ||||
size = 2 | |||||
decError: | |||||
r = utf8.RuneError | |||||
if c1 < utf8.RuneSelf { | |||||
size = 1 | |||||
} | |||||
default: | default: | ||||
err = errInvalidEUCKR | |||||
break loop | |||||
r, size = utf8.RuneError, 1 | |||||
break | |||||
} | } | ||||
if nDst+utf8.RuneLen(r) > len(dst) { | if nDst+utf8.RuneLen(r) > len(dst) { | ||||
err = transform.ErrShortDst | err = transform.ErrShortDst | ||||
break loop | |||||
break | |||||
} | } | ||||
nDst += utf8.EncodeRune(dst[nDst:], r) | nDst += utf8.EncodeRune(dst[nDst:], r) | ||||
} | } | ||||
if atEOF && err == transform.ErrShortSrc { | |||||
err = errInvalidEUCKR | |||||
} | |||||
return nDst, nSrc, err | return nDst, nSrc, err | ||||
} | } | ||||
@@ -1,143 +0,0 @@ | |||||
// Copyright 2013 The Go Authors. All rights reserved. | |||||
// Use of this source code is governed by a BSD-style | |||||
// license that can be found in the LICENSE file. | |||||
// +build ignore | |||||
package main | |||||
// This program generates tables.go: | |||||
// go run maketables.go | gofmt > tables.go | |||||
import ( | |||||
"bufio" | |||||
"fmt" | |||||
"log" | |||||
"net/http" | |||||
"sort" | |||||
"strings" | |||||
) | |||||
func main() { | |||||
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n") | |||||
fmt.Printf("// Package korean provides Korean encodings such as EUC-KR.\n") | |||||
fmt.Printf(`package korean // import "golang.org/x/text/encoding/korean"` + "\n\n") | |||||
res, err := http.Get("http://encoding.spec.whatwg.org/index-euc-kr.txt") | |||||
if err != nil { | |||||
log.Fatalf("Get: %v", err) | |||||
} | |||||
defer res.Body.Close() | |||||
mapping := [65536]uint16{} | |||||
reverse := [65536]uint16{} | |||||
scanner := bufio.NewScanner(res.Body) | |||||
for scanner.Scan() { | |||||
s := strings.TrimSpace(scanner.Text()) | |||||
if s == "" || s[0] == '#' { | |||||
continue | |||||
} | |||||
x, y := uint16(0), uint16(0) | |||||
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil { | |||||
log.Fatalf("could not parse %q", s) | |||||
} | |||||
if x < 0 || 178*(0xc7-0x81)+(0xfe-0xc7)*94+(0xff-0xa1) <= x { | |||||
log.Fatalf("EUC-KR code %d is out of range", x) | |||||
} | |||||
mapping[x] = y | |||||
if reverse[y] == 0 { | |||||
c0, c1 := uint16(0), uint16(0) | |||||
if x < 178*(0xc7-0x81) { | |||||
c0 = uint16(x/178) + 0x81 | |||||
c1 = uint16(x % 178) | |||||
switch { | |||||
case c1 < 1*26: | |||||
c1 += 0x41 | |||||
case c1 < 2*26: | |||||
c1 += 0x47 | |||||
default: | |||||
c1 += 0x4d | |||||
} | |||||
} else { | |||||
x -= 178 * (0xc7 - 0x81) | |||||
c0 = uint16(x/94) + 0xc7 | |||||
c1 = uint16(x%94) + 0xa1 | |||||
} | |||||
reverse[y] = c0<<8 | c1 | |||||
} | |||||
} | |||||
if err := scanner.Err(); err != nil { | |||||
log.Fatalf("scanner error: %v", err) | |||||
} | |||||
fmt.Printf("// decode is the decoding table from EUC-KR code to Unicode.\n") | |||||
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-euc-kr.txt\n") | |||||
fmt.Printf("var decode = [...]uint16{\n") | |||||
for i, v := range mapping { | |||||
if v != 0 { | |||||
fmt.Printf("\t%d: 0x%04X,\n", i, v) | |||||
} | |||||
} | |||||
fmt.Printf("}\n\n") | |||||
// Any run of at least separation continuous zero entries in the reverse map will | |||||
// be a separate encode table. | |||||
const separation = 1024 | |||||
intervals := []interval(nil) | |||||
low, high := -1, -1 | |||||
for i, v := range reverse { | |||||
if v == 0 { | |||||
continue | |||||
} | |||||
if low < 0 { | |||||
low = i | |||||
} else if i-high >= separation { | |||||
if high >= 0 { | |||||
intervals = append(intervals, interval{low, high}) | |||||
} | |||||
low = i | |||||
} | |||||
high = i + 1 | |||||
} | |||||
if high >= 0 { | |||||
intervals = append(intervals, interval{low, high}) | |||||
} | |||||
sort.Sort(byDecreasingLength(intervals)) | |||||
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals)) | |||||
fmt.Printf("// encodeX are the encoding tables from Unicode to EUC-KR code,\n") | |||||
fmt.Printf("// sorted by decreasing length.\n") | |||||
for i, v := range intervals { | |||||
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high) | |||||
} | |||||
fmt.Printf("\n") | |||||
for i, v := range intervals { | |||||
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high) | |||||
fmt.Printf("var encode%d = [...]uint16{\n", i) | |||||
for j := v.low; j < v.high; j++ { | |||||
x := reverse[j] | |||||
if x == 0 { | |||||
continue | |||||
} | |||||
fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x) | |||||
} | |||||
fmt.Printf("}\n\n") | |||||
} | |||||
} | |||||
// interval is a half-open interval [low, high). | |||||
type interval struct { | |||||
low, high int | |||||
} | |||||
func (i interval) len() int { return i.high - i.low } | |||||
// byDecreasingLength sorts intervals by decreasing length. | |||||
type byDecreasingLength []interval | |||||
func (b byDecreasingLength) Len() int { return len(b) } | |||||
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() } | |||||
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] } |
@@ -5,7 +5,6 @@ | |||||
package simplifiedchinese | package simplifiedchinese | ||||
import ( | import ( | ||||
"errors" | |||||
"unicode/utf8" | "unicode/utf8" | ||||
"golang.org/x/text/encoding" | "golang.org/x/text/encoding" | ||||
@@ -40,11 +39,6 @@ var gbk18030 = internal.Encoding{ | |||||
identifier.GB18030, | identifier.GB18030, | ||||
} | } | ||||
var ( | |||||
errInvalidGB18030 = errors.New("simplifiedchinese: invalid GB18030 encoding") | |||||
errInvalidGBK = errors.New("simplifiedchinese: invalid GBK encoding") | |||||
) | |||||
type gbkDecoder struct { | type gbkDecoder struct { | ||||
transform.NopResetter | transform.NopResetter | ||||
gb18030 bool | gb18030 bool | ||||
@@ -66,8 +60,12 @@ loop: | |||||
case c0 < 0xff: | case c0 < 0xff: | ||||
if nSrc+1 >= len(src) { | if nSrc+1 >= len(src) { | ||||
err = transform.ErrShortSrc | |||||
break loop | |||||
if !atEOF { | |||||
err = transform.ErrShortSrc | |||||
break loop | |||||
} | |||||
r, size = utf8.RuneError, 1 | |||||
goto write | |||||
} | } | ||||
c1 := src[nSrc+1] | c1 := src[nSrc+1] | ||||
switch { | switch { | ||||
@@ -77,18 +75,24 @@ loop: | |||||
c1 -= 0x41 | c1 -= 0x41 | ||||
case d.gb18030 && 0x30 <= c1 && c1 < 0x40: | case d.gb18030 && 0x30 <= c1 && c1 < 0x40: | ||||
if nSrc+3 >= len(src) { | if nSrc+3 >= len(src) { | ||||
err = transform.ErrShortSrc | |||||
break loop | |||||
if !atEOF { | |||||
err = transform.ErrShortSrc | |||||
break loop | |||||
} | |||||
// The second byte here is always ASCII, so we can set size | |||||
// to 1 in all cases. | |||||
r, size = utf8.RuneError, 1 | |||||
goto write | |||||
} | } | ||||
c2 := src[nSrc+2] | c2 := src[nSrc+2] | ||||
if c2 < 0x81 || 0xff <= c2 { | if c2 < 0x81 || 0xff <= c2 { | ||||
err = errInvalidGB18030 | |||||
break loop | |||||
r, size = utf8.RuneError, 1 | |||||
goto write | |||||
} | } | ||||
c3 := src[nSrc+3] | c3 := src[nSrc+3] | ||||
if c3 < 0x30 || 0x3a <= c3 { | if c3 < 0x30 || 0x3a <= c3 { | ||||
err = errInvalidGB18030 | |||||
break loop | |||||
r, size = utf8.RuneError, 1 | |||||
goto write | |||||
} | } | ||||
size = 4 | size = 4 | ||||
r = ((rune(c0-0x81)*10+rune(c1-0x30))*126+rune(c2-0x81))*10 + rune(c3-0x30) | r = ((rune(c0-0x81)*10+rune(c1-0x30))*126+rune(c2-0x81))*10 + rune(c3-0x30) | ||||
@@ -109,17 +113,13 @@ loop: | |||||
r -= 189000 | r -= 189000 | ||||
if 0 <= r && r < 0x100000 { | if 0 <= r && r < 0x100000 { | ||||
r += 0x10000 | r += 0x10000 | ||||
goto write | |||||
} | |||||
err = errInvalidGB18030 | |||||
break loop | |||||
default: | |||||
if d.gb18030 { | |||||
err = errInvalidGB18030 | |||||
} else { | } else { | ||||
err = errInvalidGBK | |||||
r, size = utf8.RuneError, 1 | |||||
} | } | ||||
break loop | |||||
goto write | |||||
default: | |||||
r, size = utf8.RuneError, 1 | |||||
goto write | |||||
} | } | ||||
r, size = '\ufffd', 2 | r, size = '\ufffd', 2 | ||||
if i := int(c0-0x81)*190 + int(c1); i < len(decode) { | if i := int(c0-0x81)*190 + int(c1); i < len(decode) { | ||||
@@ -130,12 +130,7 @@ loop: | |||||
} | } | ||||
default: | default: | ||||
if d.gb18030 { | |||||
err = errInvalidGB18030 | |||||
} else { | |||||
err = errInvalidGBK | |||||
} | |||||
break loop | |||||
r, size = utf8.RuneError, 1 | |||||
} | } | ||||
write: | write: | ||||
@@ -145,13 +140,6 @@ loop: | |||||
} | } | ||||
nDst += utf8.EncodeRune(dst[nDst:], r) | nDst += utf8.EncodeRune(dst[nDst:], r) | ||||
} | } | ||||
if atEOF && err == transform.ErrShortSrc { | |||||
if d.gb18030 { | |||||
err = errInvalidGB18030 | |||||
} else { | |||||
err = errInvalidGBK | |||||
} | |||||
} | |||||
return nDst, nSrc, err | return nDst, nSrc, err | ||||
} | } | ||||
@@ -5,7 +5,6 @@ | |||||
package simplifiedchinese | package simplifiedchinese | ||||
import ( | import ( | ||||
"errors" | |||||
"unicode/utf8" | "unicode/utf8" | ||||
"golang.org/x/text/encoding" | "golang.org/x/text/encoding" | ||||
@@ -31,8 +30,6 @@ func hzGB2312NewEncoder() transform.Transformer { | |||||
return new(hzGB2312Encoder) | return new(hzGB2312Encoder) | ||||
} | } | ||||
var errInvalidHZGB2312 = errors.New("simplifiedchinese: invalid HZ-GB2312 encoding") | |||||
const ( | const ( | ||||
asciiState = iota | asciiState = iota | ||||
gbState | gbState | ||||
@@ -50,14 +47,18 @@ loop: | |||||
for ; nSrc < len(src); nSrc += size { | for ; nSrc < len(src); nSrc += size { | ||||
c0 := src[nSrc] | c0 := src[nSrc] | ||||
if c0 >= utf8.RuneSelf { | if c0 >= utf8.RuneSelf { | ||||
err = errInvalidHZGB2312 | |||||
break loop | |||||
r, size = utf8.RuneError, 1 | |||||
goto write | |||||
} | } | ||||
if c0 == '~' { | if c0 == '~' { | ||||
if nSrc+1 >= len(src) { | if nSrc+1 >= len(src) { | ||||
err = transform.ErrShortSrc | |||||
break loop | |||||
if !atEOF { | |||||
err = transform.ErrShortSrc | |||||
break loop | |||||
} | |||||
r = utf8.RuneError | |||||
goto write | |||||
} | } | ||||
size = 2 | size = 2 | ||||
switch src[nSrc+1] { | switch src[nSrc+1] { | ||||
@@ -78,8 +79,8 @@ loop: | |||||
case '\n': | case '\n': | ||||
continue | continue | ||||
default: | default: | ||||
err = errInvalidHZGB2312 | |||||
break loop | |||||
r = utf8.RuneError | |||||
goto write | |||||
} | } | ||||
} | } | ||||
@@ -87,33 +88,37 @@ loop: | |||||
r, size = rune(c0), 1 | r, size = rune(c0), 1 | ||||
} else { | } else { | ||||
if nSrc+1 >= len(src) { | if nSrc+1 >= len(src) { | ||||
err = transform.ErrShortSrc | |||||
break loop | |||||
if !atEOF { | |||||
err = transform.ErrShortSrc | |||||
break loop | |||||
} | |||||
r, size = utf8.RuneError, 1 | |||||
goto write | |||||
} | } | ||||
size = 2 | |||||
c1 := src[nSrc+1] | c1 := src[nSrc+1] | ||||
if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 { | if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 { | ||||
err = errInvalidHZGB2312 | |||||
break loop | |||||
} | |||||
r, size = '\ufffd', 2 | |||||
if i := int(c0-0x01)*190 + int(c1+0x3f); i < len(decode) { | |||||
// error | |||||
} else if i := int(c0-0x01)*190 + int(c1+0x3f); i < len(decode) { | |||||
r = rune(decode[i]) | r = rune(decode[i]) | ||||
if r == 0 { | |||||
r = '\ufffd' | |||||
if r != 0 { | |||||
goto write | |||||
} | } | ||||
} | } | ||||
if c1 > utf8.RuneSelf { | |||||
// Be consistent and always treat non-ASCII as a single error. | |||||
size = 1 | |||||
} | |||||
r = utf8.RuneError | |||||
} | } | ||||
write: | |||||
if nDst+utf8.RuneLen(r) > len(dst) { | if nDst+utf8.RuneLen(r) > len(dst) { | ||||
err = transform.ErrShortDst | err = transform.ErrShortDst | ||||
break loop | break loop | ||||
} | } | ||||
nDst += utf8.EncodeRune(dst[nDst:], r) | nDst += utf8.EncodeRune(dst[nDst:], r) | ||||
} | } | ||||
if atEOF && err == transform.ErrShortSrc { | |||||
err = errInvalidHZGB2312 | |||||
} | |||||
return nDst, nSrc, err | return nDst, nSrc, err | ||||
} | } | ||||
@@ -1,161 +0,0 @@ | |||||
// Copyright 2013 The Go Authors. All rights reserved. | |||||
// Use of this source code is governed by a BSD-style | |||||
// license that can be found in the LICENSE file. | |||||
// +build ignore | |||||
package main | |||||
// This program generates tables.go: | |||||
// go run maketables.go | gofmt > tables.go | |||||
import ( | |||||
"bufio" | |||||
"fmt" | |||||
"log" | |||||
"net/http" | |||||
"sort" | |||||
"strings" | |||||
) | |||||
func main() { | |||||
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n") | |||||
fmt.Printf("// Package simplifiedchinese provides Simplified Chinese encodings such as GBK.\n") | |||||
fmt.Printf(`package simplifiedchinese // import "golang.org/x/text/encoding/simplifiedchinese"` + "\n\n") | |||||
printGB18030() | |||||
printGBK() | |||||
} | |||||
func printGB18030() { | |||||
res, err := http.Get("http://encoding.spec.whatwg.org/index-gb18030.txt") | |||||
if err != nil { | |||||
log.Fatalf("Get: %v", err) | |||||
} | |||||
defer res.Body.Close() | |||||
fmt.Printf("// gb18030 is the table from http://encoding.spec.whatwg.org/index-gb18030.txt\n") | |||||
fmt.Printf("var gb18030 = [...][2]uint16{\n") | |||||
scanner := bufio.NewScanner(res.Body) | |||||
for scanner.Scan() { | |||||
s := strings.TrimSpace(scanner.Text()) | |||||
if s == "" || s[0] == '#' { | |||||
continue | |||||
} | |||||
x, y := uint32(0), uint32(0) | |||||
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil { | |||||
log.Fatalf("could not parse %q", s) | |||||
} | |||||
if x < 0x10000 && y < 0x10000 { | |||||
fmt.Printf("\t{0x%04x, 0x%04x},\n", x, y) | |||||
} | |||||
} | |||||
fmt.Printf("}\n\n") | |||||
} | |||||
func printGBK() { | |||||
res, err := http.Get("http://encoding.spec.whatwg.org/index-gbk.txt") | |||||
if err != nil { | |||||
log.Fatalf("Get: %v", err) | |||||
} | |||||
defer res.Body.Close() | |||||
mapping := [65536]uint16{} | |||||
reverse := [65536]uint16{} | |||||
scanner := bufio.NewScanner(res.Body) | |||||
for scanner.Scan() { | |||||
s := strings.TrimSpace(scanner.Text()) | |||||
if s == "" || s[0] == '#' { | |||||
continue | |||||
} | |||||
x, y := uint16(0), uint16(0) | |||||
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil { | |||||
log.Fatalf("could not parse %q", s) | |||||
} | |||||
if x < 0 || 126*190 <= x { | |||||
log.Fatalf("GBK code %d is out of range", x) | |||||
} | |||||
mapping[x] = y | |||||
if reverse[y] == 0 { | |||||
c0, c1 := x/190, x%190 | |||||
if c1 >= 0x3f { | |||||
c1++ | |||||
} | |||||
reverse[y] = (0x81+c0)<<8 | (0x40 + c1) | |||||
} | |||||
} | |||||
if err := scanner.Err(); err != nil { | |||||
log.Fatalf("scanner error: %v", err) | |||||
} | |||||
fmt.Printf("// decode is the decoding table from GBK code to Unicode.\n") | |||||
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-gbk.txt\n") | |||||
fmt.Printf("var decode = [...]uint16{\n") | |||||
for i, v := range mapping { | |||||
if v != 0 { | |||||
fmt.Printf("\t%d: 0x%04X,\n", i, v) | |||||
} | |||||
} | |||||
fmt.Printf("}\n\n") | |||||
// Any run of at least separation continuous zero entries in the reverse map will | |||||
// be a separate encode table. | |||||
const separation = 1024 | |||||
intervals := []interval(nil) | |||||
low, high := -1, -1 | |||||
for i, v := range reverse { | |||||
if v == 0 { | |||||
continue | |||||
} | |||||
if low < 0 { | |||||
low = i | |||||
} else if i-high >= separation { | |||||
if high >= 0 { | |||||
intervals = append(intervals, interval{low, high}) | |||||
} | |||||
low = i | |||||
} | |||||
high = i + 1 | |||||
} | |||||
if high >= 0 { | |||||
intervals = append(intervals, interval{low, high}) | |||||
} | |||||
sort.Sort(byDecreasingLength(intervals)) | |||||
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals)) | |||||
fmt.Printf("// encodeX are the encoding tables from Unicode to GBK code,\n") | |||||
fmt.Printf("// sorted by decreasing length.\n") | |||||
for i, v := range intervals { | |||||
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high) | |||||
} | |||||
fmt.Printf("\n") | |||||
for i, v := range intervals { | |||||
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high) | |||||
fmt.Printf("var encode%d = [...]uint16{\n", i) | |||||
for j := v.low; j < v.high; j++ { | |||||
x := reverse[j] | |||||
if x == 0 { | |||||
continue | |||||
} | |||||
fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x) | |||||
} | |||||
fmt.Printf("}\n\n") | |||||
} | |||||
} | |||||
// interval is a half-open interval [low, high). | |||||
type interval struct { | |||||
low, high int | |||||
} | |||||
func (i interval) len() int { return i.high - i.low } | |||||
// byDecreasingLength sorts intervals by decreasing length. | |||||
type byDecreasingLength []interval | |||||
func (b byDecreasingLength) Len() int { return len(b) } | |||||
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() } | |||||
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] } |
@@ -5,7 +5,6 @@ | |||||
package traditionalchinese | package traditionalchinese | ||||
import ( | import ( | ||||
"errors" | |||||
"unicode/utf8" | "unicode/utf8" | ||||
"golang.org/x/text/encoding" | "golang.org/x/text/encoding" | ||||
@@ -26,8 +25,6 @@ var big5 = internal.Encoding{ | |||||
identifier.Big5, | identifier.Big5, | ||||
} | } | ||||
var errInvalidBig5 = errors.New("traditionalchinese: invalid Big5 encoding") | |||||
type big5Decoder struct{ transform.NopResetter } | type big5Decoder struct{ transform.NopResetter } | ||||
func (big5Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { | func (big5Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { | ||||
@@ -40,8 +37,12 @@ loop: | |||||
case 0x81 <= c0 && c0 < 0xff: | case 0x81 <= c0 && c0 < 0xff: | ||||
if nSrc+1 >= len(src) { | if nSrc+1 >= len(src) { | ||||
err = transform.ErrShortSrc | |||||
break loop | |||||
if !atEOF { | |||||
err = transform.ErrShortSrc | |||||
break loop | |||||
} | |||||
r, size = utf8.RuneError, 1 | |||||
goto write | |||||
} | } | ||||
c1 := src[nSrc+1] | c1 := src[nSrc+1] | ||||
switch { | switch { | ||||
@@ -49,9 +50,12 @@ loop: | |||||
c1 -= 0x40 | c1 -= 0x40 | ||||
case 0xa1 <= c1 && c1 < 0xff: | case 0xa1 <= c1 && c1 < 0xff: | ||||
c1 -= 0x62 | c1 -= 0x62 | ||||
case c1 < 0x40: | |||||
r, size = utf8.RuneError, 1 | |||||
goto write | |||||
default: | default: | ||||
err = errInvalidBig5 | |||||
break loop | |||||
r, size = utf8.RuneError, 2 | |||||
goto write | |||||
} | } | ||||
r, size = '\ufffd', 2 | r, size = '\ufffd', 2 | ||||
if i := int(c0-0x81)*157 + int(c1); i < len(decode) { | if i := int(c0-0x81)*157 + int(c1); i < len(decode) { | ||||
@@ -80,10 +84,10 @@ loop: | |||||
} | } | ||||
default: | default: | ||||
err = errInvalidBig5 | |||||
break loop | |||||
r, size = utf8.RuneError, 1 | |||||
} | } | ||||
write: | |||||
if nDst+utf8.RuneLen(r) > len(dst) { | if nDst+utf8.RuneLen(r) > len(dst) { | ||||
err = transform.ErrShortDst | err = transform.ErrShortDst | ||||
break loop | break loop | ||||
@@ -99,9 +103,6 @@ loop: | |||||
nDst += copy(dst[nDst:], s) | nDst += copy(dst[nDst:], s) | ||||
continue loop | continue loop | ||||
} | } | ||||
if atEOF && err == transform.ErrShortSrc { | |||||
err = errInvalidBig5 | |||||
} | |||||
return nDst, nSrc, err | return nDst, nSrc, err | ||||
} | } | ||||
@@ -1,140 +0,0 @@ | |||||
// Copyright 2013 The Go Authors. All rights reserved. | |||||
// Use of this source code is governed by a BSD-style | |||||
// license that can be found in the LICENSE file. | |||||
// +build ignore | |||||
package main | |||||
// This program generates tables.go: | |||||
// go run maketables.go | gofmt > tables.go | |||||
import ( | |||||
"bufio" | |||||
"fmt" | |||||
"log" | |||||
"net/http" | |||||
"sort" | |||||
"strings" | |||||
) | |||||
func main() { | |||||
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n") | |||||
fmt.Printf("// Package traditionalchinese provides Traditional Chinese encodings such as Big5.\n") | |||||
fmt.Printf(`package traditionalchinese // import "golang.org/x/text/encoding/traditionalchinese"` + "\n\n") | |||||
res, err := http.Get("http://encoding.spec.whatwg.org/index-big5.txt") | |||||
if err != nil { | |||||
log.Fatalf("Get: %v", err) | |||||
} | |||||
defer res.Body.Close() | |||||
mapping := [65536]uint32{} | |||||
reverse := [65536 * 4]uint16{} | |||||
scanner := bufio.NewScanner(res.Body) | |||||
for scanner.Scan() { | |||||
s := strings.TrimSpace(scanner.Text()) | |||||
if s == "" || s[0] == '#' { | |||||
continue | |||||
} | |||||
x, y := uint16(0), uint32(0) | |||||
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil { | |||||
log.Fatalf("could not parse %q", s) | |||||
} | |||||
if x < 0 || 126*157 <= x { | |||||
log.Fatalf("Big5 code %d is out of range", x) | |||||
} | |||||
mapping[x] = y | |||||
// The WHATWG spec http://encoding.spec.whatwg.org/#indexes says that | |||||
// "The index pointer for code point in index is the first pointer | |||||
// corresponding to code point in index", which would normally mean | |||||
// that the code below should be guarded by "if reverse[y] == 0", but | |||||
// last instead of first seems to match the behavior of | |||||
// "iconv -f UTF-8 -t BIG5". For example, U+8005 者 occurs twice in | |||||
// http://encoding.spec.whatwg.org/index-big5.txt, as index 2148 | |||||
// (encoded as "\x8e\xcd") and index 6543 (encoded as "\xaa\xcc") | |||||
// and "echo 者 | iconv -f UTF-8 -t BIG5 | xxd" gives "\xaa\xcc". | |||||
c0, c1 := x/157, x%157 | |||||
if c1 < 0x3f { | |||||
c1 += 0x40 | |||||
} else { | |||||
c1 += 0x62 | |||||
} | |||||
reverse[y] = (0x81+c0)<<8 | c1 | |||||
} | |||||
if err := scanner.Err(); err != nil { | |||||
log.Fatalf("scanner error: %v", err) | |||||
} | |||||
fmt.Printf("// decode is the decoding table from Big5 code to Unicode.\n") | |||||
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-big5.txt\n") | |||||
fmt.Printf("var decode = [...]uint32{\n") | |||||
for i, v := range mapping { | |||||
if v != 0 { | |||||
fmt.Printf("\t%d: 0x%08X,\n", i, v) | |||||
} | |||||
} | |||||
fmt.Printf("}\n\n") | |||||
// Any run of at least separation continuous zero entries in the reverse map will | |||||
// be a separate encode table. | |||||
const separation = 1024 | |||||
intervals := []interval(nil) | |||||
low, high := -1, -1 | |||||
for i, v := range reverse { | |||||
if v == 0 { | |||||
continue | |||||
} | |||||
if low < 0 { | |||||
low = i | |||||
} else if i-high >= separation { | |||||
if high >= 0 { | |||||
intervals = append(intervals, interval{low, high}) | |||||
} | |||||
low = i | |||||
} | |||||
high = i + 1 | |||||
} | |||||
if high >= 0 { | |||||
intervals = append(intervals, interval{low, high}) | |||||
} | |||||
sort.Sort(byDecreasingLength(intervals)) | |||||
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals)) | |||||
fmt.Printf("// encodeX are the encoding tables from Unicode to Big5 code,\n") | |||||
fmt.Printf("// sorted by decreasing length.\n") | |||||
for i, v := range intervals { | |||||
fmt.Printf("// encode%d: %5d entries for runes in [%6d, %6d).\n", i, v.len(), v.low, v.high) | |||||
} | |||||
fmt.Printf("\n") | |||||
for i, v := range intervals { | |||||
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high) | |||||
fmt.Printf("var encode%d = [...]uint16{\n", i) | |||||
for j := v.low; j < v.high; j++ { | |||||
x := reverse[j] | |||||
if x == 0 { | |||||
continue | |||||
} | |||||
fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x) | |||||
} | |||||
fmt.Printf("}\n\n") | |||||
} | |||||
} | |||||
// interval is a half-open interval [low, high). | |||||
type interval struct { | |||||
low, high int | |||||
} | |||||
func (i interval) len() int { return i.high - i.low } | |||||
// byDecreasingLength sorts intervals by decreasing length. | |||||
type byDecreasingLength []interval | |||||
func (b byDecreasingLength) Len() int { return len(b) } | |||||
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() } | |||||
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] } |
@@ -1,4 +1,4 @@ | |||||
// This file was generated by go generate; DO NOT EDIT | |||||
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT. | |||||
package language | package language | ||||
@@ -1,20 +0,0 @@ | |||||
// Copyright 2014 The Go Authors. All rights reserved. | |||||
// Use of this source code is governed by a BSD-style | |||||
// license that can be found in the LICENSE file. | |||||
// +build ignore | |||||
package main | |||||
// This file contains code common to the maketables.go and the package code. | |||||
// langAliasType is the type of an alias in langAliasMap. | |||||
type langAliasType int8 | |||||
const ( | |||||
langDeprecated langAliasType = iota | |||||
langMacro | |||||
langLegacy | |||||
langAliasTypeUnknown langAliasType = -1 | |||||
) |
@@ -1,162 +0,0 @@ | |||||
// Copyright 2015 The Go Authors. All rights reserved. | |||||
// Use of this source code is governed by a BSD-style | |||||
// license that can be found in the LICENSE file. | |||||
// +build ignore | |||||
package main | |||||
// This file generates derivative tables based on the language package itself. | |||||
import ( | |||||
"bytes" | |||||
"flag" | |||||
"fmt" | |||||
"io/ioutil" | |||||
"log" | |||||
"reflect" | |||||
"sort" | |||||
"strings" | |||||
"golang.org/x/text/internal/gen" | |||||
"golang.org/x/text/language" | |||||
"golang.org/x/text/unicode/cldr" | |||||
) | |||||
var ( | |||||
test = flag.Bool("test", false, | |||||
"test existing tables; can be used to compare web data with package data.") | |||||
draft = flag.String("draft", | |||||
"contributed", | |||||
`Minimal draft requirements (approved, contributed, provisional, unconfirmed).`) | |||||
) | |||||
func main() { | |||||
gen.Init() | |||||
// Read the CLDR zip file. | |||||
r := gen.OpenCLDRCoreZip() | |||||
defer r.Close() | |||||
d := &cldr.Decoder{} | |||||
data, err := d.DecodeZip(r) | |||||
if err != nil { | |||||
log.Fatalf("DecodeZip: %v", err) | |||||
} | |||||
w := gen.NewCodeWriter() | |||||
defer func() { | |||||
buf := &bytes.Buffer{} | |||||
if _, err = w.WriteGo(buf, "language"); err != nil { | |||||
log.Fatalf("Error formatting file index.go: %v", err) | |||||
} | |||||
// Since we're generating a table for our own package we need to rewrite | |||||
// doing the equivalent of go fmt -r 'language.b -> b'. Using | |||||
// bytes.Replace will do. | |||||
out := bytes.Replace(buf.Bytes(), []byte("language."), nil, -1) | |||||
if err := ioutil.WriteFile("index.go", out, 0600); err != nil { | |||||
log.Fatalf("Could not create file index.go: %v", err) | |||||
} | |||||
}() | |||||
m := map[language.Tag]bool{} | |||||
for _, lang := range data.Locales() { | |||||
// We include all locales unconditionally to be consistent with en_US. | |||||
// We want en_US, even though it has no data associated with it. | |||||
// TODO: put any of the languages for which no data exists at the end | |||||
// of the index. This allows all components based on ICU to use that | |||||
// as the cutoff point. | |||||
// if x := data.RawLDML(lang); false || | |||||
// x.LocaleDisplayNames != nil || | |||||
// x.Characters != nil || | |||||
// x.Delimiters != nil || | |||||
// x.Measurement != nil || | |||||
// x.Dates != nil || | |||||
// x.Numbers != nil || | |||||
// x.Units != nil || | |||||
// x.ListPatterns != nil || | |||||
// x.Collations != nil || | |||||
// x.Segmentations != nil || | |||||
// x.Rbnf != nil || | |||||
// x.Annotations != nil || | |||||
// x.Metadata != nil { | |||||
// TODO: support POSIX natively, albeit non-standard. | |||||
tag := language.Make(strings.Replace(lang, "_POSIX", "-u-va-posix", 1)) | |||||
m[tag] = true | |||||
// } | |||||
} | |||||
// Include locales for plural rules, which uses a different structure. | |||||
for _, plurals := range data.Supplemental().Plurals { | |||||
for _, rules := range plurals.PluralRules { | |||||
for _, lang := range strings.Split(rules.Locales, " ") { | |||||
m[language.Make(lang)] = true | |||||
} | |||||
} | |||||
} | |||||
var core, special []language.Tag | |||||
for t := range m { | |||||
if x := t.Extensions(); len(x) != 0 && fmt.Sprint(x) != "[u-va-posix]" { | |||||
log.Fatalf("Unexpected extension %v in %v", x, t) | |||||
} | |||||
if len(t.Variants()) == 0 && len(t.Extensions()) == 0 { | |||||
core = append(core, t) | |||||
} else { | |||||
special = append(special, t) | |||||
} | |||||
} | |||||
w.WriteComment(` | |||||
NumCompactTags is the number of common tags. The maximum tag is | |||||
NumCompactTags-1.`) | |||||
w.WriteConst("NumCompactTags", len(core)+len(special)) | |||||
sort.Sort(byAlpha(special)) | |||||
w.WriteVar("specialTags", special) | |||||
// TODO: order by frequency? | |||||
sort.Sort(byAlpha(core)) | |||||
// Size computations are just an estimate. | |||||
w.Size += int(reflect.TypeOf(map[uint32]uint16{}).Size()) | |||||
w.Size += len(core) * 6 // size of uint32 and uint16 | |||||
fmt.Fprintln(w) | |||||
fmt.Fprintln(w, "var coreTags = map[uint32]uint16{") | |||||
fmt.Fprintln(w, "0x0: 0, // und") | |||||
i := len(special) + 1 // Und and special tags already written. | |||||
for _, t := range core { | |||||
if t == language.Und { | |||||
continue | |||||
} | |||||
fmt.Fprint(w.Hash, t, i) | |||||
b, s, r := t.Raw() | |||||
fmt.Fprintf(w, "0x%s%s%s: %d, // %s\n", | |||||
getIndex(b, 3), // 3 is enough as it is guaranteed to be a compact number | |||||
getIndex(s, 2), | |||||
getIndex(r, 3), | |||||
i, t) | |||||
i++ | |||||
} | |||||
fmt.Fprintln(w, "}") | |||||
} | |||||
// getIndex prints the subtag type and extracts its index of size nibble. | |||||
// If the index is less than n nibbles, the result is prefixed with 0s. | |||||
func getIndex(x interface{}, n int) string { | |||||
s := fmt.Sprintf("%#v", x) // s is of form Type{typeID: 0x00} | |||||
s = s[strings.Index(s, "0x")+2 : len(s)-1] | |||||
return strings.Repeat("0", n-len(s)) + s | |||||
} | |||||
type byAlpha []language.Tag | |||||
func (a byAlpha) Len() int { return len(a) } | |||||
func (a byAlpha) Swap(i, j int) { a[i], a[j] = a[j], a[i] } | |||||
func (a byAlpha) Less(i, j int) bool { return a[i].String() < a[j].String() } |
@@ -440,8 +440,10 @@ func makeHaveTag(tag Tag, index int) (haveTag, langID) { | |||||
// script to map to another and we rely on this to keep the code simple. | // script to map to another and we rely on this to keep the code simple. | ||||
func altScript(l langID, s scriptID) scriptID { | func altScript(l langID, s scriptID) scriptID { | ||||
for _, alt := range matchScript { | for _, alt := range matchScript { | ||||
if (alt.lang == 0 || langID(alt.lang) == l) && scriptID(alt.have) == s { | |||||
return scriptID(alt.want) | |||||
// TODO: also match cases where language is not the same. | |||||
if (langID(alt.wantLang) == l || langID(alt.haveLang) == l) && | |||||
scriptID(alt.haveScript) == s { | |||||
return scriptID(alt.wantScript) | |||||
} | } | ||||
} | } | ||||
return 0 | return 0 | ||||
@@ -486,6 +488,16 @@ func (m *matcher) header(l langID) *matchHeader { | |||||
return h | return h | ||||
} | } | ||||
func toConf(d uint8) Confidence { | |||||
if d <= 10 { | |||||
return High | |||||
} | |||||
if d < 30 { | |||||
return Low | |||||
} | |||||
return No | |||||
} | |||||
// newMatcher builds an index for the given supported tags and returns it as | // newMatcher builds an index for the given supported tags and returns it as | ||||
// a matcher. It also expands the index by considering various equivalence classes | // a matcher. It also expands the index by considering various equivalence classes | ||||
// for a given tag. | // for a given tag. | ||||
@@ -537,9 +549,9 @@ func newMatcher(supported []Tag) *matcher { | |||||
// Add entries for languages with mutual intelligibility as defined by CLDR's | // Add entries for languages with mutual intelligibility as defined by CLDR's | ||||
// languageMatch data. | // languageMatch data. | ||||
for _, ml := range matchLang { | for _, ml := range matchLang { | ||||
update(ml.want, ml.have, Confidence(ml.conf), false) | |||||
update(ml.want, ml.have, toConf(ml.distance), false) | |||||
if !ml.oneway { | if !ml.oneway { | ||||
update(ml.have, ml.want, Confidence(ml.conf), false) | |||||
update(ml.have, ml.want, toConf(ml.distance), false) | |||||
} | } | ||||
} | } | ||||
@@ -1,976 +0,0 @@ | |||||
// Copyright 2011 The Go Authors. All rights reserved. | |||||
// Use of this source code is governed by a BSD-style | |||||
// license that can be found in the LICENSE file. | |||||
// +build ignore | |||||
// Normalization table generator. | |||||
// Data read from the web. | |||||
// See forminfo.go for a description of the trie values associated with each rune. | |||||
package main | |||||
import ( | |||||
"bytes" | |||||
"flag" | |||||
"fmt" | |||||
"io" | |||||
"log" | |||||
"sort" | |||||
"strconv" | |||||
"strings" | |||||
"golang.org/x/text/internal/gen" | |||||
"golang.org/x/text/internal/triegen" | |||||
"golang.org/x/text/internal/ucd" | |||||
) | |||||
func main() { | |||||
gen.Init() | |||||
loadUnicodeData() | |||||
compactCCC() | |||||
loadCompositionExclusions() | |||||
completeCharFields(FCanonical) | |||||
completeCharFields(FCompatibility) | |||||
computeNonStarterCounts() | |||||
verifyComputed() | |||||
printChars() | |||||
testDerived() | |||||
printTestdata() | |||||
makeTables() | |||||
} | |||||
var ( | |||||
tablelist = flag.String("tables", | |||||
"all", | |||||
"comma-separated list of which tables to generate; "+ | |||||
"can be 'decomp', 'recomp', 'info' and 'all'") | |||||
test = flag.Bool("test", | |||||
false, | |||||
"test existing tables against DerivedNormalizationProps and generate test data for regression testing") | |||||
verbose = flag.Bool("verbose", | |||||
false, | |||||
"write data to stdout as it is parsed") | |||||
) | |||||
const MaxChar = 0x10FFFF // anything above this shouldn't exist | |||||
// Quick Check properties of runes allow us to quickly | |||||
// determine whether a rune may occur in a normal form. | |||||
// For a given normal form, a rune may be guaranteed to occur | |||||
// verbatim (QC=Yes), may or may not combine with another | |||||
// rune (QC=Maybe), or may not occur (QC=No). | |||||
type QCResult int | |||||
const ( | |||||
QCUnknown QCResult = iota | |||||
QCYes | |||||
QCNo | |||||
QCMaybe | |||||
) | |||||
func (r QCResult) String() string { | |||||
switch r { | |||||
case QCYes: | |||||
return "Yes" | |||||
case QCNo: | |||||
return "No" | |||||
case QCMaybe: | |||||
return "Maybe" | |||||
} | |||||
return "***UNKNOWN***" | |||||
} | |||||
const ( | |||||
FCanonical = iota // NFC or NFD | |||||
FCompatibility // NFKC or NFKD | |||||
FNumberOfFormTypes | |||||
) | |||||
const ( | |||||
MComposed = iota // NFC or NFKC | |||||
MDecomposed // NFD or NFKD | |||||
MNumberOfModes | |||||
) | |||||
// This contains only the properties we're interested in. | |||||
type Char struct { | |||||
name string | |||||
codePoint rune // if zero, this index is not a valid code point. | |||||
ccc uint8 // canonical combining class | |||||
origCCC uint8 | |||||
excludeInComp bool // from CompositionExclusions.txt | |||||
compatDecomp bool // it has a compatibility expansion | |||||
nTrailingNonStarters uint8 | |||||
nLeadingNonStarters uint8 // must be equal to trailing if non-zero | |||||
forms [FNumberOfFormTypes]FormInfo // For FCanonical and FCompatibility | |||||
state State | |||||
} | |||||
var chars = make([]Char, MaxChar+1) | |||||
var cccMap = make(map[uint8]uint8) | |||||
func (c Char) String() string { | |||||
buf := new(bytes.Buffer) | |||||
fmt.Fprintf(buf, "%U [%s]:\n", c.codePoint, c.name) | |||||
fmt.Fprintf(buf, " ccc: %v\n", c.ccc) | |||||
fmt.Fprintf(buf, " excludeInComp: %v\n", c.excludeInComp) | |||||
fmt.Fprintf(buf, " compatDecomp: %v\n", c.compatDecomp) | |||||
fmt.Fprintf(buf, " state: %v\n", c.state) | |||||
fmt.Fprintf(buf, " NFC:\n") | |||||
fmt.Fprint(buf, c.forms[FCanonical]) | |||||
fmt.Fprintf(buf, " NFKC:\n") | |||||
fmt.Fprint(buf, c.forms[FCompatibility]) | |||||
return buf.String() | |||||
} | |||||
// In UnicodeData.txt, some ranges are marked like this: | |||||
// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; | |||||
// 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; | |||||
// parseCharacter keeps a state variable indicating the weirdness. | |||||
type State int | |||||
const ( | |||||
SNormal State = iota // known to be zero for the type | |||||
SFirst | |||||
SLast | |||||
SMissing | |||||
) | |||||
var lastChar = rune('\u0000') | |||||
func (c Char) isValid() bool { | |||||
return c.codePoint != 0 && c.state != SMissing | |||||
} | |||||
type FormInfo struct { | |||||
quickCheck [MNumberOfModes]QCResult // index: MComposed or MDecomposed | |||||
verified [MNumberOfModes]bool // index: MComposed or MDecomposed | |||||
combinesForward bool // May combine with rune on the right | |||||
combinesBackward bool // May combine with rune on the left | |||||
isOneWay bool // Never appears in result | |||||
inDecomp bool // Some decompositions result in this char. | |||||
decomp Decomposition | |||||
expandedDecomp Decomposition | |||||
} | |||||
func (f FormInfo) String() string { | |||||
buf := bytes.NewBuffer(make([]byte, 0)) | |||||
fmt.Fprintf(buf, " quickCheck[C]: %v\n", f.quickCheck[MComposed]) | |||||
fmt.Fprintf(buf, " quickCheck[D]: %v\n", f.quickCheck[MDecomposed]) | |||||
fmt.Fprintf(buf, " cmbForward: %v\n", f.combinesForward) | |||||
fmt.Fprintf(buf, " cmbBackward: %v\n", f.combinesBackward) | |||||
fmt.Fprintf(buf, " isOneWay: %v\n", f.isOneWay) | |||||
fmt.Fprintf(buf, " inDecomp: %v\n", f.inDecomp) | |||||
fmt.Fprintf(buf, " decomposition: %X\n", f.decomp) | |||||
fmt.Fprintf(buf, " expandedDecomp: %X\n", f.expandedDecomp) | |||||
return buf.String() | |||||
} | |||||
type Decomposition []rune | |||||
func parseDecomposition(s string, skipfirst bool) (a []rune, err error) { | |||||
decomp := strings.Split(s, " ") | |||||
if len(decomp) > 0 && skipfirst { | |||||
decomp = decomp[1:] | |||||
} | |||||
for _, d := range decomp { | |||||
point, err := strconv.ParseUint(d, 16, 64) | |||||
if err != nil { | |||||
return a, err | |||||
} | |||||
a = append(a, rune(point)) | |||||
} | |||||
return a, nil | |||||
} | |||||
func loadUnicodeData() { | |||||
f := gen.OpenUCDFile("UnicodeData.txt") | |||||
defer f.Close() | |||||
p := ucd.New(f) | |||||
for p.Next() { | |||||
r := p.Rune(ucd.CodePoint) | |||||
char := &chars[r] | |||||
char.ccc = uint8(p.Uint(ucd.CanonicalCombiningClass)) | |||||
decmap := p.String(ucd.DecompMapping) | |||||
exp, err := parseDecomposition(decmap, false) | |||||
isCompat := false | |||||
if err != nil { | |||||
if len(decmap) > 0 { | |||||
exp, err = parseDecomposition(decmap, true) | |||||
if err != nil { | |||||
log.Fatalf(`%U: bad decomp |%v|: "%s"`, r, decmap, err) | |||||
} | |||||
isCompat = true | |||||
} | |||||
} | |||||
char.name = p.String(ucd.Name) | |||||
char.codePoint = r | |||||
char.forms[FCompatibility].decomp = exp | |||||
if !isCompat { | |||||
char.forms[FCanonical].decomp = exp | |||||
} else { | |||||
char.compatDecomp = true | |||||
} | |||||
if len(decmap) > 0 { | |||||
char.forms[FCompatibility].decomp = exp | |||||
} | |||||
} | |||||
if err := p.Err(); err != nil { | |||||
log.Fatal(err) | |||||
} | |||||
} | |||||
// compactCCC converts the sparse set of CCC values to a continguous one, | |||||
// reducing the number of bits needed from 8 to 6. | |||||
func compactCCC() { | |||||
m := make(map[uint8]uint8) | |||||
for i := range chars { | |||||
c := &chars[i] | |||||
m[c.ccc] = 0 | |||||
} | |||||
cccs := []int{} | |||||
for v, _ := range m { | |||||
cccs = append(cccs, int(v)) | |||||
} | |||||
sort.Ints(cccs) | |||||
for i, c := range cccs { | |||||
cccMap[uint8(i)] = uint8(c) | |||||
m[uint8(c)] = uint8(i) | |||||
} | |||||
for i := range chars { | |||||
c := &chars[i] | |||||
c.origCCC = c.ccc | |||||
c.ccc = m[c.ccc] | |||||
} | |||||
if len(m) >= 1<<6 { | |||||
log.Fatalf("too many difference CCC values: %d >= 64", len(m)) | |||||
} | |||||
} | |||||
// CompositionExclusions.txt has form: | |||||
// 0958 # ... | |||||
// See http://unicode.org/reports/tr44/ for full explanation | |||||
func loadCompositionExclusions() { | |||||
f := gen.OpenUCDFile("CompositionExclusions.txt") | |||||
defer f.Close() | |||||
p := ucd.New(f) | |||||
for p.Next() { | |||||
c := &chars[p.Rune(0)] | |||||
if c.excludeInComp { | |||||
log.Fatalf("%U: Duplicate entry in exclusions.", c.codePoint) | |||||
} | |||||
c.excludeInComp = true | |||||
} | |||||
if e := p.Err(); e != nil { | |||||
log.Fatal(e) | |||||
} | |||||
} | |||||
// hasCompatDecomp returns true if any of the recursive | |||||
// decompositions contains a compatibility expansion. | |||||
// In this case, the character may not occur in NFK*. | |||||
func hasCompatDecomp(r rune) bool { | |||||
c := &chars[r] | |||||
if c.compatDecomp { | |||||
return true | |||||
} | |||||
for _, d := range c.forms[FCompatibility].decomp { | |||||
if hasCompatDecomp(d) { | |||||
return true | |||||
} | |||||
} | |||||
return false | |||||
} | |||||
// Hangul related constants. | |||||
const ( | |||||
HangulBase = 0xAC00 | |||||
HangulEnd = 0xD7A4 // hangulBase + Jamo combinations (19 * 21 * 28) | |||||
JamoLBase = 0x1100 | |||||
JamoLEnd = 0x1113 | |||||
JamoVBase = 0x1161 | |||||
JamoVEnd = 0x1176 | |||||
JamoTBase = 0x11A8 | |||||
JamoTEnd = 0x11C3 | |||||
JamoLVTCount = 19 * 21 * 28 | |||||
JamoTCount = 28 | |||||
) | |||||
func isHangul(r rune) bool { | |||||
return HangulBase <= r && r < HangulEnd | |||||
} | |||||
func isHangulWithoutJamoT(r rune) bool { | |||||
if !isHangul(r) { | |||||
return false | |||||
} | |||||
r -= HangulBase | |||||
return r < JamoLVTCount && r%JamoTCount == 0 | |||||
} | |||||
func ccc(r rune) uint8 { | |||||
return chars[r].ccc | |||||
} | |||||
// Insert a rune in a buffer, ordered by Canonical Combining Class. | |||||
func insertOrdered(b Decomposition, r rune) Decomposition { | |||||
n := len(b) | |||||
b = append(b, 0) | |||||
cc := ccc(r) | |||||
if cc > 0 { | |||||
// Use bubble sort. | |||||
for ; n > 0; n-- { | |||||
if ccc(b[n-1]) <= cc { | |||||
break | |||||
} | |||||
b[n] = b[n-1] | |||||
} | |||||
} | |||||
b[n] = r | |||||
return b | |||||
} | |||||
// Recursively decompose. | |||||
func decomposeRecursive(form int, r rune, d Decomposition) Decomposition { | |||||
dcomp := chars[r].forms[form].decomp | |||||
if len(dcomp) == 0 { | |||||
return insertOrdered(d, r) | |||||
} | |||||
for _, c := range dcomp { | |||||
d = decomposeRecursive(form, c, d) | |||||
} | |||||
return d | |||||
} | |||||
func completeCharFields(form int) { | |||||
// Phase 0: pre-expand decomposition. | |||||
for i := range chars { | |||||
f := &chars[i].forms[form] | |||||
if len(f.decomp) == 0 { | |||||
continue | |||||
} | |||||
exp := make(Decomposition, 0) | |||||
for _, c := range f.decomp { | |||||
exp = decomposeRecursive(form, c, exp) | |||||
} | |||||
f.expandedDecomp = exp | |||||
} | |||||
// Phase 1: composition exclusion, mark decomposition. | |||||
for i := range chars { | |||||
c := &chars[i] | |||||
f := &c.forms[form] | |||||
// Marks script-specific exclusions and version restricted. | |||||
f.isOneWay = c.excludeInComp | |||||
// Singletons | |||||
f.isOneWay = f.isOneWay || len(f.decomp) == 1 | |||||
// Non-starter decompositions | |||||
if len(f.decomp) > 1 { | |||||
chk := c.ccc != 0 || chars[f.decomp[0]].ccc != 0 | |||||
f.isOneWay = f.isOneWay || chk | |||||
} | |||||
// Runes that decompose into more than two runes. | |||||
f.isOneWay = f.isOneWay || len(f.decomp) > 2 | |||||
if form == FCompatibility { | |||||
f.isOneWay = f.isOneWay || hasCompatDecomp(c.codePoint) | |||||
} | |||||
for _, r := range f.decomp { | |||||
chars[r].forms[form].inDecomp = true | |||||
} | |||||
} | |||||
// Phase 2: forward and backward combining. | |||||
for i := range chars { | |||||
c := &chars[i] | |||||
f := &c.forms[form] | |||||
if !f.isOneWay && len(f.decomp) == 2 { | |||||
f0 := &chars[f.decomp[0]].forms[form] | |||||
f1 := &chars[f.decomp[1]].forms[form] | |||||
if !f0.isOneWay { | |||||
f0.combinesForward = true | |||||
} | |||||
if !f1.isOneWay { | |||||
f1.combinesBackward = true | |||||
} | |||||
} | |||||
if isHangulWithoutJamoT(rune(i)) { | |||||
f.combinesForward = true | |||||
} | |||||
} | |||||
// Phase 3: quick check values. | |||||
for i := range chars { | |||||
c := &chars[i] | |||||
f := &c.forms[form] | |||||
switch { | |||||
case len(f.decomp) > 0: | |||||
f.quickCheck[MDecomposed] = QCNo | |||||
case isHangul(rune(i)): | |||||
f.quickCheck[MDecomposed] = QCNo | |||||
default: | |||||
f.quickCheck[MDecomposed] = QCYes | |||||
} | |||||
switch { | |||||
case f.isOneWay: | |||||
f.quickCheck[MComposed] = QCNo | |||||
case (i & 0xffff00) == JamoLBase: | |||||
f.quickCheck[MComposed] = QCYes | |||||
if JamoLBase <= i && i < JamoLEnd { | |||||
f.combinesForward = true | |||||
} | |||||
if JamoVBase <= i && i < JamoVEnd { | |||||
f.quickCheck[MComposed] = QCMaybe | |||||
f.combinesBackward = true | |||||
f.combinesForward = true | |||||
} | |||||
if JamoTBase <= i && i < JamoTEnd { | |||||
f.quickCheck[MComposed] = QCMaybe | |||||
f.combinesBackward = true | |||||
} | |||||
case !f.combinesBackward: | |||||
f.quickCheck[MComposed] = QCYes | |||||
default: | |||||
f.quickCheck[MComposed] = QCMaybe | |||||
} | |||||
} | |||||
} | |||||
func computeNonStarterCounts() { | |||||
// Phase 4: leading and trailing non-starter count | |||||
for i := range chars { | |||||
c := &chars[i] | |||||
runes := []rune{rune(i)} | |||||
// We always use FCompatibility so that the CGJ insertion points do not | |||||
// change for repeated normalizations with different forms. | |||||
if exp := c.forms[FCompatibility].expandedDecomp; len(exp) > 0 { | |||||
runes = exp | |||||
} | |||||
// We consider runes that combine backwards to be non-starters for the | |||||
// purpose of Stream-Safe Text Processing. | |||||
for _, r := range runes { | |||||
if cr := &chars[r]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward { | |||||
break | |||||
} | |||||
c.nLeadingNonStarters++ | |||||
} | |||||
for i := len(runes) - 1; i >= 0; i-- { | |||||
if cr := &chars[runes[i]]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward { | |||||
break | |||||
} | |||||
c.nTrailingNonStarters++ | |||||
} | |||||
if c.nTrailingNonStarters > 3 { | |||||
log.Fatalf("%U: Decomposition with more than 3 (%d) trailing modifiers (%U)", i, c.nTrailingNonStarters, runes) | |||||
} | |||||
if isHangul(rune(i)) { | |||||
c.nTrailingNonStarters = 2 | |||||
if isHangulWithoutJamoT(rune(i)) { | |||||
c.nTrailingNonStarters = 1 | |||||
} | |||||
} | |||||
if l, t := c.nLeadingNonStarters, c.nTrailingNonStarters; l > 0 && l != t { | |||||
log.Fatalf("%U: number of leading and trailing non-starters should be equal (%d vs %d)", i, l, t) | |||||
} | |||||
if t := c.nTrailingNonStarters; t > 3 { | |||||
log.Fatalf("%U: number of trailing non-starters is %d > 3", t) | |||||
} | |||||
} | |||||
} | |||||
func printBytes(w io.Writer, b []byte, name string) { | |||||
fmt.Fprintf(w, "// %s: %d bytes\n", name, len(b)) | |||||
fmt.Fprintf(w, "var %s = [...]byte {", name) | |||||
for i, c := range b { | |||||
switch { | |||||
case i%64 == 0: | |||||
fmt.Fprintf(w, "\n// Bytes %x - %x\n", i, i+63) | |||||
case i%8 == 0: | |||||
fmt.Fprintf(w, "\n") | |||||
} | |||||
fmt.Fprintf(w, "0x%.2X, ", c) | |||||
} | |||||
fmt.Fprint(w, "\n}\n\n") | |||||
} | |||||
// See forminfo.go for format. | |||||
func makeEntry(f *FormInfo, c *Char) uint16 { | |||||
e := uint16(0) | |||||
if r := c.codePoint; HangulBase <= r && r < HangulEnd { | |||||
e |= 0x40 | |||||
} | |||||
if f.combinesForward { | |||||
e |= 0x20 | |||||
} | |||||
if f.quickCheck[MDecomposed] == QCNo { | |||||
e |= 0x4 | |||||
} | |||||
switch f.quickCheck[MComposed] { | |||||
case QCYes: | |||||
case QCNo: | |||||
e |= 0x10 | |||||
case QCMaybe: | |||||
e |= 0x18 | |||||
default: | |||||
log.Fatalf("Illegal quickcheck value %v.", f.quickCheck[MComposed]) | |||||
} | |||||
e |= uint16(c.nTrailingNonStarters) | |||||
return e | |||||
} | |||||
// decompSet keeps track of unique decompositions, grouped by whether | |||||
// the decomposition is followed by a trailing and/or leading CCC. | |||||
type decompSet [7]map[string]bool | |||||
const ( | |||||
normalDecomp = iota | |||||
firstMulti | |||||
firstCCC | |||||
endMulti | |||||
firstLeadingCCC | |||||
firstCCCZeroExcept | |||||
firstStarterWithNLead | |||||
lastDecomp | |||||
) | |||||
var cname = []string{"firstMulti", "firstCCC", "endMulti", "firstLeadingCCC", "firstCCCZeroExcept", "firstStarterWithNLead", "lastDecomp"} | |||||
func makeDecompSet() decompSet { | |||||
m := decompSet{} | |||||
for i := range m { | |||||
m[i] = make(map[string]bool) | |||||
} | |||||
return m | |||||
} | |||||
func (m *decompSet) insert(key int, s string) { | |||||
m[key][s] = true | |||||
} | |||||
func printCharInfoTables(w io.Writer) int { | |||||
mkstr := func(r rune, f *FormInfo) (int, string) { | |||||
d := f.expandedDecomp | |||||
s := string([]rune(d)) | |||||
if max := 1 << 6; len(s) >= max { | |||||
const msg = "%U: too many bytes in decomposition: %d >= %d" | |||||
log.Fatalf(msg, r, len(s), max) | |||||
} | |||||
head := uint8(len(s)) | |||||
if f.quickCheck[MComposed] != QCYes { | |||||
head |= 0x40 | |||||
} | |||||
if f.combinesForward { | |||||
head |= 0x80 | |||||
} | |||||
s = string([]byte{head}) + s | |||||
lccc := ccc(d[0]) | |||||
tccc := ccc(d[len(d)-1]) | |||||
cc := ccc(r) | |||||
if cc != 0 && lccc == 0 && tccc == 0 { | |||||
log.Fatalf("%U: trailing and leading ccc are 0 for non-zero ccc %d", r, cc) | |||||
} | |||||
if tccc < lccc && lccc != 0 { | |||||
const msg = "%U: lccc (%d) must be <= tcc (%d)" | |||||
log.Fatalf(msg, r, lccc, tccc) | |||||
} | |||||
index := normalDecomp | |||||
nTrail := chars[r].nTrailingNonStarters | |||||
nLead := chars[r].nLeadingNonStarters | |||||
if tccc > 0 || lccc > 0 || nTrail > 0 { | |||||
tccc <<= 2 | |||||
tccc |= nTrail | |||||
s += string([]byte{tccc}) | |||||
index = endMulti | |||||
for _, r := range d[1:] { | |||||
if ccc(r) == 0 { | |||||
index = firstCCC | |||||
} | |||||
} | |||||
if lccc > 0 || nLead > 0 { | |||||
s += string([]byte{lccc}) | |||||
if index == firstCCC { | |||||
log.Fatalf("%U: multi-segment decomposition not supported for decompositions with leading CCC != 0", r) | |||||
} | |||||
index = firstLeadingCCC | |||||
} | |||||
if cc != lccc { | |||||
if cc != 0 { | |||||
log.Fatalf("%U: for lccc != ccc, expected ccc to be 0; was %d", r, cc) | |||||
} | |||||
index = firstCCCZeroExcept | |||||
} | |||||
} else if len(d) > 1 { | |||||
index = firstMulti | |||||
} | |||||
return index, s | |||||
} | |||||
decompSet := makeDecompSet() | |||||
const nLeadStr = "\x00\x01" // 0-byte length and tccc with nTrail. | |||||
decompSet.insert(firstStarterWithNLead, nLeadStr) | |||||
// Store the uniqued decompositions in a byte buffer, | |||||
// preceded by their byte length. | |||||
for _, c := range chars { | |||||
for _, f := range c.forms { | |||||
if len(f.expandedDecomp) == 0 { | |||||
continue | |||||
} | |||||
if f.combinesBackward { | |||||
log.Fatalf("%U: combinesBackward and decompose", c.codePoint) | |||||
} | |||||
index, s := mkstr(c.codePoint, &f) | |||||
decompSet.insert(index, s) | |||||
} | |||||
} | |||||
decompositions := bytes.NewBuffer(make([]byte, 0, 10000)) | |||||
size := 0 | |||||
positionMap := make(map[string]uint16) | |||||
decompositions.WriteString("\000") | |||||
fmt.Fprintln(w, "const (") | |||||
for i, m := range decompSet { | |||||
sa := []string{} | |||||
for s := range m { | |||||
sa = append(sa, s) | |||||
} | |||||
sort.Strings(sa) | |||||
for _, s := range sa { | |||||
p := decompositions.Len() | |||||
decompositions.WriteString(s) | |||||
positionMap[s] = uint16(p) | |||||
} | |||||
if cname[i] != "" { | |||||
fmt.Fprintf(w, "%s = 0x%X\n", cname[i], decompositions.Len()) | |||||
} | |||||
} | |||||
fmt.Fprintln(w, "maxDecomp = 0x8000") | |||||
fmt.Fprintln(w, ")") | |||||
b := decompositions.Bytes() | |||||
printBytes(w, b, "decomps") | |||||
size += len(b) | |||||
varnames := []string{"nfc", "nfkc"} | |||||
for i := 0; i < FNumberOfFormTypes; i++ { | |||||
trie := triegen.NewTrie(varnames[i]) | |||||
for r, c := range chars { | |||||
f := c.forms[i] | |||||
d := f.expandedDecomp | |||||
if len(d) != 0 { | |||||
_, key := mkstr(c.codePoint, &f) | |||||
trie.Insert(rune(r), uint64(positionMap[key])) | |||||
if c.ccc != ccc(d[0]) { | |||||
// We assume the lead ccc of a decomposition !=0 in this case. | |||||
if ccc(d[0]) == 0 { | |||||
log.Fatalf("Expected leading CCC to be non-zero; ccc is %d", c.ccc) | |||||
} | |||||
} | |||||
} else if c.nLeadingNonStarters > 0 && len(f.expandedDecomp) == 0 && c.ccc == 0 && !f.combinesBackward { | |||||
// Handle cases where it can't be detected that the nLead should be equal | |||||
// to nTrail. | |||||
trie.Insert(c.codePoint, uint64(positionMap[nLeadStr])) | |||||
} else if v := makeEntry(&f, &c)<<8 | uint16(c.ccc); v != 0 { | |||||
trie.Insert(c.codePoint, uint64(0x8000|v)) | |||||
} | |||||
} | |||||
sz, err := trie.Gen(w, triegen.Compact(&normCompacter{name: varnames[i]})) | |||||
if err != nil { | |||||
log.Fatal(err) | |||||
} | |||||
size += sz | |||||
} | |||||
return size | |||||
} | |||||
func contains(sa []string, s string) bool { | |||||
for _, a := range sa { | |||||
if a == s { | |||||
return true | |||||
} | |||||
} | |||||
return false | |||||
} | |||||
func makeTables() { | |||||
w := &bytes.Buffer{} | |||||
size := 0 | |||||
if *tablelist == "" { | |||||
return | |||||
} | |||||
list := strings.Split(*tablelist, ",") | |||||
if *tablelist == "all" { | |||||
list = []string{"recomp", "info"} | |||||
} | |||||
// Compute maximum decomposition size. | |||||
max := 0 | |||||
for _, c := range chars { | |||||
if n := len(string(c.forms[FCompatibility].expandedDecomp)); n > max { | |||||
max = n | |||||
} | |||||
} | |||||
fmt.Fprintln(w, "const (") | |||||
fmt.Fprintln(w, "\t// Version is the Unicode edition from which the tables are derived.") | |||||
fmt.Fprintf(w, "\tVersion = %q\n", gen.UnicodeVersion()) | |||||
fmt.Fprintln(w) | |||||
fmt.Fprintln(w, "\t// MaxTransformChunkSize indicates the maximum number of bytes that Transform") | |||||
fmt.Fprintln(w, "\t// may need to write atomically for any Form. Making a destination buffer at") | |||||
fmt.Fprintln(w, "\t// least this size ensures that Transform can always make progress and that") | |||||
fmt.Fprintln(w, "\t// the user does not need to grow the buffer on an ErrShortDst.") | |||||
fmt.Fprintf(w, "\tMaxTransformChunkSize = %d+maxNonStarters*4\n", len(string(0x034F))+max) | |||||
fmt.Fprintln(w, ")\n") | |||||
// Print the CCC remap table. | |||||
size += len(cccMap) | |||||
fmt.Fprintf(w, "var ccc = [%d]uint8{", len(cccMap)) | |||||
for i := 0; i < len(cccMap); i++ { | |||||
if i%8 == 0 { | |||||
fmt.Fprintln(w) | |||||
} | |||||
fmt.Fprintf(w, "%3d, ", cccMap[uint8(i)]) | |||||
} | |||||
fmt.Fprintln(w, "\n}\n") | |||||
if contains(list, "info") { | |||||
size += printCharInfoTables(w) | |||||
} | |||||
if contains(list, "recomp") { | |||||
// Note that we use 32 bit keys, instead of 64 bit. | |||||
// This clips the bits of three entries, but we know | |||||
// this won't cause a collision. The compiler will catch | |||||
// any changes made to UnicodeData.txt that introduces | |||||
// a collision. | |||||
// Note that the recomposition map for NFC and NFKC | |||||
// are identical. | |||||
// Recomposition map | |||||
nrentries := 0 | |||||
for _, c := range chars { | |||||
f := c.forms[FCanonical] | |||||
if !f.isOneWay && len(f.decomp) > 0 { | |||||
nrentries++ | |||||
} | |||||
} | |||||
sz := nrentries * 8 | |||||
size += sz | |||||
fmt.Fprintf(w, "// recompMap: %d bytes (entries only)\n", sz) | |||||
fmt.Fprintln(w, "var recompMap = map[uint32]rune{") | |||||
for i, c := range chars { | |||||
f := c.forms[FCanonical] | |||||
d := f.decomp | |||||
if !f.isOneWay && len(d) > 0 { | |||||
key := uint32(uint16(d[0]))<<16 + uint32(uint16(d[1])) | |||||
fmt.Fprintf(w, "0x%.8X: 0x%.4X,\n", key, i) | |||||
} | |||||
} | |||||
fmt.Fprintf(w, "}\n\n") | |||||
} | |||||
fmt.Fprintf(w, "// Total size of tables: %dKB (%d bytes)\n", (size+512)/1024, size) | |||||
gen.WriteGoFile("tables.go", "norm", w.Bytes()) | |||||
} | |||||
func printChars() { | |||||
if *verbose { | |||||
for _, c := range chars { | |||||
if !c.isValid() || c.state == SMissing { | |||||
continue | |||||
} | |||||
fmt.Println(c) | |||||
} | |||||
} | |||||
} | |||||
// verifyComputed does various consistency tests. | |||||
func verifyComputed() { | |||||
for i, c := range chars { | |||||
for _, f := range c.forms { | |||||
isNo := (f.quickCheck[MDecomposed] == QCNo) | |||||
if (len(f.decomp) > 0) != isNo && !isHangul(rune(i)) { | |||||
log.Fatalf("%U: NF*D QC must be No if rune decomposes", i) | |||||
} | |||||
isMaybe := f.quickCheck[MComposed] == QCMaybe | |||||
if f.combinesBackward != isMaybe { | |||||
log.Fatalf("%U: NF*C QC must be Maybe if combinesBackward", i) | |||||
} | |||||
if len(f.decomp) > 0 && f.combinesForward && isMaybe { | |||||
log.Fatalf("%U: NF*C QC must be Yes or No if combinesForward and decomposes", i) | |||||
} | |||||
if len(f.expandedDecomp) != 0 { | |||||
continue | |||||
} | |||||
if a, b := c.nLeadingNonStarters > 0, (c.ccc > 0 || f.combinesBackward); a != b { | |||||
// We accept these runes to be treated differently (it only affects | |||||
// segment breaking in iteration, most likely on improper use), but | |||||
// reconsider if more characters are added. | |||||
// U+FF9E HALFWIDTH KATAKANA VOICED SOUND MARK;Lm;0;L;<narrow> 3099;;;;N;;;;; | |||||
// U+FF9F HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK;Lm;0;L;<narrow> 309A;;;;N;;;;; | |||||
// U+3133 HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<compat> 11AA;;;;N;HANGUL LETTER GIYEOG SIOS;;;; | |||||
// U+318E HANGUL LETTER ARAEAE;Lo;0;L;<compat> 11A1;;;;N;HANGUL LETTER ALAE AE;;;; | |||||
// U+FFA3 HALFWIDTH HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<narrow> 3133;;;;N;HALFWIDTH HANGUL LETTER GIYEOG SIOS;;;; | |||||
// U+FFDC HALFWIDTH HANGUL LETTER I;Lo;0;L;<narrow> 3163;;;;N;;;;; | |||||
if i != 0xFF9E && i != 0xFF9F && !(0x3133 <= i && i <= 0x318E) && !(0xFFA3 <= i && i <= 0xFFDC) { | |||||
log.Fatalf("%U: nLead was %v; want %v", i, a, b) | |||||
} | |||||
} | |||||
} | |||||
nfc := c.forms[FCanonical] | |||||
nfkc := c.forms[FCompatibility] | |||||
if nfc.combinesBackward != nfkc.combinesBackward { | |||||
log.Fatalf("%U: Cannot combine combinesBackward\n", c.codePoint) | |||||
} | |||||
} | |||||
} | |||||
// Use values in DerivedNormalizationProps.txt to compare against the | |||||
// values we computed. | |||||
// DerivedNormalizationProps.txt has form: | |||||
// 00C0..00C5 ; NFD_QC; N # ... | |||||
// 0374 ; NFD_QC; N # ... | |||||
// See http://unicode.org/reports/tr44/ for full explanation | |||||
func testDerived() { | |||||
f := gen.OpenUCDFile("DerivedNormalizationProps.txt") | |||||
defer f.Close() | |||||
p := ucd.New(f) | |||||
for p.Next() { | |||||
r := p.Rune(0) | |||||
c := &chars[r] | |||||
var ftype, mode int | |||||
qt := p.String(1) | |||||
switch qt { | |||||
case "NFC_QC": | |||||
ftype, mode = FCanonical, MComposed | |||||
case "NFD_QC": | |||||
ftype, mode = FCanonical, MDecomposed | |||||
case "NFKC_QC": | |||||
ftype, mode = FCompatibility, MComposed | |||||
case "NFKD_QC": | |||||
ftype, mode = FCompatibility, MDecomposed | |||||
default: | |||||
continue | |||||
} | |||||
var qr QCResult | |||||
switch p.String(2) { | |||||
case "Y": | |||||
qr = QCYes | |||||
case "N": | |||||
qr = QCNo | |||||
case "M": | |||||
qr = QCMaybe | |||||
default: | |||||
log.Fatalf(`Unexpected quick check value "%s"`, p.String(2)) | |||||
} | |||||
if got := c.forms[ftype].quickCheck[mode]; got != qr { | |||||
log.Printf("%U: FAILED %s (was %v need %v)\n", r, qt, got, qr) | |||||
} | |||||
c.forms[ftype].verified[mode] = true | |||||
} | |||||
if err := p.Err(); err != nil { | |||||
log.Fatal(err) | |||||
} | |||||
// Any unspecified value must be QCYes. Verify this. | |||||
for i, c := range chars { | |||||
for j, fd := range c.forms { | |||||
for k, qr := range fd.quickCheck { | |||||
if !fd.verified[k] && qr != QCYes { | |||||
m := "%U: FAIL F:%d M:%d (was %v need Yes) %s\n" | |||||
log.Printf(m, i, j, k, qr, c.name) | |||||
} | |||||
} | |||||
} | |||||
} | |||||
} | |||||
var testHeader = `const ( | |||||
Yes = iota | |||||
No | |||||
Maybe | |||||
) | |||||
type formData struct { | |||||
qc uint8 | |||||
combinesForward bool | |||||
decomposition string | |||||
} | |||||
type runeData struct { | |||||
r rune | |||||
ccc uint8 | |||||
nLead uint8 | |||||
nTrail uint8 | |||||
f [2]formData // 0: canonical; 1: compatibility | |||||
} | |||||
func f(qc uint8, cf bool, dec string) [2]formData { | |||||
return [2]formData{{qc, cf, dec}, {qc, cf, dec}} | |||||
} | |||||
func g(qc, qck uint8, cf, cfk bool, d, dk string) [2]formData { | |||||
return [2]formData{{qc, cf, d}, {qck, cfk, dk}} | |||||
} | |||||
var testData = []runeData{ | |||||
` | |||||
func printTestdata() { | |||||
type lastInfo struct { | |||||
ccc uint8 | |||||
nLead uint8 | |||||
nTrail uint8 | |||||
f string | |||||
} | |||||
last := lastInfo{} | |||||
w := &bytes.Buffer{} | |||||
fmt.Fprintf(w, testHeader) | |||||
for r, c := range chars { | |||||
f := c.forms[FCanonical] | |||||
qc, cf, d := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp) | |||||
f = c.forms[FCompatibility] | |||||
qck, cfk, dk := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp) | |||||
s := "" | |||||
if d == dk && qc == qck && cf == cfk { | |||||
s = fmt.Sprintf("f(%s, %v, %q)", qc, cf, d) | |||||
} else { | |||||
s = fmt.Sprintf("g(%s, %s, %v, %v, %q, %q)", qc, qck, cf, cfk, d, dk) | |||||
} | |||||
current := lastInfo{c.ccc, c.nLeadingNonStarters, c.nTrailingNonStarters, s} | |||||
if last != current { | |||||
fmt.Fprintf(w, "\t{0x%x, %d, %d, %d, %s},\n", r, c.origCCC, c.nLeadingNonStarters, c.nTrailingNonStarters, s) | |||||
last = current | |||||
} | |||||
} | |||||
fmt.Fprintln(w, "}") | |||||
gen.WriteGoFile("data_test.go", "norm", w.Bytes()) | |||||
} |
@@ -1,117 +0,0 @@ | |||||
// Copyright 2011 The Go Authors. All rights reserved. | |||||
// Use of this source code is governed by a BSD-style | |||||
// license that can be found in the LICENSE file. | |||||
// +build ignore | |||||
// Trie table generator. | |||||
// Used by make*tables tools to generate a go file with trie data structures | |||||
// for mapping UTF-8 to a 16-bit value. All but the last byte in a UTF-8 byte | |||||
// sequence are used to lookup offsets in the index table to be used for the | |||||
// next byte. The last byte is used to index into a table with 16-bit values. | |||||
package main | |||||
import ( | |||||
"fmt" | |||||
"io" | |||||
) | |||||
const maxSparseEntries = 16 | |||||
type normCompacter struct { | |||||
sparseBlocks [][]uint64 | |||||
sparseOffset []uint16 | |||||
sparseCount int | |||||
name string | |||||
} | |||||
func mostFrequentStride(a []uint64) int { | |||||
counts := make(map[int]int) | |||||
var v int | |||||
for _, x := range a { | |||||
if stride := int(x) - v; v != 0 && stride >= 0 { | |||||
counts[stride]++ | |||||
} | |||||
v = int(x) | |||||
} | |||||
var maxs, maxc int | |||||
for stride, cnt := range counts { | |||||
if cnt > maxc || (cnt == maxc && stride < maxs) { | |||||
maxs, maxc = stride, cnt | |||||
} | |||||
} | |||||
return maxs | |||||
} | |||||
func countSparseEntries(a []uint64) int { | |||||
stride := mostFrequentStride(a) | |||||
var v, count int | |||||
for _, tv := range a { | |||||
if int(tv)-v != stride { | |||||
if tv != 0 { | |||||
count++ | |||||
} | |||||
} | |||||
v = int(tv) | |||||
} | |||||
return count | |||||
} | |||||
func (c *normCompacter) Size(v []uint64) (sz int, ok bool) { | |||||
if n := countSparseEntries(v); n <= maxSparseEntries { | |||||
return (n+1)*4 + 2, true | |||||
} | |||||
return 0, false | |||||
} | |||||
func (c *normCompacter) Store(v []uint64) uint32 { | |||||
h := uint32(len(c.sparseOffset)) | |||||
c.sparseBlocks = append(c.sparseBlocks, v) | |||||
c.sparseOffset = append(c.sparseOffset, uint16(c.sparseCount)) | |||||
c.sparseCount += countSparseEntries(v) + 1 | |||||
return h | |||||
} | |||||
func (c *normCompacter) Handler() string { | |||||
return c.name + "Sparse.lookup" | |||||
} | |||||
func (c *normCompacter) Print(w io.Writer) (retErr error) { | |||||
p := func(f string, x ...interface{}) { | |||||
if _, err := fmt.Fprintf(w, f, x...); retErr == nil && err != nil { | |||||
retErr = err | |||||
} | |||||
} | |||||
ls := len(c.sparseBlocks) | |||||
p("// %sSparseOffset: %d entries, %d bytes\n", c.name, ls, ls*2) | |||||
p("var %sSparseOffset = %#v\n\n", c.name, c.sparseOffset) | |||||
ns := c.sparseCount | |||||
p("// %sSparseValues: %d entries, %d bytes\n", c.name, ns, ns*4) | |||||
p("var %sSparseValues = [%d]valueRange {", c.name, ns) | |||||
for i, b := range c.sparseBlocks { | |||||
p("\n// Block %#x, offset %#x", i, c.sparseOffset[i]) | |||||
var v int | |||||
stride := mostFrequentStride(b) | |||||
n := countSparseEntries(b) | |||||
p("\n{value:%#04x,lo:%#02x},", stride, uint8(n)) | |||||
for i, nv := range b { | |||||
if int(nv)-v != stride { | |||||
if v != 0 { | |||||
p(",hi:%#02x},", 0x80+i-1) | |||||
} | |||||
if nv != 0 { | |||||
p("\n{value:%#04x,lo:%#02x", nv, 0x80+i) | |||||
} | |||||
} | |||||
v = int(nv) | |||||
} | |||||
if v != 0 { | |||||
p(",hi:%#02x},", 0x80+len(b)-1) | |||||
} | |||||
} | |||||
p("\n}\n\n") | |||||
return | |||||
} |
@@ -1498,95 +1498,95 @@ | |||||
{ | { | ||||
"checksumSHA1": "Mr4ur60bgQJnQFfJY0dGtwWwMPE=", | "checksumSHA1": "Mr4ur60bgQJnQFfJY0dGtwWwMPE=", | ||||
"path": "golang.org/x/text/encoding", | "path": "golang.org/x/text/encoding", | ||||
"revision": "a8b38433e35b65ba247bb267317037dee1b70cea", | |||||
"revisionTime": "2016-10-19T13:35:53Z" | |||||
"revision": "2bf8f2a19ec09c670e931282edfe6567f6be21c9", | |||||
"revisionTime": "2017-06-27T21:03:49Z" | |||||
}, | }, | ||||
{ | { | ||||
"checksumSHA1": "gJG/5S8KrCA1QGkIkpa5a/wnmy4=", | |||||
"checksumSHA1": "DSdlK4MKI/a3U8Zaee2XKBe01Fo=", | |||||
"path": "golang.org/x/text/encoding/charmap", | "path": "golang.org/x/text/encoding/charmap", | ||||
"revision": "a8b38433e35b65ba247bb267317037dee1b70cea", | |||||
"revisionTime": "2016-10-19T13:35:53Z" | |||||
"revision": "2bf8f2a19ec09c670e931282edfe6567f6be21c9", | |||||
"revisionTime": "2017-06-27T21:03:49Z" | |||||
}, | }, | ||||
{ | { | ||||
"checksumSHA1": "mI8YM2LehMxYDcauq5loMZr1pP8=", | |||||
"checksumSHA1": "z7tgTCQT62mHxtNMi/AXui/FAfQ=", | |||||
"path": "golang.org/x/text/encoding/htmlindex", | "path": "golang.org/x/text/encoding/htmlindex", | ||||
"revision": "a8b38433e35b65ba247bb267317037dee1b70cea", | |||||
"revisionTime": "2016-10-19T13:35:53Z" | |||||
"revision": "2bf8f2a19ec09c670e931282edfe6567f6be21c9", | |||||
"revisionTime": "2017-06-27T21:03:49Z" | |||||
}, | }, | ||||
{ | { | ||||
"checksumSHA1": "zeHyHebIZl1tGuwGllIhjfci+wI=", | "checksumSHA1": "zeHyHebIZl1tGuwGllIhjfci+wI=", | ||||
"path": "golang.org/x/text/encoding/internal", | "path": "golang.org/x/text/encoding/internal", | ||||
"revision": "a8b38433e35b65ba247bb267317037dee1b70cea", | |||||
"revisionTime": "2016-10-19T13:35:53Z" | |||||
"revision": "2bf8f2a19ec09c670e931282edfe6567f6be21c9", | |||||
"revisionTime": "2017-06-27T21:03:49Z" | |||||
}, | }, | ||||
{ | { | ||||
"checksumSHA1": "TF4hoIqHVEAvOq67rfnSLSkcZ1Y=", | |||||
"checksumSHA1": "7kYqxy64WhMjFIFZgN7tJ3lbKxM=", | |||||
"path": "golang.org/x/text/encoding/internal/identifier", | "path": "golang.org/x/text/encoding/internal/identifier", | ||||
"revision": "a8b38433e35b65ba247bb267317037dee1b70cea", | |||||
"revisionTime": "2016-10-19T13:35:53Z" | |||||
"revision": "2bf8f2a19ec09c670e931282edfe6567f6be21c9", | |||||
"revisionTime": "2017-06-27T21:03:49Z" | |||||
}, | }, | ||||
{ | { | ||||
"checksumSHA1": "HeZV82ktrmgyAaYLtNFS0qYgspI=", | |||||
"checksumSHA1": "2YqVpmvjWGEBATyUphTP1MS34JE=", | |||||
"path": "golang.org/x/text/encoding/japanese", | "path": "golang.org/x/text/encoding/japanese", | ||||
"revision": "a8b38433e35b65ba247bb267317037dee1b70cea", | |||||
"revisionTime": "2016-10-19T13:35:53Z" | |||||
"revision": "2bf8f2a19ec09c670e931282edfe6567f6be21c9", | |||||
"revisionTime": "2017-06-27T21:03:49Z" | |||||
}, | }, | ||||
{ | { | ||||
"checksumSHA1": "8y87WJz3OkDWtPCIXxJcYpo+OY8=", | |||||
"checksumSHA1": "+ErWCAdaMwO4PLtrk9D/Hh+7oQM=", | |||||
"path": "golang.org/x/text/encoding/korean", | "path": "golang.org/x/text/encoding/korean", | ||||
"revision": "a8b38433e35b65ba247bb267317037dee1b70cea", | |||||
"revisionTime": "2016-10-19T13:35:53Z" | |||||
"revision": "2bf8f2a19ec09c670e931282edfe6567f6be21c9", | |||||
"revisionTime": "2017-06-27T21:03:49Z" | |||||
}, | }, | ||||
{ | { | ||||
"checksumSHA1": "WYfmebIyX5Zae8NUfu9PsQjQff0=", | |||||
"checksumSHA1": "mTuZi5urYwgDIO8+Gfql2pv8Vwg=", | |||||
"path": "golang.org/x/text/encoding/simplifiedchinese", | "path": "golang.org/x/text/encoding/simplifiedchinese", | ||||
"revision": "a8b38433e35b65ba247bb267317037dee1b70cea", | |||||
"revisionTime": "2016-10-19T13:35:53Z" | |||||
"revision": "2bf8f2a19ec09c670e931282edfe6567f6be21c9", | |||||
"revisionTime": "2017-06-27T21:03:49Z" | |||||
}, | }, | ||||
{ | { | ||||
"checksumSHA1": "KKqYmi6fxt3r3uo4lExss2yTMbs=", | |||||
"checksumSHA1": "D+VI4j0Wjzr8SeupWdOB5KBdFOw=", | |||||
"path": "golang.org/x/text/encoding/traditionalchinese", | "path": "golang.org/x/text/encoding/traditionalchinese", | ||||
"revision": "a8b38433e35b65ba247bb267317037dee1b70cea", | |||||
"revisionTime": "2016-10-19T13:35:53Z" | |||||
"revision": "2bf8f2a19ec09c670e931282edfe6567f6be21c9", | |||||
"revisionTime": "2017-06-27T21:03:49Z" | |||||
}, | }, | ||||
{ | { | ||||
"checksumSHA1": "G9LfJI9gySazd+MyyC6QbTHx4to=", | "checksumSHA1": "G9LfJI9gySazd+MyyC6QbTHx4to=", | ||||
"path": "golang.org/x/text/encoding/unicode", | "path": "golang.org/x/text/encoding/unicode", | ||||
"revision": "a8b38433e35b65ba247bb267317037dee1b70cea", | |||||
"revisionTime": "2016-10-19T13:35:53Z" | |||||
"revision": "2bf8f2a19ec09c670e931282edfe6567f6be21c9", | |||||
"revisionTime": "2017-06-27T21:03:49Z" | |||||
}, | }, | ||||
{ | { | ||||
"checksumSHA1": "hyNCcTwMQnV6/MK8uUW9E5H0J0M=", | "checksumSHA1": "hyNCcTwMQnV6/MK8uUW9E5H0J0M=", | ||||
"path": "golang.org/x/text/internal/tag", | "path": "golang.org/x/text/internal/tag", | ||||
"revision": "a8b38433e35b65ba247bb267317037dee1b70cea", | |||||
"revisionTime": "2016-10-19T13:35:53Z" | |||||
"revision": "2bf8f2a19ec09c670e931282edfe6567f6be21c9", | |||||
"revisionTime": "2017-06-27T21:03:49Z" | |||||
}, | }, | ||||
{ | { | ||||
"checksumSHA1": "Qk7dljcrEK1BJkAEZguxAbG9dSo=", | "checksumSHA1": "Qk7dljcrEK1BJkAEZguxAbG9dSo=", | ||||
"path": "golang.org/x/text/internal/utf8internal", | "path": "golang.org/x/text/internal/utf8internal", | ||||
"revision": "a8b38433e35b65ba247bb267317037dee1b70cea", | |||||
"revisionTime": "2016-10-19T13:35:53Z" | |||||
"revision": "2bf8f2a19ec09c670e931282edfe6567f6be21c9", | |||||
"revisionTime": "2017-06-27T21:03:49Z" | |||||
}, | }, | ||||
{ | { | ||||
"checksumSHA1": "bsNFI/kfmF0p43jLKiMYRqw9Dfs=", | |||||
"checksumSHA1": "SnP28TAvq7k08OmkHClZDYFEWww=", | |||||
"path": "golang.org/x/text/language", | "path": "golang.org/x/text/language", | ||||
"revision": "a8b38433e35b65ba247bb267317037dee1b70cea", | |||||
"revisionTime": "2016-10-19T13:35:53Z" | |||||
"revision": "2bf8f2a19ec09c670e931282edfe6567f6be21c9", | |||||
"revisionTime": "2017-06-27T21:03:49Z" | |||||
}, | }, | ||||
{ | { | ||||
"checksumSHA1": "IV4MN7KGBSocu/5NR3le3sxup4Y=", | "checksumSHA1": "IV4MN7KGBSocu/5NR3le3sxup4Y=", | ||||
"path": "golang.org/x/text/runes", | "path": "golang.org/x/text/runes", | ||||
"revision": "a8b38433e35b65ba247bb267317037dee1b70cea", | |||||
"revisionTime": "2016-10-19T13:35:53Z" | |||||
"revision": "2bf8f2a19ec09c670e931282edfe6567f6be21c9", | |||||
"revisionTime": "2017-06-27T21:03:49Z" | |||||
}, | }, | ||||
{ | { | ||||
"checksumSHA1": "ziMb9+ANGRJSSIuxYdRbA+cDRBQ=", | "checksumSHA1": "ziMb9+ANGRJSSIuxYdRbA+cDRBQ=", | ||||
"path": "golang.org/x/text/transform", | "path": "golang.org/x/text/transform", | ||||
"revision": "a8b38433e35b65ba247bb267317037dee1b70cea", | |||||
"revisionTime": "2016-10-19T13:35:53Z" | |||||
"revision": "2bf8f2a19ec09c670e931282edfe6567f6be21c9", | |||||
"revisionTime": "2017-06-27T21:03:49Z" | |||||
}, | }, | ||||
{ | { | ||||
"checksumSHA1": "Anof4bt0AU+Sa3R8Rq0KBnlpbaQ=", | |||||
"checksumSHA1": "kKylzIrLEnH8NKyeVAL0dq5gjVQ=", | |||||
"path": "golang.org/x/text/unicode/norm", | "path": "golang.org/x/text/unicode/norm", | ||||
"revision": "2bf8f2a19ec09c670e931282edfe6567f6be21c9", | "revision": "2bf8f2a19ec09c670e931282edfe6567f6be21c9", | ||||
"revisionTime": "2017-06-27T21:03:49Z" | "revisionTime": "2017-06-27T21:03:49Z" | ||||