308 lines
7.7 KiB
Go
308 lines
7.7 KiB
Go
|
// Copyright (c) 2014 Couchbase, Inc.
|
||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||
|
// either express or implied. See the License for the specific language governing permissions
|
||
|
// and limitations under the License.
|
||
|
|
||
|
// Modified by Martin Atkins to serve the needs of package textseg.
|
||
|
|
||
|
// +build ignore
|
||
|
|
||
|
package main
|
||
|
|
||
|
import (
|
||
|
"bufio"
|
||
|
"flag"
|
||
|
"fmt"
|
||
|
"io"
|
||
|
"log"
|
||
|
"net/http"
|
||
|
"os"
|
||
|
"os/exec"
|
||
|
"sort"
|
||
|
"strconv"
|
||
|
"strings"
|
||
|
"unicode"
|
||
|
)
|
||
|
|
||
|
var url = flag.String("url",
|
||
|
"http://www.unicode.org/Public/"+unicode.Version+"/ucd/auxiliary/",
|
||
|
"URL of Unicode database directory")
|
||
|
var verbose = flag.Bool("verbose",
|
||
|
false,
|
||
|
"write data to stdout as it is parsed")
|
||
|
var localFiles = flag.Bool("local",
|
||
|
false,
|
||
|
"data files have been copied to the current directory; for debugging only")
|
||
|
var outputFile = flag.String("output",
|
||
|
"",
|
||
|
"output file for generated tables; default stdout")
|
||
|
|
||
|
var output *bufio.Writer
|
||
|
|
||
|
func main() {
|
||
|
flag.Parse()
|
||
|
setupOutput()
|
||
|
|
||
|
graphemePropertyRanges := make(map[string]*unicode.RangeTable)
|
||
|
loadUnicodeData("GraphemeBreakProperty.txt", graphemePropertyRanges)
|
||
|
wordPropertyRanges := make(map[string]*unicode.RangeTable)
|
||
|
loadUnicodeData("WordBreakProperty.txt", wordPropertyRanges)
|
||
|
sentencePropertyRanges := make(map[string]*unicode.RangeTable)
|
||
|
loadUnicodeData("SentenceBreakProperty.txt", sentencePropertyRanges)
|
||
|
|
||
|
fmt.Fprintf(output, fileHeader, *url)
|
||
|
generateTables("Grapheme", graphemePropertyRanges)
|
||
|
generateTables("Word", wordPropertyRanges)
|
||
|
generateTables("Sentence", sentencePropertyRanges)
|
||
|
|
||
|
flushOutput()
|
||
|
}
|
||
|
|
||
|
// WordBreakProperty.txt has the form:
|
||
|
// 05F0..05F2 ; Hebrew_Letter # Lo [3] HEBREW LIGATURE YIDDISH DOUBLE VAV..HEBREW LIGATURE YIDDISH DOUBLE YOD
|
||
|
// FB1D ; Hebrew_Letter # Lo HEBREW LETTER YOD WITH HIRIQ
|
||
|
func openReader(file string) (input io.ReadCloser) {
|
||
|
if *localFiles {
|
||
|
f, err := os.Open(file)
|
||
|
if err != nil {
|
||
|
log.Fatal(err)
|
||
|
}
|
||
|
input = f
|
||
|
} else {
|
||
|
path := *url + file
|
||
|
resp, err := http.Get(path)
|
||
|
if err != nil {
|
||
|
log.Fatal(err)
|
||
|
}
|
||
|
if resp.StatusCode != 200 {
|
||
|
log.Fatal("bad GET status for "+file, resp.Status)
|
||
|
}
|
||
|
input = resp.Body
|
||
|
}
|
||
|
return
|
||
|
}
|
||
|
|
||
|
func loadUnicodeData(filename string, propertyRanges map[string]*unicode.RangeTable) {
|
||
|
f := openReader(filename)
|
||
|
defer f.Close()
|
||
|
bufioReader := bufio.NewReader(f)
|
||
|
line, err := bufioReader.ReadString('\n')
|
||
|
for err == nil {
|
||
|
parseLine(line, propertyRanges)
|
||
|
line, err = bufioReader.ReadString('\n')
|
||
|
}
|
||
|
// if the err was EOF still need to process last value
|
||
|
if err == io.EOF {
|
||
|
parseLine(line, propertyRanges)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
const comment = "#"
|
||
|
const sep = ";"
|
||
|
const rnge = ".."
|
||
|
|
||
|
func parseLine(line string, propertyRanges map[string]*unicode.RangeTable) {
|
||
|
if strings.HasPrefix(line, comment) {
|
||
|
return
|
||
|
}
|
||
|
line = strings.TrimSpace(line)
|
||
|
if len(line) == 0 {
|
||
|
return
|
||
|
}
|
||
|
commentStart := strings.Index(line, comment)
|
||
|
if commentStart > 0 {
|
||
|
line = line[0:commentStart]
|
||
|
}
|
||
|
pieces := strings.Split(line, sep)
|
||
|
if len(pieces) != 2 {
|
||
|
log.Printf("unexpected %d pieces in %s", len(pieces), line)
|
||
|
return
|
||
|
}
|
||
|
|
||
|
propertyName := strings.TrimSpace(pieces[1])
|
||
|
|
||
|
rangeTable, ok := propertyRanges[propertyName]
|
||
|
if !ok {
|
||
|
rangeTable = &unicode.RangeTable{
|
||
|
LatinOffset: 0,
|
||
|
}
|
||
|
propertyRanges[propertyName] = rangeTable
|
||
|
}
|
||
|
|
||
|
codepointRange := strings.TrimSpace(pieces[0])
|
||
|
rngeIndex := strings.Index(codepointRange, rnge)
|
||
|
|
||
|
if rngeIndex < 0 {
|
||
|
// single codepoint, not range
|
||
|
codepointInt, err := strconv.ParseUint(codepointRange, 16, 64)
|
||
|
if err != nil {
|
||
|
log.Printf("error parsing int: %v", err)
|
||
|
return
|
||
|
}
|
||
|
if codepointInt < 0x10000 {
|
||
|
r16 := unicode.Range16{
|
||
|
Lo: uint16(codepointInt),
|
||
|
Hi: uint16(codepointInt),
|
||
|
Stride: 1,
|
||
|
}
|
||
|
addR16ToTable(rangeTable, r16)
|
||
|
} else {
|
||
|
r32 := unicode.Range32{
|
||
|
Lo: uint32(codepointInt),
|
||
|
Hi: uint32(codepointInt),
|
||
|
Stride: 1,
|
||
|
}
|
||
|
addR32ToTable(rangeTable, r32)
|
||
|
}
|
||
|
} else {
|
||
|
rngeStart := codepointRange[0:rngeIndex]
|
||
|
rngeEnd := codepointRange[rngeIndex+2:]
|
||
|
rngeStartInt, err := strconv.ParseUint(rngeStart, 16, 64)
|
||
|
if err != nil {
|
||
|
log.Printf("error parsing int: %v", err)
|
||
|
return
|
||
|
}
|
||
|
rngeEndInt, err := strconv.ParseUint(rngeEnd, 16, 64)
|
||
|
if err != nil {
|
||
|
log.Printf("error parsing int: %v", err)
|
||
|
return
|
||
|
}
|
||
|
if rngeStartInt < 0x10000 && rngeEndInt < 0x10000 {
|
||
|
r16 := unicode.Range16{
|
||
|
Lo: uint16(rngeStartInt),
|
||
|
Hi: uint16(rngeEndInt),
|
||
|
Stride: 1,
|
||
|
}
|
||
|
addR16ToTable(rangeTable, r16)
|
||
|
} else if rngeStartInt >= 0x10000 && rngeEndInt >= 0x10000 {
|
||
|
r32 := unicode.Range32{
|
||
|
Lo: uint32(rngeStartInt),
|
||
|
Hi: uint32(rngeEndInt),
|
||
|
Stride: 1,
|
||
|
}
|
||
|
addR32ToTable(rangeTable, r32)
|
||
|
} else {
|
||
|
log.Printf("unexpected range")
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func addR16ToTable(r *unicode.RangeTable, r16 unicode.Range16) {
|
||
|
if r.R16 == nil {
|
||
|
r.R16 = make([]unicode.Range16, 0, 1)
|
||
|
}
|
||
|
r.R16 = append(r.R16, r16)
|
||
|
if r16.Hi <= unicode.MaxLatin1 {
|
||
|
r.LatinOffset++
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func addR32ToTable(r *unicode.RangeTable, r32 unicode.Range32) {
|
||
|
if r.R32 == nil {
|
||
|
r.R32 = make([]unicode.Range32, 0, 1)
|
||
|
}
|
||
|
r.R32 = append(r.R32, r32)
|
||
|
}
|
||
|
|
||
|
func generateTables(prefix string, propertyRanges map[string]*unicode.RangeTable) {
|
||
|
prNames := make([]string, 0, len(propertyRanges))
|
||
|
for k := range propertyRanges {
|
||
|
prNames = append(prNames, k)
|
||
|
}
|
||
|
sort.Strings(prNames)
|
||
|
for _, key := range prNames {
|
||
|
rt := propertyRanges[key]
|
||
|
fmt.Fprintf(output, "var _%s%s = %s\n", prefix, key, generateRangeTable(rt))
|
||
|
}
|
||
|
fmt.Fprintf(output, "type _%sRuneRange unicode.RangeTable\n", prefix)
|
||
|
|
||
|
fmt.Fprintf(output, "func _%sRuneType(r rune) *_%sRuneRange {\n", prefix, prefix)
|
||
|
fmt.Fprintf(output, "\tswitch {\n")
|
||
|
for _, key := range prNames {
|
||
|
fmt.Fprintf(output, "\tcase unicode.Is(_%s%s, r):\n\t\treturn (*_%sRuneRange)(_%s%s)\n", prefix, key, prefix, prefix, key)
|
||
|
}
|
||
|
fmt.Fprintf(output, "\tdefault:\n\t\treturn nil\n")
|
||
|
fmt.Fprintf(output, "\t}\n")
|
||
|
fmt.Fprintf(output, "}\n")
|
||
|
|
||
|
fmt.Fprintf(output, "func (rng *_%sRuneRange) String() string {\n", prefix)
|
||
|
fmt.Fprintf(output, "\tswitch (*unicode.RangeTable)(rng) {\n")
|
||
|
for _, key := range prNames {
|
||
|
fmt.Fprintf(output, "\tcase _%s%s:\n\t\treturn %q\n", prefix, key, key)
|
||
|
}
|
||
|
fmt.Fprintf(output, "\tdefault:\n\t\treturn \"Other\"\n")
|
||
|
fmt.Fprintf(output, "\t}\n")
|
||
|
fmt.Fprintf(output, "}\n")
|
||
|
}
|
||
|
|
||
|
func generateRangeTable(rt *unicode.RangeTable) string {
|
||
|
rv := "&unicode.RangeTable{\n"
|
||
|
if rt.R16 != nil {
|
||
|
rv += "\tR16: []unicode.Range16{\n"
|
||
|
for _, r16 := range rt.R16 {
|
||
|
rv += fmt.Sprintf("\t\t%#v,\n", r16)
|
||
|
}
|
||
|
rv += "\t},\n"
|
||
|
}
|
||
|
if rt.R32 != nil {
|
||
|
rv += "\tR32: []unicode.Range32{\n"
|
||
|
for _, r32 := range rt.R32 {
|
||
|
rv += fmt.Sprintf("\t\t%#v,\n", r32)
|
||
|
}
|
||
|
rv += "\t},\n"
|
||
|
}
|
||
|
rv += fmt.Sprintf("\t\tLatinOffset: %d,\n", rt.LatinOffset)
|
||
|
rv += "}\n"
|
||
|
return rv
|
||
|
}
|
||
|
|
||
|
const fileHeader = `// Generated by running
|
||
|
// maketables --url=%s
|
||
|
// DO NOT EDIT
|
||
|
|
||
|
package textseg
|
||
|
|
||
|
import(
|
||
|
"unicode"
|
||
|
)
|
||
|
`
|
||
|
|
||
|
func setupOutput() {
|
||
|
output = bufio.NewWriter(startGofmt())
|
||
|
}
|
||
|
|
||
|
// startGofmt connects output to a gofmt process if -output is set.
|
||
|
func startGofmt() io.Writer {
|
||
|
if *outputFile == "" {
|
||
|
return os.Stdout
|
||
|
}
|
||
|
stdout, err := os.Create(*outputFile)
|
||
|
if err != nil {
|
||
|
log.Fatal(err)
|
||
|
}
|
||
|
// Pipe output to gofmt.
|
||
|
gofmt := exec.Command("gofmt")
|
||
|
fd, err := gofmt.StdinPipe()
|
||
|
if err != nil {
|
||
|
log.Fatal(err)
|
||
|
}
|
||
|
gofmt.Stdout = stdout
|
||
|
gofmt.Stderr = os.Stderr
|
||
|
err = gofmt.Start()
|
||
|
if err != nil {
|
||
|
log.Fatal(err)
|
||
|
}
|
||
|
return fd
|
||
|
}
|
||
|
|
||
|
func flushOutput() {
|
||
|
err := output.Flush()
|
||
|
if err != nil {
|
||
|
log.Fatal(err)
|
||
|
}
|
||
|
}
|