govendor fetch golang.org/x/net/idna
This commit is contained in:
parent
7d9152850a
commit
9bd54c24e7
|
@ -1,61 +1,673 @@
|
||||||
// Copyright 2012 The Go Authors. All rights reserved.
|
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
|
||||||
|
|
||||||
|
// Copyright 2016 The Go Authors. All rights reserved.
|
||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
// Package idna implements IDNA2008 (Internationalized Domain Names for
|
// Package idna implements IDNA2008 using the compatibility processing
|
||||||
// Applications), defined in RFC 5890, RFC 5891, RFC 5892, RFC 5893 and
|
// defined by UTS (Unicode Technical Standard) #46, which defines a standard to
|
||||||
// RFC 5894.
|
// deal with the transition from IDNA2003.
|
||||||
|
//
|
||||||
|
// IDNA2008 (Internationalized Domain Names for Applications), is defined in RFC
|
||||||
|
// 5890, RFC 5891, RFC 5892, RFC 5893 and RFC 5894.
|
||||||
|
// UTS #46 is defined in http://www.unicode.org/reports/tr46.
|
||||||
|
// See http://unicode.org/cldr/utility/idna.jsp for a visualization of the
|
||||||
|
// differences between these two standards.
|
||||||
package idna // import "golang.org/x/net/idna"
|
package idna // import "golang.org/x/net/idna"
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
"strings"
|
"strings"
|
||||||
"unicode/utf8"
|
"unicode/utf8"
|
||||||
|
|
||||||
|
"golang.org/x/text/secure/bidirule"
|
||||||
|
"golang.org/x/text/unicode/norm"
|
||||||
)
|
)
|
||||||
|
|
||||||
// TODO(nigeltao): specify when errors occur. For example, is ToASCII(".") or
|
// NOTE: Unlike common practice in Go APIs, the functions will return a
|
||||||
// ToASCII("foo\x00") an error? See also http://www.unicode.org/faq/idn.html#11
|
// sanitized domain name in case of errors. Browsers sometimes use a partially
|
||||||
|
// evaluated string as lookup.
|
||||||
|
// TODO: the current error handling is, in my opinion, the least opinionated.
|
||||||
|
// Other strategies are also viable, though:
|
||||||
|
// Option 1) Return an empty string in case of error, but allow the user to
|
||||||
|
// specify explicitly which errors to ignore.
|
||||||
|
// Option 2) Return the partially evaluated string if it is itself a valid
|
||||||
|
// string, otherwise return the empty string in case of error.
|
||||||
|
// Option 3) Option 1 and 2.
|
||||||
|
// Option 4) Always return an empty string for now and implement Option 1 as
|
||||||
|
// needed, and document that the return string may not be empty in case of
|
||||||
|
// error in the future.
|
||||||
|
// I think Option 1 is best, but it is quite opinionated.
|
||||||
|
|
||||||
// acePrefix is the ASCII Compatible Encoding prefix.
|
// ToASCII is a wrapper for Punycode.ToASCII.
|
||||||
const acePrefix = "xn--"
|
func ToASCII(s string) (string, error) {
|
||||||
|
return Punycode.process(s, true)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ToUnicode is a wrapper for Punycode.ToUnicode.
|
||||||
|
func ToUnicode(s string) (string, error) {
|
||||||
|
return Punycode.process(s, false)
|
||||||
|
}
|
||||||
|
|
||||||
|
// An Option configures a Profile at creation time.
|
||||||
|
type Option func(*options)
|
||||||
|
|
||||||
|
// Transitional sets a Profile to use the Transitional mapping as defined in UTS
|
||||||
|
// #46. This will cause, for example, "ß" to be mapped to "ss". Using the
|
||||||
|
// transitional mapping provides a compromise between IDNA2003 and IDNA2008
|
||||||
|
// compatibility. It is used by most browsers when resolving domain names. This
|
||||||
|
// option is only meaningful if combined with MapForLookup.
|
||||||
|
func Transitional(transitional bool) Option {
|
||||||
|
return func(o *options) { o.transitional = true }
|
||||||
|
}
|
||||||
|
|
||||||
|
// VerifyDNSLength sets whether a Profile should fail if any of the IDN parts
|
||||||
|
// are longer than allowed by the RFC.
|
||||||
|
func VerifyDNSLength(verify bool) Option {
|
||||||
|
return func(o *options) { o.verifyDNSLength = verify }
|
||||||
|
}
|
||||||
|
|
||||||
|
// RemoveLeadingDots removes leading label separators. Leading runes that map to
|
||||||
|
// dots, such as U+3002, are removed as well.
|
||||||
|
//
|
||||||
|
// This is the behavior suggested by the UTS #46 and is adopted by some
|
||||||
|
// browsers.
|
||||||
|
func RemoveLeadingDots(remove bool) Option {
|
||||||
|
return func(o *options) { o.removeLeadingDots = remove }
|
||||||
|
}
|
||||||
|
|
||||||
|
// ValidateLabels sets whether to check the mandatory label validation criteria
|
||||||
|
// as defined in Section 5.4 of RFC 5891. This includes testing for correct use
|
||||||
|
// of hyphens ('-'), normalization, validity of runes, and the context rules.
|
||||||
|
func ValidateLabels(enable bool) Option {
|
||||||
|
return func(o *options) {
|
||||||
|
// Don't override existing mappings, but set one that at least checks
|
||||||
|
// normalization if it is not set.
|
||||||
|
if o.mapping == nil && enable {
|
||||||
|
o.mapping = normalize
|
||||||
|
}
|
||||||
|
o.trie = trie
|
||||||
|
o.validateLabels = enable
|
||||||
|
o.fromPuny = validateFromPunycode
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// StrictDomainName limits the set of permissable ASCII characters to those
|
||||||
|
// allowed in domain names as defined in RFC 1034 (A-Z, a-z, 0-9 and the
|
||||||
|
// hyphen). This is set by default for MapForLookup and ValidateForRegistration.
|
||||||
|
//
|
||||||
|
// This option is useful, for instance, for browsers that allow characters
|
||||||
|
// outside this range, for example a '_' (U+005F LOW LINE). See
|
||||||
|
// http://www.rfc-editor.org/std/std3.txt for more details This option
|
||||||
|
// corresponds to the UseSTD3ASCIIRules option in UTS #46.
|
||||||
|
func StrictDomainName(use bool) Option {
|
||||||
|
return func(o *options) {
|
||||||
|
o.trie = trie
|
||||||
|
o.useSTD3Rules = use
|
||||||
|
o.fromPuny = validateFromPunycode
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOTE: the following options pull in tables. The tables should not be linked
|
||||||
|
// in as long as the options are not used.
|
||||||
|
|
||||||
|
// BidiRule enables the Bidi rule as defined in RFC 5893. Any application
|
||||||
|
// that relies on proper validation of labels should include this rule.
|
||||||
|
func BidiRule() Option {
|
||||||
|
return func(o *options) { o.bidirule = bidirule.ValidString }
|
||||||
|
}
|
||||||
|
|
||||||
|
// ValidateForRegistration sets validation options to verify that a given IDN is
|
||||||
|
// properly formatted for registration as defined by Section 4 of RFC 5891.
|
||||||
|
func ValidateForRegistration() Option {
|
||||||
|
return func(o *options) {
|
||||||
|
o.mapping = validateRegistration
|
||||||
|
StrictDomainName(true)(o)
|
||||||
|
ValidateLabels(true)(o)
|
||||||
|
VerifyDNSLength(true)(o)
|
||||||
|
BidiRule()(o)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// MapForLookup sets validation and mapping options such that a given IDN is
|
||||||
|
// transformed for domain name lookup according to the requirements set out in
|
||||||
|
// Section 5 of RFC 5891. The mappings follow the recommendations of RFC 5894,
|
||||||
|
// RFC 5895 and UTS 46. It does not add the Bidi Rule. Use the BidiRule option
|
||||||
|
// to add this check.
|
||||||
|
//
|
||||||
|
// The mappings include normalization and mapping case, width and other
|
||||||
|
// compatibility mappings.
|
||||||
|
func MapForLookup() Option {
|
||||||
|
return func(o *options) {
|
||||||
|
o.mapping = validateAndMap
|
||||||
|
StrictDomainName(true)(o)
|
||||||
|
ValidateLabels(true)(o)
|
||||||
|
RemoveLeadingDots(true)(o)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type options struct {
|
||||||
|
transitional bool
|
||||||
|
useSTD3Rules bool
|
||||||
|
validateLabels bool
|
||||||
|
verifyDNSLength bool
|
||||||
|
removeLeadingDots bool
|
||||||
|
|
||||||
|
trie *idnaTrie
|
||||||
|
|
||||||
|
// fromPuny calls validation rules when converting A-labels to U-labels.
|
||||||
|
fromPuny func(p *Profile, s string) error
|
||||||
|
|
||||||
|
// mapping implements a validation and mapping step as defined in RFC 5895
|
||||||
|
// or UTS 46, tailored to, for example, domain registration or lookup.
|
||||||
|
mapping func(p *Profile, s string) (string, error)
|
||||||
|
|
||||||
|
// bidirule, if specified, checks whether s conforms to the Bidi Rule
|
||||||
|
// defined in RFC 5893.
|
||||||
|
bidirule func(s string) bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// A Profile defines the configuration of an IDNA mapper.
|
||||||
|
type Profile struct {
|
||||||
|
options
|
||||||
|
}
|
||||||
|
|
||||||
|
func apply(o *options, opts []Option) {
|
||||||
|
for _, f := range opts {
|
||||||
|
f(o)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// New creates a new Profile.
|
||||||
|
//
|
||||||
|
// With no options, the returned Profile is the most permissive and equals the
|
||||||
|
// Punycode Profile. Options can be passed to further restrict the Profile. The
|
||||||
|
// MapForLookup and ValidateForRegistration options set a collection of options,
|
||||||
|
// for lookup and registration purposes respectively, which can be tailored by
|
||||||
|
// adding more fine-grained options, where later options override earlier
|
||||||
|
// options.
|
||||||
|
func New(o ...Option) *Profile {
|
||||||
|
p := &Profile{}
|
||||||
|
apply(&p.options, o)
|
||||||
|
return p
|
||||||
|
}
|
||||||
|
|
||||||
// ToASCII converts a domain or domain label to its ASCII form. For example,
|
// ToASCII converts a domain or domain label to its ASCII form. For example,
|
||||||
// ToASCII("bücher.example.com") is "xn--bcher-kva.example.com", and
|
// ToASCII("bücher.example.com") is "xn--bcher-kva.example.com", and
|
||||||
// ToASCII("golang") is "golang".
|
// ToASCII("golang") is "golang". If an error is encountered it will return
|
||||||
func ToASCII(s string) (string, error) {
|
// an error and a (partially) processed result.
|
||||||
if ascii(s) {
|
func (p *Profile) ToASCII(s string) (string, error) {
|
||||||
return s, nil
|
return p.process(s, true)
|
||||||
}
|
|
||||||
labels := strings.Split(s, ".")
|
|
||||||
for i, label := range labels {
|
|
||||||
if !ascii(label) {
|
|
||||||
a, err := encode(acePrefix, label)
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
labels[i] = a
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return strings.Join(labels, "."), nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ToUnicode converts a domain or domain label to its Unicode form. For example,
|
// ToUnicode converts a domain or domain label to its Unicode form. For example,
|
||||||
// ToUnicode("xn--bcher-kva.example.com") is "bücher.example.com", and
|
// ToUnicode("xn--bcher-kva.example.com") is "bücher.example.com", and
|
||||||
// ToUnicode("golang") is "golang".
|
// ToUnicode("golang") is "golang". If an error is encountered it will return
|
||||||
func ToUnicode(s string) (string, error) {
|
// an error and a (partially) processed result.
|
||||||
if !strings.Contains(s, acePrefix) {
|
func (p *Profile) ToUnicode(s string) (string, error) {
|
||||||
|
pp := *p
|
||||||
|
pp.transitional = false
|
||||||
|
return pp.process(s, false)
|
||||||
|
}
|
||||||
|
|
||||||
|
// String reports a string with a description of the profile for debugging
|
||||||
|
// purposes. The string format may change with different versions.
|
||||||
|
func (p *Profile) String() string {
|
||||||
|
s := ""
|
||||||
|
if p.transitional {
|
||||||
|
s = "Transitional"
|
||||||
|
} else {
|
||||||
|
s = "NonTransitional"
|
||||||
|
}
|
||||||
|
if p.useSTD3Rules {
|
||||||
|
s += ":UseSTD3Rules"
|
||||||
|
}
|
||||||
|
if p.validateLabels {
|
||||||
|
s += ":ValidateLabels"
|
||||||
|
}
|
||||||
|
if p.verifyDNSLength {
|
||||||
|
s += ":VerifyDNSLength"
|
||||||
|
}
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
// Punycode is a Profile that does raw punycode processing with a minimum
|
||||||
|
// of validation.
|
||||||
|
Punycode *Profile = punycode
|
||||||
|
|
||||||
|
// Lookup is the recommended profile for looking up domain names, according
|
||||||
|
// to Section 5 of RFC 5891. The exact configuration of this profile may
|
||||||
|
// change over time.
|
||||||
|
Lookup *Profile = lookup
|
||||||
|
|
||||||
|
// Display is the recommended profile for displaying domain names.
|
||||||
|
// The configuration of this profile may change over time.
|
||||||
|
Display *Profile = display
|
||||||
|
|
||||||
|
// Registration is the recommended profile for checking whether a given
|
||||||
|
// IDN is valid for registration, according to Section 4 of RFC 5891.
|
||||||
|
Registration *Profile = registration
|
||||||
|
|
||||||
|
punycode = &Profile{}
|
||||||
|
lookup = &Profile{options{
|
||||||
|
transitional: true,
|
||||||
|
useSTD3Rules: true,
|
||||||
|
validateLabels: true,
|
||||||
|
removeLeadingDots: true,
|
||||||
|
trie: trie,
|
||||||
|
fromPuny: validateFromPunycode,
|
||||||
|
mapping: validateAndMap,
|
||||||
|
bidirule: bidirule.ValidString,
|
||||||
|
}}
|
||||||
|
display = &Profile{options{
|
||||||
|
useSTD3Rules: true,
|
||||||
|
validateLabels: true,
|
||||||
|
removeLeadingDots: true,
|
||||||
|
trie: trie,
|
||||||
|
fromPuny: validateFromPunycode,
|
||||||
|
mapping: validateAndMap,
|
||||||
|
bidirule: bidirule.ValidString,
|
||||||
|
}}
|
||||||
|
registration = &Profile{options{
|
||||||
|
useSTD3Rules: true,
|
||||||
|
validateLabels: true,
|
||||||
|
verifyDNSLength: true,
|
||||||
|
trie: trie,
|
||||||
|
fromPuny: validateFromPunycode,
|
||||||
|
mapping: validateRegistration,
|
||||||
|
bidirule: bidirule.ValidString,
|
||||||
|
}}
|
||||||
|
|
||||||
|
// TODO: profiles
|
||||||
|
// Register: recommended for approving domain names: don't do any mappings
|
||||||
|
// but rather reject on invalid input. Bundle or block deviation characters.
|
||||||
|
)
|
||||||
|
|
||||||
|
type labelError struct{ label, code_ string }
|
||||||
|
|
||||||
|
func (e labelError) code() string { return e.code_ }
|
||||||
|
func (e labelError) Error() string {
|
||||||
|
return fmt.Sprintf("idna: invalid label %q", e.label)
|
||||||
|
}
|
||||||
|
|
||||||
|
type runeError rune
|
||||||
|
|
||||||
|
func (e runeError) code() string { return "P1" }
|
||||||
|
func (e runeError) Error() string {
|
||||||
|
return fmt.Sprintf("idna: disallowed rune %U", e)
|
||||||
|
}
|
||||||
|
|
||||||
|
// process implements the algorithm described in section 4 of UTS #46,
|
||||||
|
// see http://www.unicode.org/reports/tr46.
|
||||||
|
func (p *Profile) process(s string, toASCII bool) (string, error) {
|
||||||
|
var err error
|
||||||
|
if p.mapping != nil {
|
||||||
|
s, err = p.mapping(p, s)
|
||||||
|
}
|
||||||
|
// Remove leading empty labels.
|
||||||
|
if p.removeLeadingDots {
|
||||||
|
for ; len(s) > 0 && s[0] == '.'; s = s[1:] {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// It seems like we should only create this error on ToASCII, but the
|
||||||
|
// UTS 46 conformance tests suggests we should always check this.
|
||||||
|
if err == nil && p.verifyDNSLength && s == "" {
|
||||||
|
err = &labelError{s, "A4"}
|
||||||
|
}
|
||||||
|
labels := labelIter{orig: s}
|
||||||
|
for ; !labels.done(); labels.next() {
|
||||||
|
label := labels.label()
|
||||||
|
if label == "" {
|
||||||
|
// Empty labels are not okay. The label iterator skips the last
|
||||||
|
// label if it is empty.
|
||||||
|
if err == nil && p.verifyDNSLength {
|
||||||
|
err = &labelError{s, "A4"}
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(label, acePrefix) {
|
||||||
|
u, err2 := decode(label[len(acePrefix):])
|
||||||
|
if err2 != nil {
|
||||||
|
if err == nil {
|
||||||
|
err = err2
|
||||||
|
}
|
||||||
|
// Spec says keep the old label.
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
labels.set(u)
|
||||||
|
if err == nil && p.validateLabels {
|
||||||
|
err = p.fromPuny(p, u)
|
||||||
|
}
|
||||||
|
if err == nil {
|
||||||
|
// This should be called on NonTransitional, according to the
|
||||||
|
// spec, but that currently does not have any effect. Use the
|
||||||
|
// original profile to preserve options.
|
||||||
|
err = p.validateLabel(u)
|
||||||
|
}
|
||||||
|
} else if err == nil {
|
||||||
|
err = p.validateLabel(label)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if toASCII {
|
||||||
|
for labels.reset(); !labels.done(); labels.next() {
|
||||||
|
label := labels.label()
|
||||||
|
if !ascii(label) {
|
||||||
|
a, err2 := encode(acePrefix, label)
|
||||||
|
if err == nil {
|
||||||
|
err = err2
|
||||||
|
}
|
||||||
|
label = a
|
||||||
|
labels.set(a)
|
||||||
|
}
|
||||||
|
n := len(label)
|
||||||
|
if p.verifyDNSLength && err == nil && (n == 0 || n > 63) {
|
||||||
|
err = &labelError{label, "A4"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
s = labels.result()
|
||||||
|
if toASCII && p.verifyDNSLength && err == nil {
|
||||||
|
// Compute the length of the domain name minus the root label and its dot.
|
||||||
|
n := len(s)
|
||||||
|
if n > 0 && s[n-1] == '.' {
|
||||||
|
n--
|
||||||
|
}
|
||||||
|
if len(s) < 1 || n > 253 {
|
||||||
|
err = &labelError{s, "A4"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return s, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalize(p *Profile, s string) (string, error) {
|
||||||
|
return norm.NFC.String(s), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func validateRegistration(p *Profile, s string) (string, error) {
|
||||||
|
if !norm.NFC.IsNormalString(s) {
|
||||||
|
return s, &labelError{s, "V1"}
|
||||||
|
}
|
||||||
|
for i := 0; i < len(s); {
|
||||||
|
v, sz := trie.lookupString(s[i:])
|
||||||
|
// Copy bytes not copied so far.
|
||||||
|
switch p.simplify(info(v).category()) {
|
||||||
|
// TODO: handle the NV8 defined in the Unicode idna data set to allow
|
||||||
|
// for strict conformance to IDNA2008.
|
||||||
|
case valid, deviation:
|
||||||
|
case disallowed, mapped, unknown, ignored:
|
||||||
|
r, _ := utf8.DecodeRuneInString(s[i:])
|
||||||
|
return s, runeError(r)
|
||||||
|
}
|
||||||
|
i += sz
|
||||||
|
}
|
||||||
return s, nil
|
return s, nil
|
||||||
}
|
}
|
||||||
labels := strings.Split(s, ".")
|
|
||||||
for i, label := range labels {
|
func validateAndMap(p *Profile, s string) (string, error) {
|
||||||
if strings.HasPrefix(label, acePrefix) {
|
var (
|
||||||
u, err := decode(label[len(acePrefix):])
|
err error
|
||||||
if err != nil {
|
b []byte
|
||||||
return "", err
|
k int
|
||||||
|
)
|
||||||
|
for i := 0; i < len(s); {
|
||||||
|
v, sz := trie.lookupString(s[i:])
|
||||||
|
start := i
|
||||||
|
i += sz
|
||||||
|
// Copy bytes not copied so far.
|
||||||
|
switch p.simplify(info(v).category()) {
|
||||||
|
case valid:
|
||||||
|
continue
|
||||||
|
case disallowed:
|
||||||
|
if err == nil {
|
||||||
|
r, _ := utf8.DecodeRuneInString(s[start:])
|
||||||
|
err = runeError(r)
|
||||||
}
|
}
|
||||||
labels[i] = u
|
continue
|
||||||
|
case mapped, deviation:
|
||||||
|
b = append(b, s[k:start]...)
|
||||||
|
b = info(v).appendMapping(b, s[start:i])
|
||||||
|
case ignored:
|
||||||
|
b = append(b, s[k:start]...)
|
||||||
|
// drop the rune
|
||||||
|
case unknown:
|
||||||
|
b = append(b, s[k:start]...)
|
||||||
|
b = append(b, "\ufffd"...)
|
||||||
|
}
|
||||||
|
k = i
|
||||||
|
}
|
||||||
|
if k == 0 {
|
||||||
|
// No changes so far.
|
||||||
|
s = norm.NFC.String(s)
|
||||||
|
} else {
|
||||||
|
b = append(b, s[k:]...)
|
||||||
|
if norm.NFC.QuickSpan(b) != len(b) {
|
||||||
|
b = norm.NFC.Bytes(b)
|
||||||
|
}
|
||||||
|
// TODO: the punycode converters require strings as input.
|
||||||
|
s = string(b)
|
||||||
|
}
|
||||||
|
return s, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// A labelIter allows iterating over domain name labels.
|
||||||
|
type labelIter struct {
|
||||||
|
orig string
|
||||||
|
slice []string
|
||||||
|
curStart int
|
||||||
|
curEnd int
|
||||||
|
i int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (l *labelIter) reset() {
|
||||||
|
l.curStart = 0
|
||||||
|
l.curEnd = 0
|
||||||
|
l.i = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func (l *labelIter) done() bool {
|
||||||
|
return l.curStart >= len(l.orig)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (l *labelIter) result() string {
|
||||||
|
if l.slice != nil {
|
||||||
|
return strings.Join(l.slice, ".")
|
||||||
|
}
|
||||||
|
return l.orig
|
||||||
|
}
|
||||||
|
|
||||||
|
func (l *labelIter) label() string {
|
||||||
|
if l.slice != nil {
|
||||||
|
return l.slice[l.i]
|
||||||
|
}
|
||||||
|
p := strings.IndexByte(l.orig[l.curStart:], '.')
|
||||||
|
l.curEnd = l.curStart + p
|
||||||
|
if p == -1 {
|
||||||
|
l.curEnd = len(l.orig)
|
||||||
|
}
|
||||||
|
return l.orig[l.curStart:l.curEnd]
|
||||||
|
}
|
||||||
|
|
||||||
|
// next sets the value to the next label. It skips the last label if it is empty.
|
||||||
|
func (l *labelIter) next() {
|
||||||
|
l.i++
|
||||||
|
if l.slice != nil {
|
||||||
|
if l.i >= len(l.slice) || l.i == len(l.slice)-1 && l.slice[l.i] == "" {
|
||||||
|
l.curStart = len(l.orig)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
l.curStart = l.curEnd + 1
|
||||||
|
if l.curStart == len(l.orig)-1 && l.orig[l.curStart] == '.' {
|
||||||
|
l.curStart = len(l.orig)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return strings.Join(labels, "."), nil
|
}
|
||||||
|
|
||||||
|
func (l *labelIter) set(s string) {
|
||||||
|
if l.slice == nil {
|
||||||
|
l.slice = strings.Split(l.orig, ".")
|
||||||
|
}
|
||||||
|
l.slice[l.i] = s
|
||||||
|
}
|
||||||
|
|
||||||
|
// acePrefix is the ASCII Compatible Encoding prefix.
|
||||||
|
const acePrefix = "xn--"
|
||||||
|
|
||||||
|
func (p *Profile) simplify(cat category) category {
|
||||||
|
switch cat {
|
||||||
|
case disallowedSTD3Mapped:
|
||||||
|
if p.useSTD3Rules {
|
||||||
|
cat = disallowed
|
||||||
|
} else {
|
||||||
|
cat = mapped
|
||||||
|
}
|
||||||
|
case disallowedSTD3Valid:
|
||||||
|
if p.useSTD3Rules {
|
||||||
|
cat = disallowed
|
||||||
|
} else {
|
||||||
|
cat = valid
|
||||||
|
}
|
||||||
|
case deviation:
|
||||||
|
if !p.transitional {
|
||||||
|
cat = valid
|
||||||
|
}
|
||||||
|
case validNV8, validXV8:
|
||||||
|
// TODO: handle V2008
|
||||||
|
cat = valid
|
||||||
|
}
|
||||||
|
return cat
|
||||||
|
}
|
||||||
|
|
||||||
|
func validateFromPunycode(p *Profile, s string) error {
|
||||||
|
if !norm.NFC.IsNormalString(s) {
|
||||||
|
return &labelError{s, "V1"}
|
||||||
|
}
|
||||||
|
for i := 0; i < len(s); {
|
||||||
|
v, sz := trie.lookupString(s[i:])
|
||||||
|
if c := p.simplify(info(v).category()); c != valid && c != deviation {
|
||||||
|
return &labelError{s, "V6"}
|
||||||
|
}
|
||||||
|
i += sz
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
zwnj = "\u200c"
|
||||||
|
zwj = "\u200d"
|
||||||
|
)
|
||||||
|
|
||||||
|
type joinState int8
|
||||||
|
|
||||||
|
const (
|
||||||
|
stateStart joinState = iota
|
||||||
|
stateVirama
|
||||||
|
stateBefore
|
||||||
|
stateBeforeVirama
|
||||||
|
stateAfter
|
||||||
|
stateFAIL
|
||||||
|
)
|
||||||
|
|
||||||
|
var joinStates = [][numJoinTypes]joinState{
|
||||||
|
stateStart: {
|
||||||
|
joiningL: stateBefore,
|
||||||
|
joiningD: stateBefore,
|
||||||
|
joinZWNJ: stateFAIL,
|
||||||
|
joinZWJ: stateFAIL,
|
||||||
|
joinVirama: stateVirama,
|
||||||
|
},
|
||||||
|
stateVirama: {
|
||||||
|
joiningL: stateBefore,
|
||||||
|
joiningD: stateBefore,
|
||||||
|
},
|
||||||
|
stateBefore: {
|
||||||
|
joiningL: stateBefore,
|
||||||
|
joiningD: stateBefore,
|
||||||
|
joiningT: stateBefore,
|
||||||
|
joinZWNJ: stateAfter,
|
||||||
|
joinZWJ: stateFAIL,
|
||||||
|
joinVirama: stateBeforeVirama,
|
||||||
|
},
|
||||||
|
stateBeforeVirama: {
|
||||||
|
joiningL: stateBefore,
|
||||||
|
joiningD: stateBefore,
|
||||||
|
joiningT: stateBefore,
|
||||||
|
},
|
||||||
|
stateAfter: {
|
||||||
|
joiningL: stateFAIL,
|
||||||
|
joiningD: stateBefore,
|
||||||
|
joiningT: stateAfter,
|
||||||
|
joiningR: stateStart,
|
||||||
|
joinZWNJ: stateFAIL,
|
||||||
|
joinZWJ: stateFAIL,
|
||||||
|
joinVirama: stateAfter, // no-op as we can't accept joiners here
|
||||||
|
},
|
||||||
|
stateFAIL: {
|
||||||
|
0: stateFAIL,
|
||||||
|
joiningL: stateFAIL,
|
||||||
|
joiningD: stateFAIL,
|
||||||
|
joiningT: stateFAIL,
|
||||||
|
joiningR: stateFAIL,
|
||||||
|
joinZWNJ: stateFAIL,
|
||||||
|
joinZWJ: stateFAIL,
|
||||||
|
joinVirama: stateFAIL,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// validateLabel validates the criteria from Section 4.1. Item 1, 4, and 6 are
|
||||||
|
// already implicitly satisfied by the overall implementation.
|
||||||
|
func (p *Profile) validateLabel(s string) error {
|
||||||
|
if s == "" {
|
||||||
|
if p.verifyDNSLength {
|
||||||
|
return &labelError{s, "A4"}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if p.bidirule != nil && !p.bidirule(s) {
|
||||||
|
return &labelError{s, "B"}
|
||||||
|
}
|
||||||
|
if !p.validateLabels {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
trie := p.trie // p.validateLabels is only set if trie is set.
|
||||||
|
if len(s) > 4 && s[2] == '-' && s[3] == '-' {
|
||||||
|
return &labelError{s, "V2"}
|
||||||
|
}
|
||||||
|
if s[0] == '-' || s[len(s)-1] == '-' {
|
||||||
|
return &labelError{s, "V3"}
|
||||||
|
}
|
||||||
|
// TODO: merge the use of this in the trie.
|
||||||
|
v, sz := trie.lookupString(s)
|
||||||
|
x := info(v)
|
||||||
|
if x.isModifier() {
|
||||||
|
return &labelError{s, "V5"}
|
||||||
|
}
|
||||||
|
// Quickly return in the absence of zero-width (non) joiners.
|
||||||
|
if strings.Index(s, zwj) == -1 && strings.Index(s, zwnj) == -1 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
st := stateStart
|
||||||
|
for i := 0; ; {
|
||||||
|
jt := x.joinType()
|
||||||
|
if s[i:i+sz] == zwj {
|
||||||
|
jt = joinZWJ
|
||||||
|
} else if s[i:i+sz] == zwnj {
|
||||||
|
jt = joinZWNJ
|
||||||
|
}
|
||||||
|
st = joinStates[st][jt]
|
||||||
|
if x.isViramaModifier() {
|
||||||
|
st = joinStates[st][joinVirama]
|
||||||
|
}
|
||||||
|
if i += sz; i == len(s) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
v, sz = trie.lookupString(s[i:])
|
||||||
|
x = info(v)
|
||||||
|
}
|
||||||
|
if st == stateFAIL || st == stateAfter {
|
||||||
|
return &labelError{s, "C"}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func ascii(s string) bool {
|
func ascii(s string) bool {
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
// Copyright 2012 The Go Authors. All rights reserved.
|
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
|
||||||
|
|
||||||
|
// Copyright 2016 The Go Authors. All rights reserved.
|
||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
@ -7,7 +9,6 @@ package idna
|
||||||
// This file implements the Punycode algorithm from RFC 3492.
|
// This file implements the Punycode algorithm from RFC 3492.
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
|
||||||
"math"
|
"math"
|
||||||
"strings"
|
"strings"
|
||||||
"unicode/utf8"
|
"unicode/utf8"
|
||||||
|
@ -27,6 +28,8 @@ const (
|
||||||
tmin int32 = 1
|
tmin int32 = 1
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func punyError(s string) error { return &labelError{s, "A3"} }
|
||||||
|
|
||||||
// decode decodes a string as specified in section 6.2.
|
// decode decodes a string as specified in section 6.2.
|
||||||
func decode(encoded string) (string, error) {
|
func decode(encoded string) (string, error) {
|
||||||
if encoded == "" {
|
if encoded == "" {
|
||||||
|
@ -34,7 +37,7 @@ func decode(encoded string) (string, error) {
|
||||||
}
|
}
|
||||||
pos := 1 + strings.LastIndex(encoded, "-")
|
pos := 1 + strings.LastIndex(encoded, "-")
|
||||||
if pos == 1 {
|
if pos == 1 {
|
||||||
return "", fmt.Errorf("idna: invalid label %q", encoded)
|
return "", punyError(encoded)
|
||||||
}
|
}
|
||||||
if pos == len(encoded) {
|
if pos == len(encoded) {
|
||||||
return encoded[:len(encoded)-1], nil
|
return encoded[:len(encoded)-1], nil
|
||||||
|
@ -50,16 +53,16 @@ func decode(encoded string) (string, error) {
|
||||||
oldI, w := i, int32(1)
|
oldI, w := i, int32(1)
|
||||||
for k := base; ; k += base {
|
for k := base; ; k += base {
|
||||||
if pos == len(encoded) {
|
if pos == len(encoded) {
|
||||||
return "", fmt.Errorf("idna: invalid label %q", encoded)
|
return "", punyError(encoded)
|
||||||
}
|
}
|
||||||
digit, ok := decodeDigit(encoded[pos])
|
digit, ok := decodeDigit(encoded[pos])
|
||||||
if !ok {
|
if !ok {
|
||||||
return "", fmt.Errorf("idna: invalid label %q", encoded)
|
return "", punyError(encoded)
|
||||||
}
|
}
|
||||||
pos++
|
pos++
|
||||||
i += digit * w
|
i += digit * w
|
||||||
if i < 0 {
|
if i < 0 {
|
||||||
return "", fmt.Errorf("idna: invalid label %q", encoded)
|
return "", punyError(encoded)
|
||||||
}
|
}
|
||||||
t := k - bias
|
t := k - bias
|
||||||
if t < tmin {
|
if t < tmin {
|
||||||
|
@ -72,7 +75,7 @@ func decode(encoded string) (string, error) {
|
||||||
}
|
}
|
||||||
w *= base - t
|
w *= base - t
|
||||||
if w >= math.MaxInt32/base {
|
if w >= math.MaxInt32/base {
|
||||||
return "", fmt.Errorf("idna: invalid label %q", encoded)
|
return "", punyError(encoded)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
x := int32(len(output) + 1)
|
x := int32(len(output) + 1)
|
||||||
|
@ -80,7 +83,7 @@ func decode(encoded string) (string, error) {
|
||||||
n += i / x
|
n += i / x
|
||||||
i %= x
|
i %= x
|
||||||
if n > utf8.MaxRune || len(output) >= 1024 {
|
if n > utf8.MaxRune || len(output) >= 1024 {
|
||||||
return "", fmt.Errorf("idna: invalid label %q", encoded)
|
return "", punyError(encoded)
|
||||||
}
|
}
|
||||||
output = append(output, 0)
|
output = append(output, 0)
|
||||||
copy(output[i+1:], output[i:])
|
copy(output[i+1:], output[i:])
|
||||||
|
@ -121,14 +124,14 @@ func encode(prefix, s string) (string, error) {
|
||||||
}
|
}
|
||||||
delta += (m - n) * (h + 1)
|
delta += (m - n) * (h + 1)
|
||||||
if delta < 0 {
|
if delta < 0 {
|
||||||
return "", fmt.Errorf("idna: invalid label %q", s)
|
return "", punyError(s)
|
||||||
}
|
}
|
||||||
n = m
|
n = m
|
||||||
for _, r := range s {
|
for _, r := range s {
|
||||||
if r < n {
|
if r < n {
|
||||||
delta++
|
delta++
|
||||||
if delta < 0 {
|
if delta < 0 {
|
||||||
return "", fmt.Errorf("idna: invalid label %q", s)
|
return "", punyError(s)
|
||||||
}
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,72 @@
|
||||||
|
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
|
||||||
|
|
||||||
|
// Copyright 2016 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package idna
|
||||||
|
|
||||||
|
// appendMapping appends the mapping for the respective rune. isMapped must be
|
||||||
|
// true. A mapping is a categorization of a rune as defined in UTS #46.
|
||||||
|
func (c info) appendMapping(b []byte, s string) []byte {
|
||||||
|
index := int(c >> indexShift)
|
||||||
|
if c&xorBit == 0 {
|
||||||
|
s := mappings[index:]
|
||||||
|
return append(b, s[1:s[0]+1]...)
|
||||||
|
}
|
||||||
|
b = append(b, s...)
|
||||||
|
if c&inlineXOR == inlineXOR {
|
||||||
|
// TODO: support and handle two-byte inline masks
|
||||||
|
b[len(b)-1] ^= byte(index)
|
||||||
|
} else {
|
||||||
|
for p := len(b) - int(xorData[index]); p < len(b); p++ {
|
||||||
|
index++
|
||||||
|
b[p] ^= xorData[index]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return b
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sparse block handling code.
|
||||||
|
|
||||||
|
type valueRange struct {
|
||||||
|
value uint16 // header: value:stride
|
||||||
|
lo, hi byte // header: lo:n
|
||||||
|
}
|
||||||
|
|
||||||
|
type sparseBlocks struct {
|
||||||
|
values []valueRange
|
||||||
|
offset []uint16
|
||||||
|
}
|
||||||
|
|
||||||
|
var idnaSparse = sparseBlocks{
|
||||||
|
values: idnaSparseValues[:],
|
||||||
|
offset: idnaSparseOffset[:],
|
||||||
|
}
|
||||||
|
|
||||||
|
// Don't use newIdnaTrie to avoid unconditional linking in of the table.
|
||||||
|
var trie = &idnaTrie{}
|
||||||
|
|
||||||
|
// lookup determines the type of block n and looks up the value for b.
|
||||||
|
// For n < t.cutoff, the block is a simple lookup table. Otherwise, the block
|
||||||
|
// is a list of ranges with an accompanying value. Given a matching range r,
|
||||||
|
// the value for b is by r.value + (b - r.lo) * stride.
|
||||||
|
func (t *sparseBlocks) lookup(n uint32, b byte) uint16 {
|
||||||
|
offset := t.offset[n]
|
||||||
|
header := t.values[offset]
|
||||||
|
lo := offset + 1
|
||||||
|
hi := lo + uint16(header.lo)
|
||||||
|
for lo < hi {
|
||||||
|
m := lo + (hi-lo)/2
|
||||||
|
r := t.values[m]
|
||||||
|
if r.lo <= b && b <= r.hi {
|
||||||
|
return r.value + uint16(b-r.lo)*header.value
|
||||||
|
}
|
||||||
|
if b < r.lo {
|
||||||
|
hi = m
|
||||||
|
} else {
|
||||||
|
lo = m + 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
|
@ -0,0 +1,114 @@
|
||||||
|
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
|
||||||
|
|
||||||
|
package idna
|
||||||
|
|
||||||
|
// This file contains definitions for interpreting the trie value of the idna
|
||||||
|
// trie generated by "go run gen*.go". It is shared by both the generator
|
||||||
|
// program and the resultant package. Sharing is achieved by the generator
|
||||||
|
// copying gen_trieval.go to trieval.go and changing what's above this comment.
|
||||||
|
|
||||||
|
// info holds information from the IDNA mapping table for a single rune. It is
|
||||||
|
// the value returned by a trie lookup. In most cases, all information fits in
|
||||||
|
// a 16-bit value. For mappings, this value may contain an index into a slice
|
||||||
|
// with the mapped string. Such mappings can consist of the actual mapped value
|
||||||
|
// or an XOR pattern to be applied to the bytes of the UTF8 encoding of the
|
||||||
|
// input rune. This technique is used by the cases packages and reduces the
|
||||||
|
// table size significantly.
|
||||||
|
//
|
||||||
|
// The per-rune values have the following format:
|
||||||
|
//
|
||||||
|
// if mapped {
|
||||||
|
// if inlinedXOR {
|
||||||
|
// 15..13 inline XOR marker
|
||||||
|
// 12..11 unused
|
||||||
|
// 10..3 inline XOR mask
|
||||||
|
// } else {
|
||||||
|
// 15..3 index into xor or mapping table
|
||||||
|
// }
|
||||||
|
// } else {
|
||||||
|
// 15..13 unused
|
||||||
|
// 12 modifier (including virama)
|
||||||
|
// 11 virama modifier
|
||||||
|
// 10..8 joining type
|
||||||
|
// 7..3 category type
|
||||||
|
// }
|
||||||
|
// 2 use xor pattern
|
||||||
|
// 1..0 mapped category
|
||||||
|
//
|
||||||
|
// See the definitions below for a more detailed description of the various
|
||||||
|
// bits.
|
||||||
|
type info uint16
|
||||||
|
|
||||||
|
const (
|
||||||
|
catSmallMask = 0x3
|
||||||
|
catBigMask = 0xF8
|
||||||
|
indexShift = 3
|
||||||
|
xorBit = 0x4 // interpret the index as an xor pattern
|
||||||
|
inlineXOR = 0xE000 // These bits are set if the XOR pattern is inlined.
|
||||||
|
|
||||||
|
joinShift = 8
|
||||||
|
joinMask = 0x07
|
||||||
|
|
||||||
|
viramaModifier = 0x0800
|
||||||
|
modifier = 0x1000
|
||||||
|
)
|
||||||
|
|
||||||
|
// A category corresponds to a category defined in the IDNA mapping table.
|
||||||
|
type category uint16
|
||||||
|
|
||||||
|
const (
|
||||||
|
unknown category = 0 // not defined currently in unicode.
|
||||||
|
mapped category = 1
|
||||||
|
disallowedSTD3Mapped category = 2
|
||||||
|
deviation category = 3
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
valid category = 0x08
|
||||||
|
validNV8 category = 0x18
|
||||||
|
validXV8 category = 0x28
|
||||||
|
disallowed category = 0x40
|
||||||
|
disallowedSTD3Valid category = 0x80
|
||||||
|
ignored category = 0xC0
|
||||||
|
)
|
||||||
|
|
||||||
|
// join types and additional rune information
|
||||||
|
const (
|
||||||
|
joiningL = (iota + 1)
|
||||||
|
joiningD
|
||||||
|
joiningT
|
||||||
|
joiningR
|
||||||
|
|
||||||
|
//the following types are derived during processing
|
||||||
|
joinZWJ
|
||||||
|
joinZWNJ
|
||||||
|
joinVirama
|
||||||
|
numJoinTypes
|
||||||
|
)
|
||||||
|
|
||||||
|
func (c info) isMapped() bool {
|
||||||
|
return c&0x3 != 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c info) category() category {
|
||||||
|
small := c & catSmallMask
|
||||||
|
if small != 0 {
|
||||||
|
return category(small)
|
||||||
|
}
|
||||||
|
return category(c & catBigMask)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c info) joinType() info {
|
||||||
|
if c.isMapped() {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return (c >> joinShift) & joinMask
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c info) isModifier() bool {
|
||||||
|
return c&(modifier|catSmallMask) == modifier
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c info) isViramaModifier() bool {
|
||||||
|
return c&(viramaModifier|catSmallMask) == viramaModifier
|
||||||
|
}
|
|
@ -0,0 +1,340 @@
|
||||||
|
// Copyright 2016 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
// Package bidirule implements the Bidi Rule defined by RFC 5893.
|
||||||
|
//
|
||||||
|
// This package is under development. The API may change without notice and
|
||||||
|
// without preserving backward compatibility.
|
||||||
|
package bidirule
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"unicode/utf8"
|
||||||
|
|
||||||
|
"golang.org/x/text/transform"
|
||||||
|
"golang.org/x/text/unicode/bidi"
|
||||||
|
)
|
||||||
|
|
||||||
|
// This file contains an implementation of RFC 5893: Right-to-Left Scripts for
|
||||||
|
// Internationalized Domain Names for Applications (IDNA)
|
||||||
|
//
|
||||||
|
// A label is an individual component of a domain name. Labels are usually
|
||||||
|
// shown separated by dots; for example, the domain name "www.example.com" is
|
||||||
|
// composed of three labels: "www", "example", and "com".
|
||||||
|
//
|
||||||
|
// An RTL label is a label that contains at least one character of class R, AL,
|
||||||
|
// or AN. An LTR label is any label that is not an RTL label.
|
||||||
|
//
|
||||||
|
// A "Bidi domain name" is a domain name that contains at least one RTL label.
|
||||||
|
//
|
||||||
|
// The following guarantees can be made based on the above:
|
||||||
|
//
|
||||||
|
// o In a domain name consisting of only labels that satisfy the rule,
|
||||||
|
// the requirements of Section 3 are satisfied. Note that even LTR
|
||||||
|
// labels and pure ASCII labels have to be tested.
|
||||||
|
//
|
||||||
|
// o In a domain name consisting of only LDH labels (as defined in the
|
||||||
|
// Definitions document [RFC5890]) and labels that satisfy the rule,
|
||||||
|
// the requirements of Section 3 are satisfied as long as a label
|
||||||
|
// that starts with an ASCII digit does not come after a
|
||||||
|
// right-to-left label.
|
||||||
|
//
|
||||||
|
// No guarantee is given for other combinations.
|
||||||
|
|
||||||
|
// ErrInvalid indicates a label is invalid according to the Bidi Rule.
|
||||||
|
var ErrInvalid = errors.New("bidirule: failed Bidi Rule")
|
||||||
|
|
||||||
|
type ruleState uint8
|
||||||
|
|
||||||
|
const (
|
||||||
|
ruleInitial ruleState = iota
|
||||||
|
ruleLTR
|
||||||
|
ruleLTRFinal
|
||||||
|
ruleRTL
|
||||||
|
ruleRTLFinal
|
||||||
|
ruleInvalid
|
||||||
|
)
|
||||||
|
|
||||||
|
type ruleTransition struct {
|
||||||
|
next ruleState
|
||||||
|
mask uint16
|
||||||
|
}
|
||||||
|
|
||||||
|
var transitions = [...][2]ruleTransition{
|
||||||
|
// [2.1] The first character must be a character with Bidi property L, R, or
|
||||||
|
// AL. If it has the R or AL property, it is an RTL label; if it has the L
|
||||||
|
// property, it is an LTR label.
|
||||||
|
ruleInitial: {
|
||||||
|
{ruleLTRFinal, 1 << bidi.L},
|
||||||
|
{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL},
|
||||||
|
},
|
||||||
|
ruleRTL: {
|
||||||
|
// [2.3] In an RTL label, the end of the label must be a character with
|
||||||
|
// Bidi property R, AL, EN, or AN, followed by zero or more characters
|
||||||
|
// with Bidi property NSM.
|
||||||
|
{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN},
|
||||||
|
|
||||||
|
// [2.2] In an RTL label, only characters with the Bidi properties R,
|
||||||
|
// AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
|
||||||
|
// We exclude the entries from [2.3]
|
||||||
|
{ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
|
||||||
|
},
|
||||||
|
ruleRTLFinal: {
|
||||||
|
// [2.3] In an RTL label, the end of the label must be a character with
|
||||||
|
// Bidi property R, AL, EN, or AN, followed by zero or more characters
|
||||||
|
// with Bidi property NSM.
|
||||||
|
{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN | 1<<bidi.NSM},
|
||||||
|
|
||||||
|
// [2.2] In an RTL label, only characters with the Bidi properties R,
|
||||||
|
// AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
|
||||||
|
// We exclude the entries from [2.3] and NSM.
|
||||||
|
{ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
|
||||||
|
},
|
||||||
|
ruleLTR: {
|
||||||
|
// [2.6] In an LTR label, the end of the label must be a character with
|
||||||
|
// Bidi property L or EN, followed by zero or more characters with Bidi
|
||||||
|
// property NSM.
|
||||||
|
{ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN},
|
||||||
|
|
||||||
|
// [2.5] In an LTR label, only characters with the Bidi properties L,
|
||||||
|
// EN, ES, CS, ET, ON, BN, or NSM are allowed.
|
||||||
|
// We exclude the entries from [2.6].
|
||||||
|
{ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
|
||||||
|
},
|
||||||
|
ruleLTRFinal: {
|
||||||
|
// [2.6] In an LTR label, the end of the label must be a character with
|
||||||
|
// Bidi property L or EN, followed by zero or more characters with Bidi
|
||||||
|
// property NSM.
|
||||||
|
{ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN | 1<<bidi.NSM},
|
||||||
|
|
||||||
|
// [2.5] In an LTR label, only characters with the Bidi properties L,
|
||||||
|
// EN, ES, CS, ET, ON, BN, or NSM are allowed.
|
||||||
|
// We exclude the entries from [2.6].
|
||||||
|
{ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
|
||||||
|
},
|
||||||
|
ruleInvalid: {
|
||||||
|
{ruleInvalid, 0},
|
||||||
|
{ruleInvalid, 0},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// [2.4] In an RTL label, if an EN is present, no AN may be present, and
|
||||||
|
// vice versa.
|
||||||
|
const exclusiveRTL = uint16(1<<bidi.EN | 1<<bidi.AN)
|
||||||
|
|
||||||
|
// From RFC 5893
|
||||||
|
// An RTL label is a label that contains at least one character of type
|
||||||
|
// R, AL, or AN.
|
||||||
|
//
|
||||||
|
// An LTR label is any label that is not an RTL label.
|
||||||
|
|
||||||
|
// Direction reports the direction of the given label as defined by RFC 5893.
|
||||||
|
// The Bidi Rule does not have to be applied to labels of the category
|
||||||
|
// LeftToRight.
|
||||||
|
func Direction(b []byte) bidi.Direction {
|
||||||
|
for i := 0; i < len(b); {
|
||||||
|
e, sz := bidi.Lookup(b[i:])
|
||||||
|
if sz == 0 {
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
c := e.Class()
|
||||||
|
if c == bidi.R || c == bidi.AL || c == bidi.AN {
|
||||||
|
return bidi.RightToLeft
|
||||||
|
}
|
||||||
|
i += sz
|
||||||
|
}
|
||||||
|
return bidi.LeftToRight
|
||||||
|
}
|
||||||
|
|
||||||
|
// DirectionString reports the direction of the given label as defined by RFC
|
||||||
|
// 5893. The Bidi Rule does not have to be applied to labels of the category
|
||||||
|
// LeftToRight.
|
||||||
|
func DirectionString(s string) bidi.Direction {
|
||||||
|
for i := 0; i < len(s); {
|
||||||
|
e, sz := bidi.LookupString(s[i:])
|
||||||
|
if sz == 0 {
|
||||||
|
i++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
c := e.Class()
|
||||||
|
if c == bidi.R || c == bidi.AL || c == bidi.AN {
|
||||||
|
return bidi.RightToLeft
|
||||||
|
}
|
||||||
|
i += sz
|
||||||
|
}
|
||||||
|
return bidi.LeftToRight
|
||||||
|
}
|
||||||
|
|
||||||
|
// Valid reports whether b conforms to the BiDi rule.
|
||||||
|
func Valid(b []byte) bool {
|
||||||
|
var t Transformer
|
||||||
|
if n, ok := t.advance(b); !ok || n < len(b) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return t.isFinal()
|
||||||
|
}
|
||||||
|
|
||||||
|
// ValidString reports whether s conforms to the BiDi rule.
|
||||||
|
func ValidString(s string) bool {
|
||||||
|
var t Transformer
|
||||||
|
if n, ok := t.advanceString(s); !ok || n < len(s) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return t.isFinal()
|
||||||
|
}
|
||||||
|
|
||||||
|
// New returns a Transformer that verifies that input adheres to the Bidi Rule.
|
||||||
|
func New() *Transformer {
|
||||||
|
return &Transformer{}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Transformer implements transform.Transform.
|
||||||
|
type Transformer struct {
|
||||||
|
state ruleState
|
||||||
|
hasRTL bool
|
||||||
|
seen uint16
|
||||||
|
}
|
||||||
|
|
||||||
|
// A rule can only be violated for "Bidi Domain names", meaning if one of the
|
||||||
|
// following categories has been observed.
|
||||||
|
func (t *Transformer) isRTL() bool {
|
||||||
|
const isRTL = 1<<bidi.R | 1<<bidi.AL | 1<<bidi.AN
|
||||||
|
return t.seen&isRTL != 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *Transformer) isFinal() bool {
|
||||||
|
return t.state == ruleLTRFinal || t.state == ruleRTLFinal || t.state == ruleInitial
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reset implements transform.Transformer.
|
||||||
|
func (t *Transformer) Reset() { *t = Transformer{} }
|
||||||
|
|
||||||
|
// Transform implements transform.Transformer. This Transformer has state and
|
||||||
|
// needs to be reset between uses.
|
||||||
|
func (t *Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||||
|
if len(dst) < len(src) {
|
||||||
|
src = src[:len(dst)]
|
||||||
|
atEOF = false
|
||||||
|
err = transform.ErrShortDst
|
||||||
|
}
|
||||||
|
n, err1 := t.Span(src, atEOF)
|
||||||
|
copy(dst, src[:n])
|
||||||
|
if err == nil || err1 != nil && err1 != transform.ErrShortSrc {
|
||||||
|
err = err1
|
||||||
|
}
|
||||||
|
return n, n, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Span returns the first n bytes of src that conform to the Bidi rule.
|
||||||
|
func (t *Transformer) Span(src []byte, atEOF bool) (n int, err error) {
|
||||||
|
if t.state == ruleInvalid && t.isRTL() {
|
||||||
|
return 0, ErrInvalid
|
||||||
|
}
|
||||||
|
n, ok := t.advance(src)
|
||||||
|
switch {
|
||||||
|
case !ok:
|
||||||
|
err = ErrInvalid
|
||||||
|
case n < len(src):
|
||||||
|
if !atEOF {
|
||||||
|
err = transform.ErrShortSrc
|
||||||
|
break
|
||||||
|
}
|
||||||
|
err = ErrInvalid
|
||||||
|
case !t.isFinal():
|
||||||
|
err = ErrInvalid
|
||||||
|
}
|
||||||
|
return n, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Precomputing the ASCII values decreases running time for the ASCII fast path
|
||||||
|
// by about 30%.
|
||||||
|
var asciiTable [128]bidi.Properties
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
for i := range asciiTable {
|
||||||
|
p, _ := bidi.LookupRune(rune(i))
|
||||||
|
asciiTable[i] = p
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *Transformer) advance(s []byte) (n int, ok bool) {
|
||||||
|
var e bidi.Properties
|
||||||
|
var sz int
|
||||||
|
for n < len(s) {
|
||||||
|
if s[n] < utf8.RuneSelf {
|
||||||
|
e, sz = asciiTable[s[n]], 1
|
||||||
|
} else {
|
||||||
|
e, sz = bidi.Lookup(s[n:])
|
||||||
|
if sz <= 1 {
|
||||||
|
if sz == 1 {
|
||||||
|
// We always consider invalid UTF-8 to be invalid, even if
|
||||||
|
// the string has not yet been determined to be RTL.
|
||||||
|
// TODO: is this correct?
|
||||||
|
return n, false
|
||||||
|
}
|
||||||
|
return n, true // incomplete UTF-8 encoding
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// TODO: using CompactClass would result in noticeable speedup.
|
||||||
|
// See unicode/bidi/prop.go:Properties.CompactClass.
|
||||||
|
c := uint16(1 << e.Class())
|
||||||
|
t.seen |= c
|
||||||
|
if t.seen&exclusiveRTL == exclusiveRTL {
|
||||||
|
t.state = ruleInvalid
|
||||||
|
return n, false
|
||||||
|
}
|
||||||
|
switch tr := transitions[t.state]; {
|
||||||
|
case tr[0].mask&c != 0:
|
||||||
|
t.state = tr[0].next
|
||||||
|
case tr[1].mask&c != 0:
|
||||||
|
t.state = tr[1].next
|
||||||
|
default:
|
||||||
|
t.state = ruleInvalid
|
||||||
|
if t.isRTL() {
|
||||||
|
return n, false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
n += sz
|
||||||
|
}
|
||||||
|
return n, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *Transformer) advanceString(s string) (n int, ok bool) {
|
||||||
|
var e bidi.Properties
|
||||||
|
var sz int
|
||||||
|
for n < len(s) {
|
||||||
|
if s[n] < utf8.RuneSelf {
|
||||||
|
e, sz = asciiTable[s[n]], 1
|
||||||
|
} else {
|
||||||
|
e, sz = bidi.LookupString(s[n:])
|
||||||
|
if sz <= 1 {
|
||||||
|
if sz == 1 {
|
||||||
|
return n, false // invalid UTF-8
|
||||||
|
}
|
||||||
|
return n, true // incomplete UTF-8 encoding
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// TODO: using CompactClass results in noticeable speedup.
|
||||||
|
// See unicode/bidi/prop.go:Properties.CompactClass.
|
||||||
|
c := uint16(1 << e.Class())
|
||||||
|
t.seen |= c
|
||||||
|
if t.seen&exclusiveRTL == exclusiveRTL {
|
||||||
|
t.state = ruleInvalid
|
||||||
|
return n, false
|
||||||
|
}
|
||||||
|
switch tr := transitions[t.state]; {
|
||||||
|
case tr[0].mask&c != 0:
|
||||||
|
t.state = tr[0].next
|
||||||
|
case tr[1].mask&c != 0:
|
||||||
|
t.state = tr[1].next
|
||||||
|
default:
|
||||||
|
t.state = ruleInvalid
|
||||||
|
if t.isRTL() {
|
||||||
|
return n, false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
n += sz
|
||||||
|
}
|
||||||
|
return n, true
|
||||||
|
}
|
|
@ -0,0 +1,198 @@
|
||||||
|
// Copyright 2015 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
//go:generate go run gen.go gen_trieval.go gen_ranges.go
|
||||||
|
|
||||||
|
// Package bidi contains functionality for bidirectional text support.
|
||||||
|
//
|
||||||
|
// See http://www.unicode.org/reports/tr9.
|
||||||
|
//
|
||||||
|
// NOTE: UNDER CONSTRUCTION. This API may change in backwards incompatible ways
|
||||||
|
// and without notice.
|
||||||
|
package bidi // import "golang.org/x/text/unicode/bidi"
|
||||||
|
|
||||||
|
// TODO:
|
||||||
|
// The following functionality would not be hard to implement, but hinges on
|
||||||
|
// the definition of a Segmenter interface. For now this is up to the user.
|
||||||
|
// - Iterate over paragraphs
|
||||||
|
// - Segmenter to iterate over runs directly from a given text.
|
||||||
|
// Also:
|
||||||
|
// - Transformer for reordering?
|
||||||
|
// - Transformer (validator, really) for Bidi Rule.
|
||||||
|
|
||||||
|
// This API tries to avoid dealing with embedding levels for now. Under the hood
|
||||||
|
// these will be computed, but the question is to which extent the user should
|
||||||
|
// know they exist. We should at some point allow the user to specify an
|
||||||
|
// embedding hierarchy, though.
|
||||||
|
|
||||||
|
// A Direction indicates the overall flow of text.
|
||||||
|
type Direction int
|
||||||
|
|
||||||
|
const (
|
||||||
|
// LeftToRight indicates the text contains no right-to-left characters and
|
||||||
|
// that either there are some left-to-right characters or the option
|
||||||
|
// DefaultDirection(LeftToRight) was passed.
|
||||||
|
LeftToRight Direction = iota
|
||||||
|
|
||||||
|
// RightToLeft indicates the text contains no left-to-right characters and
|
||||||
|
// that either there are some right-to-left characters or the option
|
||||||
|
// DefaultDirection(RightToLeft) was passed.
|
||||||
|
RightToLeft
|
||||||
|
|
||||||
|
// Mixed indicates text contains both left-to-right and right-to-left
|
||||||
|
// characters.
|
||||||
|
Mixed
|
||||||
|
|
||||||
|
// Neutral means that text contains no left-to-right and right-to-left
|
||||||
|
// characters and that no default direction has been set.
|
||||||
|
Neutral
|
||||||
|
)
|
||||||
|
|
||||||
|
type options struct{}
|
||||||
|
|
||||||
|
// An Option is an option for Bidi processing.
|
||||||
|
type Option func(*options)
|
||||||
|
|
||||||
|
// ICU allows the user to define embedding levels. This may be used, for example,
|
||||||
|
// to use hierarchical structure of markup languages to define embeddings.
|
||||||
|
// The following option may be a way to expose this functionality in this API.
|
||||||
|
// // LevelFunc sets a function that associates nesting levels with the given text.
|
||||||
|
// // The levels function will be called with monotonically increasing values for p.
|
||||||
|
// func LevelFunc(levels func(p int) int) Option {
|
||||||
|
// panic("unimplemented")
|
||||||
|
// }
|
||||||
|
|
||||||
|
// DefaultDirection sets the default direction for a Paragraph. The direction is
|
||||||
|
// overridden if the text contains directional characters.
|
||||||
|
func DefaultDirection(d Direction) Option {
|
||||||
|
panic("unimplemented")
|
||||||
|
}
|
||||||
|
|
||||||
|
// A Paragraph holds a single Paragraph for Bidi processing.
|
||||||
|
type Paragraph struct {
|
||||||
|
// buffers
|
||||||
|
}
|
||||||
|
|
||||||
|
// SetBytes configures p for the given paragraph text. It replaces text
|
||||||
|
// previously set by SetBytes or SetString. If b contains a paragraph separator
|
||||||
|
// it will only process the first paragraph and report the number of bytes
|
||||||
|
// consumed from b including this separator. Error may be non-nil if options are
|
||||||
|
// given.
|
||||||
|
func (p *Paragraph) SetBytes(b []byte, opts ...Option) (n int, err error) {
|
||||||
|
panic("unimplemented")
|
||||||
|
}
|
||||||
|
|
||||||
|
// SetString configures p for the given paragraph text. It replaces text
|
||||||
|
// previously set by SetBytes or SetString. If b contains a paragraph separator
|
||||||
|
// it will only process the first paragraph and report the number of bytes
|
||||||
|
// consumed from b including this separator. Error may be non-nil if options are
|
||||||
|
// given.
|
||||||
|
func (p *Paragraph) SetString(s string, opts ...Option) (n int, err error) {
|
||||||
|
panic("unimplemented")
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsLeftToRight reports whether the principle direction of rendering for this
|
||||||
|
// paragraphs is left-to-right. If this returns false, the principle direction
|
||||||
|
// of rendering is right-to-left.
|
||||||
|
func (p *Paragraph) IsLeftToRight() bool {
|
||||||
|
panic("unimplemented")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Direction returns the direction of the text of this paragraph.
|
||||||
|
//
|
||||||
|
// The direction may be LeftToRight, RightToLeft, Mixed, or Neutral.
|
||||||
|
func (p *Paragraph) Direction() Direction {
|
||||||
|
panic("unimplemented")
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunAt reports the Run at the given position of the input text.
|
||||||
|
//
|
||||||
|
// This method can be used for computing line breaks on paragraphs.
|
||||||
|
func (p *Paragraph) RunAt(pos int) Run {
|
||||||
|
panic("unimplemented")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Order computes the visual ordering of all the runs in a Paragraph.
|
||||||
|
func (p *Paragraph) Order() (Ordering, error) {
|
||||||
|
panic("unimplemented")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Line computes the visual ordering of runs for a single line starting and
|
||||||
|
// ending at the given positions in the original text.
|
||||||
|
func (p *Paragraph) Line(start, end int) (Ordering, error) {
|
||||||
|
panic("unimplemented")
|
||||||
|
}
|
||||||
|
|
||||||
|
// An Ordering holds the computed visual order of runs of a Paragraph. Calling
|
||||||
|
// SetBytes or SetString on the originating Paragraph invalidates an Ordering.
|
||||||
|
// The methods of an Ordering should only be called by one goroutine at a time.
|
||||||
|
type Ordering struct{}
|
||||||
|
|
||||||
|
// Direction reports the directionality of the runs.
|
||||||
|
//
|
||||||
|
// The direction may be LeftToRight, RightToLeft, Mixed, or Neutral.
|
||||||
|
func (o *Ordering) Direction() Direction {
|
||||||
|
panic("unimplemented")
|
||||||
|
}
|
||||||
|
|
||||||
|
// NumRuns returns the number of runs.
|
||||||
|
func (o *Ordering) NumRuns() int {
|
||||||
|
panic("unimplemented")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run returns the ith run within the ordering.
|
||||||
|
func (o *Ordering) Run(i int) Run {
|
||||||
|
panic("unimplemented")
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: perhaps with options.
|
||||||
|
// // Reorder creates a reader that reads the runes in visual order per character.
|
||||||
|
// // Modifiers remain after the runes they modify.
|
||||||
|
// func (l *Runs) Reorder() io.Reader {
|
||||||
|
// panic("unimplemented")
|
||||||
|
// }
|
||||||
|
|
||||||
|
// A Run is a continuous sequence of characters of a single direction.
|
||||||
|
type Run struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
// String returns the text of the run in its original order.
|
||||||
|
func (r *Run) String() string {
|
||||||
|
panic("unimplemented")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Bytes returns the text of the run in its original order.
|
||||||
|
func (r *Run) Bytes() []byte {
|
||||||
|
panic("unimplemented")
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: methods for
|
||||||
|
// - Display order
|
||||||
|
// - headers and footers
|
||||||
|
// - bracket replacement.
|
||||||
|
|
||||||
|
// Direction reports the direction of the run.
|
||||||
|
func (r *Run) Direction() Direction {
|
||||||
|
panic("unimplemented")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Position of the Run within the text passed to SetBytes or SetString of the
|
||||||
|
// originating Paragraph value.
|
||||||
|
func (r *Run) Pos() (start, end int) {
|
||||||
|
panic("unimplemented")
|
||||||
|
}
|
||||||
|
|
||||||
|
// AppendReverse reverses the order of characters of in, appends them to out,
|
||||||
|
// and returns the result. Modifiers will still follow the runes they modify.
|
||||||
|
// Brackets are replaced with their counterparts.
|
||||||
|
func AppendReverse(out, in []byte) []byte {
|
||||||
|
panic("unimplemented")
|
||||||
|
}
|
||||||
|
|
||||||
|
// ReverseString reverses the order of characters in s and returns a new string.
|
||||||
|
// Modifiers will still follow the runes they modify. Brackets are replaced with
|
||||||
|
// their counterparts.
|
||||||
|
func ReverseString(s string) string {
|
||||||
|
panic("unimplemented")
|
||||||
|
}
|
|
@ -0,0 +1,335 @@
|
||||||
|
// Copyright 2015 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package bidi
|
||||||
|
|
||||||
|
import (
|
||||||
|
"container/list"
|
||||||
|
"fmt"
|
||||||
|
"sort"
|
||||||
|
)
|
||||||
|
|
||||||
|
// This file contains a port of the reference implementation of the
|
||||||
|
// Bidi Parentheses Algorithm:
|
||||||
|
// http://www.unicode.org/Public/PROGRAMS/BidiReferenceJava/BidiPBAReference.java
|
||||||
|
//
|
||||||
|
// The implementation in this file covers definitions BD14-BD16 and rule N0
|
||||||
|
// of UAX#9.
|
||||||
|
//
|
||||||
|
// Some preprocessing is done for each rune before data is passed to this
|
||||||
|
// algorithm:
|
||||||
|
// - opening and closing brackets are identified
|
||||||
|
// - a bracket pair type, like '(' and ')' is assigned a unique identifier that
|
||||||
|
// is identical for the opening and closing bracket. It is left to do these
|
||||||
|
// mappings.
|
||||||
|
// - The BPA algorithm requires that bracket characters that are canonical
|
||||||
|
// equivalents of each other be able to be substituted for each other.
|
||||||
|
// It is the responsibility of the caller to do this canonicalization.
|
||||||
|
//
|
||||||
|
// In implementing BD16, this implementation departs slightly from the "logical"
|
||||||
|
// algorithm defined in UAX#9. In particular, the stack referenced there
|
||||||
|
// supports operations that go beyond a "basic" stack. An equivalent
|
||||||
|
// implementation based on a linked list is used here.
|
||||||
|
|
||||||
|
// Bidi_Paired_Bracket_Type
|
||||||
|
// BD14. An opening paired bracket is a character whose
|
||||||
|
// Bidi_Paired_Bracket_Type property value is Open.
|
||||||
|
//
|
||||||
|
// BD15. A closing paired bracket is a character whose
|
||||||
|
// Bidi_Paired_Bracket_Type property value is Close.
|
||||||
|
type bracketType byte
|
||||||
|
|
||||||
|
const (
|
||||||
|
bpNone bracketType = iota
|
||||||
|
bpOpen
|
||||||
|
bpClose
|
||||||
|
)
|
||||||
|
|
||||||
|
// bracketPair holds a pair of index values for opening and closing bracket
|
||||||
|
// location of a bracket pair.
|
||||||
|
type bracketPair struct {
|
||||||
|
opener int
|
||||||
|
closer int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *bracketPair) String() string {
|
||||||
|
return fmt.Sprintf("(%v, %v)", b.opener, b.closer)
|
||||||
|
}
|
||||||
|
|
||||||
|
// bracketPairs is a slice of bracketPairs with a sort.Interface implementation.
|
||||||
|
type bracketPairs []bracketPair
|
||||||
|
|
||||||
|
func (b bracketPairs) Len() int { return len(b) }
|
||||||
|
func (b bracketPairs) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
|
||||||
|
func (b bracketPairs) Less(i, j int) bool { return b[i].opener < b[j].opener }
|
||||||
|
|
||||||
|
// resolvePairedBrackets runs the paired bracket part of the UBA algorithm.
|
||||||
|
//
|
||||||
|
// For each rune, it takes the indexes into the original string, the class the
|
||||||
|
// bracket type (in pairTypes) and the bracket identifier (pairValues). It also
|
||||||
|
// takes the direction type for the start-of-sentence and the embedding level.
|
||||||
|
//
|
||||||
|
// The identifiers for bracket types are the rune of the canonicalized opening
|
||||||
|
// bracket for brackets (open or close) or 0 for runes that are not brackets.
|
||||||
|
func resolvePairedBrackets(s *isolatingRunSequence) {
|
||||||
|
p := bracketPairer{
|
||||||
|
sos: s.sos,
|
||||||
|
openers: list.New(),
|
||||||
|
codesIsolatedRun: s.types,
|
||||||
|
indexes: s.indexes,
|
||||||
|
}
|
||||||
|
dirEmbed := L
|
||||||
|
if s.level&1 != 0 {
|
||||||
|
dirEmbed = R
|
||||||
|
}
|
||||||
|
p.locateBrackets(s.p.pairTypes, s.p.pairValues)
|
||||||
|
p.resolveBrackets(dirEmbed, s.p.initialTypes)
|
||||||
|
}
|
||||||
|
|
||||||
|
type bracketPairer struct {
|
||||||
|
sos Class // direction corresponding to start of sequence
|
||||||
|
|
||||||
|
// The following is a restatement of BD 16 using non-algorithmic language.
|
||||||
|
//
|
||||||
|
// A bracket pair is a pair of characters consisting of an opening
|
||||||
|
// paired bracket and a closing paired bracket such that the
|
||||||
|
// Bidi_Paired_Bracket property value of the former equals the latter,
|
||||||
|
// subject to the following constraints.
|
||||||
|
// - both characters of a pair occur in the same isolating run sequence
|
||||||
|
// - the closing character of a pair follows the opening character
|
||||||
|
// - any bracket character can belong at most to one pair, the earliest possible one
|
||||||
|
// - any bracket character not part of a pair is treated like an ordinary character
|
||||||
|
// - pairs may nest properly, but their spans may not overlap otherwise
|
||||||
|
|
||||||
|
// Bracket characters with canonical decompositions are supposed to be
|
||||||
|
// treated as if they had been normalized, to allow normalized and non-
|
||||||
|
// normalized text to give the same result. In this implementation that step
|
||||||
|
// is pushed out to the caller. The caller has to ensure that the pairValue
|
||||||
|
// slices contain the rune of the opening bracket after normalization for
|
||||||
|
// any opening or closing bracket.
|
||||||
|
|
||||||
|
openers *list.List // list of positions for opening brackets
|
||||||
|
|
||||||
|
// bracket pair positions sorted by location of opening bracket
|
||||||
|
pairPositions bracketPairs
|
||||||
|
|
||||||
|
codesIsolatedRun []Class // directional bidi codes for an isolated run
|
||||||
|
indexes []int // array of index values into the original string
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// matchOpener reports whether characters at given positions form a matching
|
||||||
|
// bracket pair.
|
||||||
|
func (p *bracketPairer) matchOpener(pairValues []rune, opener, closer int) bool {
|
||||||
|
return pairValues[p.indexes[opener]] == pairValues[p.indexes[closer]]
|
||||||
|
}
|
||||||
|
|
||||||
|
const maxPairingDepth = 63
|
||||||
|
|
||||||
|
// locateBrackets locates matching bracket pairs according to BD16.
|
||||||
|
//
|
||||||
|
// This implementation uses a linked list instead of a stack, because, while
|
||||||
|
// elements are added at the front (like a push) they are not generally removed
|
||||||
|
// in atomic 'pop' operations, reducing the benefit of the stack archetype.
|
||||||
|
func (p *bracketPairer) locateBrackets(pairTypes []bracketType, pairValues []rune) {
|
||||||
|
// traverse the run
|
||||||
|
// do that explicitly (not in a for-each) so we can record position
|
||||||
|
for i, index := range p.indexes {
|
||||||
|
|
||||||
|
// look at the bracket type for each character
|
||||||
|
if pairTypes[index] == bpNone || p.codesIsolatedRun[i] != ON {
|
||||||
|
// continue scanning
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
switch pairTypes[index] {
|
||||||
|
case bpOpen:
|
||||||
|
// check if maximum pairing depth reached
|
||||||
|
if p.openers.Len() == maxPairingDepth {
|
||||||
|
p.openers.Init()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// remember opener location, most recent first
|
||||||
|
p.openers.PushFront(i)
|
||||||
|
|
||||||
|
case bpClose:
|
||||||
|
// see if there is a match
|
||||||
|
count := 0
|
||||||
|
for elem := p.openers.Front(); elem != nil; elem = elem.Next() {
|
||||||
|
count++
|
||||||
|
opener := elem.Value.(int)
|
||||||
|
if p.matchOpener(pairValues, opener, i) {
|
||||||
|
// if the opener matches, add nested pair to the ordered list
|
||||||
|
p.pairPositions = append(p.pairPositions, bracketPair{opener, i})
|
||||||
|
// remove up to and including matched opener
|
||||||
|
for ; count > 0; count-- {
|
||||||
|
p.openers.Remove(p.openers.Front())
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sort.Sort(p.pairPositions)
|
||||||
|
// if we get here, the closing bracket matched no openers
|
||||||
|
// and gets ignored
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Bracket pairs within an isolating run sequence are processed as units so
|
||||||
|
// that both the opening and the closing paired bracket in a pair resolve to
|
||||||
|
// the same direction.
|
||||||
|
//
|
||||||
|
// N0. Process bracket pairs in an isolating run sequence sequentially in
|
||||||
|
// the logical order of the text positions of the opening paired brackets
|
||||||
|
// using the logic given below. Within this scope, bidirectional types EN
|
||||||
|
// and AN are treated as R.
|
||||||
|
//
|
||||||
|
// Identify the bracket pairs in the current isolating run sequence
|
||||||
|
// according to BD16. For each bracket-pair element in the list of pairs of
|
||||||
|
// text positions:
|
||||||
|
//
|
||||||
|
// a Inspect the bidirectional types of the characters enclosed within the
|
||||||
|
// bracket pair.
|
||||||
|
//
|
||||||
|
// b If any strong type (either L or R) matching the embedding direction is
|
||||||
|
// found, set the type for both brackets in the pair to match the embedding
|
||||||
|
// direction.
|
||||||
|
//
|
||||||
|
// o [ e ] o -> o e e e o
|
||||||
|
//
|
||||||
|
// o [ o e ] -> o e o e e
|
||||||
|
//
|
||||||
|
// o [ NI e ] -> o e NI e e
|
||||||
|
//
|
||||||
|
// c Otherwise, if a strong type (opposite the embedding direction) is
|
||||||
|
// found, test for adjacent strong types as follows: 1 First, check
|
||||||
|
// backwards before the opening paired bracket until the first strong type
|
||||||
|
// (L, R, or sos) is found. If that first preceding strong type is opposite
|
||||||
|
// the embedding direction, then set the type for both brackets in the pair
|
||||||
|
// to that type. 2 Otherwise, set the type for both brackets in the pair to
|
||||||
|
// the embedding direction.
|
||||||
|
//
|
||||||
|
// o [ o ] e -> o o o o e
|
||||||
|
//
|
||||||
|
// o [ o NI ] o -> o o o NI o o
|
||||||
|
//
|
||||||
|
// e [ o ] o -> e e o e o
|
||||||
|
//
|
||||||
|
// e [ o ] e -> e e o e e
|
||||||
|
//
|
||||||
|
// e ( o [ o ] NI ) e -> e e o o o o NI e e
|
||||||
|
//
|
||||||
|
// d Otherwise, do not set the type for the current bracket pair. Note that
|
||||||
|
// if the enclosed text contains no strong types the paired brackets will
|
||||||
|
// both resolve to the same level when resolved individually using rules N1
|
||||||
|
// and N2.
|
||||||
|
//
|
||||||
|
// e ( NI ) o -> e ( NI ) o
|
||||||
|
|
||||||
|
// getStrongTypeN0 maps character's directional code to strong type as required
|
||||||
|
// by rule N0.
|
||||||
|
//
|
||||||
|
// TODO: have separate type for "strong" directionality.
|
||||||
|
func (p *bracketPairer) getStrongTypeN0(index int) Class {
|
||||||
|
switch p.codesIsolatedRun[index] {
|
||||||
|
// in the scope of N0, number types are treated as R
|
||||||
|
case EN, AN, AL, R:
|
||||||
|
return R
|
||||||
|
case L:
|
||||||
|
return L
|
||||||
|
default:
|
||||||
|
return ON
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// classifyPairContent reports the strong types contained inside a Bracket Pair,
|
||||||
|
// assuming the given embedding direction.
|
||||||
|
//
|
||||||
|
// It returns ON if no strong type is found. If a single strong type is found,
|
||||||
|
// it returns this this type. Otherwise it returns the embedding direction.
|
||||||
|
//
|
||||||
|
// TODO: use separate type for "strong" directionality.
|
||||||
|
func (p *bracketPairer) classifyPairContent(loc bracketPair, dirEmbed Class) Class {
|
||||||
|
dirOpposite := ON
|
||||||
|
for i := loc.opener + 1; i < loc.closer; i++ {
|
||||||
|
dir := p.getStrongTypeN0(i)
|
||||||
|
if dir == ON {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if dir == dirEmbed {
|
||||||
|
return dir // type matching embedding direction found
|
||||||
|
}
|
||||||
|
dirOpposite = dir
|
||||||
|
}
|
||||||
|
// return ON if no strong type found, or class opposite to dirEmbed
|
||||||
|
return dirOpposite
|
||||||
|
}
|
||||||
|
|
||||||
|
// classBeforePair determines which strong types are present before a Bracket
|
||||||
|
// Pair. Return R or L if strong type found, otherwise ON.
|
||||||
|
func (p *bracketPairer) classBeforePair(loc bracketPair) Class {
|
||||||
|
for i := loc.opener - 1; i >= 0; i-- {
|
||||||
|
if dir := p.getStrongTypeN0(i); dir != ON {
|
||||||
|
return dir
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// no strong types found, return sos
|
||||||
|
return p.sos
|
||||||
|
}
|
||||||
|
|
||||||
|
// assignBracketType implements rule N0 for a single bracket pair.
|
||||||
|
func (p *bracketPairer) assignBracketType(loc bracketPair, dirEmbed Class, initialTypes []Class) {
|
||||||
|
// rule "N0, a", inspect contents of pair
|
||||||
|
dirPair := p.classifyPairContent(loc, dirEmbed)
|
||||||
|
|
||||||
|
// dirPair is now L, R, or N (no strong type found)
|
||||||
|
|
||||||
|
// the following logical tests are performed out of order compared to
|
||||||
|
// the statement of the rules but yield the same results
|
||||||
|
if dirPair == ON {
|
||||||
|
return // case "d" - nothing to do
|
||||||
|
}
|
||||||
|
|
||||||
|
if dirPair != dirEmbed {
|
||||||
|
// case "c": strong type found, opposite - check before (c.1)
|
||||||
|
dirPair = p.classBeforePair(loc)
|
||||||
|
if dirPair == dirEmbed || dirPair == ON {
|
||||||
|
// no strong opposite type found before - use embedding (c.2)
|
||||||
|
dirPair = dirEmbed
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// else: case "b", strong type found matching embedding,
|
||||||
|
// no explicit action needed, as dirPair is already set to embedding
|
||||||
|
// direction
|
||||||
|
|
||||||
|
// set the bracket types to the type found
|
||||||
|
p.setBracketsToType(loc, dirPair, initialTypes)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *bracketPairer) setBracketsToType(loc bracketPair, dirPair Class, initialTypes []Class) {
|
||||||
|
p.codesIsolatedRun[loc.opener] = dirPair
|
||||||
|
p.codesIsolatedRun[loc.closer] = dirPair
|
||||||
|
|
||||||
|
for i := loc.opener + 1; i < loc.closer; i++ {
|
||||||
|
index := p.indexes[i]
|
||||||
|
if initialTypes[index] != NSM {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
p.codesIsolatedRun[i] = dirPair
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := loc.closer + 1; i < len(p.indexes); i++ {
|
||||||
|
index := p.indexes[i]
|
||||||
|
if initialTypes[index] != NSM {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
p.codesIsolatedRun[i] = dirPair
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// resolveBrackets implements rule N0 for a list of pairs.
|
||||||
|
func (p *bracketPairer) resolveBrackets(dirEmbed Class, initialTypes []Class) {
|
||||||
|
for _, loc := range p.pairPositions {
|
||||||
|
p.assignBracketType(loc, dirEmbed, initialTypes)
|
||||||
|
}
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,206 @@
|
||||||
|
// Copyright 2016 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package bidi
|
||||||
|
|
||||||
|
import "unicode/utf8"
|
||||||
|
|
||||||
|
// Properties provides access to BiDi properties of runes.
|
||||||
|
type Properties struct {
|
||||||
|
entry uint8
|
||||||
|
last uint8
|
||||||
|
}
|
||||||
|
|
||||||
|
var trie = newBidiTrie(0)
|
||||||
|
|
||||||
|
// TODO: using this for bidirule reduces the running time by about 5%. Consider
|
||||||
|
// if this is worth exposing or if we can find a way to speed up the Class
|
||||||
|
// method.
|
||||||
|
//
|
||||||
|
// // CompactClass is like Class, but maps all of the BiDi control classes
|
||||||
|
// // (LRO, RLO, LRE, RLE, PDF, LRI, RLI, FSI, PDI) to the class Control.
|
||||||
|
// func (p Properties) CompactClass() Class {
|
||||||
|
// return Class(p.entry & 0x0F)
|
||||||
|
// }
|
||||||
|
|
||||||
|
// Class returns the Bidi class for p.
|
||||||
|
func (p Properties) Class() Class {
|
||||||
|
c := Class(p.entry & 0x0F)
|
||||||
|
if c == Control {
|
||||||
|
c = controlByteToClass[p.last&0xF]
|
||||||
|
}
|
||||||
|
return c
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsBracket reports whether the rune is a bracket.
|
||||||
|
func (p Properties) IsBracket() bool { return p.entry&0xF0 != 0 }
|
||||||
|
|
||||||
|
// IsOpeningBracket reports whether the rune is an opening bracket.
|
||||||
|
// IsBracket must return true.
|
||||||
|
func (p Properties) IsOpeningBracket() bool { return p.entry&openMask != 0 }
|
||||||
|
|
||||||
|
// TODO: find a better API and expose.
|
||||||
|
func (p Properties) reverseBracket(r rune) rune {
|
||||||
|
return xorMasks[p.entry>>xorMaskShift] ^ r
|
||||||
|
}
|
||||||
|
|
||||||
|
var controlByteToClass = [16]Class{
|
||||||
|
0xD: LRO, // U+202D LeftToRightOverride,
|
||||||
|
0xE: RLO, // U+202E RightToLeftOverride,
|
||||||
|
0xA: LRE, // U+202A LeftToRightEmbedding,
|
||||||
|
0xB: RLE, // U+202B RightToLeftEmbedding,
|
||||||
|
0xC: PDF, // U+202C PopDirectionalFormat,
|
||||||
|
0x6: LRI, // U+2066 LeftToRightIsolate,
|
||||||
|
0x7: RLI, // U+2067 RightToLeftIsolate,
|
||||||
|
0x8: FSI, // U+2068 FirstStrongIsolate,
|
||||||
|
0x9: PDI, // U+2069 PopDirectionalIsolate,
|
||||||
|
}
|
||||||
|
|
||||||
|
// LookupRune returns properties for r.
|
||||||
|
func LookupRune(r rune) (p Properties, size int) {
|
||||||
|
var buf [4]byte
|
||||||
|
n := utf8.EncodeRune(buf[:], r)
|
||||||
|
return Lookup(buf[:n])
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: these lookup methods are based on the generated trie code. The returned
|
||||||
|
// sizes have slightly different semantics from the generated code, in that it
|
||||||
|
// always returns size==1 for an illegal UTF-8 byte (instead of the length
|
||||||
|
// of the maximum invalid subsequence). Most Transformers, like unicode/norm,
|
||||||
|
// leave invalid UTF-8 untouched, in which case it has performance benefits to
|
||||||
|
// do so (without changing the semantics). Bidi requires the semantics used here
|
||||||
|
// for the bidirule implementation to be compatible with the Go semantics.
|
||||||
|
// They ultimately should perhaps be adopted by all trie implementations, for
|
||||||
|
// convenience sake.
|
||||||
|
// This unrolled code also boosts performance of the secure/bidirule package by
|
||||||
|
// about 30%.
|
||||||
|
// So, to remove this code:
|
||||||
|
// - add option to trie generator to define return type.
|
||||||
|
// - always return 1 byte size for ill-formed UTF-8 runes.
|
||||||
|
|
||||||
|
// Lookup returns properties for the first rune in s and the width in bytes of
|
||||||
|
// its encoding. The size will be 0 if s does not hold enough bytes to complete
|
||||||
|
// the encoding.
|
||||||
|
func Lookup(s []byte) (p Properties, sz int) {
|
||||||
|
c0 := s[0]
|
||||||
|
switch {
|
||||||
|
case c0 < 0x80: // is ASCII
|
||||||
|
return Properties{entry: bidiValues[c0]}, 1
|
||||||
|
case c0 < 0xC2:
|
||||||
|
return Properties{}, 1
|
||||||
|
case c0 < 0xE0: // 2-byte UTF-8
|
||||||
|
if len(s) < 2 {
|
||||||
|
return Properties{}, 0
|
||||||
|
}
|
||||||
|
i := bidiIndex[c0]
|
||||||
|
c1 := s[1]
|
||||||
|
if c1 < 0x80 || 0xC0 <= c1 {
|
||||||
|
return Properties{}, 1
|
||||||
|
}
|
||||||
|
return Properties{entry: trie.lookupValue(uint32(i), c1)}, 2
|
||||||
|
case c0 < 0xF0: // 3-byte UTF-8
|
||||||
|
if len(s) < 3 {
|
||||||
|
return Properties{}, 0
|
||||||
|
}
|
||||||
|
i := bidiIndex[c0]
|
||||||
|
c1 := s[1]
|
||||||
|
if c1 < 0x80 || 0xC0 <= c1 {
|
||||||
|
return Properties{}, 1
|
||||||
|
}
|
||||||
|
o := uint32(i)<<6 + uint32(c1)
|
||||||
|
i = bidiIndex[o]
|
||||||
|
c2 := s[2]
|
||||||
|
if c2 < 0x80 || 0xC0 <= c2 {
|
||||||
|
return Properties{}, 1
|
||||||
|
}
|
||||||
|
return Properties{entry: trie.lookupValue(uint32(i), c2), last: c2}, 3
|
||||||
|
case c0 < 0xF8: // 4-byte UTF-8
|
||||||
|
if len(s) < 4 {
|
||||||
|
return Properties{}, 0
|
||||||
|
}
|
||||||
|
i := bidiIndex[c0]
|
||||||
|
c1 := s[1]
|
||||||
|
if c1 < 0x80 || 0xC0 <= c1 {
|
||||||
|
return Properties{}, 1
|
||||||
|
}
|
||||||
|
o := uint32(i)<<6 + uint32(c1)
|
||||||
|
i = bidiIndex[o]
|
||||||
|
c2 := s[2]
|
||||||
|
if c2 < 0x80 || 0xC0 <= c2 {
|
||||||
|
return Properties{}, 1
|
||||||
|
}
|
||||||
|
o = uint32(i)<<6 + uint32(c2)
|
||||||
|
i = bidiIndex[o]
|
||||||
|
c3 := s[3]
|
||||||
|
if c3 < 0x80 || 0xC0 <= c3 {
|
||||||
|
return Properties{}, 1
|
||||||
|
}
|
||||||
|
return Properties{entry: trie.lookupValue(uint32(i), c3)}, 4
|
||||||
|
}
|
||||||
|
// Illegal rune
|
||||||
|
return Properties{}, 1
|
||||||
|
}
|
||||||
|
|
||||||
|
// LookupString returns properties for the first rune in s and the width in
|
||||||
|
// bytes of its encoding. The size will be 0 if s does not hold enough bytes to
|
||||||
|
// complete the encoding.
|
||||||
|
func LookupString(s string) (p Properties, sz int) {
|
||||||
|
c0 := s[0]
|
||||||
|
switch {
|
||||||
|
case c0 < 0x80: // is ASCII
|
||||||
|
return Properties{entry: bidiValues[c0]}, 1
|
||||||
|
case c0 < 0xC2:
|
||||||
|
return Properties{}, 1
|
||||||
|
case c0 < 0xE0: // 2-byte UTF-8
|
||||||
|
if len(s) < 2 {
|
||||||
|
return Properties{}, 0
|
||||||
|
}
|
||||||
|
i := bidiIndex[c0]
|
||||||
|
c1 := s[1]
|
||||||
|
if c1 < 0x80 || 0xC0 <= c1 {
|
||||||
|
return Properties{}, 1
|
||||||
|
}
|
||||||
|
return Properties{entry: trie.lookupValue(uint32(i), c1)}, 2
|
||||||
|
case c0 < 0xF0: // 3-byte UTF-8
|
||||||
|
if len(s) < 3 {
|
||||||
|
return Properties{}, 0
|
||||||
|
}
|
||||||
|
i := bidiIndex[c0]
|
||||||
|
c1 := s[1]
|
||||||
|
if c1 < 0x80 || 0xC0 <= c1 {
|
||||||
|
return Properties{}, 1
|
||||||
|
}
|
||||||
|
o := uint32(i)<<6 + uint32(c1)
|
||||||
|
i = bidiIndex[o]
|
||||||
|
c2 := s[2]
|
||||||
|
if c2 < 0x80 || 0xC0 <= c2 {
|
||||||
|
return Properties{}, 1
|
||||||
|
}
|
||||||
|
return Properties{entry: trie.lookupValue(uint32(i), c2), last: c2}, 3
|
||||||
|
case c0 < 0xF8: // 4-byte UTF-8
|
||||||
|
if len(s) < 4 {
|
||||||
|
return Properties{}, 0
|
||||||
|
}
|
||||||
|
i := bidiIndex[c0]
|
||||||
|
c1 := s[1]
|
||||||
|
if c1 < 0x80 || 0xC0 <= c1 {
|
||||||
|
return Properties{}, 1
|
||||||
|
}
|
||||||
|
o := uint32(i)<<6 + uint32(c1)
|
||||||
|
i = bidiIndex[o]
|
||||||
|
c2 := s[2]
|
||||||
|
if c2 < 0x80 || 0xC0 <= c2 {
|
||||||
|
return Properties{}, 1
|
||||||
|
}
|
||||||
|
o = uint32(i)<<6 + uint32(c2)
|
||||||
|
i = bidiIndex[o]
|
||||||
|
c3 := s[3]
|
||||||
|
if c3 < 0x80 || 0xC0 <= c3 {
|
||||||
|
return Properties{}, 1
|
||||||
|
}
|
||||||
|
return Properties{entry: trie.lookupValue(uint32(i), c3)}, 4
|
||||||
|
}
|
||||||
|
// Illegal rune
|
||||||
|
return Properties{}, 1
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,60 @@
|
||||||
|
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
|
||||||
|
|
||||||
|
package bidi
|
||||||
|
|
||||||
|
// Class is the Unicode BiDi class. Each rune has a single class.
|
||||||
|
type Class uint
|
||||||
|
|
||||||
|
const (
|
||||||
|
L Class = iota // LeftToRight
|
||||||
|
R // RightToLeft
|
||||||
|
EN // EuropeanNumber
|
||||||
|
ES // EuropeanSeparator
|
||||||
|
ET // EuropeanTerminator
|
||||||
|
AN // ArabicNumber
|
||||||
|
CS // CommonSeparator
|
||||||
|
B // ParagraphSeparator
|
||||||
|
S // SegmentSeparator
|
||||||
|
WS // WhiteSpace
|
||||||
|
ON // OtherNeutral
|
||||||
|
BN // BoundaryNeutral
|
||||||
|
NSM // NonspacingMark
|
||||||
|
AL // ArabicLetter
|
||||||
|
Control // Control LRO - PDI
|
||||||
|
|
||||||
|
numClass
|
||||||
|
|
||||||
|
LRO // LeftToRightOverride
|
||||||
|
RLO // RightToLeftOverride
|
||||||
|
LRE // LeftToRightEmbedding
|
||||||
|
RLE // RightToLeftEmbedding
|
||||||
|
PDF // PopDirectionalFormat
|
||||||
|
LRI // LeftToRightIsolate
|
||||||
|
RLI // RightToLeftIsolate
|
||||||
|
FSI // FirstStrongIsolate
|
||||||
|
PDI // PopDirectionalIsolate
|
||||||
|
|
||||||
|
unknownClass = ^Class(0)
|
||||||
|
)
|
||||||
|
|
||||||
|
var controlToClass = map[rune]Class{
|
||||||
|
0x202D: LRO, // LeftToRightOverride,
|
||||||
|
0x202E: RLO, // RightToLeftOverride,
|
||||||
|
0x202A: LRE, // LeftToRightEmbedding,
|
||||||
|
0x202B: RLE, // RightToLeftEmbedding,
|
||||||
|
0x202C: PDF, // PopDirectionalFormat,
|
||||||
|
0x2066: LRI, // LeftToRightIsolate,
|
||||||
|
0x2067: RLI, // RightToLeftIsolate,
|
||||||
|
0x2068: FSI, // FirstStrongIsolate,
|
||||||
|
0x2069: PDI, // PopDirectionalIsolate,
|
||||||
|
}
|
||||||
|
|
||||||
|
// A trie entry has the following bits:
|
||||||
|
// 7..5 XOR mask for brackets
|
||||||
|
// 4 1: Bracket open, 0: Bracket close
|
||||||
|
// 3..0 Class type
|
||||||
|
|
||||||
|
const (
|
||||||
|
openMask = 0x10
|
||||||
|
xorMaskShift = 5
|
||||||
|
)
|
|
@ -2208,10 +2208,10 @@
|
||||||
"revisionTime": "2017-08-04T00:04:37Z"
|
"revisionTime": "2017-08-04T00:04:37Z"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"checksumSHA1": "GIGmSrYACByf5JDIP9ByBZksY80=",
|
"checksumSHA1": "1osdKBIU5mNqyQqiGmnutoTzdJA=",
|
||||||
"path": "golang.org/x/net/idna",
|
"path": "golang.org/x/net/idna",
|
||||||
"revision": "4971afdc2f162e82d185353533d3cf16188a9f4e",
|
"revision": "a04bdaca5b32abe1c069418fb7088ae607de5bd0",
|
||||||
"revisionTime": "2016-11-15T21:05:04Z"
|
"revisionTime": "2017-10-03T05:09:24Z"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"checksumSHA1": "/k7k6eJDkxXx6K9Zpo/OwNm58XM=",
|
"checksumSHA1": "/k7k6eJDkxXx6K9Zpo/OwNm58XM=",
|
||||||
|
@ -2370,12 +2370,24 @@
|
||||||
"revision": "e56139fd9c5bc7244c76116c68e500765bb6db6b",
|
"revision": "e56139fd9c5bc7244c76116c68e500765bb6db6b",
|
||||||
"revisionTime": "2017-07-04T19:41:35Z"
|
"revisionTime": "2017-07-04T19:41:35Z"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"checksumSHA1": "tltivJ/uj/lqLk05IqGfCv2F/E8=",
|
||||||
|
"path": "golang.org/x/text/secure/bidirule",
|
||||||
|
"revision": "c01e4764d870b77f8abe5096ee19ad20d80e8075",
|
||||||
|
"revisionTime": "2017-10-09T19:53:40Z"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"checksumSHA1": "ziMb9+ANGRJSSIuxYdRbA+cDRBQ=",
|
"checksumSHA1": "ziMb9+ANGRJSSIuxYdRbA+cDRBQ=",
|
||||||
"path": "golang.org/x/text/transform",
|
"path": "golang.org/x/text/transform",
|
||||||
"revision": "1cbadb444a806fd9430d14ad08967ed91da4fa0a",
|
"revision": "1cbadb444a806fd9430d14ad08967ed91da4fa0a",
|
||||||
"revisionTime": "2017-09-13T19:45:57Z"
|
"revisionTime": "2017-09-13T19:45:57Z"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"checksumSHA1": "iB6/RoQIzBaZxVi+t7tzbkwZTlo=",
|
||||||
|
"path": "golang.org/x/text/unicode/bidi",
|
||||||
|
"revision": "c01e4764d870b77f8abe5096ee19ad20d80e8075",
|
||||||
|
"revisionTime": "2017-10-09T19:53:40Z"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"checksumSHA1": "giMB1yxQIKwOLVEjSibt2keNA3k=",
|
"checksumSHA1": "giMB1yxQIKwOLVEjSibt2keNA3k=",
|
||||||
"path": "golang.org/x/text/unicode/cldr",
|
"path": "golang.org/x/text/unicode/cldr",
|
||||||
|
|
Loading…
Reference in New Issue