1 // Copyright 2019 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
11 // RuneRole specifies the role of a rune in the context of an input.
15 // RNone specifies a rune without any role in the input (i.e., whitespace/non-ASCII).
17 // RSep specifies a rune with the role of segment separator.
19 // RTail specifies a rune which is a lower-case tail in a word in the input.
21 // RUCTail specifies a rune which is an upper-case tail in a word in the input.
23 // RHead specifies a rune which is the first character in a word in the input.
27 // RuneRoles detects the roles of each byte rune in an input string and stores it in the output
28 // slice. The rune role depends on the input type. Stops when it parsed all the runes in the string
29 // or when it filled the output. If output is nil, then it gets created.
30 func RuneRoles(str string, reuse []RuneRole) []RuneRole {
32 if cap(reuse) < len(str) {
33 output = make([]RuneRole, 0, len(str))
38 prev, prev2 := rtNone, rtNone
39 for i := 0; i < len(str); i++ {
45 if str[i] <= unicode.MaxASCII {
46 curr = runeType(rt[str[i]] - '0')
50 if prev == rtNone || prev == rtPunct {
55 } else if curr == rtUpper {
59 // This and previous characters are both upper case.
62 // This is last character, previous was also uppercase -> this is UCTail
63 // i.e., (current char is C): aBC / BC / ABC
67 } else if curr == rtPunct {
74 if i > 1 && output[i-1] == RHead && prev2 == rtUpper && (output[i-2] == RHead || output[i-2] == RUCTail) {
75 // The previous two characters were uppercase. The current one is not a lower case, so the
76 // previous one can't be a HEAD. Make it a UCTail.
77 // i.e., (last char is current char - B must be a UCTail): ABC / ZABC / AB.
82 output = append(output, role)
92 rtNone runeType = iota
98 const rt = "00000000000000000000000000000000000000000000001122222222221000000333333333333333333333333330000002222222222222222222222222200000"
100 // LastSegment returns the substring representing the last segment from the input, where each
101 // byte has an associated RuneRole in the roles slice. This makes sense only for inputs of Symbol
103 func LastSegment(input string, roles []RuneRole) string {
104 // Exclude ending separators.
105 end := len(input) - 1
106 for end >= 0 && roles[end] == RSep {
114 for start >= 0 && roles[start] != RSep {
118 return input[start+1 : end+1]
121 // ToLower transforms the input string to lower case, which is stored in the output byte slice.
122 // The lower casing considers only ASCII values - non ASCII values are left unmodified.
123 // Stops when parsed all input or when it filled the output slice. If output is nil, then it gets
125 func ToLower(input string, reuse []byte) []byte {
127 if cap(reuse) < len(input) {
128 output = make([]byte, len(input))
131 for i := 0; i < len(input); i++ {
133 if r <= unicode.MaxASCII {
134 if 'A' <= r && r <= 'Z' {
140 return output[:len(input)]
143 // WordConsumer defines a consumer for a word delimited by the [start,end) byte offsets in an input
144 // (start is inclusive, end is exclusive).
145 type WordConsumer func(start, end int)
147 // Words find word delimiters in an input based on its bytes' mappings to rune roles. The offset
148 // delimiters for each word are fed to the provided consumer function.
149 func Words(roles []RuneRole, consume WordConsumer) {
151 for i, r := range roles {
154 case RHead, RNone, RSep:
156 consume(wordStart, i)
160 // Skip this character.
165 if wordStart != len(roles) {
166 consume(wordStart, len(roles))