// Copyright 2019 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package fuzzy import ( "unicode" ) // RuneRole specifies the role of a rune in the context of an input. type RuneRole byte const ( // RNone specifies a rune without any role in the input (i.e., whitespace/non-ASCII). RNone RuneRole = iota // RSep specifies a rune with the role of segment separator. RSep // RTail specifies a rune which is a lower-case tail in a word in the input. RTail // RUCTail specifies a rune which is an upper-case tail in a word in the input. RUCTail // RHead specifies a rune which is the first character in a word in the input. RHead ) // RuneRoles detects the roles of each byte rune in an input string and stores it in the output // slice. The rune role depends on the input type. Stops when it parsed all the runes in the string // or when it filled the output. If output is nil, then it gets created. func RuneRoles(str string, reuse []RuneRole) []RuneRole { var output []RuneRole if cap(reuse) < len(str) { output = make([]RuneRole, 0, len(str)) } else { output = reuse[:0] } prev, prev2 := rtNone, rtNone for i := 0; i < len(str); i++ { r := rune(str[i]) role := RNone curr := rtLower if str[i] <= unicode.MaxASCII { curr = runeType(rt[str[i]] - '0') } if curr == rtLower { if prev == rtNone || prev == rtPunct { role = RHead } else { role = RTail } } else if curr == rtUpper { role = RHead if prev == rtUpper { // This and previous characters are both upper case. if i+1 == len(str) { // This is last character, previous was also uppercase -> this is UCTail // i.e., (current char is C): aBC / BC / ABC role = RUCTail } } } else if curr == rtPunct { switch r { case '.', ':': role = RSep } } if curr != rtLower { if i > 1 && output[i-1] == RHead && prev2 == rtUpper && (output[i-2] == RHead || output[i-2] == RUCTail) { // The previous two characters were uppercase. The current one is not a lower case, so the // previous one can't be a HEAD. Make it a UCTail. // i.e., (last char is current char - B must be a UCTail): ABC / ZABC / AB. output[i-1] = RUCTail } } output = append(output, role) prev2 = prev prev = curr } return output } type runeType byte const ( rtNone runeType = iota rtPunct rtLower rtUpper ) const rt = "00000000000000000000000000000000000000000000001122222222221000000333333333333333333333333330000002222222222222222222222222200000" // LastSegment returns the substring representing the last segment from the input, where each // byte has an associated RuneRole in the roles slice. This makes sense only for inputs of Symbol // or Filename type. func LastSegment(input string, roles []RuneRole) string { // Exclude ending separators. end := len(input) - 1 for end >= 0 && roles[end] == RSep { end-- } if end < 0 { return "" } start := end - 1 for start >= 0 && roles[start] != RSep { start-- } return input[start+1 : end+1] } // ToLower transforms the input string to lower case, which is stored in the output byte slice. // The lower casing considers only ASCII values - non ASCII values are left unmodified. // Stops when parsed all input or when it filled the output slice. If output is nil, then it gets // created. func ToLower(input string, reuse []byte) []byte { output := reuse if cap(reuse) < len(input) { output = make([]byte, len(input)) } for i := 0; i < len(input); i++ { r := rune(input[i]) if r <= unicode.MaxASCII { if 'A' <= r && r <= 'Z' { r += 'a' - 'A' } } output[i] = byte(r) } return output[:len(input)] } // WordConsumer defines a consumer for a word delimited by the [start,end) byte offsets in an input // (start is inclusive, end is exclusive). type WordConsumer func(start, end int) // Words find word delimiters in an input based on its bytes' mappings to rune roles. The offset // delimiters for each word are fed to the provided consumer function. func Words(roles []RuneRole, consume WordConsumer) { var wordStart int for i, r := range roles { switch r { case RUCTail, RTail: case RHead, RNone, RSep: if i != wordStart { consume(wordStart, i) } wordStart = i if r != RHead { // Skip this character. wordStart = i + 1 } } } if wordStart != len(roles) { consume(wordStart, len(roles)) } }