// Copyright 2019 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Package fuzzy implements a fuzzy matching algorithm. package fuzzy import ( "bytes" "fmt" ) const ( // MaxInputSize is the maximum size of the input scored against the fuzzy matcher. Longer inputs // will be truncated to this size. MaxInputSize = 127 // MaxPatternSize is the maximum size of the pattern used to construct the fuzzy matcher. Longer // inputs are truncated to this size. MaxPatternSize = 63 ) type scoreVal int func (s scoreVal) val() int { return int(s) >> 1 } func (s scoreVal) prevK() int { return int(s) & 1 } func score(val int, prevK int /*0 or 1*/) scoreVal { return scoreVal(val<<1 + prevK) } // Matcher implements a fuzzy matching algorithm for scoring candidates against a pattern. // The matcher does not support parallel usage. type Matcher struct { pattern string patternLower []byte // lower-case version of the pattern patternShort []byte // first characters of the pattern caseSensitive bool // set if the pattern is mix-cased patternRoles []RuneRole // the role of each character in the pattern roles []RuneRole // the role of each character in the tested string scores [MaxInputSize + 1][MaxPatternSize + 1][2]scoreVal scoreScale float32 lastCandidateLen int // in bytes lastCandidateMatched bool // Here we save the last candidate in lower-case. This is basically a byte slice we reuse for // performance reasons, so the slice is not reallocated for every candidate. lowerBuf [MaxInputSize]byte rolesBuf [MaxInputSize]RuneRole } func (m *Matcher) bestK(i, j int) int { if m.scores[i][j][0].val() < m.scores[i][j][1].val() { return 1 } return 0 } // NewMatcher returns a new fuzzy matcher for scoring candidates against the provided pattern. func NewMatcher(pattern string) *Matcher { if len(pattern) > MaxPatternSize { pattern = pattern[:MaxPatternSize] } m := &Matcher{ pattern: pattern, patternLower: ToLower(pattern, nil), } for i, c := range m.patternLower { if pattern[i] != c { m.caseSensitive = true break } } if len(pattern) > 3 { m.patternShort = m.patternLower[:3] } else { m.patternShort = m.patternLower } m.patternRoles = RuneRoles(pattern, nil) if len(pattern) > 0 { maxCharScore := 4 m.scoreScale = 1 / float32(maxCharScore*len(pattern)) } return m } // Score returns the score returned by matching the candidate to the pattern. // This is not designed for parallel use. Multiple candidates must be scored sequentially. // Returns a score between 0 and 1 (0 - no match, 1 - perfect match). func (m *Matcher) Score(candidate string) float32 { if len(candidate) > MaxInputSize { candidate = candidate[:MaxInputSize] } lower := ToLower(candidate, m.lowerBuf[:]) m.lastCandidateLen = len(candidate) if len(m.pattern) == 0 { // Empty patterns perfectly match candidates. return 1 } if m.match(candidate, lower) { sc := m.computeScore(candidate, lower) if sc > minScore/2 && !m.poorMatch() { m.lastCandidateMatched = true if len(m.pattern) == len(candidate) { // Perfect match. return 1 } if sc < 0 { sc = 0 } normalizedScore := float32(sc) * m.scoreScale if normalizedScore > 1 { normalizedScore = 1 } return normalizedScore } } m.lastCandidateMatched = false return 0 } const minScore = -10000 // MatchedRanges returns matches ranges for the last scored string as a flattened array of // [begin, end) byte offset pairs. func (m *Matcher) MatchedRanges() []int { if len(m.pattern) == 0 || !m.lastCandidateMatched { return nil } i, j := m.lastCandidateLen, len(m.pattern) if m.scores[i][j][0].val() < minScore/2 && m.scores[i][j][1].val() < minScore/2 { return nil } var ret []int k := m.bestK(i, j) for i > 0 { take := (k == 1) k = m.scores[i][j][k].prevK() if take { if len(ret) == 0 || ret[len(ret)-1] != i { ret = append(ret, i) ret = append(ret, i-1) } else { ret[len(ret)-1] = i - 1 } j-- } i-- } // Reverse slice. for i := 0; i < len(ret)/2; i++ { ret[i], ret[len(ret)-1-i] = ret[len(ret)-1-i], ret[i] } return ret } func (m *Matcher) match(candidate string, candidateLower []byte) bool { i, j := 0, 0 for ; i < len(candidateLower) && j < len(m.patternLower); i++ { if candidateLower[i] == m.patternLower[j] { j++ } } if j != len(m.patternLower) { return false } // The input passes the simple test against pattern, so it is time to classify its characters. // Character roles are used below to find the last segment. m.roles = RuneRoles(candidate, m.rolesBuf[:]) return true } func (m *Matcher) computeScore(candidate string, candidateLower []byte) int { pattLen, candLen := len(m.pattern), len(candidate) for j := 0; j <= len(m.pattern); j++ { m.scores[0][j][0] = minScore << 1 m.scores[0][j][1] = minScore << 1 } m.scores[0][0][0] = score(0, 0) // Start with 0. segmentsLeft, lastSegStart := 1, 0 for i := 0; i < candLen; i++ { if m.roles[i] == RSep { segmentsLeft++ lastSegStart = i + 1 } } // A per-character bonus for a consecutive match. consecutiveBonus := 2 wordIdx := 0 // Word count within segment. for i := 1; i <= candLen; i++ { role := m.roles[i-1] isHead := role == RHead if isHead { wordIdx++ } else if role == RSep && segmentsLeft > 1 { wordIdx = 0 segmentsLeft-- } var skipPenalty int if i == 1 || (i-1) == lastSegStart { // Skipping the start of first or last segment. skipPenalty++ } for j := 0; j <= pattLen; j++ { // By default, we don't have a match. Fill in the skip data. m.scores[i][j][1] = minScore << 1 // Compute the skip score. k := 0 if m.scores[i-1][j][0].val() < m.scores[i-1][j][1].val() { k = 1 } skipScore := m.scores[i-1][j][k].val() // Do not penalize missing characters after the last matched segment. if j != pattLen { skipScore -= skipPenalty } m.scores[i][j][0] = score(skipScore, k) if j == 0 || candidateLower[i-1] != m.patternLower[j-1] { // Not a match. continue } pRole := m.patternRoles[j-1] if role == RTail && pRole == RHead { if j > 1 { // Not a match: a head in the pattern matches a tail character in the candidate. continue } // Special treatment for the first character of the pattern. We allow // matches in the middle of a word if they are long enough, at least // min(3, pattern.length) characters. if !bytes.HasPrefix(candidateLower[i-1:], m.patternShort) { continue } } // Compute the char score. var charScore int // Bonus 1: the char is in the candidate's last segment. if segmentsLeft <= 1 { charScore++ } // Bonus 2: Case match or a Head in the pattern aligns with one in the word. // Single-case patterns lack segmentation signals and we assume any character // can be a head of a segment. if candidate[i-1] == m.pattern[j-1] || role == RHead && (!m.caseSensitive || pRole == RHead) { charScore++ } // Penalty 1: pattern char is Head, candidate char is Tail. if role == RTail && pRole == RHead { charScore-- } // Penalty 2: first pattern character matched in the middle of a word. if j == 1 && role == RTail { charScore -= 4 } // Third dimension encodes whether there is a gap between the previous match and the current // one. for k := 0; k < 2; k++ { sc := m.scores[i-1][j-1][k].val() + charScore isConsecutive := k == 1 || i-1 == 0 || i-1 == lastSegStart if isConsecutive { // Bonus 3: a consecutive match. First character match also gets a bonus to // ensure prefix final match score normalizes to 1.0. // Logically, this is a part of charScore, but we have to compute it here because it // only applies for consecutive matches (k == 1). sc += consecutiveBonus } if k == 0 { // Penalty 3: Matching inside a segment (and previous char wasn't matched). Penalize for the lack // of alignment. if role == RTail || role == RUCTail { sc -= 3 } } if sc > m.scores[i][j][1].val() { m.scores[i][j][1] = score(sc, k) } } } } result := m.scores[len(candidate)][len(m.pattern)][m.bestK(len(candidate), len(m.pattern))].val() return result } // ScoreTable returns the score table computed for the provided candidate. Used only for debugging. func (m *Matcher) ScoreTable(candidate string) string { var buf bytes.Buffer var line1, line2, separator bytes.Buffer line1.WriteString("\t") line2.WriteString("\t") for j := 0; j < len(m.pattern); j++ { line1.WriteString(fmt.Sprintf("%c\t\t", m.pattern[j])) separator.WriteString("----------------") } buf.WriteString(line1.String()) buf.WriteString("\n") buf.WriteString(separator.String()) buf.WriteString("\n") for i := 1; i <= len(candidate); i++ { line1.Reset() line2.Reset() line1.WriteString(fmt.Sprintf("%c\t", candidate[i-1])) line2.WriteString("\t") for j := 1; j <= len(m.pattern); j++ { line1.WriteString(fmt.Sprintf("M%6d(%c)\t", m.scores[i][j][0].val(), dir(m.scores[i][j][0].prevK()))) line2.WriteString(fmt.Sprintf("H%6d(%c)\t", m.scores[i][j][1].val(), dir(m.scores[i][j][1].prevK()))) } buf.WriteString(line1.String()) buf.WriteString("\n") buf.WriteString(line2.String()) buf.WriteString("\n") buf.WriteString(separator.String()) buf.WriteString("\n") } return buf.String() } func dir(prevK int) rune { if prevK == 0 { return 'M' } return 'H' } func (m *Matcher) poorMatch() bool { if len(m.pattern) < 2 { return false } i, j := m.lastCandidateLen, len(m.pattern) k := m.bestK(i, j) var counter, len int for i > 0 { take := (k == 1) k = m.scores[i][j][k].prevK() if take { len++ if k == 0 && len < 3 && m.roles[i-1] == RTail { // Short match in the middle of a word counter++ if counter > 1 { return true } } j-- } else { len = 0 } i-- } return false }