1 // Copyright 2019 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Package fuzzy implements a fuzzy matching algorithm.
14 // MaxInputSize is the maximum size of the input scored against the fuzzy matcher. Longer inputs
15 // will be truncated to this size.
17 // MaxPatternSize is the maximum size of the pattern used to construct the fuzzy matcher. Longer
18 // inputs are truncated to this size.
24 func (s scoreVal) val() int {
28 func (s scoreVal) prevK() int {
32 func score(val int, prevK int /*0 or 1*/) scoreVal {
33 return scoreVal(val<<1 + prevK)
36 // Matcher implements a fuzzy matching algorithm for scoring candidates against a pattern.
37 // The matcher does not support parallel usage.
40 patternLower []byte // lower-case version of the pattern
41 patternShort []byte // first characters of the pattern
42 caseSensitive bool // set if the pattern is mix-cased
44 patternRoles []RuneRole // the role of each character in the pattern
45 roles []RuneRole // the role of each character in the tested string
47 scores [MaxInputSize + 1][MaxPatternSize + 1][2]scoreVal
51 lastCandidateLen int // in bytes
52 lastCandidateMatched bool
54 // Here we save the last candidate in lower-case. This is basically a byte slice we reuse for
55 // performance reasons, so the slice is not reallocated for every candidate.
56 lowerBuf [MaxInputSize]byte
57 rolesBuf [MaxInputSize]RuneRole
60 func (m *Matcher) bestK(i, j int) int {
61 if m.scores[i][j][0].val() < m.scores[i][j][1].val() {
67 // NewMatcher returns a new fuzzy matcher for scoring candidates against the provided pattern.
68 func NewMatcher(pattern string) *Matcher {
69 if len(pattern) > MaxPatternSize {
70 pattern = pattern[:MaxPatternSize]
75 patternLower: ToLower(pattern, nil),
78 for i, c := range m.patternLower {
80 m.caseSensitive = true
86 m.patternShort = m.patternLower[:3]
88 m.patternShort = m.patternLower
91 m.patternRoles = RuneRoles(pattern, nil)
95 m.scoreScale = 1 / float32(maxCharScore*len(pattern))
101 // Score returns the score returned by matching the candidate to the pattern.
102 // This is not designed for parallel use. Multiple candidates must be scored sequentially.
103 // Returns a score between 0 and 1 (0 - no match, 1 - perfect match).
104 func (m *Matcher) Score(candidate string) float32 {
105 if len(candidate) > MaxInputSize {
106 candidate = candidate[:MaxInputSize]
108 lower := ToLower(candidate, m.lowerBuf[:])
109 m.lastCandidateLen = len(candidate)
111 if len(m.pattern) == 0 {
112 // Empty patterns perfectly match candidates.
116 if m.match(candidate, lower) {
117 sc := m.computeScore(candidate, lower)
118 if sc > minScore/2 && !m.poorMatch() {
119 m.lastCandidateMatched = true
120 if len(m.pattern) == len(candidate) {
128 normalizedScore := float32(sc) * m.scoreScale
129 if normalizedScore > 1 {
133 return normalizedScore
137 m.lastCandidateMatched = false
141 const minScore = -10000
143 // MatchedRanges returns matches ranges for the last scored string as a flattened array of
144 // [begin, end) byte offset pairs.
145 func (m *Matcher) MatchedRanges() []int {
146 if len(m.pattern) == 0 || !m.lastCandidateMatched {
149 i, j := m.lastCandidateLen, len(m.pattern)
150 if m.scores[i][j][0].val() < minScore/2 && m.scores[i][j][1].val() < minScore/2 {
158 k = m.scores[i][j][k].prevK()
160 if len(ret) == 0 || ret[len(ret)-1] != i {
162 ret = append(ret, i-1)
164 ret[len(ret)-1] = i - 1
171 for i := 0; i < len(ret)/2; i++ {
172 ret[i], ret[len(ret)-1-i] = ret[len(ret)-1-i], ret[i]
177 func (m *Matcher) match(candidate string, candidateLower []byte) bool {
179 for ; i < len(candidateLower) && j < len(m.patternLower); i++ {
180 if candidateLower[i] == m.patternLower[j] {
184 if j != len(m.patternLower) {
188 // The input passes the simple test against pattern, so it is time to classify its characters.
189 // Character roles are used below to find the last segment.
190 m.roles = RuneRoles(candidate, m.rolesBuf[:])
195 func (m *Matcher) computeScore(candidate string, candidateLower []byte) int {
196 pattLen, candLen := len(m.pattern), len(candidate)
198 for j := 0; j <= len(m.pattern); j++ {
199 m.scores[0][j][0] = minScore << 1
200 m.scores[0][j][1] = minScore << 1
202 m.scores[0][0][0] = score(0, 0) // Start with 0.
204 segmentsLeft, lastSegStart := 1, 0
205 for i := 0; i < candLen; i++ {
206 if m.roles[i] == RSep {
212 // A per-character bonus for a consecutive match.
213 consecutiveBonus := 2
214 wordIdx := 0 // Word count within segment.
215 for i := 1; i <= candLen; i++ {
218 isHead := role == RHead
222 } else if role == RSep && segmentsLeft > 1 {
228 if i == 1 || (i-1) == lastSegStart {
229 // Skipping the start of first or last segment.
233 for j := 0; j <= pattLen; j++ {
234 // By default, we don't have a match. Fill in the skip data.
235 m.scores[i][j][1] = minScore << 1
237 // Compute the skip score.
239 if m.scores[i-1][j][0].val() < m.scores[i-1][j][1].val() {
243 skipScore := m.scores[i-1][j][k].val()
244 // Do not penalize missing characters after the last matched segment.
246 skipScore -= skipPenalty
248 m.scores[i][j][0] = score(skipScore, k)
250 if j == 0 || candidateLower[i-1] != m.patternLower[j-1] {
254 pRole := m.patternRoles[j-1]
256 if role == RTail && pRole == RHead {
258 // Not a match: a head in the pattern matches a tail character in the candidate.
261 // Special treatment for the first character of the pattern. We allow
262 // matches in the middle of a word if they are long enough, at least
263 // min(3, pattern.length) characters.
264 if !bytes.HasPrefix(candidateLower[i-1:], m.patternShort) {
269 // Compute the char score.
271 // Bonus 1: the char is in the candidate's last segment.
272 if segmentsLeft <= 1 {
275 // Bonus 2: Case match or a Head in the pattern aligns with one in the word.
276 // Single-case patterns lack segmentation signals and we assume any character
277 // can be a head of a segment.
278 if candidate[i-1] == m.pattern[j-1] || role == RHead && (!m.caseSensitive || pRole == RHead) {
282 // Penalty 1: pattern char is Head, candidate char is Tail.
283 if role == RTail && pRole == RHead {
286 // Penalty 2: first pattern character matched in the middle of a word.
287 if j == 1 && role == RTail {
291 // Third dimension encodes whether there is a gap between the previous match and the current
293 for k := 0; k < 2; k++ {
294 sc := m.scores[i-1][j-1][k].val() + charScore
296 isConsecutive := k == 1 || i-1 == 0 || i-1 == lastSegStart
298 // Bonus 3: a consecutive match. First character match also gets a bonus to
299 // ensure prefix final match score normalizes to 1.0.
300 // Logically, this is a part of charScore, but we have to compute it here because it
301 // only applies for consecutive matches (k == 1).
302 sc += consecutiveBonus
305 // Penalty 3: Matching inside a segment (and previous char wasn't matched). Penalize for the lack
307 if role == RTail || role == RUCTail {
312 if sc > m.scores[i][j][1].val() {
313 m.scores[i][j][1] = score(sc, k)
319 result := m.scores[len(candidate)][len(m.pattern)][m.bestK(len(candidate), len(m.pattern))].val()
324 // ScoreTable returns the score table computed for the provided candidate. Used only for debugging.
325 func (m *Matcher) ScoreTable(candidate string) string {
328 var line1, line2, separator bytes.Buffer
329 line1.WriteString("\t")
330 line2.WriteString("\t")
331 for j := 0; j < len(m.pattern); j++ {
332 line1.WriteString(fmt.Sprintf("%c\t\t", m.pattern[j]))
333 separator.WriteString("----------------")
336 buf.WriteString(line1.String())
337 buf.WriteString("\n")
338 buf.WriteString(separator.String())
339 buf.WriteString("\n")
341 for i := 1; i <= len(candidate); i++ {
345 line1.WriteString(fmt.Sprintf("%c\t", candidate[i-1]))
346 line2.WriteString("\t")
348 for j := 1; j <= len(m.pattern); j++ {
349 line1.WriteString(fmt.Sprintf("M%6d(%c)\t", m.scores[i][j][0].val(), dir(m.scores[i][j][0].prevK())))
350 line2.WriteString(fmt.Sprintf("H%6d(%c)\t", m.scores[i][j][1].val(), dir(m.scores[i][j][1].prevK())))
352 buf.WriteString(line1.String())
353 buf.WriteString("\n")
354 buf.WriteString(line2.String())
355 buf.WriteString("\n")
356 buf.WriteString(separator.String())
357 buf.WriteString("\n")
363 func dir(prevK int) rune {
370 func (m *Matcher) poorMatch() bool {
371 if len(m.pattern) < 2 {
375 i, j := m.lastCandidateLen, len(m.pattern)
381 k = m.scores[i][j][k].prevK()
384 if k == 0 && len < 3 && m.roles[i-1] == RTail {
385 // Short match in the middle of a word