This commit is contained in:
2026-01-18 18:40:26 +00:00
commit 185e73da47
51 changed files with 14073 additions and 0 deletions
+375
View File
@@ -0,0 +1,375 @@
// Package fuzzy provides fuzzy string matching using Levenshtein distance.
package fuzzy
import (
"sort"
"strings"
"unicode"
)
// Match represents a fuzzy match result.
type Match struct {
Text string
Distance int
Similarity float64
Score float64
}
// Matcher provides fuzzy matching capabilities.
type Matcher struct {
threshold int
}
// New creates a new fuzzy matcher with the given threshold.
// Threshold is the maximum edit distance to consider a match (typically 1-3).
func New(threshold int) *Matcher {
return &Matcher{
threshold: threshold,
}
}
// Match performs fuzzy matching of query against candidates.
func (m *Matcher) Match(query string, candidates []string) []Match {
if query == "" {
return nil
}
matches := make([]Match, 0, len(candidates)/10)
queryLower := strings.ToLower(query)
for _, candidate := range candidates {
candidateLower := strings.ToLower(candidate)
// Calculate Levenshtein distance
dist := levenshteinDistance(queryLower, candidateLower)
// Skip if distance exceeds threshold
if dist > m.threshold {
// Check if it's a substring match (important for identifiers)
if !strings.Contains(candidateLower, queryLower) {
continue
}
// Allow substring matches even if edit distance is high
}
// Calculate similarity (0.0 to 1.0)
maxLen := max(len(query), len(candidate))
similarity := 1.0 - float64(dist)/float64(maxLen)
// Calculate composite score
score := m.calculateScore(queryLower, candidateLower, dist, similarity)
matches = append(matches, Match{
Text: candidate,
Distance: dist,
Similarity: similarity,
Score: score,
})
}
// Sort by score descending
sort.Slice(matches, func(i, j int) bool {
return matches[i].Score > matches[j].Score
})
return matches
}
// calculateScore computes a composite score considering multiple factors.
func (m *Matcher) calculateScore(query, candidate string, dist int, similarity float64) float64 {
score := similarity
// Bonus for exact match
if query == candidate {
score += 2.0
}
// Bonus for prefix match (important for identifier search)
if strings.HasPrefix(candidate, query) {
score += 1.0
}
// Bonus for word boundary matches (e.g., "getName" matches "get")
if containsWordBoundary(candidate, query) {
score += 0.5
}
// Penalty for length difference (prefer similar-length matches)
lenDiff := abs(len(candidate) - len(query))
score -= float64(lenDiff) * 0.01
// Penalty for edit distance
score -= float64(dist) * 0.1
return score
}
// levenshteinDistance computes the Levenshtein distance between two strings.
// Uses the Wagner-Fischer algorithm with space optimization O(min(m,n)).
func levenshteinDistance(s1, s2 string) int {
if s1 == s2 {
return 0
}
if len(s1) == 0 {
return len(s2)
}
if len(s2) == 0 {
return len(s1)
}
// Ensure s1 is the shorter string for space optimization
if len(s1) > len(s2) {
s1, s2 = s2, s1
}
// Use rune slices to handle Unicode properly
r1 := []rune(s1)
r2 := []rune(s2)
len1 := len(r1)
len2 := len(r2)
// Only need two rows of the matrix
previous := make([]int, len2+1)
current := make([]int, len2+1)
// Initialize first row
for j := 0; j <= len2; j++ {
previous[j] = j
}
// Calculate edit distance
for i := 1; i <= len1; i++ {
current[0] = i
for j := 1; j <= len2; j++ {
cost := 1
if r1[i-1] == r2[j-1] {
cost = 0
}
current[j] = min(
previous[j]+1, // deletion
current[j-1]+1, // insertion
previous[j-1]+cost, // substitution
)
}
// Swap rows
previous, current = current, previous
}
return previous[len2]
}
// DamerauLevenshteinDistance computes Damerau-Levenshtein distance (includes transpositions).
// This is more accurate for typos where adjacent characters are swapped.
func DamerauLevenshteinDistance(s1, s2 string) int {
if s1 == s2 {
return 0
}
if len(s1) == 0 {
return len(s2)
}
if len(s2) == 0 {
return len(s1)
}
r1 := []rune(s1)
r2 := []rune(s2)
len1 := len(r1)
len2 := len(r2)
// Create distance matrix
d := make([][]int, len1+1)
for i := range d {
d[i] = make([]int, len2+1)
}
// Initialize first row and column
for i := 0; i <= len1; i++ {
d[i][0] = i
}
for j := 0; j <= len2; j++ {
d[0][j] = j
}
// Calculate distances
for i := 1; i <= len1; i++ {
for j := 1; j <= len2; j++ {
cost := 1
if r1[i-1] == r2[j-1] {
cost = 0
}
d[i][j] = min(
d[i-1][j]+1, // deletion
d[i][j-1]+1, // insertion
d[i-1][j-1]+cost, // substitution
)
// Check for transposition
if i > 1 && j > 1 && r1[i-1] == r2[j-2] && r1[i-2] == r2[j-1] {
d[i][j] = min(d[i][j], d[i-2][j-2]+cost)
}
}
}
return d[len1][len2]
}
// JaroWinklerSimilarity computes Jaro-Winkler similarity (0.0 to 1.0).
// Better for short strings and names.
func JaroWinklerSimilarity(s1, s2 string) float64 {
if s1 == s2 {
return 1.0
}
r1 := []rune(s1)
r2 := []rune(s2)
if len(r1) == 0 || len(r2) == 0 {
return 0.0
}
// Calculate Jaro similarity first
jaro := jaroSimilarity(r1, r2)
// Calculate common prefix length (up to 4 characters)
prefixLen := 0
for i := 0; i < min(min(len(r1), len(r2)), 4); i++ {
if r1[i] == r2[i] {
prefixLen++
} else {
break
}
}
// Jaro-Winkler adds bonus for common prefix
const p = 0.1
return jaro + float64(prefixLen)*p*(1.0-jaro)
}
// jaroSimilarity computes Jaro similarity.
func jaroSimilarity(r1, r2 []rune) float64 {
len1 := len(r1)
len2 := len(r2)
// Maximum allowed distance
matchDist := max(len1, len2)/2 - 1
if matchDist < 0 {
matchDist = 0
}
matched1 := make([]bool, len1)
matched2 := make([]bool, len2)
matches := 0
transpositions := 0
// Find matches
for i := range len1 {
start := max(0, i-matchDist)
end := min(i+matchDist+1, len2)
for j := start; j < end; j++ {
if matched2[j] || r1[i] != r2[j] {
continue
}
matched1[i] = true
matched2[j] = true
matches++
break
}
}
if matches == 0 {
return 0.0
}
// Count transpositions
k := 0
for i := range len1 {
if !matched1[i] {
continue
}
for !matched2[k] {
k++
}
if r1[i] != r2[k] {
transpositions++
}
k++
}
return (float64(matches)/float64(len1) +
float64(matches)/float64(len2) +
float64(matches-transpositions/2)/float64(matches)) / 3.0
}
// containsWordBoundary checks if query appears at word boundaries in text.
func containsWordBoundary(text, query string) bool {
textLower := strings.ToLower(text)
queryLower := strings.ToLower(query)
idx := strings.Index(textLower, queryLower)
if idx == -1 {
return false
}
// Check if match is at start
if idx == 0 {
return true
}
// Check for underscore or non-alphanumeric boundary
prevRune := rune(text[idx-1])
if !unicode.IsLetter(prevRune) && !unicode.IsDigit(prevRune) {
return true
}
// Check for camelCase boundary (lowercase before uppercase)
if idx > 0 && len(text) > idx {
curr := rune(text[idx])
prev := rune(text[idx-1])
if unicode.IsLower(prev) && unicode.IsUpper(curr) {
return true
}
}
return false
}
// Helper functions
func min(values ...int) int {
if len(values) == 0 {
return 0
}
m := values[0]
for _, v := range values[1:] {
if v < m {
m = v
}
}
return m
}
func max(values ...int) int {
if len(values) == 0 {
return 0
}
m := values[0]
for _, v := range values[1:] {
if v > m {
m = v
}
}
return m
}
func abs(x int) int {
if x < 0 {
return -x
}
return x
}
+275
View File
@@ -0,0 +1,275 @@
package fuzzy
import (
"testing"
)
func TestLevenshteinDistance(t *testing.T) {
tests := []struct {
s1 string
s2 string
expected int
}{
{"", "", 0},
{"", "abc", 3},
{"abc", "", 3},
{"abc", "abc", 0},
{"abc", "abd", 1},
{"kitten", "sitting", 3},
{"saturday", "sunday", 3},
{"book", "back", 2},
{"café", "cafe", 1}, // Unicode handling
}
for _, tt := range tests {
got := levenshteinDistance(tt.s1, tt.s2)
if got != tt.expected {
t.Errorf("levenshteinDistance(%q, %q) = %d, want %d", tt.s1, tt.s2, got, tt.expected)
}
}
}
func TestDamerauLevenshteinDistance(t *testing.T) {
tests := []struct {
s1 string
s2 string
expected int
}{
{"abc", "abc", 0},
{"abc", "acb", 1}, // Transposition
{"ca", "abc", 3}, // Delete a, delete b, insert c = 3 operations
{"", "abc", 3},
}
for _, tt := range tests {
got := DamerauLevenshteinDistance(tt.s1, tt.s2)
if got != tt.expected {
t.Errorf("DamerauLevenshteinDistance(%q, %q) = %d, want %d", tt.s1, tt.s2, got, tt.expected)
}
}
}
func TestJaroWinklerSimilarity(t *testing.T) {
tests := []struct {
s1 string
s2 string
minScore float64 // Minimum expected similarity
}{
{"", "", 1.0},
{"abc", "abc", 1.0},
{"martha", "marhta", 0.96}, // High similarity for transposition
{"dixon", "dicksonx", 0.76}, // Moderate similarity
{"", "abc", 0.0},
}
for _, tt := range tests {
got := JaroWinklerSimilarity(tt.s1, tt.s2)
if got < tt.minScore {
t.Errorf("JaroWinklerSimilarity(%q, %q) = %.2f, want >= %.2f", tt.s1, tt.s2, got, tt.minScore)
}
}
}
func TestMatcher_Match(t *testing.T) {
m := New(2) // Allow edit distance up to 2
candidates := []string{
"getUserName",
"getUsername",
"get_user_name",
"getUserId",
"setUserName",
"findUser",
"userName",
"usernameField",
}
tests := []struct {
query string
topMatch string
expectMin int
}{
{
query: "getUserName",
expectMin: 3, // Exact + similar variants
topMatch: "getUserName",
},
{
query: "getuser",
expectMin: 2, // Should match getUserName, getUsername at minimum
topMatch: "getUserName",
},
{
query: "username",
expectMin: 2, // Case-insensitive matches
topMatch: "userName",
},
}
for _, tt := range tests {
matches := m.Match(tt.query, candidates)
if len(matches) < tt.expectMin {
t.Errorf("Match(%q) returned %d matches, want at least %d", tt.query, len(matches), tt.expectMin)
}
if len(matches) > 0 {
// Top match should have highest score
if matches[0].Score < matches[len(matches)-1].Score {
t.Errorf("Match(%q) results not sorted by score", tt.query)
}
}
}
}
func TestMatcher_EmptyQuery(t *testing.T) {
m := New(2)
candidates := []string{"test", "example"}
matches := m.Match("", candidates)
if matches != nil {
t.Errorf("Match with empty query should return nil, got %v", matches)
}
}
func TestMatcher_PrefixBonus(t *testing.T) {
m := New(2)
candidates := []string{
"getUserName", // prefix match
"findUserName", // contains but not prefix
}
matches := m.Match("get", candidates)
if len(matches) < 1 {
t.Fatal("Expected at least one match")
}
// Prefix match should score higher
if matches[0].Text != "getUserName" {
t.Errorf("Expected prefix match to rank first, got %q", matches[0].Text)
}
}
func TestMatcher_ExactMatchBonus(t *testing.T) {
m := New(2)
candidates := []string{
"test",
"testing",
"tester",
}
matches := m.Match("test", candidates)
if len(matches) < 1 {
t.Fatal("Expected at least one match")
}
// Exact match should rank first
if matches[0].Text != "test" {
t.Errorf("Expected exact match to rank first, got %q", matches[0].Text)
}
// Exact match should have highest score
if matches[0].Score < 2.0 { // Should have exact match bonus
t.Errorf("Exact match score too low: %.2f", matches[0].Score)
}
}
func TestContainsWordBoundary(t *testing.T) {
tests := []struct {
text string
query string
expected bool
}{
{"getUserName", "get", true}, // At start
{"getUserName", "user", true}, // After lowercase->uppercase boundary
{"get_user_name", "user", true}, // After underscore
{"getUserName", "Name", true}, // After lowercase->uppercase
{"getUserName", "ser", false}, // Middle of word
{"", "test", false}, // Empty text
}
for _, tt := range tests {
got := containsWordBoundary(tt.text, tt.query)
if got != tt.expected {
t.Errorf("containsWordBoundary(%q, %q) = %v, want %v", tt.text, tt.query, got, tt.expected)
}
}
}
func TestMatcher_UnicodeHandling(t *testing.T) {
m := New(2)
candidates := []string{
"café",
"resume",
"naïve",
}
// Test with Unicode characters
matches := m.Match("cafe", candidates)
if len(matches) == 0 {
t.Error("Expected matches for Unicode strings")
}
// Should find café with small edit distance
found := false
for _, match := range matches {
if match.Text == "café" && match.Distance <= 2 {
found = true
break
}
}
if !found {
t.Error("Failed to fuzzy match Unicode string 'café'")
}
}
func BenchmarkLevenshteinDistance(b *testing.B) {
s1 := "the quick brown fox jumps over the lazy dog"
s2 := "the quikc brown fox jumps ovver the lazy dog"
b.ResetTimer()
for i := range b.N {
_ = levenshteinDistance(s1, s2)
_ = i // use i to avoid unused warning
}
}
func BenchmarkDamerauLevenshteinDistance(b *testing.B) {
s1 := "the quick brown fox jumps over the lazy dog"
s2 := "the quikc brown fox jumps ovver the lazy dog"
b.ResetTimer()
for i := range b.N {
_ = DamerauLevenshteinDistance(s1, s2)
_ = i
}
}
func BenchmarkJaroWinklerSimilarity(b *testing.B) {
s1 := "martha"
s2 := "marhta"
b.ResetTimer()
for i := range b.N {
_ = JaroWinklerSimilarity(s1, s2)
_ = i
}
}
func BenchmarkMatcher_Match(b *testing.B) {
m := New(2)
candidates := []string{
"getUserName", "getUsername", "get_user_name", "getUserId",
"setUserName", "findUser", "userName", "usernameField",
"userAccount", "accountUser", "userProfile", "profileUser",
}
b.ResetTimer()
for i := range b.N {
_ = m.Match("getuser", candidates)
_ = i
}
}