Ho hum.

2026-06-11 23:09:02 +00:00 · 2026-01-18 18:40:26 +00:00
commit 185e73da47
51 changed files with 14073 additions and 0 deletions
@@ -0,0 +1,375 @@
+// Package fuzzy provides fuzzy string matching using Levenshtein distance.
+package fuzzy
+
+import (
+	"sort"
+	"strings"
+	"unicode"
+)
+
+// Match represents a fuzzy match result.
+type Match struct {
+	Text       string
+	Distance   int
+	Similarity float64
+	Score      float64
+}
+
+// Matcher provides fuzzy matching capabilities.
+type Matcher struct {
+	threshold int
+}
+
+// New creates a new fuzzy matcher with the given threshold.
+// Threshold is the maximum edit distance to consider a match (typically 1-3).
+func New(threshold int) *Matcher {
+	return &Matcher{
+		threshold: threshold,
+	}
+}
+
+// Match performs fuzzy matching of query against candidates.
+func (m *Matcher) Match(query string, candidates []string) []Match {
+	if query == "" {
+		return nil
+	}
+
+	matches := make([]Match, 0, len(candidates)/10)
+	queryLower := strings.ToLower(query)
+
+	for _, candidate := range candidates {
+		candidateLower := strings.ToLower(candidate)
+
+		// Calculate Levenshtein distance
+		dist := levenshteinDistance(queryLower, candidateLower)
+
+		// Skip if distance exceeds threshold
+		if dist > m.threshold {
+			// Check if it's a substring match (important for identifiers)
+			if !strings.Contains(candidateLower, queryLower) {
+				continue
+			}
+			// Allow substring matches even if edit distance is high
+		}
+
+		// Calculate similarity (0.0 to 1.0)
+		maxLen := max(len(query), len(candidate))
+		similarity := 1.0 - float64(dist)/float64(maxLen)
+
+		// Calculate composite score
+		score := m.calculateScore(queryLower, candidateLower, dist, similarity)
+
+		matches = append(matches, Match{
+			Text:       candidate,
+			Distance:   dist,
+			Similarity: similarity,
+			Score:      score,
+		})
+	}
+
+	// Sort by score descending
+	sort.Slice(matches, func(i, j int) bool {
+		return matches[i].Score > matches[j].Score
+	})
+
+	return matches
+}
+
+// calculateScore computes a composite score considering multiple factors.
+func (m *Matcher) calculateScore(query, candidate string, dist int, similarity float64) float64 {
+	score := similarity
+
+	// Bonus for exact match
+	if query == candidate {
+		score += 2.0
+	}
+
+	// Bonus for prefix match (important for identifier search)
+	if strings.HasPrefix(candidate, query) {
+		score += 1.0
+	}
+
+	// Bonus for word boundary matches (e.g., "getName" matches "get")
+	if containsWordBoundary(candidate, query) {
+		score += 0.5
+	}
+
+	// Penalty for length difference (prefer similar-length matches)
+	lenDiff := abs(len(candidate) - len(query))
+	score -= float64(lenDiff) * 0.01
+
+	// Penalty for edit distance
+	score -= float64(dist) * 0.1
+
+	return score
+}
+
+// levenshteinDistance computes the Levenshtein distance between two strings.
+// Uses the Wagner-Fischer algorithm with space optimization O(min(m,n)).
+func levenshteinDistance(s1, s2 string) int {
+	if s1 == s2 {
+		return 0
+	}
+	if len(s1) == 0 {
+		return len(s2)
+	}
+	if len(s2) == 0 {
+		return len(s1)
+	}
+
+	// Ensure s1 is the shorter string for space optimization
+	if len(s1) > len(s2) {
+		s1, s2 = s2, s1
+	}
+
+	// Use rune slices to handle Unicode properly
+	r1 := []rune(s1)
+	r2 := []rune(s2)
+	len1 := len(r1)
+	len2 := len(r2)
+
+	// Only need two rows of the matrix
+	previous := make([]int, len2+1)
+	current := make([]int, len2+1)
+
+	// Initialize first row
+	for j := 0; j <= len2; j++ {
+		previous[j] = j
+	}
+
+	// Calculate edit distance
+	for i := 1; i <= len1; i++ {
+		current[0] = i
+
+		for j := 1; j <= len2; j++ {
+			cost := 1
+			if r1[i-1] == r2[j-1] {
+				cost = 0
+			}
+
+			current[j] = min(
+				previous[j]+1,      // deletion
+				current[j-1]+1,     // insertion
+				previous[j-1]+cost, // substitution
+			)
+		}
+
+		// Swap rows
+		previous, current = current, previous
+	}
+
+	return previous[len2]
+}
+
+// DamerauLevenshteinDistance computes Damerau-Levenshtein distance (includes transpositions).
+// This is more accurate for typos where adjacent characters are swapped.
+func DamerauLevenshteinDistance(s1, s2 string) int {
+	if s1 == s2 {
+		return 0
+	}
+	if len(s1) == 0 {
+		return len(s2)
+	}
+	if len(s2) == 0 {
+		return len(s1)
+	}
+
+	r1 := []rune(s1)
+	r2 := []rune(s2)
+	len1 := len(r1)
+	len2 := len(r2)
+
+	// Create distance matrix
+	d := make([][]int, len1+1)
+	for i := range d {
+		d[i] = make([]int, len2+1)
+	}
+
+	// Initialize first row and column
+	for i := 0; i <= len1; i++ {
+		d[i][0] = i
+	}
+	for j := 0; j <= len2; j++ {
+		d[0][j] = j
+	}
+
+	// Calculate distances
+	for i := 1; i <= len1; i++ {
+		for j := 1; j <= len2; j++ {
+			cost := 1
+			if r1[i-1] == r2[j-1] {
+				cost = 0
+			}
+
+			d[i][j] = min(
+				d[i-1][j]+1,      // deletion
+				d[i][j-1]+1,      // insertion
+				d[i-1][j-1]+cost, // substitution
+			)
+
+			// Check for transposition
+			if i > 1 && j > 1 && r1[i-1] == r2[j-2] && r1[i-2] == r2[j-1] {
+				d[i][j] = min(d[i][j], d[i-2][j-2]+cost)
+			}
+		}
+	}
+
+	return d[len1][len2]
+}
+
+// JaroWinklerSimilarity computes Jaro-Winkler similarity (0.0 to 1.0).
+// Better for short strings and names.
+func JaroWinklerSimilarity(s1, s2 string) float64 {
+	if s1 == s2 {
+		return 1.0
+	}
+
+	r1 := []rune(s1)
+	r2 := []rune(s2)
+
+	if len(r1) == 0 || len(r2) == 0 {
+		return 0.0
+	}
+
+	// Calculate Jaro similarity first
+	jaro := jaroSimilarity(r1, r2)
+
+	// Calculate common prefix length (up to 4 characters)
+	prefixLen := 0
+	for i := 0; i < min(min(len(r1), len(r2)), 4); i++ {
+		if r1[i] == r2[i] {
+			prefixLen++
+		} else {
+			break
+		}
+	}
+
+	// Jaro-Winkler adds bonus for common prefix
+	const p = 0.1
+	return jaro + float64(prefixLen)*p*(1.0-jaro)
+}
+
+// jaroSimilarity computes Jaro similarity.
+func jaroSimilarity(r1, r2 []rune) float64 {
+	len1 := len(r1)
+	len2 := len(r2)
+
+	// Maximum allowed distance
+	matchDist := max(len1, len2)/2 - 1
+	if matchDist < 0 {
+		matchDist = 0
+	}
+
+	matched1 := make([]bool, len1)
+	matched2 := make([]bool, len2)
+
+	matches := 0
+	transpositions := 0
+
+	// Find matches
+	for i := range len1 {
+		start := max(0, i-matchDist)
+		end := min(i+matchDist+1, len2)
+
+		for j := start; j < end; j++ {
+			if matched2[j] || r1[i] != r2[j] {
+				continue
+			}
+			matched1[i] = true
+			matched2[j] = true
+			matches++
+			break
+		}
+	}
+
+	if matches == 0 {
+		return 0.0
+	}
+
+	// Count transpositions
+	k := 0
+	for i := range len1 {
+		if !matched1[i] {
+			continue
+		}
+		for !matched2[k] {
+			k++
+		}
+		if r1[i] != r2[k] {
+			transpositions++
+		}
+		k++
+	}
+
+	return (float64(matches)/float64(len1) +
+		float64(matches)/float64(len2) +
+		float64(matches-transpositions/2)/float64(matches)) / 3.0
+}
+
+// containsWordBoundary checks if query appears at word boundaries in text.
+func containsWordBoundary(text, query string) bool {
+	textLower := strings.ToLower(text)
+	queryLower := strings.ToLower(query)
+
+	idx := strings.Index(textLower, queryLower)
+	if idx == -1 {
+		return false
+	}
+
+	// Check if match is at start
+	if idx == 0 {
+		return true
+	}
+
+	// Check for underscore or non-alphanumeric boundary
+	prevRune := rune(text[idx-1])
+	if !unicode.IsLetter(prevRune) && !unicode.IsDigit(prevRune) {
+		return true
+	}
+
+	// Check for camelCase boundary (lowercase before uppercase)
+	if idx > 0 && len(text) > idx {
+		curr := rune(text[idx])
+		prev := rune(text[idx-1])
+		if unicode.IsLower(prev) && unicode.IsUpper(curr) {
+			return true
+		}
+	}
+
+	return false
+}
+
+// Helper functions
+
+func min(values ...int) int {
+	if len(values) == 0 {
+		return 0
+	}
+	m := values[0]
+	for _, v := range values[1:] {
+		if v < m {
+			m = v
+		}
+	}
+	return m
+}
+
+func max(values ...int) int {
+	if len(values) == 0 {
+		return 0
+	}
+	m := values[0]
+	for _, v := range values[1:] {
+		if v > m {
+			m = v
+		}
+	}
+	return m
+}
+
+func abs(x int) int {
+	if x < 0 {
+		return -x
+	}
+	return x
+}
@@ -0,0 +1,275 @@
+package fuzzy
+
+import (
+	"testing"
+)
+
+func TestLevenshteinDistance(t *testing.T) {
+	tests := []struct {
+		s1       string
+		s2       string
+		expected int
+	}{
+		{"", "", 0},
+		{"", "abc", 3},
+		{"abc", "", 3},
+		{"abc", "abc", 0},
+		{"abc", "abd", 1},
+		{"kitten", "sitting", 3},
+		{"saturday", "sunday", 3},
+		{"book", "back", 2},
+		{"café", "cafe", 1}, // Unicode handling
+	}
+
+	for _, tt := range tests {
+		got := levenshteinDistance(tt.s1, tt.s2)
+		if got != tt.expected {
+			t.Errorf("levenshteinDistance(%q, %q) = %d, want %d", tt.s1, tt.s2, got, tt.expected)
+		}
+	}
+}
+
+func TestDamerauLevenshteinDistance(t *testing.T) {
+	tests := []struct {
+		s1       string
+		s2       string
+		expected int
+	}{
+		{"abc", "abc", 0},
+		{"abc", "acb", 1}, // Transposition
+		{"ca", "abc", 3},  // Delete a, delete b, insert c = 3 operations
+		{"", "abc", 3},
+	}
+
+	for _, tt := range tests {
+		got := DamerauLevenshteinDistance(tt.s1, tt.s2)
+		if got != tt.expected {
+			t.Errorf("DamerauLevenshteinDistance(%q, %q) = %d, want %d", tt.s1, tt.s2, got, tt.expected)
+		}
+	}
+}
+
+func TestJaroWinklerSimilarity(t *testing.T) {
+	tests := []struct {
+		s1       string
+		s2       string
+		minScore float64 // Minimum expected similarity
+	}{
+		{"", "", 1.0},
+		{"abc", "abc", 1.0},
+		{"martha", "marhta", 0.96},  // High similarity for transposition
+		{"dixon", "dicksonx", 0.76}, // Moderate similarity
+		{"", "abc", 0.0},
+	}
+
+	for _, tt := range tests {
+		got := JaroWinklerSimilarity(tt.s1, tt.s2)
+		if got < tt.minScore {
+			t.Errorf("JaroWinklerSimilarity(%q, %q) = %.2f, want >= %.2f", tt.s1, tt.s2, got, tt.minScore)
+		}
+	}
+}
+
+func TestMatcher_Match(t *testing.T) {
+	m := New(2) // Allow edit distance up to 2
+
+	candidates := []string{
+		"getUserName",
+		"getUsername",
+		"get_user_name",
+		"getUserId",
+		"setUserName",
+		"findUser",
+		"userName",
+		"usernameField",
+	}
+
+	tests := []struct {
+		query     string
+		topMatch  string
+		expectMin int
+	}{
+		{
+			query:     "getUserName",
+			expectMin: 3, // Exact + similar variants
+			topMatch:  "getUserName",
+		},
+		{
+			query:     "getuser",
+			expectMin: 2, // Should match getUserName, getUsername at minimum
+			topMatch:  "getUserName",
+		},
+		{
+			query:     "username",
+			expectMin: 2, // Case-insensitive matches
+			topMatch:  "userName",
+		},
+	}
+
+	for _, tt := range tests {
+		matches := m.Match(tt.query, candidates)
+
+		if len(matches) < tt.expectMin {
+			t.Errorf("Match(%q) returned %d matches, want at least %d", tt.query, len(matches), tt.expectMin)
+		}
+
+		if len(matches) > 0 {
+			// Top match should have highest score
+			if matches[0].Score < matches[len(matches)-1].Score {
+				t.Errorf("Match(%q) results not sorted by score", tt.query)
+			}
+		}
+	}
+}
+
+func TestMatcher_EmptyQuery(t *testing.T) {
+	m := New(2)
+	candidates := []string{"test", "example"}
+
+	matches := m.Match("", candidates)
+	if matches != nil {
+		t.Errorf("Match with empty query should return nil, got %v", matches)
+	}
+}
+
+func TestMatcher_PrefixBonus(t *testing.T) {
+	m := New(2)
+	candidates := []string{
+		"getUserName",  // prefix match
+		"findUserName", // contains but not prefix
+	}
+
+	matches := m.Match("get", candidates)
+
+	if len(matches) < 1 {
+		t.Fatal("Expected at least one match")
+	}
+
+	// Prefix match should score higher
+	if matches[0].Text != "getUserName" {
+		t.Errorf("Expected prefix match to rank first, got %q", matches[0].Text)
+	}
+}
+
+func TestMatcher_ExactMatchBonus(t *testing.T) {
+	m := New(2)
+	candidates := []string{
+		"test",
+		"testing",
+		"tester",
+	}
+
+	matches := m.Match("test", candidates)
+
+	if len(matches) < 1 {
+		t.Fatal("Expected at least one match")
+	}
+
+	// Exact match should rank first
+	if matches[0].Text != "test" {
+		t.Errorf("Expected exact match to rank first, got %q", matches[0].Text)
+	}
+
+	// Exact match should have highest score
+	if matches[0].Score < 2.0 { // Should have exact match bonus
+		t.Errorf("Exact match score too low: %.2f", matches[0].Score)
+	}
+}
+
+func TestContainsWordBoundary(t *testing.T) {
+	tests := []struct {
+		text     string
+		query    string
+		expected bool
+	}{
+		{"getUserName", "get", true},    // At start
+		{"getUserName", "user", true},   // After lowercase->uppercase boundary
+		{"get_user_name", "user", true}, // After underscore
+		{"getUserName", "Name", true},   // After lowercase->uppercase
+		{"getUserName", "ser", false},   // Middle of word
+		{"", "test", false},             // Empty text
+	}
+
+	for _, tt := range tests {
+		got := containsWordBoundary(tt.text, tt.query)
+		if got != tt.expected {
+			t.Errorf("containsWordBoundary(%q, %q) = %v, want %v", tt.text, tt.query, got, tt.expected)
+		}
+	}
+}
+
+func TestMatcher_UnicodeHandling(t *testing.T) {
+	m := New(2)
+	candidates := []string{
+		"café",
+		"resume",
+		"naïve",
+	}
+
+	// Test with Unicode characters
+	matches := m.Match("cafe", candidates)
+	if len(matches) == 0 {
+		t.Error("Expected matches for Unicode strings")
+	}
+
+	// Should find café with small edit distance
+	found := false
+	for _, match := range matches {
+		if match.Text == "café" && match.Distance <= 2 {
+			found = true
+			break
+		}
+	}
+
+	if !found {
+		t.Error("Failed to fuzzy match Unicode string 'café'")
+	}
+}
+
+func BenchmarkLevenshteinDistance(b *testing.B) {
+	s1 := "the quick brown fox jumps over the lazy dog"
+	s2 := "the quikc brown fox jumps ovver the lazy dog"
+
+	b.ResetTimer()
+	for i := range b.N {
+		_ = levenshteinDistance(s1, s2)
+		_ = i // use i to avoid unused warning
+	}
+}
+
+func BenchmarkDamerauLevenshteinDistance(b *testing.B) {
+	s1 := "the quick brown fox jumps over the lazy dog"
+	s2 := "the quikc brown fox jumps ovver the lazy dog"
+
+	b.ResetTimer()
+	for i := range b.N {
+		_ = DamerauLevenshteinDistance(s1, s2)
+		_ = i
+	}
+}
+
+func BenchmarkJaroWinklerSimilarity(b *testing.B) {
+	s1 := "martha"
+	s2 := "marhta"
+
+	b.ResetTimer()
+	for i := range b.N {
+		_ = JaroWinklerSimilarity(s1, s2)
+		_ = i
+	}
+}
+
+func BenchmarkMatcher_Match(b *testing.B) {
+	m := New(2)
+	candidates := []string{
+		"getUserName", "getUsername", "get_user_name", "getUserId",
+		"setUserName", "findUser", "userName", "usernameField",
+		"userAccount", "accountUser", "userProfile", "profileUser",
+	}
+
+	b.ResetTimer()
+	for i := range b.N {
+		_ = m.Match("getuser", candidates)
+		_ = i
+	}
+}