Ho hum.

2026-06-05 22:23:50 +00:00 · 2026-01-18 18:40:26 +00:00
commit 185e73da47
51 changed files with 14073 additions and 0 deletions
@@ -0,0 +1,375 @@
+// Package fuzzy provides fuzzy string matching using Levenshtein distance.
+package fuzzy
+
+import (
+	"sort"
+	"strings"
+	"unicode"
+)
+
+// Match represents a fuzzy match result.
+type Match struct {
+	Text       string
+	Distance   int
+	Similarity float64
+	Score      float64
+}
+
+// Matcher provides fuzzy matching capabilities.
+type Matcher struct {
+	threshold int
+}
+
+// New creates a new fuzzy matcher with the given threshold.
+// Threshold is the maximum edit distance to consider a match (typically 1-3).
+func New(threshold int) *Matcher {
+	return &Matcher{
+		threshold: threshold,
+	}
+}
+
+// Match performs fuzzy matching of query against candidates.
+func (m *Matcher) Match(query string, candidates []string) []Match {
+	if query == "" {
+		return nil
+	}
+
+	matches := make([]Match, 0, len(candidates)/10)
+	queryLower := strings.ToLower(query)
+
+	for _, candidate := range candidates {
+		candidateLower := strings.ToLower(candidate)
+
+		// Calculate Levenshtein distance
+		dist := levenshteinDistance(queryLower, candidateLower)
+
+		// Skip if distance exceeds threshold
+		if dist > m.threshold {
+			// Check if it's a substring match (important for identifiers)
+			if !strings.Contains(candidateLower, queryLower) {
+				continue
+			}
+			// Allow substring matches even if edit distance is high
+		}
+
+		// Calculate similarity (0.0 to 1.0)
+		maxLen := max(len(query), len(candidate))
+		similarity := 1.0 - float64(dist)/float64(maxLen)
+
+		// Calculate composite score
+		score := m.calculateScore(queryLower, candidateLower, dist, similarity)
+
+		matches = append(matches, Match{
+			Text:       candidate,
+			Distance:   dist,
+			Similarity: similarity,
+			Score:      score,
+		})
+	}
+
+	// Sort by score descending
+	sort.Slice(matches, func(i, j int) bool {
+		return matches[i].Score > matches[j].Score
+	})
+
+	return matches
+}
+
+// calculateScore computes a composite score considering multiple factors.
+func (m *Matcher) calculateScore(query, candidate string, dist int, similarity float64) float64 {
+	score := similarity
+
+	// Bonus for exact match
+	if query == candidate {
+		score += 2.0
+	}
+
+	// Bonus for prefix match (important for identifier search)
+	if strings.HasPrefix(candidate, query) {
+		score += 1.0
+	}
+
+	// Bonus for word boundary matches (e.g., "getName" matches "get")
+	if containsWordBoundary(candidate, query) {
+		score += 0.5
+	}
+
+	// Penalty for length difference (prefer similar-length matches)
+	lenDiff := abs(len(candidate) - len(query))
+	score -= float64(lenDiff) * 0.01
+
+	// Penalty for edit distance
+	score -= float64(dist) * 0.1
+
+	return score
+}
+
+// levenshteinDistance computes the Levenshtein distance between two strings.
+// Uses the Wagner-Fischer algorithm with space optimization O(min(m,n)).
+func levenshteinDistance(s1, s2 string) int {
+	if s1 == s2 {
+		return 0
+	}
+	if len(s1) == 0 {
+		return len(s2)
+	}
+	if len(s2) == 0 {
+		return len(s1)
+	}
+
+	// Ensure s1 is the shorter string for space optimization
+	if len(s1) > len(s2) {
+		s1, s2 = s2, s1
+	}
+
+	// Use rune slices to handle Unicode properly
+	r1 := []rune(s1)
+	r2 := []rune(s2)
+	len1 := len(r1)
+	len2 := len(r2)
+
+	// Only need two rows of the matrix
+	previous := make([]int, len2+1)
+	current := make([]int, len2+1)
+
+	// Initialize first row
+	for j := 0; j <= len2; j++ {
+		previous[j] = j
+	}
+
+	// Calculate edit distance
+	for i := 1; i <= len1; i++ {
+		current[0] = i
+
+		for j := 1; j <= len2; j++ {
+			cost := 1
+			if r1[i-1] == r2[j-1] {
+				cost = 0
+			}
+
+			current[j] = min(
+				previous[j]+1,      // deletion
+				current[j-1]+1,     // insertion
+				previous[j-1]+cost, // substitution
+			)
+		}
+
+		// Swap rows
+		previous, current = current, previous
+	}
+
+	return previous[len2]
+}
+
+// DamerauLevenshteinDistance computes Damerau-Levenshtein distance (includes transpositions).
+// This is more accurate for typos where adjacent characters are swapped.
+func DamerauLevenshteinDistance(s1, s2 string) int {
+	if s1 == s2 {
+		return 0
+	}
+	if len(s1) == 0 {
+		return len(s2)
+	}
+	if len(s2) == 0 {
+		return len(s1)
+	}
+
+	r1 := []rune(s1)
+	r2 := []rune(s2)
+	len1 := len(r1)
+	len2 := len(r2)
+
+	// Create distance matrix
+	d := make([][]int, len1+1)
+	for i := range d {
+		d[i] = make([]int, len2+1)
+	}
+
+	// Initialize first row and column
+	for i := 0; i <= len1; i++ {
+		d[i][0] = i
+	}
+	for j := 0; j <= len2; j++ {
+		d[0][j] = j
+	}
+
+	// Calculate distances
+	for i := 1; i <= len1; i++ {
+		for j := 1; j <= len2; j++ {
+			cost := 1
+			if r1[i-1] == r2[j-1] {
+				cost = 0
+			}
+
+			d[i][j] = min(
+				d[i-1][j]+1,      // deletion
+				d[i][j-1]+1,      // insertion
+				d[i-1][j-1]+cost, // substitution
+			)
+
+			// Check for transposition
+			if i > 1 && j > 1 && r1[i-1] == r2[j-2] && r1[i-2] == r2[j-1] {
+				d[i][j] = min(d[i][j], d[i-2][j-2]+cost)
+			}
+		}
+	}
+
+	return d[len1][len2]
+}
+
+// JaroWinklerSimilarity computes Jaro-Winkler similarity (0.0 to 1.0).
+// Better for short strings and names.
+func JaroWinklerSimilarity(s1, s2 string) float64 {
+	if s1 == s2 {
+		return 1.0
+	}
+
+	r1 := []rune(s1)
+	r2 := []rune(s2)
+
+	if len(r1) == 0 || len(r2) == 0 {
+		return 0.0
+	}
+
+	// Calculate Jaro similarity first
+	jaro := jaroSimilarity(r1, r2)
+
+	// Calculate common prefix length (up to 4 characters)
+	prefixLen := 0
+	for i := 0; i < min(min(len(r1), len(r2)), 4); i++ {
+		if r1[i] == r2[i] {
+			prefixLen++
+		} else {
+			break
+		}
+	}
+
+	// Jaro-Winkler adds bonus for common prefix
+	const p = 0.1
+	return jaro + float64(prefixLen)*p*(1.0-jaro)
+}
+
+// jaroSimilarity computes Jaro similarity.
+func jaroSimilarity(r1, r2 []rune) float64 {
+	len1 := len(r1)
+	len2 := len(r2)
+
+	// Maximum allowed distance
+	matchDist := max(len1, len2)/2 - 1
+	if matchDist < 0 {
+		matchDist = 0
+	}
+
+	matched1 := make([]bool, len1)
+	matched2 := make([]bool, len2)
+
+	matches := 0
+	transpositions := 0
+
+	// Find matches
+	for i := range len1 {
+		start := max(0, i-matchDist)
+		end := min(i+matchDist+1, len2)
+
+		for j := start; j < end; j++ {
+			if matched2[j] || r1[i] != r2[j] {
+				continue
+			}
+			matched1[i] = true
+			matched2[j] = true
+			matches++
+			break
+		}
+	}
+
+	if matches == 0 {
+		return 0.0
+	}
+
+	// Count transpositions
+	k := 0
+	for i := range len1 {
+		if !matched1[i] {
+			continue
+		}
+		for !matched2[k] {
+			k++
+		}
+		if r1[i] != r2[k] {
+			transpositions++
+		}
+		k++
+	}
+
+	return (float64(matches)/float64(len1) +
+		float64(matches)/float64(len2) +
+		float64(matches-transpositions/2)/float64(matches)) / 3.0
+}
+
+// containsWordBoundary checks if query appears at word boundaries in text.
+func containsWordBoundary(text, query string) bool {
+	textLower := strings.ToLower(text)
+	queryLower := strings.ToLower(query)
+
+	idx := strings.Index(textLower, queryLower)
+	if idx == -1 {
+		return false
+	}
+
+	// Check if match is at start
+	if idx == 0 {
+		return true
+	}
+
+	// Check for underscore or non-alphanumeric boundary
+	prevRune := rune(text[idx-1])
+	if !unicode.IsLetter(prevRune) && !unicode.IsDigit(prevRune) {
+		return true
+	}
+
+	// Check for camelCase boundary (lowercase before uppercase)
+	if idx > 0 && len(text) > idx {
+		curr := rune(text[idx])
+		prev := rune(text[idx-1])
+		if unicode.IsLower(prev) && unicode.IsUpper(curr) {
+			return true
+		}
+	}
+
+	return false
+}
+
+// Helper functions
+
+func min(values ...int) int {
+	if len(values) == 0 {
+		return 0
+	}
+	m := values[0]
+	for _, v := range values[1:] {
+		if v < m {
+			m = v
+		}
+	}
+	return m
+}
+
+func max(values ...int) int {
+	if len(values) == 0 {
+		return 0
+	}
+	m := values[0]
+	for _, v := range values[1:] {
+		if v > m {
+			m = v
+		}
+	}
+	return m
+}
+
+func abs(x int) int {
+	if x < 0 {
+		return -x
+	}
+	return x
+}