mirror of
https://github.com/lukaszraczylo/filepuff-mcp.git
synced 2026-06-05 22:23:50 +00:00
Ho hum.
This commit is contained in:
@@ -0,0 +1,375 @@
|
||||
// Package fuzzy provides fuzzy string matching using Levenshtein distance.
|
||||
package fuzzy
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
// Match represents a fuzzy match result.
|
||||
type Match struct {
|
||||
Text string
|
||||
Distance int
|
||||
Similarity float64
|
||||
Score float64
|
||||
}
|
||||
|
||||
// Matcher provides fuzzy matching capabilities.
|
||||
type Matcher struct {
|
||||
threshold int
|
||||
}
|
||||
|
||||
// New creates a new fuzzy matcher with the given threshold.
|
||||
// Threshold is the maximum edit distance to consider a match (typically 1-3).
|
||||
func New(threshold int) *Matcher {
|
||||
return &Matcher{
|
||||
threshold: threshold,
|
||||
}
|
||||
}
|
||||
|
||||
// Match performs fuzzy matching of query against candidates.
|
||||
func (m *Matcher) Match(query string, candidates []string) []Match {
|
||||
if query == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
matches := make([]Match, 0, len(candidates)/10)
|
||||
queryLower := strings.ToLower(query)
|
||||
|
||||
for _, candidate := range candidates {
|
||||
candidateLower := strings.ToLower(candidate)
|
||||
|
||||
// Calculate Levenshtein distance
|
||||
dist := levenshteinDistance(queryLower, candidateLower)
|
||||
|
||||
// Skip if distance exceeds threshold
|
||||
if dist > m.threshold {
|
||||
// Check if it's a substring match (important for identifiers)
|
||||
if !strings.Contains(candidateLower, queryLower) {
|
||||
continue
|
||||
}
|
||||
// Allow substring matches even if edit distance is high
|
||||
}
|
||||
|
||||
// Calculate similarity (0.0 to 1.0)
|
||||
maxLen := max(len(query), len(candidate))
|
||||
similarity := 1.0 - float64(dist)/float64(maxLen)
|
||||
|
||||
// Calculate composite score
|
||||
score := m.calculateScore(queryLower, candidateLower, dist, similarity)
|
||||
|
||||
matches = append(matches, Match{
|
||||
Text: candidate,
|
||||
Distance: dist,
|
||||
Similarity: similarity,
|
||||
Score: score,
|
||||
})
|
||||
}
|
||||
|
||||
// Sort by score descending
|
||||
sort.Slice(matches, func(i, j int) bool {
|
||||
return matches[i].Score > matches[j].Score
|
||||
})
|
||||
|
||||
return matches
|
||||
}
|
||||
|
||||
// calculateScore computes a composite score considering multiple factors.
|
||||
func (m *Matcher) calculateScore(query, candidate string, dist int, similarity float64) float64 {
|
||||
score := similarity
|
||||
|
||||
// Bonus for exact match
|
||||
if query == candidate {
|
||||
score += 2.0
|
||||
}
|
||||
|
||||
// Bonus for prefix match (important for identifier search)
|
||||
if strings.HasPrefix(candidate, query) {
|
||||
score += 1.0
|
||||
}
|
||||
|
||||
// Bonus for word boundary matches (e.g., "getName" matches "get")
|
||||
if containsWordBoundary(candidate, query) {
|
||||
score += 0.5
|
||||
}
|
||||
|
||||
// Penalty for length difference (prefer similar-length matches)
|
||||
lenDiff := abs(len(candidate) - len(query))
|
||||
score -= float64(lenDiff) * 0.01
|
||||
|
||||
// Penalty for edit distance
|
||||
score -= float64(dist) * 0.1
|
||||
|
||||
return score
|
||||
}
|
||||
|
||||
// levenshteinDistance computes the Levenshtein distance between two strings.
|
||||
// Uses the Wagner-Fischer algorithm with space optimization O(min(m,n)).
|
||||
func levenshteinDistance(s1, s2 string) int {
|
||||
if s1 == s2 {
|
||||
return 0
|
||||
}
|
||||
if len(s1) == 0 {
|
||||
return len(s2)
|
||||
}
|
||||
if len(s2) == 0 {
|
||||
return len(s1)
|
||||
}
|
||||
|
||||
// Ensure s1 is the shorter string for space optimization
|
||||
if len(s1) > len(s2) {
|
||||
s1, s2 = s2, s1
|
||||
}
|
||||
|
||||
// Use rune slices to handle Unicode properly
|
||||
r1 := []rune(s1)
|
||||
r2 := []rune(s2)
|
||||
len1 := len(r1)
|
||||
len2 := len(r2)
|
||||
|
||||
// Only need two rows of the matrix
|
||||
previous := make([]int, len2+1)
|
||||
current := make([]int, len2+1)
|
||||
|
||||
// Initialize first row
|
||||
for j := 0; j <= len2; j++ {
|
||||
previous[j] = j
|
||||
}
|
||||
|
||||
// Calculate edit distance
|
||||
for i := 1; i <= len1; i++ {
|
||||
current[0] = i
|
||||
|
||||
for j := 1; j <= len2; j++ {
|
||||
cost := 1
|
||||
if r1[i-1] == r2[j-1] {
|
||||
cost = 0
|
||||
}
|
||||
|
||||
current[j] = min(
|
||||
previous[j]+1, // deletion
|
||||
current[j-1]+1, // insertion
|
||||
previous[j-1]+cost, // substitution
|
||||
)
|
||||
}
|
||||
|
||||
// Swap rows
|
||||
previous, current = current, previous
|
||||
}
|
||||
|
||||
return previous[len2]
|
||||
}
|
||||
|
||||
// DamerauLevenshteinDistance computes Damerau-Levenshtein distance (includes transpositions).
|
||||
// This is more accurate for typos where adjacent characters are swapped.
|
||||
func DamerauLevenshteinDistance(s1, s2 string) int {
|
||||
if s1 == s2 {
|
||||
return 0
|
||||
}
|
||||
if len(s1) == 0 {
|
||||
return len(s2)
|
||||
}
|
||||
if len(s2) == 0 {
|
||||
return len(s1)
|
||||
}
|
||||
|
||||
r1 := []rune(s1)
|
||||
r2 := []rune(s2)
|
||||
len1 := len(r1)
|
||||
len2 := len(r2)
|
||||
|
||||
// Create distance matrix
|
||||
d := make([][]int, len1+1)
|
||||
for i := range d {
|
||||
d[i] = make([]int, len2+1)
|
||||
}
|
||||
|
||||
// Initialize first row and column
|
||||
for i := 0; i <= len1; i++ {
|
||||
d[i][0] = i
|
||||
}
|
||||
for j := 0; j <= len2; j++ {
|
||||
d[0][j] = j
|
||||
}
|
||||
|
||||
// Calculate distances
|
||||
for i := 1; i <= len1; i++ {
|
||||
for j := 1; j <= len2; j++ {
|
||||
cost := 1
|
||||
if r1[i-1] == r2[j-1] {
|
||||
cost = 0
|
||||
}
|
||||
|
||||
d[i][j] = min(
|
||||
d[i-1][j]+1, // deletion
|
||||
d[i][j-1]+1, // insertion
|
||||
d[i-1][j-1]+cost, // substitution
|
||||
)
|
||||
|
||||
// Check for transposition
|
||||
if i > 1 && j > 1 && r1[i-1] == r2[j-2] && r1[i-2] == r2[j-1] {
|
||||
d[i][j] = min(d[i][j], d[i-2][j-2]+cost)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return d[len1][len2]
|
||||
}
|
||||
|
||||
// JaroWinklerSimilarity computes Jaro-Winkler similarity (0.0 to 1.0).
|
||||
// Better for short strings and names.
|
||||
func JaroWinklerSimilarity(s1, s2 string) float64 {
|
||||
if s1 == s2 {
|
||||
return 1.0
|
||||
}
|
||||
|
||||
r1 := []rune(s1)
|
||||
r2 := []rune(s2)
|
||||
|
||||
if len(r1) == 0 || len(r2) == 0 {
|
||||
return 0.0
|
||||
}
|
||||
|
||||
// Calculate Jaro similarity first
|
||||
jaro := jaroSimilarity(r1, r2)
|
||||
|
||||
// Calculate common prefix length (up to 4 characters)
|
||||
prefixLen := 0
|
||||
for i := 0; i < min(min(len(r1), len(r2)), 4); i++ {
|
||||
if r1[i] == r2[i] {
|
||||
prefixLen++
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Jaro-Winkler adds bonus for common prefix
|
||||
const p = 0.1
|
||||
return jaro + float64(prefixLen)*p*(1.0-jaro)
|
||||
}
|
||||
|
||||
// jaroSimilarity computes Jaro similarity.
|
||||
func jaroSimilarity(r1, r2 []rune) float64 {
|
||||
len1 := len(r1)
|
||||
len2 := len(r2)
|
||||
|
||||
// Maximum allowed distance
|
||||
matchDist := max(len1, len2)/2 - 1
|
||||
if matchDist < 0 {
|
||||
matchDist = 0
|
||||
}
|
||||
|
||||
matched1 := make([]bool, len1)
|
||||
matched2 := make([]bool, len2)
|
||||
|
||||
matches := 0
|
||||
transpositions := 0
|
||||
|
||||
// Find matches
|
||||
for i := range len1 {
|
||||
start := max(0, i-matchDist)
|
||||
end := min(i+matchDist+1, len2)
|
||||
|
||||
for j := start; j < end; j++ {
|
||||
if matched2[j] || r1[i] != r2[j] {
|
||||
continue
|
||||
}
|
||||
matched1[i] = true
|
||||
matched2[j] = true
|
||||
matches++
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if matches == 0 {
|
||||
return 0.0
|
||||
}
|
||||
|
||||
// Count transpositions
|
||||
k := 0
|
||||
for i := range len1 {
|
||||
if !matched1[i] {
|
||||
continue
|
||||
}
|
||||
for !matched2[k] {
|
||||
k++
|
||||
}
|
||||
if r1[i] != r2[k] {
|
||||
transpositions++
|
||||
}
|
||||
k++
|
||||
}
|
||||
|
||||
return (float64(matches)/float64(len1) +
|
||||
float64(matches)/float64(len2) +
|
||||
float64(matches-transpositions/2)/float64(matches)) / 3.0
|
||||
}
|
||||
|
||||
// containsWordBoundary checks if query appears at word boundaries in text.
|
||||
func containsWordBoundary(text, query string) bool {
|
||||
textLower := strings.ToLower(text)
|
||||
queryLower := strings.ToLower(query)
|
||||
|
||||
idx := strings.Index(textLower, queryLower)
|
||||
if idx == -1 {
|
||||
return false
|
||||
}
|
||||
|
||||
// Check if match is at start
|
||||
if idx == 0 {
|
||||
return true
|
||||
}
|
||||
|
||||
// Check for underscore or non-alphanumeric boundary
|
||||
prevRune := rune(text[idx-1])
|
||||
if !unicode.IsLetter(prevRune) && !unicode.IsDigit(prevRune) {
|
||||
return true
|
||||
}
|
||||
|
||||
// Check for camelCase boundary (lowercase before uppercase)
|
||||
if idx > 0 && len(text) > idx {
|
||||
curr := rune(text[idx])
|
||||
prev := rune(text[idx-1])
|
||||
if unicode.IsLower(prev) && unicode.IsUpper(curr) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// Helper functions
|
||||
|
||||
func min(values ...int) int {
|
||||
if len(values) == 0 {
|
||||
return 0
|
||||
}
|
||||
m := values[0]
|
||||
for _, v := range values[1:] {
|
||||
if v < m {
|
||||
m = v
|
||||
}
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
func max(values ...int) int {
|
||||
if len(values) == 0 {
|
||||
return 0
|
||||
}
|
||||
m := values[0]
|
||||
for _, v := range values[1:] {
|
||||
if v > m {
|
||||
m = v
|
||||
}
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
func abs(x int) int {
|
||||
if x < 0 {
|
||||
return -x
|
||||
}
|
||||
return x
|
||||
}
|
||||
Reference in New Issue
Block a user