Files
compaction-mcp/index.go
T
lukaszraczylo dded4ec04c Add release infrastructure and complete implementation
- Dockerfile: distroless container for MCP server
- GoReleaser: multi-platform binary and Docker builds with cosign signing
- GitHub Actions: release workflow using shared actions
- Semver config for automatic version calculation
- Persistence layer, content indexing, and improved tool handlers
2026-03-07 18:31:00 +00:00

191 lines
4.5 KiB
Go

package main
import (
"math"
"regexp"
"sort"
"strings"
"unicode"
)
// Index is a BM25 inverted index for full-text search over stored documents.
type Index struct {
docs map[string]map[string]int // docID -> term -> frequency
docLen map[string]int // docID -> total terms
postings map[string]map[string]struct{} // term -> set of docIDs
docTags map[string]map[string]struct{} // docID -> tag set (boosted 5x)
n int
avgDL float64
}
// SearchResult holds a document ID and its BM25 relevance score.
type SearchResult struct {
ID string
Score float64
}
// NewIndex creates an empty BM25 index.
func NewIndex() *Index {
return &Index{
docs: make(map[string]map[string]int),
docLen: make(map[string]int),
postings: make(map[string]map[string]struct{}),
docTags: make(map[string]map[string]struct{}),
}
}
// Add indexes a document with the given content and tags.
// Tags are stored separately and receive a 5x score boost during search.
func (idx *Index) Add(id, content string, tags []string) {
// Remove first if already present to avoid stale data.
if _, exists := idx.docs[id]; exists {
idx.Remove(id)
}
tokens := tokenize(content)
tf := make(map[string]int, len(tokens))
for _, t := range tokens {
tf[t]++
}
idx.docs[id] = tf
idx.docLen[id] = len(tokens)
for term := range tf {
if idx.postings[term] == nil {
idx.postings[term] = make(map[string]struct{})
}
idx.postings[term][id] = struct{}{}
}
tagSet := make(map[string]struct{}, len(tags))
for _, tag := range tags {
for _, t := range tokenize(tag) {
tagSet[t] = struct{}{}
}
}
idx.docTags[id] = tagSet
idx.n++
idx.recalcAvgDL()
}
// Remove deletes a document from the index.
func (idx *Index) Remove(id string) {
tf, ok := idx.docs[id]
if !ok {
return
}
for term := range tf {
if set, exists := idx.postings[term]; exists {
delete(set, id)
if len(set) == 0 {
delete(idx.postings, term)
}
}
}
delete(idx.docs, id)
delete(idx.docLen, id)
delete(idx.docTags, id)
idx.n--
idx.recalcAvgDL()
}
// Search returns the top `limit` documents ranked by BM25 score for the query.
// Tag matches receive a 5x boost on top of the BM25 score.
func (idx *Index) Search(query string, limit int) []SearchResult {
terms := tokenize(query)
if len(terms) == 0 || idx.n == 0 {
return nil
}
const (
k1 = 1.2
b = 0.75
tagBoost = 5.0
)
scores := make(map[string]float64)
for _, term := range terms {
docSet, ok := idx.postings[term]
if !ok {
continue
}
df := float64(len(docSet))
idf := math.Log((float64(idx.n)-df+0.5)/(df+0.5) + 1.0)
for docID := range docSet {
tfVal := float64(idx.docs[docID][term])
dl := float64(idx.docLen[docID])
num := tfVal * (k1 + 1)
denom := tfVal + k1*(1-b+b*(dl/idx.avgDL))
scores[docID] += idf * (num / denom)
}
// Tag boost: add 5x the IDF-weighted score for docs whose tags match.
for docID, tagSet := range idx.docTags {
if _, hit := tagSet[term]; hit {
dl := float64(idx.docLen[docID])
// Use a synthetic TF of 1 for tag matches.
num := 1.0 * (k1 + 1)
denom := 1.0 + k1*(1-b+b*(dl/idx.avgDL))
scores[docID] += tagBoost * idf * (num / denom)
}
}
}
results := make([]SearchResult, 0, len(scores))
for id, score := range scores {
results = append(results, SearchResult{ID: id, Score: score})
}
sort.Slice(results, func(i, j int) bool {
return results[i].Score > results[j].Score
})
if limit > 0 && len(results) > limit {
results = results[:limit]
}
return results
}
func (idx *Index) recalcAvgDL() {
if idx.n == 0 {
idx.avgDL = 0
return
}
total := 0
for _, dl := range idx.docLen {
total += dl
}
idx.avgDL = float64(total) / float64(idx.n)
}
// camelRe matches boundaries in camelCase identifiers (e.g. "handleCompact").
var camelRe = regexp.MustCompile(`([a-z])([A-Z])`)
// tokenize splits text into lowercase terms, handling camelCase and snake_case.
// Tokens shorter than 2 characters are filtered out.
func tokenize(s string) []string {
// Split camelCase: insert space at lowercase-to-uppercase boundary.
s = camelRe.ReplaceAllString(s, "${1} ${2}")
// Split on any non-letter, non-digit character (handles snake_case, punctuation, whitespace).
splitter := func(r rune) bool {
return !unicode.IsLetter(r) && !unicode.IsDigit(r)
}
parts := strings.FieldsFunc(strings.ToLower(s), splitter)
tokens := make([]string, 0, len(parts))
for _, p := range parts {
if len(p) >= 2 {
tokens = append(tokens, p)
}
}
return tokens
}