compaction-mcp/index.go

package main

import (
	"math"
	"regexp"
	"sort"
	"strings"
	"unicode"
)

// Index is a BM25 inverted index for full-text search over stored documents.
type Index struct {
	docs     map[string]map[string]int      // docID -> term -> frequency
	docLen   map[string]int                 // docID -> total terms
	postings map[string]map[string]struct{} // term -> set of docIDs
	docTags  map[string]map[string]struct{} // docID -> tag set (boosted 5x)
	n        int
	avgDL    float64
}

// SearchResult holds a document ID and its BM25 relevance score.
type SearchResult struct {
	ID    string
	Score float64
}

// NewIndex creates an empty BM25 index.
func NewIndex() *Index {
	return &Index{
		docs:     make(map[string]map[string]int),
		docLen:   make(map[string]int),
		postings: make(map[string]map[string]struct{}),
		docTags:  make(map[string]map[string]struct{}),
	}
}

// Add indexes a document with the given content and tags.
// Tags are stored separately and receive a 5x score boost during search.
func (idx *Index) Add(id, content string, tags []string) {
	// Remove first if already present to avoid stale data.
	if _, exists := idx.docs[id]; exists {
		idx.Remove(id)
	}

	tokens := tokenize(content)
	tf := make(map[string]int, len(tokens))
	for _, t := range tokens {
		tf[t]++
	}

	idx.docs[id] = tf
	idx.docLen[id] = len(tokens)

	for term := range tf {
		if idx.postings[term] == nil {
			idx.postings[term] = make(map[string]struct{})
		}
		idx.postings[term][id] = struct{}{}
	}

	tagSet := make(map[string]struct{}, len(tags))
	for _, tag := range tags {
		for _, t := range tokenize(tag) {
			tagSet[t] = struct{}{}
		}
	}
	idx.docTags[id] = tagSet

	idx.n++
	idx.recalcAvgDL()
}

// Remove deletes a document from the index.
func (idx *Index) Remove(id string) {
	tf, ok := idx.docs[id]
	if !ok {
		return
	}

	for term := range tf {
		if set, exists := idx.postings[term]; exists {
			delete(set, id)
			if len(set) == 0 {
				delete(idx.postings, term)
			}
		}
	}

	delete(idx.docs, id)
	delete(idx.docLen, id)
	delete(idx.docTags, id)

	idx.n--
	idx.recalcAvgDL()
}

// Search returns the top `limit` documents ranked by BM25 score for the query.
// Tag matches receive a 5x boost on top of the BM25 score.
func (idx *Index) Search(query string, limit int) []SearchResult {
	terms := tokenize(query)
	if len(terms) == 0 || idx.n == 0 {
		return nil
	}

	const (
		k1       = 1.2
		b        = 0.75
		tagBoost = 5.0
	)

	scores := make(map[string]float64)

	for _, term := range terms {
		docSet, ok := idx.postings[term]
		if !ok {
			continue
		}
		df := float64(len(docSet))
		idf := math.Log((float64(idx.n)-df+0.5)/(df+0.5) + 1.0)

		for docID := range docSet {
			tfVal := float64(idx.docs[docID][term])
			dl := float64(idx.docLen[docID])
			num := tfVal * (k1 + 1)
			denom := tfVal + k1*(1-b+b*(dl/idx.avgDL))
			scores[docID] += idf * (num / denom)
		}

		// Tag boost: add 5x the IDF-weighted score for docs whose tags match.
		for docID, tagSet := range idx.docTags {
			if _, hit := tagSet[term]; hit {
				dl := float64(idx.docLen[docID])
				// Use a synthetic TF of 1 for tag matches.
				num := 1.0 * (k1 + 1)
				denom := 1.0 + k1*(1-b+b*(dl/idx.avgDL))
				scores[docID] += tagBoost * idf * (num / denom)
			}
		}
	}

	results := make([]SearchResult, 0, len(scores))
	for id, score := range scores {
		results = append(results, SearchResult{ID: id, Score: score})
	}

	sort.Slice(results, func(i, j int) bool {
		return results[i].Score > results[j].Score
	})

	if limit > 0 && len(results) > limit {
		results = results[:limit]
	}
	return results
}

func (idx *Index) recalcAvgDL() {
	if idx.n == 0 {
		idx.avgDL = 0
		return
	}
	total := 0
	for _, dl := range idx.docLen {
		total += dl
	}
	idx.avgDL = float64(total) / float64(idx.n)
}

// camelRe matches boundaries in camelCase identifiers (e.g. "handleCompact").
var camelRe = regexp.MustCompile(`([a-z])([A-Z])`)

// tokenize splits text into lowercase terms, handling camelCase and snake_case.
// Tokens shorter than 2 characters are filtered out.
func tokenize(s string) []string {
	// Split camelCase: insert space at lowercase-to-uppercase boundary.
	s = camelRe.ReplaceAllString(s, "${1} ${2}")

	// Split on any non-letter, non-digit character (handles snake_case, punctuation, whitespace).
	splitter := func(r rune) bool {
		return !unicode.IsLetter(r) && !unicode.IsDigit(r)
	}
	parts := strings.FieldsFunc(strings.ToLower(s), splitter)

	tokens := make([]string, 0, len(parts))
	for _, p := range parts {
		if len(p) >= 2 {
			tokens = append(tokens, p)
		}
	}
	return tokens
}