claude-mnemonic/internal/worker/handlers_import_export.go

// Package worker provides import, export, and archive HTTP handlers.
package worker

import (
	"context"
	"database/sql"
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"sort"
	"strconv"
	"strings"
	"sync"
	"time"

	"github.com/go-chi/chi/v5"
	"github.com/lukaszraczylo/claude-mnemonic/internal/db/gorm"
	"github.com/lukaszraczylo/claude-mnemonic/pkg/models"
	"github.com/lukaszraczylo/claude-mnemonic/pkg/similarity"
	"github.com/rs/zerolog/log"
)

// BulkImportRequest is the request body for bulk observation import.
type BulkImportRequest struct {
	Project      string                 `json:"project"`
	Observations []BulkObservationInput `json:"observations"`
}

// BulkObservationInput represents a single observation in bulk import.
type BulkObservationInput struct {
	Type          string   `json:"type"`
	Title         string   `json:"title"`
	Subtitle      string   `json:"subtitle,omitempty"`
	Narrative     string   `json:"narrative,omitempty"`
	Scope         string   `json:"scope,omitempty"`
	Facts         []string `json:"facts,omitempty"`
	Concepts      []string `json:"concepts,omitempty"`
	FilesRead     []string `json:"files_read,omitempty"`
	FilesModified []string `json:"files_modified,omitempty"`
}

// BulkImportResponse contains the result of a bulk import operation.
type BulkImportResponse struct {
	Errors            []string `json:"errors,omitempty"`
	Imported          int      `json:"imported"`
	Failed            int      `json:"failed"`
	SkippedDuplicates int      `json:"skipped_duplicates,omitempty"`
}

// handleBulkImport handles bulk import of observations.
// This is useful for migrating data or importing observations from external sources.
func (s *Service) handleBulkImport(w http.ResponseWriter, r *http.Request) {
	// Rate limit bulk operations to prevent DoS
	if s.bulkOpLimiter != nil && !s.bulkOpLimiter.CanExecute() {
		remaining := s.bulkOpLimiter.CooldownRemaining()
		http.Error(w, fmt.Sprintf("bulk import rate limited, retry in %d seconds", remaining), http.StatusTooManyRequests)
		return
	}

	var req BulkImportRequest
	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
		http.Error(w, "invalid request body: "+err.Error(), http.StatusBadRequest)
		return
	}

	if req.Project == "" {
		http.Error(w, "project is required", http.StatusBadRequest)
		return
	}

	// Validate project name to prevent path traversal
	if err := ValidateProjectName(req.Project); err != nil {
		http.Error(w, err.Error(), http.StatusBadRequest)
		return
	}

	if len(req.Observations) == 0 {
		http.Error(w, "at least one observation is required", http.StatusBadRequest)
		return
	}

	// Limit batch size to prevent overwhelming the system
	maxBatchSize := 100
	if len(req.Observations) > maxBatchSize {
		http.Error(w, fmt.Sprintf("batch size exceeds maximum of %d", maxBatchSize), http.StatusBadRequest)
		return
	}

	// Create a synthetic session for bulk import
	sessionID, err := s.sessionStore.CreateSDKSession(r.Context(), fmt.Sprintf("bulk-import-%d", time.Now().UnixMilli()), req.Project, "bulk import")
	if err != nil {
		http.Error(w, "failed to create import session: "+err.Error(), http.StatusInternalServerError)
		return
	}

	var imported, failed, skippedDupes int
	var errors []string

	// Track imported observations for deduplication within the batch
	importedObs := make([]*models.Observation, 0, len(req.Observations))

	// Deduplication threshold - observations more similar than this are considered duplicates
	const dedupThreshold = 0.7

	for i, obsInput := range req.Observations {
		// Validate observation type using O(1) map lookup
		if !IsValidObservationType(obsInput.Type) {
			failed++
			errors = append(errors, fmt.Sprintf("observation %d: invalid type '%s'", i, obsInput.Type))
			continue
		}

		// Build parsed observation
		parsedObs := &models.ParsedObservation{
			Type:          models.ObservationType(obsInput.Type),
			Title:         obsInput.Title,
			Subtitle:      obsInput.Subtitle,
			Facts:         obsInput.Facts,
			Narrative:     obsInput.Narrative,
			Concepts:      obsInput.Concepts,
			FilesRead:     obsInput.FilesRead,
			FilesModified: obsInput.FilesModified,
			Scope:         models.ObservationScope(obsInput.Scope),
		}

		// Convert to temporary observation for similarity check
		tempObs := &models.Observation{
			Title:     sql.NullString{String: parsedObs.Title, Valid: parsedObs.Title != ""},
			Subtitle:  sql.NullString{String: parsedObs.Subtitle, Valid: parsedObs.Subtitle != ""},
			Narrative: sql.NullString{String: parsedObs.Narrative, Valid: parsedObs.Narrative != ""},
		}

		// Check for duplicates within this import batch
		if similarity.IsSimilarToAny(tempObs, importedObs, dedupThreshold) {
			skippedDupes++
			continue
		}

		// Store observation
		obsID, _, err := s.observationStore.StoreObservation(
			r.Context(),
			fmt.Sprintf("bulk-import-%d", sessionID),
			req.Project,
			parsedObs,
			0, // prompt number
			0, // discovery tokens
		)
		if err != nil {
			failed++
			errors = append(errors, fmt.Sprintf("observation %d: %v", i, err))
			continue
		}

		// Sync to vector DB asynchronously with rate limiting
		if s.vectorSync != nil {
			s.asyncVectorSync(func() {
				// Use service context as parent to respect shutdown signals
				ctx, cancel := context.WithTimeout(s.ctx, 10*time.Second)
				defer cancel()
				obs, err := s.observationStore.GetObservationByID(ctx, obsID)
				if err == nil && obs != nil {
					if syncErr := s.vectorSync.SyncObservation(ctx, obs); syncErr != nil {
						if s.ctx.Err() == nil { // Don't log during shutdown
							log.Debug().Err(syncErr).Int64("id", obsID).Msg("Failed to sync observation during bulk import")
						}
					}
				}
			})
		}

		// Track for deduplication of subsequent observations in this batch
		importedObs = append(importedObs, tempObs)
		imported++
	}

	log.Info().
		Str("project", req.Project).
		Int("imported", imported).
		Int("failed", failed).
		Int("skipped_duplicates", skippedDupes).
		Msg("Bulk import completed")

	// Invalidate observation count cache after import
	if imported > 0 {
		if req.Project != "" {
			s.invalidateObsCountCache(req.Project)
		} else {
			s.invalidateAllObsCountCache()
		}
	}

	// Broadcast observation event for dashboard refresh
	s.sseBroadcaster.Broadcast(map[string]any{
		"type":    "observation",
		"action":  "bulk_import",
		"project": req.Project,
		"count":   imported,
	})

	writeJSON(w, BulkImportResponse{
		Imported:          imported,
		Failed:            failed,
		SkippedDuplicates: skippedDupes,
		Errors:            errors,
	})
}

// ArchiveRequest is the request body for archiving observations.
type ArchiveRequest struct {
	Project    string  `json:"project,omitempty"`
	Reason     string  `json:"reason,omitempty"`
	IDs        []int64 `json:"ids,omitempty"`
	MaxAgeDays int     `json:"max_age_days,omitempty"`
}

// handleArchiveObservations archives observations by ID or by age.
// Supports batch archival with error tracking per observation.
func (s *Service) handleArchiveObservations(w http.ResponseWriter, r *http.Request) {
	var req ArchiveRequest
	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
		http.Error(w, "invalid request body: "+err.Error(), http.StatusBadRequest)
		return
	}

	var archivedIDs []int64
	var failedIDs []int64
	var errors []string
	var err error

	if len(req.IDs) > 0 {
		// Archive specific observations with parallel processing for large batches
		if len(req.IDs) > 5 {
			// Use parallel archival for batches larger than 5
			type archiveResult struct {
				err error
				id  int64
			}
			results := make(chan archiveResult, len(req.IDs))

			// Limit concurrency to avoid overwhelming the database
			sem := make(chan struct{}, 5)
			var wg sync.WaitGroup

			for _, id := range req.IDs {
				wg.Add(1)
				go func(obsID int64) {
					defer wg.Done()
					sem <- struct{}{}        // Acquire
					defer func() { <-sem }() // Release

					archErr := s.observationStore.ArchiveObservation(r.Context(), obsID, req.Reason)
					results <- archiveResult{id: obsID, err: archErr}
				}(id)
			}

			// Close results channel when all goroutines complete
			go func() {
				wg.Wait()
				close(results)
			}()

			// Collect results
			for res := range results {
				if res.err != nil {
					log.Warn().Err(res.err).Int64("id", res.id).Msg("Failed to archive observation")
					failedIDs = append(failedIDs, res.id)
					errors = append(errors, fmt.Sprintf("id %d: %v", res.id, res.err))
				} else {
					archivedIDs = append(archivedIDs, res.id)
				}
			}
		} else {
			// Sequential for small batches
			for _, id := range req.IDs {
				if archErr := s.observationStore.ArchiveObservation(r.Context(), id, req.Reason); archErr != nil {
					log.Warn().Err(archErr).Int64("id", id).Msg("Failed to archive observation")
					failedIDs = append(failedIDs, id)
					errors = append(errors, fmt.Sprintf("id %d: %v", id, archErr))
				} else {
					archivedIDs = append(archivedIDs, id)
				}
			}
		}
	} else if req.Project != "" || req.MaxAgeDays > 0 {
		// Archive by age
		archivedIDs, err = s.observationStore.ArchiveOldObservations(r.Context(), req.Project, req.MaxAgeDays, req.Reason)
		if err != nil {
			http.Error(w, "failed to archive: "+err.Error(), http.StatusInternalServerError)
			return
		}
	} else {
		http.Error(w, "either 'ids' or 'project'/'max_age_days' is required", http.StatusBadRequest)
		return
	}

	log.Info().
		Str("project", req.Project).
		Int("archived", len(archivedIDs)).
		Int("failed", len(failedIDs)).
		Msg("Observations archived")

	// Invalidate cache if any observations were archived
	if len(archivedIDs) > 0 {
		if req.Project != "" {
			s.invalidateObsCountCache(req.Project)
		} else {
			s.invalidateAllObsCountCache()
		}
	}

	response := map[string]any{
		"archived_count": len(archivedIDs),
		"archived_ids":   archivedIDs,
	}
	if len(failedIDs) > 0 {
		response["failed_count"] = len(failedIDs)
		response["failed_ids"] = failedIDs
		response["errors"] = errors
	}

	writeJSON(w, response)
}

// handleUnarchiveObservation restores an archived observation.
func (s *Service) handleUnarchiveObservation(w http.ResponseWriter, r *http.Request) {
	idStr := chi.URLParam(r, "id")
	id, err := strconv.ParseInt(idStr, 10, 64)
	if err != nil {
		http.Error(w, "invalid observation id", http.StatusBadRequest)
		return
	}

	if err := s.observationStore.UnarchiveObservation(r.Context(), id); err != nil {
		http.Error(w, err.Error(), http.StatusInternalServerError)
		return
	}

	// Invalidate all caches since we don't know the project
	s.invalidateAllObsCountCache()

	writeJSON(w, map[string]any{
		"success": true,
		"id":      id,
	})
}

// handleGetArchivedObservations returns archived observations.
func (s *Service) handleGetArchivedObservations(w http.ResponseWriter, r *http.Request) {
	project := r.URL.Query().Get("project")
	limit := gorm.ParseLimitParam(r, DefaultObservationsLimit)

	observations, err := s.observationStore.GetArchivedObservations(r.Context(), project, limit)
	if err != nil {
		http.Error(w, err.Error(), http.StatusInternalServerError)
		return
	}

	if observations == nil {
		observations = []*models.Observation{}
	}

	writeJSON(w, observations)
}

// handleGetArchivalStats returns archival statistics.
func (s *Service) handleGetArchivalStats(w http.ResponseWriter, r *http.Request) {
	project := r.URL.Query().Get("project")

	stats, err := s.observationStore.GetArchivalStats(r.Context(), project)
	if err != nil {
		http.Error(w, err.Error(), http.StatusInternalServerError)
		return
	}

	writeJSON(w, stats)
}

// handleExportObservations exports observations in JSON or CSV format.
// Supports query parameters: project, format (json/csv), scope, type, limit.
func (s *Service) handleExportObservations(w http.ResponseWriter, r *http.Request) {
	project := r.URL.Query().Get("project")
	format := r.URL.Query().Get("format")
	if format == "" {
		format = "json"
	}
	scope := r.URL.Query().Get("scope")                 // project, global, or empty for all
	obsType := r.URL.Query().Get("type")                // bugfix, feature, etc.
	limit := gorm.ParseLimitParamWithMax(r, 1000, 5000) // Higher limit for exports, capped at 5000

	// Validate format
	if format != "json" && format != "csv" {
		http.Error(w, "format must be 'json' or 'csv'", http.StatusBadRequest)
		return
	}

	// Get observations with filters
	ctx := r.Context()
	var observations []*models.Observation
	var err error

	if project != "" {
		observations, _, err = s.observationStore.GetObservationsByProjectStrictPaginated(ctx, project, limit, 0)
	} else {
		observations, _, err = s.observationStore.GetAllRecentObservationsPaginated(ctx, limit, 0)
	}

	if err != nil {
		http.Error(w, err.Error(), http.StatusInternalServerError)
		return
	}

	// Apply additional filters
	if scope != "" || obsType != "" {
		filtered := make([]*models.Observation, 0, len(observations))
		for _, obs := range observations {
			if scope != "" && string(obs.Scope) != scope {
				continue
			}
			if obsType != "" && string(obs.Type) != obsType {
				continue
			}
			filtered = append(filtered, obs)
		}
		observations = filtered
	}

	// Generate filename
	timestamp := time.Now().Format("20060102-150405")
	filename := fmt.Sprintf("observations-%s.%s", timestamp, format)
	if project != "" {
		// Sanitize project name for filename
		sanitized := strings.ReplaceAll(project, "/", "_")
		sanitized = strings.ReplaceAll(sanitized, "\\", "_")
		if len(sanitized) > 50 {
			sanitized = sanitized[:50]
		}
		filename = fmt.Sprintf("observations-%s-%s.%s", sanitized, timestamp, format)
	}

	switch format {
	case "csv":
		w.Header().Set("Content-Type", "text/csv")
		w.Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=%q", filename))
		s.writeObservationsCSV(w, observations)
	default: // json
		w.Header().Set("Content-Type", "application/json")
		w.Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=%q", filename))
		writeJSON(w, map[string]any{
			"exported_at":  time.Now().Format(time.RFC3339),
			"project":      project,
			"count":        len(observations),
			"observations": observations,
		})
	}
}

// writeObservationsCSV writes observations in CSV format.
// Uses fmt.Fprintf directly to avoid intermediate string allocations.
func (s *Service) writeObservationsCSV(w http.ResponseWriter, observations []*models.Observation) {
	// Write CSV header
	_, _ = io.WriteString(w, "id,type,scope,project,title,subtitle,narrative,concepts,facts,created_at,importance_score\n")

	for _, obs := range observations {
		// Write directly to avoid string allocation per row
		_, _ = fmt.Fprintf(w, "%d,%s,%s,%s,%s,%s,%s,%s,%s,%s,%.2f\n",
			obs.ID,
			obs.Type,
			obs.Scope,
			escapeCsvField(obs.Project),
			escapeCsvField(obs.Title.String),
			escapeCsvField(obs.Subtitle.String),
			escapeCsvField(obs.Narrative.String),
			escapeCsvField(strings.Join(obs.Concepts, ";")),
			escapeCsvField(strings.Join(obs.Facts, ";")),
			obs.CreatedAt,
			obs.ImportanceScore,
		)
	}
}

// escapeCsvField escapes a field for CSV output.
func escapeCsvField(s string) string {
	// If field contains comma, quote, or newline, wrap in quotes and escape quotes
	if strings.ContainsAny(s, ",\"\n\r") {
		s = strings.ReplaceAll(s, "\"", "\"\"")
		return "\"" + s + "\""
	}
	return s
}

// BulkStatusRequest represents a request to update status for multiple observations.
type BulkStatusRequest struct {
	Action   string  `json:"action"`
	Reason   string  `json:"reason,omitempty"`
	IDs      []int64 `json:"ids"`
	Feedback int     `json:"feedback,omitempty"`
}

// handleBulkStatusUpdate updates status for multiple observations in one request.
func (s *Service) handleBulkStatusUpdate(w http.ResponseWriter, r *http.Request) {
	var req BulkStatusRequest
	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
		http.Error(w, "invalid request body: "+err.Error(), http.StatusBadRequest)
		return
	}

	if len(req.IDs) == 0 {
		http.Error(w, "ids is required", http.StatusBadRequest)
		return
	}

	if len(req.IDs) > 500 {
		http.Error(w, "maximum 500 ids per request", http.StatusBadRequest)
		return
	}

	ctx := r.Context()
	var updated, failed int
	var errors []string

	switch req.Action {
	case "supersede":
		for _, id := range req.IDs {
			if err := s.observationStore.MarkAsSuperseded(ctx, id); err != nil {
				failed++
				errors = append(errors, fmt.Sprintf("id %d: %v", id, err))
			} else {
				updated++
			}
		}

	case "archive":
		for _, id := range req.IDs {
			if err := s.observationStore.ArchiveObservation(ctx, id, req.Reason); err != nil {
				failed++
				errors = append(errors, fmt.Sprintf("id %d: %v", id, err))
			} else {
				updated++
			}
		}

	case "set_feedback":
		if req.Feedback < -1 || req.Feedback > 1 {
			http.Error(w, "feedback must be -1, 0, or 1", http.StatusBadRequest)
			return
		}
		for _, id := range req.IDs {
			if err := s.observationStore.UpdateObservationFeedback(ctx, id, req.Feedback); err != nil {
				failed++
				errors = append(errors, fmt.Sprintf("id %d: %v", id, err))
			} else {
				updated++
			}
		}

	default:
		http.Error(w, "action must be 'supersede', 'archive', or 'set_feedback'", http.StatusBadRequest)
		return
	}

	// Invalidate cache for archive action (affects observation counts)
	if req.Action == "archive" && updated > 0 {
		// No project info available, invalidate all caches
		s.invalidateAllObsCountCache()
	}

	response := map[string]any{
		"action":  req.Action,
		"updated": updated,
		"failed":  failed,
	}
	if len(errors) > 0 {
		response["errors"] = errors
	}

	writeJSON(w, response)
}

// handleFindDuplicates finds potential duplicate observations using similarity clustering.
// Returns groups of similar observations that may be candidates for merging or archival.
func (s *Service) handleFindDuplicates(w http.ResponseWriter, r *http.Request) {
	project := r.URL.Query().Get("project")
	thresholdStr := r.URL.Query().Get("threshold")
	limit := gorm.ParseLimitParam(r, 100)

	// Parse threshold (default 0.6 = 60% similarity)
	threshold := 0.6
	if thresholdStr != "" {
		if t, err := strconv.ParseFloat(thresholdStr, 64); err == nil && t > 0 && t < 1 {
			threshold = t
		}
	}

	// Get recent observations
	ctx := r.Context()
	var observations []*models.Observation
	var err error

	if project != "" {
		observations, _, err = s.observationStore.GetObservationsByProjectStrictPaginated(ctx, project, limit, 0)
	} else {
		observations, _, err = s.observationStore.GetAllRecentObservationsPaginated(ctx, limit, 0)
	}

	if err != nil {
		http.Error(w, err.Error(), http.StatusInternalServerError)
		return
	}

	if len(observations) < 2 {
		writeJSON(w, map[string]any{
			"duplicate_groups": []any{},
			"total_checked":    len(observations),
			"threshold":        threshold,
		})
		return
	}

	// Find duplicates using similarity comparison
	type duplicateGroup struct {
		Observations []map[string]any `json:"observations"`
		Similarity   float64          `json:"similarity"`
	}

	groups := []duplicateGroup{}
	processed := make(map[int64]bool)

	for i, obs1 := range observations {
		if processed[obs1.ID] {
			continue
		}

		terms1 := similarity.ExtractObservationTerms(obs1)
		if len(terms1) == 0 {
			continue
		}

		group := duplicateGroup{
			Observations: []map[string]any{obs1.ToMap()},
			Similarity:   1.0,
		}

		for j := i + 1; j < len(observations); j++ {
			obs2 := observations[j]
			if processed[obs2.ID] {
				continue
			}

			terms2 := similarity.ExtractObservationTerms(obs2)
			sim := similarity.JaccardSimilarity(terms1, terms2)

			if sim >= threshold {
				obsMap := obs2.ToMap()
				obsMap["similarity_to_first"] = sim
				group.Observations = append(group.Observations, obsMap)
				group.Similarity = min(group.Similarity, sim)
				processed[obs2.ID] = true
			}
		}

		if len(group.Observations) > 1 {
			processed[obs1.ID] = true
			groups = append(groups, group)
		}
	}

	// Sort groups by size (largest first)
	sort.Slice(groups, func(i, j int) bool {
		return len(groups[i].Observations) > len(groups[j].Observations)
	})

	writeJSON(w, map[string]any{
		"duplicate_groups": groups,
		"total_checked":    len(observations),
		"groups_found":     len(groups),
		"threshold":        threshold,
		"project":          project,
	})
}