mirror of
https://github.com/lukaszraczylo/graphql-monitoring-proxy.git
synced 2026-06-05 23:03:48 +00:00
c2c75d69c0
Performance / resource usage: - circuit_breaker_metrics: fix data race on failCounters map (RWMutex + double-checked locking) - server.go: drop user_id and op_name metric labels (Prometheus cardinality bound); de-duplicate extractUserInfo - graphql.go: gate runtime.ReadMemStats per-request behind ENABLE_ALLOCATION_TRACKING flag (default off) - graphql.go: collapse two-pass AST scan into single pass; lower-case once - sanitization.go: cache compiled redaction regexes per pattern via sync.Map; hoist inner constants to pkg vars - proxy.go: hoist connection/timeout substrings to pkg vars; sentinel errors for static error paths; drop dead Headers map alloc - metrics_aggregator.go: log-field allocation guarded by Logger.IsLevelEnabled - logging/logger.go: add IsLevelEnabled helper - lru_cache.go: 16-shard sharding, FNV-1a routing (concurrent throughput +22%) - cache/memory/lru_memory_cache.go: gzip compress/decompress moved outside mu.Lock - rps_tracker.go: RWMutex+uint64 -> atomic.Uint64 - retry_budget.go: drop unused mutex - api.go: bannedUsersIDs map+RWMutex -> sync.Map (+ snapshot/replace helpers) - tracing/tracing.go: pkg-level constSpanAttrs, copy-then-append in StartSpanWithAttributes - admin_dashboard.go: handleStatsWebSocket reuses bytes.Buffer + json.Encoder per connection Build / runtime: - Makefile: -ldflags="-s -w" -trimpath, CGO_ENABLED=0 for build (=1 for test recipes) - Dockerfile + Dockerfile.goreleaser: ENV GOMEMLIMIT=512MiB - main.go: blank import go.uber.org/automaxprocs (cgroup-aware GOMAXPROCS) - main.go: PPROF_PORT env var wires net/http/pprof on 127.0.0.1 only with full server timeouts - README.md: env-var docs + metric-label docs updated; cardinality note Test coverage push (per package): - main 51.2% -> 74.7% - cache 66.3% -> 93.7% - cache/redis 45.5% -> 98.2% - tracing 66.7% -> 72.9% - (cache/memory 91.6%, logging 91.9%, monitoring 77.6%, pkg/pools 100% unchanged) New test files: coverage_micro_test, coverage_extras_test, server_handlers_test, api_health_test, admin_dashboard_cluster_test, metrics_aggregator_test, concerns_test, cache/cache_coverage_test, cache/redis/redis_coverage_test, tracing/tracing_coverage_test. Bug fix: connection_resilience_test.go TestIntegratedHealthManagement.health_manager_startup was sync.Once-coupled to InitializeBackendHealth and panicked when another test (e.g. via parseConfig) had already triggered Once. Use NewBackendHealthManager directly.
423 lines
13 KiB
Go
423 lines
13 KiB
Go
package main
|
|
|
|
import (
|
|
"fmt"
|
|
"strconv"
|
|
"time"
|
|
|
|
"github.com/goccy/go-json"
|
|
fiber "github.com/gofiber/fiber/v2"
|
|
"github.com/gofiber/fiber/v2/middleware/cors"
|
|
"github.com/google/uuid"
|
|
|
|
graphql "github.com/lukaszraczylo/go-simple-graphql"
|
|
libpack_cache "github.com/lukaszraczylo/graphql-monitoring-proxy/cache"
|
|
libpack_config "github.com/lukaszraczylo/graphql-monitoring-proxy/config"
|
|
libpack_logger "github.com/lukaszraczylo/graphql-monitoring-proxy/logging"
|
|
libpack_monitoring "github.com/lukaszraczylo/graphql-monitoring-proxy/monitoring"
|
|
)
|
|
|
|
const (
|
|
healthCheckQueryStr = `{ __typename }`
|
|
)
|
|
|
|
// HealthCheckResponse represents the response structure for health check endpoints
|
|
type HealthCheckResponse struct {
|
|
Status string `json:"status"` // overall status: "healthy" or "unhealthy"
|
|
Dependencies map[string]DependencyStatus `json:"dependencies"` // status of each dependency
|
|
Timestamp string `json:"timestamp"` // when the health check was performed
|
|
}
|
|
|
|
// DependencyStatus represents the status of a dependency
|
|
type DependencyStatus struct {
|
|
Error *string `json:"error,omitempty"`
|
|
Status string `json:"status"`
|
|
ResponseTime int64 `json:"responseTime"`
|
|
}
|
|
|
|
// StartHTTPProxy initializes and starts the HTTP proxy server.
|
|
func StartHTTPProxy() error {
|
|
cfg.Logger.Debug(&libpack_logger.LogMessage{
|
|
Message: "Starting the HTTP proxy",
|
|
})
|
|
|
|
serverConfig := fiber.Config{
|
|
DisableStartupMessage: true,
|
|
AppName: fmt.Sprintf("GraphQL Monitoring Proxy - %s v%s", libpack_config.PKG_NAME, libpack_config.PKG_VERSION),
|
|
IdleTimeout: time.Duration(cfg.Client.ClientTimeout) * time.Second,
|
|
ReadTimeout: time.Duration(cfg.Client.ClientTimeout) * time.Second,
|
|
WriteTimeout: time.Duration(cfg.Client.ClientTimeout) * time.Second,
|
|
JSONEncoder: json.Marshal,
|
|
JSONDecoder: json.Unmarshal,
|
|
}
|
|
|
|
server := fiber.New(serverConfig)
|
|
|
|
server.Use(cors.New(cors.Config{
|
|
AllowOrigins: "*",
|
|
}))
|
|
|
|
server.Use(AddRequestUUID)
|
|
|
|
server.Get("/healthz", healthCheck)
|
|
server.Get("/livez", healthCheck)
|
|
server.Get("/health", healthCheck)
|
|
|
|
// Register admin dashboard routes if enabled
|
|
if cfg.AdminDashboard.Enable {
|
|
adminDash := NewAdminDashboard(cfg.Logger)
|
|
adminDash.RegisterRoutes(server)
|
|
}
|
|
|
|
// WebSocket support - must be registered before catch-all routes
|
|
if cfg.WebSocket.Enable {
|
|
server.Get("/v1/graphql", func(c *fiber.Ctx) error {
|
|
if IsWebSocketRequest(c) {
|
|
wsp := GetWebSocketProxy()
|
|
if wsp != nil {
|
|
return wsp.HandleWebSocket(c)
|
|
}
|
|
}
|
|
return proxyTheRequestToDefault(c)
|
|
})
|
|
}
|
|
|
|
server.Post("/*", processGraphQLRequest)
|
|
server.Get("/*", proxyTheRequestToDefault)
|
|
|
|
cfg.Logger.Info(&libpack_logger.LogMessage{
|
|
Message: "GraphQL proxy starting",
|
|
Pairs: map[string]any{"port": cfg.Server.PortGraphQL},
|
|
})
|
|
|
|
if err := server.Listen(fmt.Sprintf(":%d", cfg.Server.PortGraphQL)); err != nil {
|
|
return fmt.Errorf("failed to start HTTP proxy server on port %d: %w",
|
|
cfg.Server.PortGraphQL, err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// proxyTheRequestToDefault proxies the request to the default GraphQL endpoint.
|
|
func proxyTheRequestToDefault(c *fiber.Ctx) error {
|
|
return proxyTheRequest(c, cfg.Server.HostGraphQL)
|
|
}
|
|
|
|
// AddRequestUUID adds a unique request UUID to the context.
|
|
func AddRequestUUID(c *fiber.Ctx) error {
|
|
c.Locals("request_uuid", uuid.NewString())
|
|
return c.Next()
|
|
}
|
|
|
|
// checkAllowedURLs checks if the requested URL is allowed.
|
|
func checkAllowedURLs(c *fiber.Ctx) bool {
|
|
if len(allowedUrls) == 0 {
|
|
return true
|
|
}
|
|
path := c.OriginalURL()
|
|
_, ok := allowedUrls[path]
|
|
return ok
|
|
}
|
|
|
|
// healthCheck performs a comprehensive health check on the GraphQL server and its dependencies.
|
|
func healthCheck(c *fiber.Ctx) error {
|
|
// Prepare the response structure
|
|
response := HealthCheckResponse{
|
|
Status: "healthy",
|
|
Dependencies: make(map[string]DependencyStatus),
|
|
Timestamp: time.Now().UTC().Format(time.RFC3339),
|
|
}
|
|
|
|
// Configure checks from query parameters
|
|
checkGraphQL := true
|
|
checkRedis := cfg.Cache.CacheRedisEnable
|
|
|
|
// Parse query parameters to enable/disable specific checks
|
|
if c.Query("check_graphql") == "false" {
|
|
checkGraphQL = false
|
|
}
|
|
if c.Query("check_redis") == "false" {
|
|
checkRedis = false
|
|
}
|
|
|
|
// Check GraphQL backend service
|
|
if checkGraphQL {
|
|
startTime := time.Now()
|
|
graphqlStatus := DependencyStatus{
|
|
Status: "up",
|
|
}
|
|
|
|
// Try to connect to main GraphQL endpoint
|
|
endpoint := cfg.Server.HostGraphQL
|
|
if len(cfg.Server.HealthcheckGraphQL) > 0 {
|
|
endpoint = cfg.Server.HealthcheckGraphQL
|
|
}
|
|
|
|
// Create a new GraphQL client for the health check
|
|
tempClient := graphql.NewConnection()
|
|
tempClient.SetEndpoint(endpoint)
|
|
_, err := tempClient.Query(healthCheckQueryStr, nil, nil)
|
|
|
|
graphqlStatus.ResponseTime = time.Since(startTime).Milliseconds()
|
|
|
|
if err != nil {
|
|
errorMsg := err.Error()
|
|
graphqlStatus.Status = "down"
|
|
graphqlStatus.Error = &errorMsg
|
|
response.Status = "unhealthy"
|
|
|
|
cfg.Logger.Error(&libpack_logger.LogMessage{
|
|
Message: "Health check: Can't reach the GraphQL server",
|
|
Pairs: map[string]any{
|
|
"endpoint": endpoint,
|
|
"error": errorMsg,
|
|
"response_time_ms": graphqlStatus.ResponseTime,
|
|
},
|
|
})
|
|
cfg.Monitoring.Increment(libpack_monitoring.MetricsFailed, nil)
|
|
}
|
|
|
|
response.Dependencies["graphql"] = graphqlStatus
|
|
}
|
|
|
|
// Check Redis connectivity if enabled
|
|
if checkRedis && cfg.Cache.CacheRedisEnable {
|
|
startTime := time.Now()
|
|
redisStatus := DependencyStatus{
|
|
Status: "up",
|
|
}
|
|
|
|
// Implement proper Redis connectivity test
|
|
redisAccessible := false
|
|
var redisError error
|
|
|
|
if libpack_cache.IsCacheInitialized() {
|
|
// Try a simple Redis operation to test connectivity
|
|
testKey := "health_check_test"
|
|
testValue := []byte("test")
|
|
|
|
// Try to set and get a test value
|
|
libpack_cache.CacheStore(testKey, testValue)
|
|
retrievedValue := libpack_cache.CacheLookup(testKey)
|
|
|
|
if retrievedValue != nil && string(retrievedValue) == "test" {
|
|
redisAccessible = true
|
|
// Clean up test key
|
|
libpack_cache.CacheDelete(testKey)
|
|
} else {
|
|
redisError = fmt.Errorf("redis test operation failed")
|
|
}
|
|
} else {
|
|
redisError = fmt.Errorf("cache not initialized")
|
|
}
|
|
|
|
redisStatus.ResponseTime = time.Since(startTime).Milliseconds()
|
|
|
|
if !redisAccessible {
|
|
errorMsg := "Failed to connect to Redis"
|
|
if redisError != nil {
|
|
errorMsg = redisError.Error()
|
|
}
|
|
redisStatus.Status = "down"
|
|
redisStatus.Error = &errorMsg
|
|
response.Status = "unhealthy"
|
|
|
|
cfg.Logger.Error(&libpack_logger.LogMessage{
|
|
Message: "Health check: Can't connect to Redis",
|
|
Pairs: map[string]any{
|
|
"server": cfg.Cache.CacheRedisURL,
|
|
"error": errorMsg,
|
|
"response_time_ms": redisStatus.ResponseTime,
|
|
},
|
|
})
|
|
}
|
|
|
|
response.Dependencies["redis"] = redisStatus
|
|
}
|
|
|
|
// Determine appropriate HTTP status code
|
|
httpStatus := fiber.StatusOK
|
|
if response.Status == "unhealthy" {
|
|
httpStatus = fiber.StatusServiceUnavailable
|
|
}
|
|
|
|
cfg.Logger.Debug(&libpack_logger.LogMessage{
|
|
Message: "Health check completed",
|
|
Pairs: map[string]any{
|
|
"status": response.Status,
|
|
"dependencies": response.Dependencies,
|
|
},
|
|
})
|
|
|
|
// Return JSON response
|
|
return c.Status(httpStatus).JSON(response)
|
|
}
|
|
|
|
// processGraphQLRequest handles the incoming GraphQL requests.
|
|
func processGraphQLRequest(c *fiber.Ctx) error {
|
|
startTime := time.Now()
|
|
|
|
// Extract user information and check permissions
|
|
extractedUserID, extractedRoleName := extractUserInfo(c)
|
|
|
|
// Check if user is banned
|
|
if checkIfUserIsBanned(c, extractedUserID) {
|
|
return c.Status(fiber.StatusForbidden).SendString("User is banned")
|
|
}
|
|
|
|
// Apply rate limiting if enabled
|
|
if cfg.Client.RoleRateLimit && !rateLimitedRequest(extractedUserID, extractedRoleName) {
|
|
return c.Status(fiber.StatusTooManyRequests).SendString("Rate limit exceeded, try again later")
|
|
}
|
|
|
|
// Parse the GraphQL query
|
|
parsedResult := parseGraphQLQuery(c)
|
|
|
|
// Debug logging for mutation routing analysis (enabled when LOG_LEVEL=DEBUG)
|
|
if cfg.LogLevel == "DEBUG" {
|
|
var m map[string]any
|
|
if err := json.Unmarshal(c.Body(), &m); err == nil {
|
|
if query, ok := m["query"].(string); ok {
|
|
debugParseGraphQLQuery(c, query)
|
|
}
|
|
}
|
|
}
|
|
|
|
if parsedResult.shouldBlock {
|
|
return c.Status(fiber.StatusForbidden).SendString("Request blocked")
|
|
}
|
|
|
|
// Handle non-GraphQL requests
|
|
if parsedResult.shouldIgnore {
|
|
return proxyTheRequest(c, parsedResult.activeEndpoint)
|
|
}
|
|
|
|
// Handle caching
|
|
wasCached, err := handleCaching(c, parsedResult, extractedUserID, extractedRoleName)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Log and monitor the request
|
|
logAndMonitorRequest(c, extractedUserID, parsedResult.operationType, parsedResult.operationName, wasCached, time.Since(startTime), startTime)
|
|
|
|
return nil
|
|
}
|
|
|
|
// extractUserInfo extracts user ID and role from request headers
|
|
func extractUserInfo(c *fiber.Ctx) (string, string) {
|
|
extractedUserID := "-"
|
|
extractedRoleName := "-"
|
|
|
|
// Extract from JWT if available
|
|
if authorization := c.Get("Authorization"); authorization != "" &&
|
|
(len(cfg.Client.JWTUserClaimPath) > 0 || len(cfg.Client.JWTRoleClaimPath) > 0) {
|
|
extractedUserID, extractedRoleName = extractClaimsFromJWTHeader(authorization)
|
|
}
|
|
|
|
// Override role from header if configured
|
|
if cfg.Client.RoleFromHeader != "" {
|
|
if role := c.Get(cfg.Client.RoleFromHeader); role != "" {
|
|
extractedRoleName = role
|
|
}
|
|
}
|
|
|
|
return extractedUserID, extractedRoleName
|
|
}
|
|
|
|
// handleCaching manages the caching logic for GraphQL requests
|
|
func handleCaching(c *fiber.Ctx, parsedResult *parseGraphQLQueryResult, userID, userRole string) (bool, error) {
|
|
// Calculate query hash for cache key - now includes user context for security
|
|
calculatedQueryHash := libpack_cache.CalculateHash(c, userID, userRole)
|
|
|
|
// Set cache time from header or default
|
|
if parsedResult.cacheTime == 0 {
|
|
if cacheQuery := c.Get("X-Cache-Graphql-Query"); cacheQuery != "" {
|
|
parsedResult.cacheTime, _ = strconv.Atoi(cacheQuery)
|
|
} else {
|
|
parsedResult.cacheTime = cfg.Cache.CacheTTL
|
|
}
|
|
}
|
|
|
|
// Handle cache refresh directive
|
|
if parsedResult.cacheRefresh {
|
|
libpack_cache.CacheDelete(calculatedQueryHash)
|
|
}
|
|
|
|
// Check if caching is enabled
|
|
cacheEnabled := parsedResult.cacheRequest || cfg.Cache.CacheEnable || cfg.Cache.CacheRedisEnable
|
|
if !cacheEnabled {
|
|
// No caching, just proxy the request
|
|
if err := proxyTheRequest(c, parsedResult.activeEndpoint); err != nil {
|
|
cfg.Monitoring.Increment(libpack_monitoring.MetricsFailed, nil)
|
|
return false, c.Status(fiber.StatusInternalServerError).SendString("Can't proxy the request - try again later")
|
|
}
|
|
return false, nil
|
|
}
|
|
|
|
// Try to get from cache
|
|
if cachedResponse := libpack_cache.CacheLookup(calculatedQueryHash); cachedResponse != nil {
|
|
cfg.Monitoring.Increment(libpack_monitoring.MetricsCacheHit, nil)
|
|
c.Set("X-Cache-Hit", "true")
|
|
c.Set("Content-Type", "application/json")
|
|
return true, c.Send(cachedResponse)
|
|
}
|
|
|
|
// Cache miss, proxy and cache
|
|
cfg.Monitoring.Increment(libpack_monitoring.MetricsCacheMiss, nil)
|
|
if err := proxyAndCacheTheRequest(c, calculatedQueryHash, parsedResult.cacheTime, parsedResult.activeEndpoint); err != nil {
|
|
return false, err
|
|
}
|
|
|
|
return false, nil
|
|
}
|
|
|
|
// proxyAndCacheTheRequest proxies and caches the request if needed.
|
|
func proxyAndCacheTheRequest(c *fiber.Ctx, queryCacheHash string, cacheTime int, currentEndpoint string) error {
|
|
if err := proxyTheRequest(c, currentEndpoint); err != nil {
|
|
cfg.Logger.Error(&libpack_logger.LogMessage{
|
|
Message: "Can't proxy the request",
|
|
Pairs: map[string]any{"error": err.Error()},
|
|
})
|
|
cfg.Monitoring.Increment(libpack_monitoring.MetricsFailed, nil)
|
|
return c.Status(fiber.StatusInternalServerError).SendString("Can't proxy the request - try again later")
|
|
}
|
|
|
|
libpack_cache.CacheStoreWithTTL(queryCacheHash, c.Response().Body(), time.Duration(cacheTime)*time.Second)
|
|
cfg.Monitoring.Increment(libpack_monitoring.MetricsQueriesCached, nil)
|
|
return c.Send(c.Response().Body())
|
|
}
|
|
|
|
// logAndMonitorRequest logs and monitors the request processing.
|
|
func logAndMonitorRequest(c *fiber.Ctx, userID, opType, opName string, wasCached bool, duration time.Duration, startTime time.Time) {
|
|
// Low-cardinality labels only: user_id and op_name dropped to prevent Prometheus explosion.
|
|
labels := map[string]string{
|
|
"op_type": opType,
|
|
"cached": strconv.FormatBool(wasCached),
|
|
}
|
|
|
|
if cfg.Server.AccessLog {
|
|
cfg.Logger.Info(&libpack_logger.LogMessage{
|
|
Message: "Request processed",
|
|
Pairs: map[string]any{
|
|
"ip": c.IP(),
|
|
"fwd-ip": c.Get("X-Forwarded-For"),
|
|
"user_id": userID,
|
|
"op_type": opType,
|
|
"op_name": opName,
|
|
"time": duration,
|
|
"cache": wasCached,
|
|
"request_uuid": c.Locals("request_uuid"),
|
|
},
|
|
})
|
|
}
|
|
|
|
cfg.Monitoring.Increment(libpack_monitoring.MetricsSucceeded, nil)
|
|
cfg.Monitoring.Increment(libpack_monitoring.MetricsExecutedQuery, labels)
|
|
|
|
if !wasCached {
|
|
cfg.Monitoring.UpdateDuration(libpack_monitoring.MetricsTimedQuery, labels, startTime)
|
|
cfg.Monitoring.Update(libpack_monitoring.MetricsTimedQuery, labels, float64(duration.Milliseconds()))
|
|
}
|
|
}
|