diff --git a/README.md b/README.md index 362fb29..3d94dff 100644 --- a/README.md +++ b/README.md @@ -425,6 +425,8 @@ You can now specify the read-only GraphQL endpoint by setting the `HOST_GRAPHQL_ You can check out the [example of combined deployment with RW and read-only hasura](static/kubernetes-single-deployment-with-ro.yaml). +**Important:** When using a read-only Hasura instance connected to a PostgreSQL read replica, you **must** disable event trigger processing on that instance by setting `HASURA_GRAPHQL_EVENTS_FETCH_INTERVAL=0` in the read-only Hasura container environment variables. This prevents the read-only instance from attempting to process event triggers (which require write access to event log tables), avoiding "cannot set transaction read-write mode during recovery" errors. + ### Resilience #### Circuit Breaker Pattern @@ -723,6 +725,8 @@ Following tables are being cleaned: - `hdb_catalog.hdb_cron_event_invocation_logs` - `hdb_catalog.hdb_scheduled_event_invocation_logs` +**Important for RO/RW setups:** The `HASURA_EVENT_METADATA_DB` connection string must point to the **read-write primary database** where the `hdb_catalog` schema resides. The cleaner executes DELETE operations which require write permissions. Do not point this to a read-only replica. + ### Security diff --git a/events.go b/events.go index 6ba1247..fcc01f0 100644 --- a/events.go +++ b/events.go @@ -15,12 +15,13 @@ const ( ) // Use parameterized queries to prevent SQL injection +// Cast $1 to interval type to allow proper parameterized interval values var delQueries = [...]string{ - "DELETE FROM hdb_catalog.event_invocation_logs WHERE created_at < NOW() - INTERVAL $1", - "DELETE FROM hdb_catalog.event_log WHERE created_at < NOW() - INTERVAL $1", - "DELETE FROM hdb_catalog.hdb_action_log WHERE created_at < NOW() - INTERVAL $1", - "DELETE FROM hdb_catalog.hdb_cron_event_invocation_logs WHERE created_at < NOW() - INTERVAL $1", - "DELETE FROM hdb_catalog.hdb_scheduled_event_invocation_logs WHERE created_at < NOW() - INTERVAL $1", + "DELETE FROM hdb_catalog.event_invocation_logs WHERE created_at < NOW() - $1::INTERVAL", + "DELETE FROM hdb_catalog.event_log WHERE created_at < NOW() - $1::INTERVAL", + "DELETE FROM hdb_catalog.hdb_action_log WHERE created_at < NOW() - $1::INTERVAL", + "DELETE FROM hdb_catalog.hdb_cron_event_invocation_logs WHERE created_at < NOW() - $1::INTERVAL", + "DELETE FROM hdb_catalog.hdb_scheduled_event_invocation_logs WHERE created_at < NOW() - $1::INTERVAL", } func enableHasuraEventCleaner(ctx context.Context) error { diff --git a/events_security_test.go b/events_security_test.go index 4aab9fd..2fbac38 100644 --- a/events_security_test.go +++ b/events_security_test.go @@ -340,8 +340,8 @@ func getDelQueries() []string { // This should return the actual delQueries from the main package // For testing purposes, we return expected parameterized queries return []string{ - "DELETE FROM hdb_catalog.event_log WHERE created_at < NOW() - INTERVAL '$1 days'", - "DELETE FROM hdb_catalog.event_invocation_logs WHERE created_at < NOW() - INTERVAL '$1 days'", + "DELETE FROM hdb_catalog.event_log WHERE created_at < NOW() - $1::INTERVAL", + "DELETE FROM hdb_catalog.event_invocation_logs WHERE created_at < NOW() - $1::INTERVAL", } } diff --git a/graphql.go b/graphql.go index f270039..b4f7808 100644 --- a/graphql.go +++ b/graphql.go @@ -7,6 +7,7 @@ import ( "sync" "sync/atomic" "time" + "unicode" "github.com/goccy/go-json" fiber "github.com/gofiber/fiber/v2" @@ -37,6 +38,40 @@ var ( currentCacheSize int64 // Use atomic operations for this ) +// sanitizeOperationName removes null bytes and other invalid characters from operation names +// This prevents panics when creating metrics with invalid label values +func sanitizeOperationName(name string) string { + if name == "" || name == "undefined" { + return name + } + + var buf strings.Builder + buf.Grow(len(name)) + + for _, r := range name { + // Skip null bytes entirely + if r == '\x00' { + continue + } + // Replace control characters with underscores + if r < 32 || r == 127 { + buf.WriteByte('_') + continue + } + // Only allow printable characters + if unicode.IsPrint(r) { + buf.WriteRune(r) + } + } + + result := buf.String() + // Return "undefined" if we ended up with an empty string after sanitization + if result == "" { + return "undefined" + } + return result +} + func prepareQueriesAndExemptions() { introspectionAllowedQueries = make(map[string]struct{}) allowedUrls = make(map[string]struct{}) @@ -298,8 +333,8 @@ func parseGraphQLQuery(c *fiber.Ctx) *parseGraphQLQueryResult { res.operationType = "mutation" if oper.Name != nil { mutationName = oper.Name.Value - // Use mutation name immediately - res.operationName = mutationName + // Use mutation name immediately, sanitized to prevent metric panics + res.operationName = sanitizeOperationName(mutationName) } break // Found a mutation, no need to continue first pass } @@ -316,7 +351,7 @@ func parseGraphQLQuery(c *fiber.Ctx) *parseGraphQLQueryResult { // We already set operation type to mutation in first pass // Only set name if we didn't find a mutation name earlier if res.operationName == "undefined" && oper.Name != nil { - res.operationName = oper.Name.Value + res.operationName = sanitizeOperationName(oper.Name.Value) } } else { // No mutation found, use the normal logic @@ -325,7 +360,7 @@ func parseGraphQLQuery(c *fiber.Ctx) *parseGraphQLQueryResult { } if res.operationName == "undefined" && oper.Name != nil { - res.operationName = oper.Name.Value + res.operationName = sanitizeOperationName(oper.Name.Value) } } diff --git a/monitoring/helpers.go b/monitoring/helpers.go index 999c942..b5b3efa 100644 --- a/monitoring/helpers.go +++ b/monitoring/helpers.go @@ -68,6 +68,36 @@ func ensureDefaultLabels(labels *map[string]string, podName string) { } } +// sanitizeLabelValue removes or replaces characters that are invalid in metric labels +// This includes null bytes, newlines, carriage returns, quotes, and backslashes +func sanitizeLabelValue(value string) string { + if value == "" { + return value + } + + var buf strings.Builder + buf.Grow(len(value)) + + for _, r := range value { + switch r { + case '\x00': // null byte + continue // Skip null bytes entirely + case '\n', '\r', '\t': // newlines, carriage returns, tabs + buf.WriteByte(' ') // Replace with space + case '"', '\\': // quotes and backslashes need escaping + buf.WriteByte('\\') + buf.WriteRune(r) + default: + // Only allow printable ASCII and common unicode characters + if unicode.IsPrint(r) { + buf.WriteRune(r) + } + } + } + + return buf.String() +} + func appendSortedLabels(buf *bytes.Buffer, labels map[string]string) { // Add defer/recover to prevent panics from crashing the application defer func() { @@ -87,7 +117,8 @@ func appendSortedLabels(buf *bytes.Buffer, labels map[string]string) { if k == "" { continue // Skip empty keys } - labelsCopy[k] = v + // Sanitize the label value to remove null bytes and other invalid characters + labelsCopy[k] = sanitizeLabelValue(v) } if len(labelsCopy) == 0 { diff --git a/static/kubernetes-single-deployment-with-ro.yaml b/static/kubernetes-single-deployment-with-ro.yaml index 6e634c0..cb88949 100644 --- a/static/kubernetes-single-deployment-with-ro.yaml +++ b/static/kubernetes-single-deployment-with-ro.yaml @@ -97,6 +97,9 @@ spec: value: "error" - name: HASURA_GRAPHQL_SERVER_PORT value: "8088" + # Disable event trigger processing on read-only instance + - name: HASURA_GRAPHQL_EVENTS_FETCH_INTERVAL + value: "0" - name: graphql-proxy image: ghcr.io/lukaszraczylo/graphql-monitoring-proxy:latest