From 0b642f8be19d18d211f65b7bee5c0862f5f5ea92 Mon Sep 17 00:00:00 2001 From: Lukasz Raczylo Date: Thu, 16 Nov 2023 17:45:48 +0100 Subject: [PATCH] Add ability to reset metrics between crawl to limit payload absorbed (#5) by the prometheus/victoria metric crawlers. --- Makefile | 2 +- README.md | 8 ++++++++ main.go | 1 + monitoring.go | 2 +- monitoring/monitoring.go | 33 ++++++++++++++++++++++----------- struct_config.go | 1 + 6 files changed, 34 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index 3cb04f2..eaaeb09 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,7 @@ help: ## display this help .PHONY: run run: build ## run application - @LOG_LEVEL=debug BLOCK_SCHEMA_INTROSPECTION=false CACHE_TTL=10 JWT_ROLE_RATE_LIMIT=false JWT_ROLE_CLAIM_PATH="Hasura.x-hasura-default-role" JWT_USER_CLAIM_PATH="Hasura.x-hasura-user-id" HOST_GRAPHQL=https://hasura8.lan/ HEALTHCHECK_GRAPHQL_URL=https://hasura8.lan/v1/graphql ./graphql-proxy + @LOG_LEVEL=debug PURGE_METRICS_ON_CRAWL=true BLOCK_SCHEMA_INTROSPECTION=false CACHE_TTL=10 JWT_ROLE_RATE_LIMIT=false JWT_ROLE_CLAIM_PATH="Hasura.x-hasura-default-role" JWT_USER_CLAIM_PATH="Hasura.x-hasura-user-id" HOST_GRAPHQL=https://hasura8.lan/ HEALTHCHECK_GRAPHQL_URL=https://hasura8.lan/v1/graphql ./graphql-proxy .PHONY: build build: ## build the binary diff --git a/README.md b/README.md index 37aa987..c0003c5 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,7 @@ This project is in active use by [telegram-bot.app](https://telegram-bot.app), a - [API endpoints](#api-endpoints) - [Ban or unban the user](#ban-or-unban-the-user) - [General](#general) + - [Metrics which matter](#metrics-which-matter) - [Healthcheck](#healthcheck) - [Monitoring endpoint](#monitoring-endpoint) @@ -123,6 +124,7 @@ In this case, both proxy and websockets will be available under the `/v1/graphql | `API_PORT` | The port to expose the monitoring API | `9090` | | `BANNED_USERS_FILE` | The path to the file with banned users | `/go/src/app/banned_users.json` | | `PROXIED_CLIENT_TIMEOUT` | The timeout for the proxied client in seconds | `120` | +| `PURGE_METRICS_ON_CRAWL` | Purge metrics on each /metrics crawl | `false` | ### Speed @@ -227,6 +229,12 @@ Ban details will be stored in the `banned_users.json` file, which you can mount ### General +#### Metrics which matter + +You can always enable `PURGE_METRICS_ON_CRAWL` environment variable to purge the metrics on each `/metrics` crawl. This will allow you to see only the current metrics, without potential leftovers from the previous crawls. This is useful if you want to monitor the metrics in real-time and / or limit the amount of data ingested into the monitoring system. When enabled you will most likely need to update your monitoring queries. + +With the `PURGE_METRICS_ON_CRAWL` enabled, the `graphql_proxy_requests_failed`, `graphql_proxy_requests_skipped` and `graphql_proxy_requests_succesful` metrics will remain between resets. + #### Healthcheck If you'd like the `/healthz` endpoint to perform actual check for the connectivity to the graphql endpoint - set the `HEALTHCHECK_GRAPHQL_URL` environment variable to the exact URL of the graphql endpoint. The query executed will be `query { __typename }` and if the response is not `200 OK` - the healthcheck will fail. Remember that the endpoint is a full URL which you'd like to check, so it should include the protocol, host and path - for example `http://localhost:8080/v1/graphql` and it's NOT the same as value of `HOST_GRAPHQL` environment variable which should provide only the host, without path, ending with slash. diff --git a/main.go b/main.go index cc3c3a8..f305781 100644 --- a/main.go +++ b/main.go @@ -55,6 +55,7 @@ func parseConfig() { c.Server.EnableApi = envutil.GetBool("ENABLE_API", false) c.Server.ApiPort = envutil.GetInt("API_PORT", 9090) c.Api.BannedUsersFile = envutil.Getenv("BANNED_USERS_FILE", "/go/src/app/banned_users.json") + c.Server.PurgeOnCrawl = envutil.GetBool("PURGE_METRICS_ON_CRAWL", false) cfg = &c enableCache() // takes close to no resources, but can be used with dynamic query cache loadRatelimitConfig() diff --git a/monitoring.go b/monitoring.go index 4089935..fc21d4e 100644 --- a/monitoring.go +++ b/monitoring.go @@ -5,7 +5,7 @@ import ( ) func StartMonitoringServer() { - cfg.Monitoring = libpack_monitoring.NewMonitoring() + cfg.Monitoring = libpack_monitoring.NewMonitoring(cfg.Server.PurgeOnCrawl) cfg.Monitoring.AddMetricsPrefix("graphql_proxy") cfg.Monitoring.RegisterDefaultMetrics() } diff --git a/monitoring/monitoring.go b/monitoring/monitoring.go index d8462dd..8b2cd6e 100644 --- a/monitoring/monitoring.go +++ b/monitoring/monitoring.go @@ -15,18 +15,22 @@ import ( ) type MetricsSetup struct { - metrics_prefix string - metrics_set *metrics.Set + metrics_prefix string + metrics_set *metrics.Set + metrics_set_custom *metrics.Set } var ( - log *logging.LogConfig + log *logging.LogConfig + purgeMetricsOnCrawl bool ) -func NewMonitoring() *MetricsSetup { +func NewMonitoring(purgeOnCrawl bool) *MetricsSetup { + purgeMetricsOnCrawl = purgeOnCrawl log = logging.NewLogger() ms := &MetricsSetup{} ms.metrics_set = metrics.NewSet() + ms.metrics_set_custom = metrics.NewSet() go ms.startPrometheusEndpoint() return ms } @@ -45,6 +49,10 @@ func (ms *MetricsSetup) startPrometheusEndpoint() { func (ms *MetricsSetup) metricsEndpoint(c *fiber.Ctx) error { ms.metrics_set.WritePrometheus(c.Response().BodyWriter()) + ms.metrics_set_custom.WritePrometheus(c.Response().BodyWriter()) + if purgeMetricsOnCrawl { + ms.PurgeMetrics() + } return nil } @@ -61,7 +69,7 @@ func (ms *MetricsSetup) RegisterMetricsGauge(metric_name string, labels map[stri log.Critical("RegisterMetricsGauge() error", map[string]interface{}{"_error": "Invalid metric name", "_metric_name": metric_name}) return nil } - return ms.metrics_set.GetOrCreateGauge(ms.get_metrics_name(metric_name, labels), func() float64 { + return ms.metrics_set_custom.GetOrCreateGauge(ms.get_metrics_name(metric_name, labels), func() float64 { // get current value of the gauge and add val to it return val }) @@ -72,7 +80,10 @@ func (ms *MetricsSetup) RegisterMetricsCounter(metric_name string, labels map[st log.Critical("RegisterMetricsCounter() error", map[string]interface{}{"_error": "Invalid metric name", "_metric_name": metric_name}) return nil } - return ms.metrics_set.GetOrCreateCounter(ms.get_metrics_name(metric_name, labels)) + if metric_name == MetricsSucceeded || metric_name == MetricsFailed || metric_name == MetricsSkipped { + return ms.metrics_set.GetOrCreateCounter(ms.get_metrics_name(metric_name, labels)) + } + return ms.metrics_set_custom.GetOrCreateCounter(ms.get_metrics_name(metric_name, labels)) } func (ms *MetricsSetup) RegisterFloatCounter(metric_name string, labels map[string]string) *metrics.FloatCounter { @@ -80,7 +91,7 @@ func (ms *MetricsSetup) RegisterFloatCounter(metric_name string, labels map[stri log.Critical("RegisterFloatCounter() error", map[string]interface{}{"_error": "Invalid metric name", "_metric_name": metric_name}) return nil } - return ms.metrics_set.GetOrCreateFloatCounter(ms.get_metrics_name(metric_name, labels)) + return ms.metrics_set_custom.GetOrCreateFloatCounter(ms.get_metrics_name(metric_name, labels)) } func (ms *MetricsSetup) RegisterMetricsSummary(metric_name string, labels map[string]string) *metrics.Summary { @@ -88,7 +99,7 @@ func (ms *MetricsSetup) RegisterMetricsSummary(metric_name string, labels map[st log.Critical("RegisterMetricsSummary() error", map[string]interface{}{"_error": "Invalid metric name", "_metric_name": metric_name}) return nil } - return ms.metrics_set.GetOrCreateSummary(ms.get_metrics_name(metric_name, labels)) + return ms.metrics_set_custom.GetOrCreateSummary(ms.get_metrics_name(metric_name, labels)) } func (ms *MetricsSetup) RegisterMetricsHistogram(metric_name string, labels map[string]string) *metrics.Histogram { @@ -96,7 +107,7 @@ func (ms *MetricsSetup) RegisterMetricsHistogram(metric_name string, labels map[ log.Critical("RegisterMetricsHistogram() error", map[string]interface{}{"_error": "Invalid metric name", "_metric_name": metric_name}) return nil } - return ms.metrics_set.GetOrCreateHistogram(ms.get_metrics_name(metric_name, labels)) + return ms.metrics_set_custom.GetOrCreateHistogram(ms.get_metrics_name(metric_name, labels)) } func (ms *MetricsSetup) Increment(metric_name string, labels map[string]string) { @@ -124,9 +135,9 @@ func (ms *MetricsSetup) UpdateSummary(metric_name string, labels map[string]stri } func (ms *MetricsSetup) RemoveMetrics(metric_name string, labels map[string]string) { - ms.metrics_set.UnregisterMetric(ms.get_metrics_name(metric_name, labels)) + ms.metrics_set_custom.UnregisterMetric(ms.get_metrics_name(metric_name, labels)) } func (ms *MetricsSetup) PurgeMetrics() { - ms.metrics_set.UnregisterAllMetrics() + ms.metrics_set_custom.UnregisterAllMetrics() } diff --git a/struct_config.go b/struct_config.go index 5c5a4ae..d0bf35a 100644 --- a/struct_config.go +++ b/struct_config.go @@ -24,6 +24,7 @@ type config struct { AllowURLs []string EnableApi bool ApiPort int + PurgeOnCrawl bool } Client struct {