mirror of
https://github.com/lukaszraczylo/kubemirror.git
synced 2026-06-05 22:43:51 +00:00
147 lines
5.6 KiB
YAML
147 lines
5.6 KiB
YAML
---
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kubemirror-alerts
|
|
namespace: kubemirror-system
|
|
labels:
|
|
app.kubernetes.io/name: kubemirror
|
|
app.kubernetes.io/component: monitoring
|
|
spec:
|
|
groups:
|
|
- name: kubemirror.rules
|
|
interval: 30s
|
|
rules:
|
|
# Controller health alerts
|
|
- alert: KubeMirrorControllerDown
|
|
expr: up{service="kubemirror-controller-metrics"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
component: kubemirror
|
|
annotations:
|
|
summary: "KubeMirror controller is down"
|
|
description: "KubeMirror controller in namespace {{ $labels.namespace }} has been down for more than 5 minutes."
|
|
|
|
- alert: KubeMirrorHighReconcileErrors
|
|
expr: |
|
|
rate(controller_runtime_reconcile_errors_total{controller="secret"}[5m]) > 0.1
|
|
or
|
|
rate(controller_runtime_reconcile_errors_total{controller="configmap"}[5m]) > 0.1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
component: kubemirror
|
|
annotations:
|
|
summary: "High reconciliation error rate in KubeMirror"
|
|
description: "KubeMirror controller {{ $labels.controller }} is experiencing high error rate: {{ $value | humanizePercentage }} errors/sec"
|
|
|
|
- alert: KubeMirrorReconcileLatencyHigh
|
|
expr: |
|
|
histogram_quantile(0.99,
|
|
rate(controller_runtime_reconcile_time_seconds_bucket{controller=~"secret|configmap"}[5m])
|
|
) > 5
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
component: kubemirror
|
|
annotations:
|
|
summary: "High reconciliation latency in KubeMirror"
|
|
description: "KubeMirror {{ $labels.controller }} controller p99 latency is {{ $value | humanizeDuration }}"
|
|
|
|
- alert: KubeMirrorWorkqueueDepthHigh
|
|
expr: |
|
|
workqueue_depth{name=~"secret|configmap"} > 100
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
component: kubemirror
|
|
annotations:
|
|
summary: "High workqueue depth in KubeMirror"
|
|
description: "KubeMirror {{ $labels.name }} workqueue has {{ $value }} items pending for more than 15 minutes"
|
|
|
|
- alert: KubeMirrorLeaderElectionLost
|
|
expr: |
|
|
leader_election_master_status{name="kubemirror-controller-leader"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
component: kubemirror
|
|
annotations:
|
|
summary: "KubeMirror lost leader election"
|
|
description: "KubeMirror controller on pod {{ $labels.pod }} is not the leader"
|
|
|
|
# Resource mirror alerts
|
|
- alert: KubeMirrorHighFailureRate
|
|
expr: |
|
|
sum(rate(controller_runtime_reconcile_errors_total{controller=~"secret|configmap"}[5m]))
|
|
/
|
|
sum(rate(controller_runtime_reconcile_total{controller=~"secret|configmap"}[5m]))
|
|
> 0.05
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
component: kubemirror
|
|
annotations:
|
|
summary: "High mirror operation failure rate"
|
|
description: "KubeMirror has {{ $value | humanizePercentage }} failure rate over the last 10 minutes"
|
|
|
|
- alert: KubeMirrorMemoryHigh
|
|
expr: |
|
|
container_memory_working_set_bytes{pod=~"kubemirror-.*",container="controller"}
|
|
/
|
|
container_spec_memory_limit_bytes{pod=~"kubemirror-.*",container="controller"}
|
|
> 0.9
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: kubemirror
|
|
annotations:
|
|
summary: "KubeMirror controller high memory usage"
|
|
description: "KubeMirror controller {{ $labels.pod }} is using {{ $value | humanizePercentage }} of its memory limit"
|
|
|
|
- alert: KubeMirrorCPUThrottling
|
|
expr: |
|
|
rate(container_cpu_cfs_throttled_seconds_total{pod=~"kubemirror-.*",container="controller"}[5m]) > 0.5
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
component: kubemirror
|
|
annotations:
|
|
summary: "KubeMirror controller is being CPU throttled"
|
|
description: "KubeMirror controller {{ $labels.pod }} is experiencing CPU throttling: {{ $value | humanizeDuration }}/sec"
|
|
|
|
- name: kubemirror.recording
|
|
interval: 30s
|
|
rules:
|
|
# Recording rules for better query performance
|
|
- record: kubemirror:reconcile_duration_seconds:p99
|
|
expr: |
|
|
histogram_quantile(0.99,
|
|
rate(controller_runtime_reconcile_time_seconds_bucket{controller=~"secret|configmap"}[5m])
|
|
)
|
|
|
|
- record: kubemirror:reconcile_duration_seconds:p95
|
|
expr: |
|
|
histogram_quantile(0.95,
|
|
rate(controller_runtime_reconcile_time_seconds_bucket{controller=~"secret|configmap"}[5m])
|
|
)
|
|
|
|
- record: kubemirror:reconcile_duration_seconds:p50
|
|
expr: |
|
|
histogram_quantile(0.50,
|
|
rate(controller_runtime_reconcile_time_seconds_bucket{controller=~"secret|configmap"}[5m])
|
|
)
|
|
|
|
- record: kubemirror:reconcile_rate:5m
|
|
expr: |
|
|
sum(rate(controller_runtime_reconcile_total{controller=~"secret|configmap"}[5m])) by (controller, result)
|
|
|
|
- record: kubemirror:reconcile_errors:rate5m
|
|
expr: |
|
|
sum(rate(controller_runtime_reconcile_errors_total{controller=~"secret|configmap"}[5m])) by (controller)
|
|
|
|
- record: kubemirror:workqueue_depth:max
|
|
expr: |
|
|
max(workqueue_depth{name=~"secret|configmap"}) by (name)
|