Files
kubemirror/monitoring/prometheusrule.yaml
T
2025-12-25 22:10:57 +00:00

147 lines
5.6 KiB
YAML

---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: kubemirror-alerts
namespace: kubemirror-system
labels:
app.kubernetes.io/name: kubemirror
app.kubernetes.io/component: monitoring
spec:
groups:
- name: kubemirror.rules
interval: 30s
rules:
# Controller health alerts
- alert: KubeMirrorControllerDown
expr: up{service="kubemirror-controller-metrics"} == 0
for: 5m
labels:
severity: critical
component: kubemirror
annotations:
summary: "KubeMirror controller is down"
description: "KubeMirror controller in namespace {{ $labels.namespace }} has been down for more than 5 minutes."
- alert: KubeMirrorHighReconcileErrors
expr: |
rate(controller_runtime_reconcile_errors_total{controller="secret"}[5m]) > 0.1
or
rate(controller_runtime_reconcile_errors_total{controller="configmap"}[5m]) > 0.1
for: 10m
labels:
severity: warning
component: kubemirror
annotations:
summary: "High reconciliation error rate in KubeMirror"
description: "KubeMirror controller {{ $labels.controller }} is experiencing high error rate: {{ $value | humanizePercentage }} errors/sec"
- alert: KubeMirrorReconcileLatencyHigh
expr: |
histogram_quantile(0.99,
rate(controller_runtime_reconcile_time_seconds_bucket{controller=~"secret|configmap"}[5m])
) > 5
for: 10m
labels:
severity: warning
component: kubemirror
annotations:
summary: "High reconciliation latency in KubeMirror"
description: "KubeMirror {{ $labels.controller }} controller p99 latency is {{ $value | humanizeDuration }}"
- alert: KubeMirrorWorkqueueDepthHigh
expr: |
workqueue_depth{name=~"secret|configmap"} > 100
for: 15m
labels:
severity: warning
component: kubemirror
annotations:
summary: "High workqueue depth in KubeMirror"
description: "KubeMirror {{ $labels.name }} workqueue has {{ $value }} items pending for more than 15 minutes"
- alert: KubeMirrorLeaderElectionLost
expr: |
leader_election_master_status{name="kubemirror-controller-leader"} == 0
for: 2m
labels:
severity: warning
component: kubemirror
annotations:
summary: "KubeMirror lost leader election"
description: "KubeMirror controller on pod {{ $labels.pod }} is not the leader"
# Resource mirror alerts
- alert: KubeMirrorHighFailureRate
expr: |
sum(rate(controller_runtime_reconcile_errors_total{controller=~"secret|configmap"}[5m]))
/
sum(rate(controller_runtime_reconcile_total{controller=~"secret|configmap"}[5m]))
> 0.05
for: 10m
labels:
severity: warning
component: kubemirror
annotations:
summary: "High mirror operation failure rate"
description: "KubeMirror has {{ $value | humanizePercentage }} failure rate over the last 10 minutes"
- alert: KubeMirrorMemoryHigh
expr: |
container_memory_working_set_bytes{pod=~"kubemirror-.*",container="controller"}
/
container_spec_memory_limit_bytes{pod=~"kubemirror-.*",container="controller"}
> 0.9
for: 5m
labels:
severity: warning
component: kubemirror
annotations:
summary: "KubeMirror controller high memory usage"
description: "KubeMirror controller {{ $labels.pod }} is using {{ $value | humanizePercentage }} of its memory limit"
- alert: KubeMirrorCPUThrottling
expr: |
rate(container_cpu_cfs_throttled_seconds_total{pod=~"kubemirror-.*",container="controller"}[5m]) > 0.5
for: 10m
labels:
severity: warning
component: kubemirror
annotations:
summary: "KubeMirror controller is being CPU throttled"
description: "KubeMirror controller {{ $labels.pod }} is experiencing CPU throttling: {{ $value | humanizeDuration }}/sec"
- name: kubemirror.recording
interval: 30s
rules:
# Recording rules for better query performance
- record: kubemirror:reconcile_duration_seconds:p99
expr: |
histogram_quantile(0.99,
rate(controller_runtime_reconcile_time_seconds_bucket{controller=~"secret|configmap"}[5m])
)
- record: kubemirror:reconcile_duration_seconds:p95
expr: |
histogram_quantile(0.95,
rate(controller_runtime_reconcile_time_seconds_bucket{controller=~"secret|configmap"}[5m])
)
- record: kubemirror:reconcile_duration_seconds:p50
expr: |
histogram_quantile(0.50,
rate(controller_runtime_reconcile_time_seconds_bucket{controller=~"secret|configmap"}[5m])
)
- record: kubemirror:reconcile_rate:5m
expr: |
sum(rate(controller_runtime_reconcile_total{controller=~"secret|configmap"}[5m])) by (controller, result)
- record: kubemirror:reconcile_errors:rate5m
expr: |
sum(rate(controller_runtime_reconcile_errors_total{controller=~"secret|configmap"}[5m])) by (controller)
- record: kubemirror:workqueue_depth:max
expr: |
max(workqueue_depth{name=~"secret|configmap"}) by (name)