| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: etcdInsufficientMembers
expr: sum
by(job) (up{job=~".*etcd.*"} == bool 1) < ((count by(job) (up{job=~".*etcd.*"})
+ 1) / 2)
for: 3m
labels:
severity: critical
annotations:
message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
}}).'
|
ok
|
|
5.37s ago
|
1.587ms |
| alert: etcdNoLeader
expr: etcd_server_has_leader{job=~".*etcd.*"}
== 0
for: 1m
labels:
severity: critical
annotations:
message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }}
has no leader.'
|
ok
|
|
5.369s ago
|
240.1us |
| alert: etcdHighNumberOfLeaderChanges
expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m])
> 3
for: 15m
labels:
severity: warning
annotations:
message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance
}} has seen {{ $value }} leader changes within the last hour.'
|
ok
|
|
5.369s ago
|
259.3us |
| alert: etcdHighNumberOfFailedGRPCRequests
expr: 100
* sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!="OK",job=~".*etcd.*"}[5m]))
/ sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~".*etcd.*"}[5m]))
> 1
for: 10m
labels:
severity: warning
annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
ok
|
|
5.369s ago
|
506.1us |
| alert: etcdHighNumberOfFailedGRPCRequests
expr: 100
* sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!="OK",job=~".*etcd.*"}[5m]))
/ sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~".*etcd.*"}[5m]))
> 5
for: 5m
labels:
severity: critical
annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
ok
|
|
5.369s ago
|
462.3us |
| alert: etcdGRPCRequestsSlow
expr: histogram_quantile(0.99,
sum by(job, instance, grpc_service, grpc_method, le) (rate(grpc_server_handling_seconds_bucket{grpc_type="unary",job=~".*etcd.*"}[5m])))
> 0.15
for: 10m
labels:
severity: critical
annotations:
message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method
}} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
ok
|
|
5.369s ago
|
568.1us |
| alert: etcdMemberCommunicationSlow
expr: histogram_quantile(0.99,
rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.15
for: 10m
labels:
severity: warning
annotations:
message: 'etcd cluster "{{ $labels.job }}": member communication with {{
$labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
ok
|
|
5.369s ago
|
303.4us |
| alert: etcdHighNumberOfFailedProposals
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m])
> 5
for: 15m
labels:
severity: warning
annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
within the last hour on etcd instance {{ $labels.instance }}.'
|
ok
|
|
5.369s ago
|
230.4us |
| alert: etcdHighFsyncDurations
expr: histogram_quantile(0.99,
rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.5
for: 10m
labels:
severity: warning
annotations:
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
ok
|
|
5.369s ago
|
257.4us |
| alert: etcdHighCommitDurations
expr: histogram_quantile(0.99,
rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.25
for: 10m
labels:
severity: warning
annotations:
message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
{{ $value }}s on etcd instance {{ $labels.instance }}.'
|
ok
|
|
5.369s ago
|
246.9us |
| alert: etcdHighNumberOfFailedHTTPRequests
expr: sum
by(method) (rate(etcd_http_failed_total{code!="404",job=~".*etcd.*"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) >
0.01
for: 10m
labels:
severity: warning
annotations:
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
{{ $labels.instance }}'
|
ok
|
|
5.369s ago
|
437.6us |
| alert: etcdHighNumberOfFailedHTTPRequests
expr: sum
by(method) (rate(etcd_http_failed_total{code!="404",job=~".*etcd.*"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) >
0.05
for: 10m
labels:
severity: critical
annotations:
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
{{ $labels.instance }}.'
|
ok
|
|
5.369s ago
|
483us |
| alert: etcdHTTPRequestsSlow
expr: histogram_quantile(0.99,
rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
for: 10m
labels:
severity: warning
annotations:
message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
}} are slow.
|
ok
|
|
5.369s ago
|
159.7us |
|
21.053s ago |
3.642ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: AlertmanagerDown
expr: absent(up{job="prometheus-prometheus-oper-alertmanager",namespace="monitoring"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: Alertmanager has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown
|
ok
|
|
9.31s ago
|
315.4us |
| alert: KubeAPIDown
expr: absent(up{job="apiserver"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: KubeAPI has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown
|
ok
|
|
9.31s ago
|
197.9us |
| alert: KubeControllerManagerDown
expr: absent(up{job="kube-controller-manager"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: KubeControllerManager has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown
|
ok
|
|
9.31s ago
|
472.3us |
| alert: KubeSchedulerDown
expr: absent(up{job="kube-scheduler"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: KubeScheduler has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown
|
ok
|
|
9.309s ago
|
403.5us |
| alert: KubeStateMetricsDown
expr: absent(up{job="kube-state-metrics"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: KubeStateMetrics has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsdown
|
ok
|
|
9.309s ago
|
132.2us |
| alert: KubeletDown
expr: absent(up{job="kubelet"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: Kubelet has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
|
ok
|
|
9.309s ago
|
165.7us |
| alert: NodeExporterDown
expr: absent(up{job="node-exporter"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: NodeExporter has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeexporterdown
|
ok
|
|
9.309s ago
|
159.6us |
| alert: PrometheusDown
expr: absent(up{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: Prometheus has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown
|
ok
|
|
9.309s ago
|
135.3us |
| alert: PrometheusOperatorDown
expr: absent(up{job="prometheus-prometheus-oper-operator",namespace="monitoring"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: PrometheusOperator has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown
|
ok
|
|
9.309s ago
|
122.4us |
|
3.944s ago |
36.04ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: KubePodCrashLooping
expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m])
* 60 * 5 > 0
for: 1h
labels:
severity: critical
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }})
is restarting {{ printf "%.2f" $value }} times / 5 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
|
ok
|
|
3.944s ago
|
12.53ms |
| alert: KubePodNotReady
expr: sum
by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics",phase=~"Failed|Pending|Unknown"})
> 0
for: 1h
labels:
severity: critical
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state
for longer than an hour.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
ok
|
|
3.932s ago
|
10.69ms |
| alert: KubeDeploymentGenerationMismatch
expr: kube_deployment_status_observed_generation{job="kube-state-metrics"}
!= kube_deployment_metadata_generation{job="kube-state-metrics"}
for: 15m
labels:
severity: critical
annotations:
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
}} does not match, this indicates that the Deployment has failed but has not been
rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
|
ok
|
|
3.921s ago
|
4.503ms |
| alert: KubeDeploymentReplicasMismatch
expr: kube_deployment_spec_replicas{job="kube-state-metrics"}
!= kube_deployment_status_replicas_available{job="kube-state-metrics"}
for: 1h
labels:
severity: critical
annotations:
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched
the expected number of replicas for longer than an hour.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
|
ok
|
|
3.917s ago
|
4.39ms |
| alert: KubeStatefulSetReplicasMismatch
expr: kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
!= kube_statefulset_status_replicas{job="kube-state-metrics"}
for: 15m
labels:
severity: critical
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched
the expected number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
|
ok
|
|
3.913s ago
|
386us |
| alert: KubeStatefulSetGenerationMismatch
expr: kube_statefulset_status_observed_generation{job="kube-state-metrics"}
!= kube_statefulset_metadata_generation{job="kube-state-metrics"}
for: 15m
labels:
severity: critical
annotations:
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
}} does not match, this indicates that the StatefulSet has failed but has not
been rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
|
ok
|
|
3.912s ago
|
395.6us |
| alert: KubeStatefulSetUpdateNotRolledOut
expr: max
without(revision) (kube_statefulset_status_current_revision{job="kube-state-metrics"}
unless kube_statefulset_status_update_revision{job="kube-state-metrics"})
* (kube_statefulset_replicas{job="kube-state-metrics"} != kube_statefulset_status_replicas_updated{job="kube-state-metrics"})
for: 15m
labels:
severity: critical
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has
not been rolled out.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
|
ok
|
|
3.912s ago
|
718.7us |
| alert: KubeDaemonSetRolloutStuck
expr: kube_daemonset_status_number_ready{job="kube-state-metrics"}
/ kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
* 100 < 100
for: 15m
labels:
severity: critical
annotations:
message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace
}}/{{ $labels.daemonset }} are scheduled and ready.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
|
ok
|
|
3.912s ago
|
245us |
| alert: KubeDaemonSetNotScheduled
expr: kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
- kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"}
> 0
for: 10m
labels:
severity: warning
annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are not scheduled.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
|
ok
|
|
3.912s ago
|
184.1us |
| alert: KubeDaemonSetMisScheduled
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics"}
> 0
for: 10m
labels:
severity: warning
annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are running where they are not supposed to run.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
|
ok
|
|
3.912s ago
|
98.78us |
| alert: KubeCronJobRunning
expr: time()
- kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600
for: 1h
labels:
severity: warning
annotations:
message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than
1h to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning
|
ok
|
|
3.912s ago
|
90.31us |
| alert: KubeJobCompletion
expr: kube_job_spec_completions{job="kube-state-metrics"}
- kube_job_status_succeeded{job="kube-state-metrics"} > 0
for: 1h
labels:
severity: warning
annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than
one hour to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
|
ok
|
|
3.912s ago
|
882.1us |
| alert: KubeJobFailed
expr: kube_job_status_failed{job="kube-state-metrics"}
> 0
for: 1h
labels:
severity: warning
annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
|
ok
|
|
3.911s ago
|
887.8us |
|
19.253s ago |
10.47ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: KubeCPUOvercommit
expr: sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum)
/ sum(kube_node_status_allocatable_cpu_cores) > (count(kube_node_status_allocatable_cpu_cores)
- 1) / count(kube_node_status_allocatable_cpu_cores)
for: 5m
labels:
severity: warning
annotations:
message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate
node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
|
ok
|
|
19.253s ago
|
1.197ms |
| alert: KubeMemOvercommit
expr: sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum)
/ sum(kube_node_status_allocatable_memory_bytes) > (count(kube_node_status_allocatable_memory_bytes)
- 1) / count(kube_node_status_allocatable_memory_bytes)
for: 5m
labels:
severity: warning
annotations:
message: Cluster has overcommitted memory resource requests for Pods and cannot
tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
|
ok
|
|
19.252s ago
|
1.65ms |
| alert: KubeCPUOvercommit
expr: sum(kube_resourcequota{job="kube-state-metrics",resource="cpu",type="hard"})
/ sum(kube_node_status_allocatable_cpu_cores) > 1.5
for: 5m
labels:
severity: warning
annotations:
message: Cluster has overcommitted CPU resource requests for Namespaces.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
|
ok
|
|
19.251s ago
|
384.9us |
| alert: KubeMemOvercommit
expr: sum(kube_resourcequota{job="kube-state-metrics",resource="memory",type="hard"})
/ sum(kube_node_status_allocatable_memory_bytes{job="node-exporter"}) >
1.5
for: 5m
labels:
severity: warning
annotations:
message: Cluster has overcommitted memory resource requests for Namespaces.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
|
ok
|
|
19.25s ago
|
257.1us |
| alert: KubeQuotaExceeded
expr: 100
* kube_resourcequota{job="kube-state-metrics",type="used"} / ignoring(instance,
job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"}
> 0) > 90
for: 15m
labels:
severity: warning
annotations:
message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value
}}% of its {{ $labels.resource }} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
|
ok
|
|
19.25s ago
|
314.2us |
| alert: CPUThrottlingHigh
expr: 100
* sum by(container, pod, namespace) (increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m]))
/ sum by(container, pod, namespace) (increase(container_cpu_cfs_periods_total[5m]))
> 25
for: 15m
labels:
severity: warning
annotations:
message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{
$labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod
}}.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
|
ok
|
|
19.25s ago
|
6.636ms |
|
9.088s ago |
13.26ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: KubeNodeNotReady
expr: kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"}
== 0
for: 1h
labels:
severity: warning
annotations:
message: '{{ $labels.node }} has been unready for more than an hour.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
|
ok
|
|
28.312s ago
|
354.3us |
| alert: KubeVersionMismatch
expr: count(count
by(gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},
"gitVersion", "$1", "gitVersion", "(v[0-9]*.[0-9]*.[0-9]*).*")))
> 1
for: 1h
labels:
severity: warning
annotations:
message: There are {{ $value }} different semantic versions of Kubernetes components
running.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
|
ok
|
|
28.311s ago
|
388.1us |
| alert: KubeClientErrors
expr: (sum
by(instance, job) (rate(rest_client_requests_total{code=~"5.."}[5m])) /
sum by(instance, job) (rate(rest_client_requests_total[5m]))) * 100 > 1
for: 15m
labels:
severity: warning
annotations:
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
}}' is experiencing {{ printf "%0.0f" $value }}% errors.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
|
ok
|
|
28.311s ago
|
1.239ms |
| alert: KubeClientErrors
expr: sum
by(instance, job) (rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m]))
> 0.1
for: 15m
labels:
severity: warning
annotations:
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
}}' is experiencing {{ printf "%0.0f" $value }} errors / second.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
|
ok
|
|
28.31s ago
|
107.3us |
| alert: KubeletTooManyPods
expr: kubelet_running_pod_count{job="kubelet"}
> 110 * 0.9
for: 15m
labels:
severity: warning
annotations:
message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close to the
limit of 110.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
|
ok
|
|
28.31s ago
|
66.78us |
| alert: KubeAPILatencyHigh
expr: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"}
> 1
for: 10m
labels:
severity: warning
annotations:
message: The API server has a 99th percentile latency of {{ $value }} seconds for
{{ $labels.verb }} {{ $labels.resource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
|
ok
|
|
28.31s ago
|
1.685ms |
| alert: KubeAPILatencyHigh
expr: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"}
> 4
for: 10m
labels:
severity: critical
annotations:
message: The API server has a 99th percentile latency of {{ $value }} seconds for
{{ $labels.verb }} {{ $labels.resource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
|
ok
|
|
28.309s ago
|
1.473ms |
| alert: KubeAPIErrorsHigh
expr: sum(rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m]))
/ sum(rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 3
for: 10m
labels:
severity: critical
annotations:
message: API server is returning errors for {{ $value }}% of requests.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
ok
|
|
28.308s ago
|
12.77ms |
| alert: KubeAPIErrorsHigh
expr: sum(rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m]))
/ sum(rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 1
for: 10m
labels:
severity: warning
annotations:
message: API server is returning errors for {{ $value }}% of requests.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
ok
|
|
28.295s ago
|
11.93ms |
| alert: KubeAPIErrorsHigh
expr: sum
by(resource, subresource, verb) (rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m]))
/ sum by(resource, subresource, verb) (rate(apiserver_request_total{job="apiserver"}[5m]))
* 100 > 10
for: 10m
labels:
severity: critical
annotations:
message: API server is returning errors for {{ $value }}% of requests for {{ $labels.verb
}} {{ $labels.resource }} {{ $labels.subresource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
ok
|
|
28.283s ago
|
12.72ms |
| alert: KubeAPIErrorsHigh
expr: sum
by(resource, subresource, verb) (rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m]))
/ sum by(resource, subresource, verb) (rate(apiserver_request_total{job="apiserver"}[5m]))
* 100 > 5
for: 10m
labels:
severity: warning
annotations:
message: API server is returning errors for {{ $value }}% of requests for {{ $labels.verb
}} {{ $labels.resource }} {{ $labels.subresource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
ok
|
|
28.271s ago
|
12.63ms |
| alert: KubeClientCertificateExpiration
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"}
> 0 and histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m])))
< 604800
labels:
severity: warning
annotations:
message: A client certificate used to authenticate to the apiserver is expiring
in less than 7.0 days.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
ok
|
|
28.258s ago
|
898.7us |
| alert: KubeClientCertificateExpiration
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"}
> 0 and histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m])))
< 86400
labels:
severity: critical
annotations:
message: A client certificate used to authenticate to the apiserver is expiring
in less than 24.0 hours.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
ok
|
|
28.258s ago
|
769us |
|
6.194s ago |
6.661ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| record: :kube_pod_info_node_count:
expr: sum(min
by(node) (kube_pod_info))
|
ok
|
|
25.639s ago
|
4.925ms |
| record: node_namespace_pod:kube_pod_info:
expr: max
by(node, namespace, pod) (label_replace(kube_pod_info{job="kube-state-metrics"},
"pod", "$1", "pod", "(.*)"))
|
ok
|
|
25.635s ago
|
11.16ms |
| record: node:node_num_cpu:sum
expr: count
by(node) (sum by(node, cpu) (node_cpu_seconds_total{job="node-exporter"}
* on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:))
|
ok
|
|
25.623s ago
|
29.39ms |
| record: :node_cpu_utilisation:avg1m
expr: 1
- avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]))
|
ok
|
|
25.594s ago
|
2.64ms |
| record: node:node_cpu_utilisation:avg1m
expr: 1
- avg by(node) (rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])
* on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:)
|
ok
|
|
25.592s ago
|
5.691ms |
| record: node:cluster_cpu_utilisation:ratio
expr: node:node_cpu_utilisation:avg1m
* node:node_num_cpu:sum / scalar(sum(node:node_num_cpu:sum))
|
ok
|
|
25.586s ago
|
271.3us |
| record: :node_cpu_saturation_load1:
expr: sum(node_load1{job="node-exporter"})
/ sum(node:node_num_cpu:sum)
|
ok
|
|
25.586s ago
|
205.1us |
| record: node:node_cpu_saturation_load1:
expr: sum
by(node) (node_load1{job="node-exporter"} * on(namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:) / node:node_num_cpu:sum
|
ok
|
|
25.586s ago
|
5.34ms |
| record: :node_memory_utilisation:
expr: 1
- sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"}
+ node_memory_Buffers_bytes{job="node-exporter"}) / sum(node_memory_MemTotal_bytes{job="node-exporter"})
|
ok
|
|
25.581s ago
|
745.3us |
| record: :node_memory_MemFreeCachedBuffers_bytes:sum
expr: sum(node_memory_MemFree_bytes{job="node-exporter"}
+ node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
|
ok
|
|
25.58s ago
|
421.2us |
| record: :node_memory_MemTotal_bytes:sum
expr: sum(node_memory_MemTotal_bytes{job="node-exporter"})
|
ok
|
|
25.58s ago
|
137.8us |
| record: node:node_memory_bytes_available:sum
expr: sum
by(node) ((node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"}
+ node_memory_Buffers_bytes{job="node-exporter"}) * on(namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:)
|
ok
|
|
25.58s ago
|
5.935ms |
| record: node:node_memory_bytes_total:sum
expr: sum
by(node) (node_memory_MemTotal_bytes{job="node-exporter"} * on(namespace,
pod) group_left(node) node_namespace_pod:kube_pod_info:)
|
ok
|
|
25.574s ago
|
4.5ms |
| record: node:node_memory_utilisation:ratio
expr: (node:node_memory_bytes_total:sum
- node:node_memory_bytes_available:sum) / node:node_memory_bytes_total:sum
|
ok
|
|
25.569s ago
|
339.1us |
| record: node:cluster_memory_utilisation:ratio
expr: (node:node_memory_bytes_total:sum
- node:node_memory_bytes_available:sum) / scalar(sum(node:node_memory_bytes_total:sum))
|
ok
|
|
25.569s ago
|
222.4us |
| record: :node_memory_swap_io_bytes:sum_rate
expr: 1000
* sum((rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])))
|
ok
|
|
25.569s ago
|
296us |
| record: node:node_memory_utilisation:
expr: 1
- sum by(node) ((node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"}
+ node_memory_Buffers_bytes{job="node-exporter"}) * on(namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:) / sum by(node) (node_memory_MemTotal_bytes{job="node-exporter"}
* on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:)
|
ok
|
|
25.569s ago
|
7.494ms |
| record: node:node_memory_utilisation_2:
expr: 1
- (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum)
|
ok
|
|
25.561s ago
|
300.5us |
| record: node:node_memory_swap_io_bytes:sum_rate
expr: 1000
* sum by(node) ((rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) + rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
* on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:)
|
ok
|
|
25.561s ago
|
4.917ms |
| record: :node_disk_utilisation:avg_irate
expr: avg(irate(node_disk_io_time_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m]))
|
ok
|
|
25.556s ago
|
2.324ms |
| record: node:node_disk_utilisation:avg_irate
expr: avg
by(node) (irate(node_disk_io_time_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m])
* on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:)
|
ok
|
|
25.554s ago
|
6.031ms |
| record: :node_disk_saturation:avg_irate
expr: avg(irate(node_disk_io_time_weighted_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m]))
|
ok
|
|
25.548s ago
|
1.601ms |
| record: node:node_disk_saturation:avg_irate
expr: avg
by(node) (irate(node_disk_io_time_weighted_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m])
* on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:)
|
ok
|
|
25.547s ago
|
4.213ms |
| record: node:node_filesystem_usage:
expr: max
by(instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
- node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
|
ok
|
|
25.543s ago
|
489.4us |
| record: node:node_filesystem_avail:
expr: max
by(instance, namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
/ node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
|
ok
|
|
25.542s ago
|
299.8us |
| record: :node_net_utilisation:sum_irate
expr: sum(irate(node_network_receive_bytes_total{device!~"veth.+",job="node-exporter"}[1m]))
+ sum(irate(node_network_transmit_bytes_total{device!~"veth.+",job="node-exporter"}[1m]))
|
ok
|
|
25.542s ago
|
2.533ms |
| record: node:node_net_utilisation:sum_irate
expr: sum
by(node) ((irate(node_network_receive_bytes_total{device!~"veth.+",job="node-exporter"}[1m])
+ irate(node_network_transmit_bytes_total{device!~"veth.+",job="node-exporter"}[1m]))
* on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:)
|
ok
|
|
25.54s ago
|
4.741ms |
| record: :node_net_saturation:sum_irate
expr: sum(irate(node_network_receive_drop_total{device!~"veth.+",job="node-exporter"}[1m]))
+ sum(irate(node_network_transmit_drop_total{device!~"veth.+",job="node-exporter"}[1m]))
|
ok
|
|
25.535s ago
|
1.656ms |
| record: node:node_net_saturation:sum_irate
expr: sum
by(node) ((irate(node_network_receive_drop_total{device!~"veth.+",job="node-exporter"}[1m])
+ irate(node_network_transmit_drop_total{device!~"veth.+",job="node-exporter"}[1m]))
* on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:)
|
ok
|
|
25.533s ago
|
4.361ms |
| record: node:node_inodes_total:
expr: max
by(node) (max by(node, host_ip) (kube_pod_info{host_ip!="",job="kube-state-metrics"})
* on(host_ip) group_right(node) label_replace((max by(instance) (node_filesystem_files{job="node-exporter",mountpoint="/"})),
"host_ip", "$1", "instance", "(.*):.*"))
|
ok
|
|
25.529s ago
|
3.216ms |
| record: node:node_inodes_free:
expr: max
by(node) (max by(node, host_ip) (kube_pod_info{host_ip!="",job="kube-state-metrics"})
* on(host_ip) group_right(node) label_replace((max by(instance) (node_filesystem_files_free{job="node-exporter",mountpoint="/"})),
"host_ip", "$1", "instance", "(.*):.*"))
|
ok
|
|
25.526s ago
|
2.977ms |
|
28.868s ago |
915.4us |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: PrometheusBadConfig
expr: max_over_time(prometheus_config_last_reload_successful{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
== 0
for: 10m
labels:
severity: critical
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload
its configuration.
summary: Failed Prometheus configuration reload.
|
ok
|
|
25.378s ago
|
434.1us |
| alert: PrometheusNotificationQueueRunningFull
expr: (predict_linear(prometheus_notifications_queue_length{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m],
60 * 30) > min_over_time(prometheus_notifications_queue_capacity{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]))
for: 15m
labels:
severity: warning
annotations:
description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}}
is running full.
summary: Prometheus alert notification queue predicted to run full in less than
30m.
|
ok
|
|
25.378s ago
|
335.7us |
| alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
expr: (rate(prometheus_notifications_errors_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
/ rate(prometheus_notifications_sent_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]))
* 100 > 1
for: 15m
labels:
severity: warning
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts from
Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.'
summary: Prometheus has encountered more than 1% errors sending alerts to a specific
Alertmanager.
|
ok
|
|
25.378s ago
|
419.5us |
| alert: PrometheusErrorSendingAlertsToAnyAlertmanager
expr: min
without(alertmanager) (rate(prometheus_notifications_errors_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
/ rate(prometheus_notifications_sent_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]))
* 100 > 3
for: 15m
labels:
severity: critical
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts
from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
ok
|
|
25.377s ago
|
341.2us |
| alert: PrometheusNotConnectedToAlertmanagers
expr: max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
< 1
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to
any Alertmanagers.
summary: Prometheus is not connected to any Alertmanagers.
|
ok
|
|
25.377s ago
|
164us |
| alert: PrometheusTSDBReloadsFailing
expr: increase(prometheus_tsdb_reloads_failures_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[3h])
> 0
for: 4h
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value
| humanize}} reload failures over the last 3h.
summary: Prometheus has issues reloading blocks from disk.
|
ok
|
|
25.377s ago
|
340.8us |
| alert: PrometheusTSDBCompactionsFailing
expr: increase(prometheus_tsdb_compactions_failed_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[3h])
> 0
for: 4h
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value
| humanize}} compaction failures over the last 3h.
summary: Prometheus has issues compacting blocks.
|
ok
|
|
25.377s ago
|
267.5us |
| alert: PrometheusTSDBWALCorruptions
expr: increase(tsdb_wal_corruptions_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[3h])
> 0
for: 4h
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value
| humanize}} corruptions of the write-ahead log (WAL) over the last 3h.
summary: Prometheus is detecting WAL corruptions.
|
ok
|
|
25.377s ago
|
154.4us |
| alert: PrometheusNotIngestingSamples
expr: rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
<= 0
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples.
summary: Prometheus is not ingesting samples.
|
ok
|
|
25.377s ago
|
166.3us |
| alert: PrometheusDuplicateTimestamps
expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
> 0
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{$value
| humanize}} samples/s with different values but duplicated timestamp.
summary: Prometheus is dropping samples with duplicate timestamps.
|
ok
|
|
25.377s ago
|
158.5us |
| alert: PrometheusOutOfOrderTimestamps
expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
> 0
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{$value
| humanize}} samples/s with timestamps arriving out of order.
summary: Prometheus drops samples with out-of-order timestamps.
|
ok
|
|
25.377s ago
|
146.8us |
| alert: PrometheusRemoteStorageFailures
expr: (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
/ (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
+ rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])))
* 100 > 1
for: 15m
labels:
severity: critical
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{
printf "%.1f" $value }}% of the samples to queue {{$labels.queue}}.
summary: Prometheus fails to send samples to remote storage.
|
ok
|
|
25.377s ago
|
345.2us |
| alert: PrometheusRemoteWriteBehind
expr: (max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
- on(job, instance) group_right() max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]))
> 120
for: 15m
labels:
severity: critical
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{
printf "%.1f" $value }}s behind for queue {{$labels.queue}}.
summary: Prometheus remote write is behind.
|
ok
|
|
25.377s ago
|
290.1us |
| alert: PrometheusRuleFailures
expr: increase(prometheus_rule_evaluation_failures_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
> 0
for: 15m
labels:
severity: critical
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate
{{ printf "%.0f" $value }} rules in the last 5m.
summary: Prometheus is failing rule evaluations.
|
ok
|
|
25.377s ago
|
170.3us |
| alert: PrometheusMissingRuleEvaluations
expr: increase(prometheus_rule_group_iterations_missed_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
> 0
for: 15m
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf
"%.0f" $value }} rule group evaluations in the last 5m.
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
|
ok
|
|
25.377s ago
|
167.5us |