| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: etcdInsufficientMembers
expr: sum
by(job) (up{job=~".*etcd.*"} == bool 1) < ((count by(job) (up{job=~".*etcd.*"})
+ 1) / 2)
for: 3m
labels:
severity: critical
annotations:
message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
}}).'
|
ok
|
|
6.199s ago
|
1.768ms |
| alert: etcdNoLeader
expr: etcd_server_has_leader{job=~".*etcd.*"}
== 0
for: 1m
labels:
severity: critical
annotations:
message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }}
has no leader.'
|
ok
|
|
6.198s ago
|
226.2us |
| alert: etcdHighNumberOfLeaderChanges
expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m])
> 3
for: 15m
labels:
severity: warning
annotations:
message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance
}} has seen {{ $value }} leader changes within the last hour.'
|
ok
|
|
6.198s ago
|
216.7us |
| alert: etcdHighNumberOfFailedGRPCRequests
expr: 100
* sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!="OK",job=~".*etcd.*"}[5m]))
/ sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~".*etcd.*"}[5m]))
> 1
for: 10m
labels:
severity: warning
annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
ok
|
|
6.198s ago
|
417.4us |
| alert: etcdHighNumberOfFailedGRPCRequests
expr: 100
* sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!="OK",job=~".*etcd.*"}[5m]))
/ sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~".*etcd.*"}[5m]))
> 5
for: 5m
labels:
severity: critical
annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
ok
|
|
6.198s ago
|
404.8us |
| alert: etcdGRPCRequestsSlow
expr: histogram_quantile(0.99,
sum by(job, instance, grpc_service, grpc_method, le) (rate(grpc_server_handling_seconds_bucket{grpc_type="unary",job=~".*etcd.*"}[5m])))
> 0.15
for: 10m
labels:
severity: critical
annotations:
message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method
}} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
ok
|
|
6.197s ago
|
261.9us |
| alert: etcdMemberCommunicationSlow
expr: histogram_quantile(0.99,
rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.15
for: 10m
labels:
severity: warning
annotations:
message: 'etcd cluster "{{ $labels.job }}": member communication with {{
$labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
ok
|
|
6.197s ago
|
208.8us |
| alert: etcdHighNumberOfFailedProposals
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m])
> 5
for: 15m
labels:
severity: warning
annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
within the last hour on etcd instance {{ $labels.instance }}.'
|
ok
|
|
6.197s ago
|
177.8us |
| alert: etcdHighFsyncDurations
expr: histogram_quantile(0.99,
rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.5
for: 10m
labels:
severity: warning
annotations:
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
ok
|
|
6.197s ago
|
283.8us |
| alert: etcdHighCommitDurations
expr: histogram_quantile(0.99,
rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.25
for: 10m
labels:
severity: warning
annotations:
message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
{{ $value }}s on etcd instance {{ $labels.instance }}.'
|
ok
|
|
6.197s ago
|
225.9us |
| alert: etcdHighNumberOfFailedHTTPRequests
expr: sum
by(method) (rate(etcd_http_failed_total{code!="404",job=~".*etcd.*"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) >
0.01
for: 10m
labels:
severity: warning
annotations:
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
{{ $labels.instance }}'
|
ok
|
|
6.197s ago
|
366.6us |
| alert: etcdHighNumberOfFailedHTTPRequests
expr: sum
by(method) (rate(etcd_http_failed_total{code!="404",job=~".*etcd.*"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) >
0.05
for: 10m
labels:
severity: critical
annotations:
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
{{ $labels.instance }}.'
|
ok
|
|
6.197s ago
|
352.1us |
| alert: etcdHTTPRequestsSlow
expr: histogram_quantile(0.99,
rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
for: 10m
labels:
severity: warning
annotations:
message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
}} are slow.
|
ok
|
|
6.197s ago
|
131.3us |
|
21.88s ago |
4.446ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: AlertmanagerDown
expr: absent(up{job="prometheus-prometheus-oper-alertmanager",namespace="monitoring"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: Alertmanager has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown
|
ok
|
|
10.138s ago
|
499.1us |
| alert: KubeAPIDown
expr: absent(up{job="apiserver"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: KubeAPI has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown
|
ok
|
|
10.137s ago
|
357.5us |
| alert: KubeControllerManagerDown
expr: absent(up{job="kube-controller-manager"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: KubeControllerManager has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown
|
ok
|
|
10.137s ago
|
829us |
| alert: KubeSchedulerDown
expr: absent(up{job="kube-scheduler"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: KubeScheduler has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown
|
ok
|
|
10.136s ago
|
719.4us |
| alert: KubeStateMetricsDown
expr: absent(up{job="kube-state-metrics"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: KubeStateMetrics has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsdown
|
ok
|
|
10.136s ago
|
219.8us |
| alert: KubeletDown
expr: absent(up{job="kubelet"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: Kubelet has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
|
ok
|
|
10.136s ago
|
336.9us |
| alert: NodeExporterDown
expr: absent(up{job="node-exporter"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: NodeExporter has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeexporterdown
|
ok
|
|
10.135s ago
|
242.8us |
| alert: PrometheusDown
expr: absent(up{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: Prometheus has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown
|
ok
|
|
10.135s ago
|
216.7us |
| alert: PrometheusOperatorDown
expr: absent(up{job="prometheus-prometheus-oper-operator",namespace="monitoring"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: PrometheusOperator has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown
|
ok
|
|
10.135s ago
|
206.5us |
|
4.772s ago |
30.22ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: KubePodCrashLooping
expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m])
* 60 * 5 > 0
for: 1h
labels:
severity: critical
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }})
is restarting {{ printf "%.2f" $value }} times / 5 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
|
ok
|
|
4.772s ago
|
8.106ms |
| alert: KubePodNotReady
expr: sum
by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics",phase=~"Failed|Pending|Unknown"})
> 0
for: 1h
labels:
severity: critical
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state
for longer than an hour.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
ok
|
|
4.764s ago
|
10.6ms |
| alert: KubeDeploymentGenerationMismatch
expr: kube_deployment_status_observed_generation{job="kube-state-metrics"}
!= kube_deployment_metadata_generation{job="kube-state-metrics"}
for: 15m
labels:
severity: critical
annotations:
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
}} does not match, this indicates that the Deployment has failed but has not been
rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
|
ok
|
|
4.754s ago
|
4.108ms |
| alert: KubeDeploymentReplicasMismatch
expr: kube_deployment_spec_replicas{job="kube-state-metrics"}
!= kube_deployment_status_replicas_available{job="kube-state-metrics"}
for: 1h
labels:
severity: critical
annotations:
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched
the expected number of replicas for longer than an hour.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
|
ok
|
|
4.75s ago
|
4.117ms |
| alert: KubeStatefulSetReplicasMismatch
expr: kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
!= kube_statefulset_status_replicas{job="kube-state-metrics"}
for: 15m
labels:
severity: critical
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched
the expected number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
|
ok
|
|
4.746s ago
|
367us |
| alert: KubeStatefulSetGenerationMismatch
expr: kube_statefulset_status_observed_generation{job="kube-state-metrics"}
!= kube_statefulset_metadata_generation{job="kube-state-metrics"}
for: 15m
labels:
severity: critical
annotations:
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
}} does not match, this indicates that the StatefulSet has failed but has not
been rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
|
ok
|
|
4.746s ago
|
341.4us |
| alert: KubeStatefulSetUpdateNotRolledOut
expr: max
without(revision) (kube_statefulset_status_current_revision{job="kube-state-metrics"}
unless kube_statefulset_status_update_revision{job="kube-state-metrics"})
* (kube_statefulset_replicas{job="kube-state-metrics"} != kube_statefulset_status_replicas_updated{job="kube-state-metrics"})
for: 15m
labels:
severity: critical
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has
not been rolled out.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
|
ok
|
|
4.745s ago
|
705.8us |
| alert: KubeDaemonSetRolloutStuck
expr: kube_daemonset_status_number_ready{job="kube-state-metrics"}
/ kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
* 100 < 100
for: 15m
labels:
severity: critical
annotations:
message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace
}}/{{ $labels.daemonset }} are scheduled and ready.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
|
ok
|
|
4.745s ago
|
219.8us |
| alert: KubeDaemonSetNotScheduled
expr: kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
- kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"}
> 0
for: 10m
labels:
severity: warning
annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are not scheduled.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
|
ok
|
|
4.745s ago
|
182.1us |
| alert: KubeDaemonSetMisScheduled
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics"}
> 0
for: 10m
labels:
severity: warning
annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are running where they are not supposed to run.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
|
ok
|
|
4.745s ago
|
94.55us |
| alert: KubeCronJobRunning
expr: time()
- kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600
for: 1h
labels:
severity: warning
annotations:
message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than
1h to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning
|
ok
|
|
4.745s ago
|
87.27us |
| alert: KubeJobCompletion
expr: kube_job_spec_completions{job="kube-state-metrics"}
- kube_job_status_succeeded{job="kube-state-metrics"} > 0
for: 1h
labels:
severity: warning
annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than
one hour to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
|
ok
|
|
4.745s ago
|
913.2us |
| alert: KubeJobFailed
expr: kube_job_status_failed{job="kube-state-metrics"}
> 0
for: 1h
labels:
severity: warning
annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
|
ok
|
|
4.744s ago
|
336.6us |
|
20.081s ago |
11.99ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: KubeCPUOvercommit
expr: sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum)
/ sum(kube_node_status_allocatable_cpu_cores) > (count(kube_node_status_allocatable_cpu_cores)
- 1) / count(kube_node_status_allocatable_cpu_cores)
for: 5m
labels:
severity: warning
annotations:
message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate
node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
|
ok
|
|
20.081s ago
|
1.275ms |
| alert: KubeMemOvercommit
expr: sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum)
/ sum(kube_node_status_allocatable_memory_bytes) > (count(kube_node_status_allocatable_memory_bytes)
- 1) / count(kube_node_status_allocatable_memory_bytes)
for: 5m
labels:
severity: warning
annotations:
message: Cluster has overcommitted memory resource requests for Pods and cannot
tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
|
ok
|
|
20.08s ago
|
1.573ms |
| alert: KubeCPUOvercommit
expr: sum(kube_resourcequota{job="kube-state-metrics",resource="cpu",type="hard"})
/ sum(kube_node_status_allocatable_cpu_cores) > 1.5
for: 5m
labels:
severity: warning
annotations:
message: Cluster has overcommitted CPU resource requests for Namespaces.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
|
ok
|
|
20.078s ago
|
376.2us |
| alert: KubeMemOvercommit
expr: sum(kube_resourcequota{job="kube-state-metrics",resource="memory",type="hard"})
/ sum(kube_node_status_allocatable_memory_bytes{job="node-exporter"}) >
1.5
for: 5m
labels:
severity: warning
annotations:
message: Cluster has overcommitted memory resource requests for Namespaces.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
|
ok
|
|
20.078s ago
|
271.4us |
| alert: KubeQuotaExceeded
expr: 100
* kube_resourcequota{job="kube-state-metrics",type="used"} / ignoring(instance,
job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"}
> 0) > 90
for: 15m
labels:
severity: warning
annotations:
message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value
}}% of its {{ $labels.resource }} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
|
ok
|
|
20.078s ago
|
309.4us |
| alert: CPUThrottlingHigh
expr: 100
* sum by(container, pod, namespace) (increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m]))
/ sum by(container, pod, namespace) (increase(container_cpu_cfs_periods_total[5m]))
> 25
for: 15m
labels:
severity: warning
annotations:
message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{
$labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod
}}.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
|
ok
|
|
20.078s ago
|
8.156ms |
|
9.916s ago |
13.34ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: KubeNodeNotReady
expr: kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"}
== 0
for: 1h
labels:
severity: warning
annotations:
message: '{{ $labels.node }} has been unready for more than an hour.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
|
ok
|
|
29.14s ago
|
406.1us |
| alert: KubeVersionMismatch
expr: count(count
by(gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},
"gitVersion", "$1", "gitVersion", "(v[0-9]*.[0-9]*.[0-9]*).*")))
> 1
for: 1h
labels:
severity: warning
annotations:
message: There are {{ $value }} different semantic versions of Kubernetes components
running.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
|
ok
|
|
29.139s ago
|
703.4us |
| alert: KubeClientErrors
expr: (sum
by(instance, job) (rate(rest_client_requests_total{code=~"5.."}[5m])) /
sum by(instance, job) (rate(rest_client_requests_total[5m]))) * 100 > 1
for: 15m
labels:
severity: warning
annotations:
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
}}' is experiencing {{ printf "%0.0f" $value }}% errors.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
|
ok
|
|
29.139s ago
|
1.972ms |
| alert: KubeClientErrors
expr: sum
by(instance, job) (rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m]))
> 0.1
for: 15m
labels:
severity: warning
annotations:
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
}}' is experiencing {{ printf "%0.0f" $value }} errors / second.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
|
ok
|
|
29.137s ago
|
150.6us |
| alert: KubeletTooManyPods
expr: kubelet_running_pod_count{job="kubelet"}
> 110 * 0.9
for: 15m
labels:
severity: warning
annotations:
message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close to the
limit of 110.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
|
ok
|
|
29.137s ago
|
84.1us |
| alert: KubeAPILatencyHigh
expr: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"}
> 1
for: 10m
labels:
severity: warning
annotations:
message: The API server has a 99th percentile latency of {{ $value }} seconds for
{{ $labels.verb }} {{ $labels.resource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
|
ok
|
|
29.137s ago
|
2.367ms |
| alert: KubeAPILatencyHigh
expr: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"}
> 4
for: 10m
labels:
severity: critical
annotations:
message: The API server has a 99th percentile latency of {{ $value }} seconds for
{{ $labels.verb }} {{ $labels.resource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
|
ok
|
|
29.135s ago
|
2.166ms |
| alert: KubeAPIErrorsHigh
expr: sum(rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m]))
/ sum(rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 3
for: 10m
labels:
severity: critical
annotations:
message: API server is returning errors for {{ $value }}% of requests.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
ok
|
|
29.133s ago
|
25.94ms |
| alert: KubeAPIErrorsHigh
expr: sum(rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m]))
/ sum(rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 1
for: 10m
labels:
severity: warning
annotations:
message: API server is returning errors for {{ $value }}% of requests.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
ok
|
|
29.108s ago
|
21.82ms |
| alert: KubeAPIErrorsHigh
expr: sum
by(resource, subresource, verb) (rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m]))
/ sum by(resource, subresource, verb) (rate(apiserver_request_total{job="apiserver"}[5m]))
* 100 > 10
for: 10m
labels:
severity: critical
annotations:
message: API server is returning errors for {{ $value }}% of requests for {{ $labels.verb
}} {{ $labels.resource }} {{ $labels.subresource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
ok
|
|
29.086s ago
|
19.16ms |
| alert: KubeAPIErrorsHigh
expr: sum
by(resource, subresource, verb) (rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m]))
/ sum by(resource, subresource, verb) (rate(apiserver_request_total{job="apiserver"}[5m]))
* 100 > 5
for: 10m
labels:
severity: warning
annotations:
message: API server is returning errors for {{ $value }}% of requests for {{ $labels.verb
}} {{ $labels.resource }} {{ $labels.subresource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
ok
|
|
29.067s ago
|
17.65ms |
| alert: KubeClientCertificateExpiration
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"}
> 0 and histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m])))
< 604800
labels:
severity: warning
annotations:
message: A client certificate used to authenticate to the apiserver is expiring
in less than 7.0 days.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
ok
|
|
29.049s ago
|
883.6us |
| alert: KubeClientCertificateExpiration
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"}
> 0 and histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m])))
< 86400
labels:
severity: critical
annotations:
message: A client certificate used to authenticate to the apiserver is expiring
in less than 24.0 hours.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
ok
|
|
29.049s ago
|
748.7us |
|
7.022s ago |
4.524ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| record: :kube_pod_info_node_count:
expr: sum(min
by(node) (kube_pod_info))
|
ok
|
|
26.467s ago
|
6.721ms |
| record: node_namespace_pod:kube_pod_info:
expr: max
by(node, namespace, pod) (label_replace(kube_pod_info{job="kube-state-metrics"},
"pod", "$1", "pod", "(.*)"))
|
ok
|
|
26.461s ago
|
7.875ms |
| record: node:node_num_cpu:sum
expr: count
by(node) (sum by(node, cpu) (node_cpu_seconds_total{job="node-exporter"}
* on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:))
|
ok
|
|
26.453s ago
|
18.97ms |
| record: :node_cpu_utilisation:avg1m
expr: 1
- avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]))
|
ok
|
|
26.434s ago
|
2.145ms |
| record: node:node_cpu_utilisation:avg1m
expr: 1
- avg by(node) (rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])
* on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:)
|
ok
|
|
26.432s ago
|
5.037ms |
| record: node:cluster_cpu_utilisation:ratio
expr: node:node_cpu_utilisation:avg1m
* node:node_num_cpu:sum / scalar(sum(node:node_num_cpu:sum))
|
ok
|
|
26.427s ago
|
255us |
| record: :node_cpu_saturation_load1:
expr: sum(node_load1{job="node-exporter"})
/ sum(node:node_num_cpu:sum)
|
ok
|
|
26.427s ago
|
180.8us |
| record: node:node_cpu_saturation_load1:
expr: sum
by(node) (node_load1{job="node-exporter"} * on(namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:) / node:node_num_cpu:sum
|
ok
|
|
26.427s ago
|
2.809ms |
| record: :node_memory_utilisation:
expr: 1
- sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"}
+ node_memory_Buffers_bytes{job="node-exporter"}) / sum(node_memory_MemTotal_bytes{job="node-exporter"})
|
ok
|
|
26.424s ago
|
453.7us |
| record: :node_memory_MemFreeCachedBuffers_bytes:sum
expr: sum(node_memory_MemFree_bytes{job="node-exporter"}
+ node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
|
ok
|
|
26.424s ago
|
277.4us |
| record: :node_memory_MemTotal_bytes:sum
expr: sum(node_memory_MemTotal_bytes{job="node-exporter"})
|
ok
|
|
26.424s ago
|
107.7us |
| record: node:node_memory_bytes_available:sum
expr: sum
by(node) ((node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"}
+ node_memory_Buffers_bytes{job="node-exporter"}) * on(namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:)
|
ok
|
|
26.424s ago
|
2.92ms |
| record: node:node_memory_bytes_total:sum
expr: sum
by(node) (node_memory_MemTotal_bytes{job="node-exporter"} * on(namespace,
pod) group_left(node) node_namespace_pod:kube_pod_info:)
|
ok
|
|
26.421s ago
|
2.723ms |
| record: node:node_memory_utilisation:ratio
expr: (node:node_memory_bytes_total:sum
- node:node_memory_bytes_available:sum) / node:node_memory_bytes_total:sum
|
ok
|
|
26.418s ago
|
229us |
| record: node:cluster_memory_utilisation:ratio
expr: (node:node_memory_bytes_total:sum
- node:node_memory_bytes_available:sum) / scalar(sum(node:node_memory_bytes_total:sum))
|
ok
|
|
26.418s ago
|
221.2us |
| record: :node_memory_swap_io_bytes:sum_rate
expr: 1000
* sum((rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])))
|
ok
|
|
26.418s ago
|
273.5us |
| record: node:node_memory_utilisation:
expr: 1
- sum by(node) ((node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"}
+ node_memory_Buffers_bytes{job="node-exporter"}) * on(namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:) / sum by(node) (node_memory_MemTotal_bytes{job="node-exporter"}
* on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:)
|
ok
|
|
26.418s ago
|
5.574ms |
| record: node:node_memory_utilisation_2:
expr: 1
- (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum)
|
ok
|
|
26.413s ago
|
201.7us |
| record: node:node_memory_swap_io_bytes:sum_rate
expr: 1000
* sum by(node) ((rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) + rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
* on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:)
|
ok
|
|
26.413s ago
|
2.891ms |
| record: :node_disk_utilisation:avg_irate
expr: avg(irate(node_disk_io_time_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m]))
|
ok
|
|
26.41s ago
|
1.36ms |
| record: node:node_disk_utilisation:avg_irate
expr: avg
by(node) (irate(node_disk_io_time_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m])
* on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:)
|
ok
|
|
26.408s ago
|
3.981ms |
| record: :node_disk_saturation:avg_irate
expr: avg(irate(node_disk_io_time_weighted_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m]))
|
ok
|
|
26.405s ago
|
1.332ms |
| record: node:node_disk_saturation:avg_irate
expr: avg
by(node) (irate(node_disk_io_time_weighted_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m])
* on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:)
|
ok
|
|
26.403s ago
|
3.994ms |
| record: node:node_filesystem_usage:
expr: max
by(instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
- node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
|
ok
|
|
26.4s ago
|
471.4us |
| record: node:node_filesystem_avail:
expr: max
by(instance, namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
/ node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
|
ok
|
|
26.399s ago
|
304.3us |
| record: :node_net_utilisation:sum_irate
expr: sum(irate(node_network_receive_bytes_total{device!~"veth.+",job="node-exporter"}[1m]))
+ sum(irate(node_network_transmit_bytes_total{device!~"veth.+",job="node-exporter"}[1m]))
|
ok
|
|
26.399s ago
|
1.911ms |
| record: node:node_net_utilisation:sum_irate
expr: sum
by(node) ((irate(node_network_receive_bytes_total{device!~"veth.+",job="node-exporter"}[1m])
+ irate(node_network_transmit_bytes_total{device!~"veth.+",job="node-exporter"}[1m]))
* on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:)
|
ok
|
|
26.397s ago
|
4.652ms |
| record: :node_net_saturation:sum_irate
expr: sum(irate(node_network_receive_drop_total{device!~"veth.+",job="node-exporter"}[1m]))
+ sum(irate(node_network_transmit_drop_total{device!~"veth.+",job="node-exporter"}[1m]))
|
ok
|
|
26.393s ago
|
1.638ms |
| record: node:node_net_saturation:sum_irate
expr: sum
by(node) ((irate(node_network_receive_drop_total{device!~"veth.+",job="node-exporter"}[1m])
+ irate(node_network_transmit_drop_total{device!~"veth.+",job="node-exporter"}[1m]))
* on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:)
|
ok
|
|
26.391s ago
|
8.367ms |
| record: node:node_inodes_total:
expr: max
by(node) (max by(node, host_ip) (kube_pod_info{host_ip!="",job="kube-state-metrics"})
* on(host_ip) group_right(node) label_replace((max by(instance) (node_filesystem_files{job="node-exporter",mountpoint="/"})),
"host_ip", "$1", "instance", "(.*):.*"))
|
ok
|
|
26.383s ago
|
5.084ms |
| record: node:node_inodes_free:
expr: max
by(node) (max by(node, host_ip) (kube_pod_info{host_ip!="",job="kube-state-metrics"})
* on(host_ip) group_right(node) label_replace((max by(instance) (node_filesystem_files_free{job="node-exporter",mountpoint="/"})),
"host_ip", "$1", "instance", "(.*):.*"))
|
ok
|
|
26.378s ago
|
4.383ms |
|
29.697s ago |
634.7us |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: PrometheusBadConfig
expr: max_over_time(prometheus_config_last_reload_successful{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
== 0
for: 10m
labels:
severity: critical
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload
its configuration.
summary: Failed Prometheus configuration reload.
|
ok
|
|
26.206s ago
|
514.9us |
| alert: PrometheusNotificationQueueRunningFull
expr: (predict_linear(prometheus_notifications_queue_length{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m],
60 * 30) > min_over_time(prometheus_notifications_queue_capacity{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]))
for: 15m
labels:
severity: warning
annotations:
description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}}
is running full.
summary: Prometheus alert notification queue predicted to run full in less than
30m.
|
ok
|
|
26.206s ago
|
516.6us |
| alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
expr: (rate(prometheus_notifications_errors_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
/ rate(prometheus_notifications_sent_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]))
* 100 > 1
for: 15m
labels:
severity: warning
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts from
Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.'
summary: Prometheus has encountered more than 1% errors sending alerts to a specific
Alertmanager.
|
ok
|
|
26.205s ago
|
400.8us |
| alert: PrometheusErrorSendingAlertsToAnyAlertmanager
expr: min
without(alertmanager) (rate(prometheus_notifications_errors_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
/ rate(prometheus_notifications_sent_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]))
* 100 > 3
for: 15m
labels:
severity: critical
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts
from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
ok
|
|
26.205s ago
|
306.5us |
| alert: PrometheusNotConnectedToAlertmanagers
expr: max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
< 1
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to
any Alertmanagers.
summary: Prometheus is not connected to any Alertmanagers.
|
ok
|
|
26.205s ago
|
142.2us |
| alert: PrometheusTSDBReloadsFailing
expr: increase(prometheus_tsdb_reloads_failures_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[3h])
> 0
for: 4h
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value
| humanize}} reload failures over the last 3h.
summary: Prometheus has issues reloading blocks from disk.
|
ok
|
|
26.205s ago
|
297.3us |
| alert: PrometheusTSDBCompactionsFailing
expr: increase(prometheus_tsdb_compactions_failed_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[3h])
> 0
for: 4h
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value
| humanize}} compaction failures over the last 3h.
summary: Prometheus has issues compacting blocks.
|
ok
|
|
26.205s ago
|
251.4us |
| alert: PrometheusTSDBWALCorruptions
expr: increase(tsdb_wal_corruptions_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[3h])
> 0
for: 4h
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value
| humanize}} corruptions of the write-ahead log (WAL) over the last 3h.
summary: Prometheus is detecting WAL corruptions.
|
ok
|
|
26.205s ago
|
127.1us |
| alert: PrometheusNotIngestingSamples
expr: rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
<= 0
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples.
summary: Prometheus is not ingesting samples.
|
ok
|
|
26.205s ago
|
151.9us |
| alert: PrometheusDuplicateTimestamps
expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
> 0
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{$value
| humanize}} samples/s with different values but duplicated timestamp.
summary: Prometheus is dropping samples with duplicate timestamps.
|
ok
|
|
26.205s ago
|
136.3us |
| alert: PrometheusOutOfOrderTimestamps
expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
> 0
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{$value
| humanize}} samples/s with timestamps arriving out of order.
summary: Prometheus drops samples with out-of-order timestamps.
|
ok
|
|
26.205s ago
|
145us |
| alert: PrometheusRemoteStorageFailures
expr: (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
/ (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
+ rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])))
* 100 > 1
for: 15m
labels:
severity: critical
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{
printf "%.1f" $value }}% of the samples to queue {{$labels.queue}}.
summary: Prometheus fails to send samples to remote storage.
|
ok
|
|
26.205s ago
|
324.1us |
| alert: PrometheusRemoteWriteBehind
expr: (max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
- on(job, instance) group_right() max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]))
> 120
for: 15m
labels:
severity: critical
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{
printf "%.1f" $value }}s behind for queue {{$labels.queue}}.
summary: Prometheus remote write is behind.
|
ok
|
|
26.205s ago
|
240.3us |
| alert: PrometheusRuleFailures
expr: increase(prometheus_rule_evaluation_failures_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
> 0
for: 15m
labels:
severity: critical
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate
{{ printf "%.0f" $value }} rules in the last 5m.
summary: Prometheus is failing rule evaluations.
|
ok
|
|
26.205s ago
|
138.4us |
| alert: PrometheusMissingRuleEvaluations
expr: increase(prometheus_rule_group_iterations_missed_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
> 0
for: 15m
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf
"%.0f" $value }} rule group evaluations in the last 5m.
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
|
ok
|
|
26.205s ago
|
143.8us |