| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: etcdInsufficientMembers
expr: sum
by(job) (up{job=~".*etcd.*"} == bool 1) < ((count by(job) (up{job=~".*etcd.*"})
+ 1) / 2)
for: 3m
labels:
severity: critical
annotations:
message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
}}).'
|
ok
|
|
2.897s ago
|
1.973ms |
| alert: etcdNoLeader
expr: etcd_server_has_leader{job=~".*etcd.*"}
== 0
for: 1m
labels:
severity: critical
annotations:
message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }}
has no leader.'
|
ok
|
|
2.896s ago
|
329.9us |
| alert: etcdHighNumberOfLeaderChanges
expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m])
> 3
for: 15m
labels:
severity: warning
annotations:
message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance
}} has seen {{ $value }} leader changes within the last hour.'
|
ok
|
|
2.895s ago
|
386us |
| alert: etcdHighNumberOfFailedGRPCRequests
expr: 100
* sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!="OK",job=~".*etcd.*"}[5m]))
/ sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~".*etcd.*"}[5m]))
> 1
for: 10m
labels:
severity: warning
annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
ok
|
|
2.895s ago
|
777.1us |
| alert: etcdHighNumberOfFailedGRPCRequests
expr: 100
* sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!="OK",job=~".*etcd.*"}[5m]))
/ sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~".*etcd.*"}[5m]))
> 5
for: 5m
labels:
severity: critical
annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
ok
|
|
2.895s ago
|
748.4us |
| alert: etcdGRPCRequestsSlow
expr: histogram_quantile(0.99,
sum by(job, instance, grpc_service, grpc_method, le) (rate(grpc_server_handling_seconds_bucket{grpc_type="unary",job=~".*etcd.*"}[5m])))
> 0.15
for: 10m
labels:
severity: critical
annotations:
message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method
}} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
ok
|
|
2.894s ago
|
261.2us |
| alert: etcdMemberCommunicationSlow
expr: histogram_quantile(0.99,
rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.15
for: 10m
labels:
severity: warning
annotations:
message: 'etcd cluster "{{ $labels.job }}": member communication with {{
$labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
ok
|
|
2.894s ago
|
194.7us |
| alert: etcdHighNumberOfFailedProposals
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m])
> 5
for: 15m
labels:
severity: warning
annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
within the last hour on etcd instance {{ $labels.instance }}.'
|
ok
|
|
2.894s ago
|
168.5us |
| alert: etcdHighFsyncDurations
expr: histogram_quantile(0.99,
rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.5
for: 10m
labels:
severity: warning
annotations:
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
ok
|
|
2.894s ago
|
212.5us |
| alert: etcdHighCommitDurations
expr: histogram_quantile(0.99,
rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.25
for: 10m
labels:
severity: warning
annotations:
message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
{{ $value }}s on etcd instance {{ $labels.instance }}.'
|
ok
|
|
2.894s ago
|
195.1us |
| alert: etcdHighNumberOfFailedHTTPRequests
expr: sum
by(method) (rate(etcd_http_failed_total{code!="404",job=~".*etcd.*"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) >
0.01
for: 10m
labels:
severity: warning
annotations:
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
{{ $labels.instance }}'
|
ok
|
|
2.894s ago
|
350.1us |
| alert: etcdHighNumberOfFailedHTTPRequests
expr: sum
by(method) (rate(etcd_http_failed_total{code!="404",job=~".*etcd.*"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) >
0.05
for: 10m
labels:
severity: critical
annotations:
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
{{ $labels.instance }}.'
|
ok
|
|
2.894s ago
|
366.8us |
| alert: etcdHTTPRequestsSlow
expr: histogram_quantile(0.99,
rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
for: 10m
labels:
severity: warning
annotations:
message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
}} are slow.
|
ok
|
|
2.893s ago
|
129.1us |
|
18.579s ago |
4.489ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: AlertmanagerDown
expr: absent(up{job="prometheus-prometheus-oper-alertmanager",namespace="monitoring"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: Alertmanager has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown
|
ok
|
|
6.834s ago
|
402.4us |
| alert: KubeAPIDown
expr: absent(up{job="apiserver"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: KubeAPI has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown
|
ok
|
|
6.834s ago
|
266.5us |
| alert: KubeControllerManagerDown
expr: absent(up{job="kube-controller-manager"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: KubeControllerManager has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown
|
ok
|
|
6.834s ago
|
960.1us |
| alert: KubeSchedulerDown
expr: absent(up{job="kube-scheduler"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: KubeScheduler has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown
|
ok
|
|
6.833s ago
|
952.5us |
| alert: KubeStateMetricsDown
expr: absent(up{job="kube-state-metrics"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: KubeStateMetrics has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsdown
|
ok
|
|
6.832s ago
|
267.1us |
| alert: KubeletDown
expr: absent(up{job="kubelet"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: Kubelet has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
|
ok
|
|
6.832s ago
|
291.1us |
| alert: NodeExporterDown
expr: absent(up{job="node-exporter"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: NodeExporter has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeexporterdown
|
ok
|
|
6.832s ago
|
192.5us |
| alert: PrometheusDown
expr: absent(up{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: Prometheus has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown
|
ok
|
|
6.832s ago
|
165us |
| alert: PrometheusOperatorDown
expr: absent(up{job="prometheus-prometheus-oper-operator",namespace="monitoring"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: PrometheusOperator has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown
|
ok
|
|
6.832s ago
|
182.1us |
|
1.468s ago |
41.08ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: KubePodCrashLooping
expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m])
* 60 * 5 > 0
for: 1h
labels:
severity: critical
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }})
is restarting {{ printf "%.2f" $value }} times / 5 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
|
ok
|
|
1.468s ago
|
10.84ms |
| alert: KubePodNotReady
expr: sum
by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics",phase=~"Failed|Pending|Unknown"})
> 0
for: 1h
labels:
severity: critical
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state
for longer than an hour.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
ok
|
|
1.458s ago
|
18.38ms |
| alert: KubeDeploymentGenerationMismatch
expr: kube_deployment_status_observed_generation{job="kube-state-metrics"}
!= kube_deployment_metadata_generation{job="kube-state-metrics"}
for: 15m
labels:
severity: critical
annotations:
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
}} does not match, this indicates that the Deployment has failed but has not been
rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
|
ok
|
|
1.439s ago
|
4.395ms |
| alert: KubeDeploymentReplicasMismatch
expr: kube_deployment_spec_replicas{job="kube-state-metrics"}
!= kube_deployment_status_replicas_available{job="kube-state-metrics"}
for: 1h
labels:
severity: critical
annotations:
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched
the expected number of replicas for longer than an hour.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
|
ok
|
|
1.435s ago
|
4.42ms |
| alert: KubeStatefulSetReplicasMismatch
expr: kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
!= kube_statefulset_status_replicas{job="kube-state-metrics"}
for: 15m
labels:
severity: critical
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched
the expected number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
|
ok
|
|
1.431s ago
|
398.7us |
| alert: KubeStatefulSetGenerationMismatch
expr: kube_statefulset_status_observed_generation{job="kube-state-metrics"}
!= kube_statefulset_metadata_generation{job="kube-state-metrics"}
for: 15m
labels:
severity: critical
annotations:
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
}} does not match, this indicates that the StatefulSet has failed but has not
been rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
|
ok
|
|
1.431s ago
|
393.3us |
| alert: KubeStatefulSetUpdateNotRolledOut
expr: max
without(revision) (kube_statefulset_status_current_revision{job="kube-state-metrics"}
unless kube_statefulset_status_update_revision{job="kube-state-metrics"})
* (kube_statefulset_replicas{job="kube-state-metrics"} != kube_statefulset_status_replicas_updated{job="kube-state-metrics"})
for: 15m
labels:
severity: critical
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has
not been rolled out.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
|
ok
|
|
1.43s ago
|
787.7us |
| alert: KubeDaemonSetRolloutStuck
expr: kube_daemonset_status_number_ready{job="kube-state-metrics"}
/ kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
* 100 < 100
for: 15m
labels:
severity: critical
annotations:
message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace
}}/{{ $labels.daemonset }} are scheduled and ready.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
|
ok
|
|
1.43s ago
|
272.8us |
| alert: KubeDaemonSetNotScheduled
expr: kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
- kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"}
> 0
for: 10m
labels:
severity: warning
annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are not scheduled.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
|
ok
|
|
1.43s ago
|
199.4us |
| alert: KubeDaemonSetMisScheduled
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics"}
> 0
for: 10m
labels:
severity: warning
annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are running where they are not supposed to run.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
|
ok
|
|
1.43s ago
|
106us |
| alert: KubeCronJobRunning
expr: time()
- kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600
for: 1h
labels:
severity: warning
annotations:
message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than
1h to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning
|
ok
|
|
1.43s ago
|
107.2us |
| alert: KubeJobCompletion
expr: kube_job_spec_completions{job="kube-state-metrics"}
- kube_job_status_succeeded{job="kube-state-metrics"} > 0
for: 1h
labels:
severity: warning
annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than
one hour to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
|
ok
|
|
1.43s ago
|
595.3us |
| alert: KubeJobFailed
expr: kube_job_status_failed{job="kube-state-metrics"}
> 0
for: 1h
labels:
severity: warning
annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
|
ok
|
|
1.429s ago
|
142.3us |
|
16.777s ago |
9.41ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: KubeCPUOvercommit
expr: sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum)
/ sum(kube_node_status_allocatable_cpu_cores) > (count(kube_node_status_allocatable_cpu_cores)
- 1) / count(kube_node_status_allocatable_cpu_cores)
for: 5m
labels:
severity: warning
annotations:
message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate
node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
|
ok
|
|
16.777s ago
|
1.103ms |
| alert: KubeMemOvercommit
expr: sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum)
/ sum(kube_node_status_allocatable_memory_bytes) > (count(kube_node_status_allocatable_memory_bytes)
- 1) / count(kube_node_status_allocatable_memory_bytes)
for: 5m
labels:
severity: warning
annotations:
message: Cluster has overcommitted memory resource requests for Pods and cannot
tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
|
ok
|
|
16.776s ago
|
1.685ms |
| alert: KubeCPUOvercommit
expr: sum(kube_resourcequota{job="kube-state-metrics",resource="cpu",type="hard"})
/ sum(kube_node_status_allocatable_cpu_cores) > 1.5
for: 5m
labels:
severity: warning
annotations:
message: Cluster has overcommitted CPU resource requests for Namespaces.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
|
ok
|
|
16.775s ago
|
361.1us |
| alert: KubeMemOvercommit
expr: sum(kube_resourcequota{job="kube-state-metrics",resource="memory",type="hard"})
/ sum(kube_node_status_allocatable_memory_bytes{job="node-exporter"}) >
1.5
for: 5m
labels:
severity: warning
annotations:
message: Cluster has overcommitted memory resource requests for Namespaces.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
|
ok
|
|
16.775s ago
|
302.5us |
| alert: KubeQuotaExceeded
expr: 100
* kube_resourcequota{job="kube-state-metrics",type="used"} / ignoring(instance,
job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"}
> 0) > 90
for: 15m
labels:
severity: warning
annotations:
message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value
}}% of its {{ $labels.resource }} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
|
ok
|
|
16.774s ago
|
367us |
| alert: CPUThrottlingHigh
expr: 100
* sum by(container, pod, namespace) (increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m]))
/ sum by(container, pod, namespace) (increase(container_cpu_cfs_periods_total[5m]))
> 25
for: 15m
labels:
severity: warning
annotations:
message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{
$labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod
}}.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
|
ok
|
|
16.774s ago
|
5.558ms |
|
6.612s ago |
16.12ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: KubeNodeNotReady
expr: kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"}
== 0
for: 1h
labels:
severity: warning
annotations:
message: '{{ $labels.node }} has been unready for more than an hour.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
|
ok
|
|
25.836s ago
|
440.7us |
| alert: KubeVersionMismatch
expr: count(count
by(gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},
"gitVersion", "$1", "gitVersion", "(v[0-9]*.[0-9]*.[0-9]*).*")))
> 1
for: 1h
labels:
severity: warning
annotations:
message: There are {{ $value }} different semantic versions of Kubernetes components
running.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
|
ok
|
|
25.835s ago
|
798us |
| alert: KubeClientErrors
expr: (sum
by(instance, job) (rate(rest_client_requests_total{code=~"5.."}[5m])) /
sum by(instance, job) (rate(rest_client_requests_total[5m]))) * 100 > 1
for: 15m
labels:
severity: warning
annotations:
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
}}' is experiencing {{ printf "%0.0f" $value }}% errors.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
|
ok
|
|
25.835s ago
|
2.556ms |
| alert: KubeClientErrors
expr: sum
by(instance, job) (rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m]))
> 0.1
for: 15m
labels:
severity: warning
annotations:
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
}}' is experiencing {{ printf "%0.0f" $value }} errors / second.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
|
ok
|
|
25.832s ago
|
214.1us |
| alert: KubeletTooManyPods
expr: kubelet_running_pod_count{job="kubelet"}
> 110 * 0.9
for: 15m
labels:
severity: warning
annotations:
message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close to the
limit of 110.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
|
ok
|
|
25.832s ago
|
177.8us |
| alert: KubeAPILatencyHigh
expr: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"}
> 1
for: 10m
labels:
severity: warning
annotations:
message: The API server has a 99th percentile latency of {{ $value }} seconds for
{{ $labels.verb }} {{ $labels.resource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
|
ok
|
|
25.832s ago
|
3.408ms |
| alert: KubeAPILatencyHigh
expr: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"}
> 4
for: 10m
labels:
severity: critical
annotations:
message: The API server has a 99th percentile latency of {{ $value }} seconds for
{{ $labels.verb }} {{ $labels.resource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
|
ok
|
|
25.829s ago
|
2.982ms |
| alert: KubeAPIErrorsHigh
expr: sum(rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m]))
/ sum(rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 3
for: 10m
labels:
severity: critical
annotations:
message: API server is returning errors for {{ $value }}% of requests.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
ok
|
|
25.826s ago
|
15.39ms |
| alert: KubeAPIErrorsHigh
expr: sum(rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m]))
/ sum(rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 1
for: 10m
labels:
severity: warning
annotations:
message: API server is returning errors for {{ $value }}% of requests.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
ok
|
|
25.811s ago
|
13.55ms |
| alert: KubeAPIErrorsHigh
expr: sum
by(resource, subresource, verb) (rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m]))
/ sum by(resource, subresource, verb) (rate(apiserver_request_total{job="apiserver"}[5m]))
* 100 > 10
for: 10m
labels:
severity: critical
annotations:
message: API server is returning errors for {{ $value }}% of requests for {{ $labels.verb
}} {{ $labels.resource }} {{ $labels.subresource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
ok
|
|
25.798s ago
|
13.83ms |
| alert: KubeAPIErrorsHigh
expr: sum
by(resource, subresource, verb) (rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m]))
/ sum by(resource, subresource, verb) (rate(apiserver_request_total{job="apiserver"}[5m]))
* 100 > 5
for: 10m
labels:
severity: warning
annotations:
message: API server is returning errors for {{ $value }}% of requests for {{ $labels.verb
}} {{ $labels.resource }} {{ $labels.subresource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
ok
|
|
25.784s ago
|
13.77ms |
| alert: KubeClientCertificateExpiration
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"}
> 0 and histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m])))
< 604800
labels:
severity: warning
annotations:
message: A client certificate used to authenticate to the apiserver is expiring
in less than 7.0 days.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
ok
|
|
25.771s ago
|
832.1us |
| alert: KubeClientCertificateExpiration
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"}
> 0 and histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m])))
< 86400
labels:
severity: critical
annotations:
message: A client certificate used to authenticate to the apiserver is expiring
in less than 24.0 hours.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
ok
|
|
25.77s ago
|
692.7us |
|
3.718s ago |
6.569ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| record: :kube_pod_info_node_count:
expr: sum(min
by(node) (kube_pod_info))
|
ok
|
|
23.163s ago
|
5.296ms |
| record: node_namespace_pod:kube_pod_info:
expr: max
by(node, namespace, pod) (label_replace(kube_pod_info{job="kube-state-metrics"},
"pod", "$1", "pod", "(.*)"))
|
ok
|
|
23.158s ago
|
10.42ms |
| record: node:node_num_cpu:sum
expr: count
by(node) (sum by(node, cpu) (node_cpu_seconds_total{job="node-exporter"}
* on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:))
|
ok
|
|
23.148s ago
|
18.87ms |
| record: :node_cpu_utilisation:avg1m
expr: 1
- avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]))
|
ok
|
|
23.129s ago
|
2.05ms |
| record: node:node_cpu_utilisation:avg1m
expr: 1
- avg by(node) (rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])
* on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:)
|
ok
|
|
23.127s ago
|
4.742ms |
| record: node:cluster_cpu_utilisation:ratio
expr: node:node_cpu_utilisation:avg1m
* node:node_num_cpu:sum / scalar(sum(node:node_num_cpu:sum))
|
ok
|
|
23.122s ago
|
256.2us |
| record: :node_cpu_saturation_load1:
expr: sum(node_load1{job="node-exporter"})
/ sum(node:node_num_cpu:sum)
|
ok
|
|
23.122s ago
|
174.7us |
| record: node:node_cpu_saturation_load1:
expr: sum
by(node) (node_load1{job="node-exporter"} * on(namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:) / node:node_num_cpu:sum
|
ok
|
|
23.122s ago
|
2.646ms |
| record: :node_memory_utilisation:
expr: 1
- sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"}
+ node_memory_Buffers_bytes{job="node-exporter"}) / sum(node_memory_MemTotal_bytes{job="node-exporter"})
|
ok
|
|
23.119s ago
|
373us |
| record: :node_memory_MemFreeCachedBuffers_bytes:sum
expr: sum(node_memory_MemFree_bytes{job="node-exporter"}
+ node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
|
ok
|
|
23.119s ago
|
246.9us |
| record: :node_memory_MemTotal_bytes:sum
expr: sum(node_memory_MemTotal_bytes{job="node-exporter"})
|
ok
|
|
23.119s ago
|
89.31us |
| record: node:node_memory_bytes_available:sum
expr: sum
by(node) ((node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"}
+ node_memory_Buffers_bytes{job="node-exporter"}) * on(namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:)
|
ok
|
|
23.119s ago
|
2.723ms |
| record: node:node_memory_bytes_total:sum
expr: sum
by(node) (node_memory_MemTotal_bytes{job="node-exporter"} * on(namespace,
pod) group_left(node) node_namespace_pod:kube_pod_info:)
|
ok
|
|
23.116s ago
|
2.508ms |
| record: node:node_memory_utilisation:ratio
expr: (node:node_memory_bytes_total:sum
- node:node_memory_bytes_available:sum) / node:node_memory_bytes_total:sum
|
ok
|
|
23.114s ago
|
225us |
| record: node:cluster_memory_utilisation:ratio
expr: (node:node_memory_bytes_total:sum
- node:node_memory_bytes_available:sum) / scalar(sum(node:node_memory_bytes_total:sum))
|
ok
|
|
23.114s ago
|
204.1us |
| record: :node_memory_swap_io_bytes:sum_rate
expr: 1000
* sum((rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])))
|
ok
|
|
23.114s ago
|
259.9us |
| record: node:node_memory_utilisation:
expr: 1
- sum by(node) ((node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"}
+ node_memory_Buffers_bytes{job="node-exporter"}) * on(namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:) / sum by(node) (node_memory_MemTotal_bytes{job="node-exporter"}
* on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:)
|
ok
|
|
23.114s ago
|
5.147ms |
| record: node:node_memory_utilisation_2:
expr: 1
- (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum)
|
ok
|
|
23.109s ago
|
188.3us |
| record: node:node_memory_swap_io_bytes:sum_rate
expr: 1000
* sum by(node) ((rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) + rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
* on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:)
|
ok
|
|
23.108s ago
|
2.646ms |
| record: :node_disk_utilisation:avg_irate
expr: avg(irate(node_disk_io_time_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m]))
|
ok
|
|
23.106s ago
|
1.323ms |
| record: node:node_disk_utilisation:avg_irate
expr: avg
by(node) (irate(node_disk_io_time_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m])
* on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:)
|
ok
|
|
23.105s ago
|
6.605ms |
| record: :node_disk_saturation:avg_irate
expr: avg(irate(node_disk_io_time_weighted_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m]))
|
ok
|
|
23.098s ago
|
3.949ms |
| record: node:node_disk_saturation:avg_irate
expr: avg
by(node) (irate(node_disk_io_time_weighted_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m])
* on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:)
|
ok
|
|
23.094s ago
|
10.49ms |
| record: node:node_filesystem_usage:
expr: max
by(instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
- node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
|
ok
|
|
23.084s ago
|
1.211ms |
| record: node:node_filesystem_avail:
expr: max
by(instance, namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
/ node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
|
ok
|
|
23.083s ago
|
794.6us |
| record: :node_net_utilisation:sum_irate
expr: sum(irate(node_network_receive_bytes_total{device!~"veth.+",job="node-exporter"}[1m]))
+ sum(irate(node_network_transmit_bytes_total{device!~"veth.+",job="node-exporter"}[1m]))
|
ok
|
|
23.082s ago
|
5.748ms |
| record: node:node_net_utilisation:sum_irate
expr: sum
by(node) ((irate(node_network_receive_bytes_total{device!~"veth.+",job="node-exporter"}[1m])
+ irate(node_network_transmit_bytes_total{device!~"veth.+",job="node-exporter"}[1m]))
* on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:)
|
ok
|
|
23.077s ago
|
12.02ms |
| record: :node_net_saturation:sum_irate
expr: sum(irate(node_network_receive_drop_total{device!~"veth.+",job="node-exporter"}[1m]))
+ sum(irate(node_network_transmit_drop_total{device!~"veth.+",job="node-exporter"}[1m]))
|
ok
|
|
23.065s ago
|
4.66ms |
| record: node:node_net_saturation:sum_irate
expr: sum
by(node) ((irate(node_network_receive_drop_total{device!~"veth.+",job="node-exporter"}[1m])
+ irate(node_network_transmit_drop_total{device!~"veth.+",job="node-exporter"}[1m]))
* on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:)
|
ok
|
|
23.06s ago
|
11.07ms |
| record: node:node_inodes_total:
expr: max
by(node) (max by(node, host_ip) (kube_pod_info{host_ip!="",job="kube-state-metrics"})
* on(host_ip) group_right(node) label_replace((max by(instance) (node_filesystem_files{job="node-exporter",mountpoint="/"})),
"host_ip", "$1", "instance", "(.*):.*"))
|
ok
|
|
23.049s ago
|
7.361ms |
| record: node:node_inodes_free:
expr: max
by(node) (max by(node, host_ip) (kube_pod_info{host_ip!="",job="kube-state-metrics"})
* on(host_ip) group_right(node) label_replace((max by(instance) (node_filesystem_files_free{job="node-exporter",mountpoint="/"})),
"host_ip", "$1", "instance", "(.*):.*"))
|
ok
|
|
23.042s ago
|
6.207ms |
|
26.392s ago |
813.3us |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: PrometheusBadConfig
expr: max_over_time(prometheus_config_last_reload_successful{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
== 0
for: 10m
labels:
severity: critical
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload
its configuration.
summary: Failed Prometheus configuration reload.
|
ok
|
|
22.902s ago
|
477.2us |
| alert: PrometheusNotificationQueueRunningFull
expr: (predict_linear(prometheus_notifications_queue_length{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m],
60 * 30) > min_over_time(prometheus_notifications_queue_capacity{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]))
for: 15m
labels:
severity: warning
annotations:
description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}}
is running full.
summary: Prometheus alert notification queue predicted to run full in less than
30m.
|
ok
|
|
22.901s ago
|
762.8us |
| alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
expr: (rate(prometheus_notifications_errors_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
/ rate(prometheus_notifications_sent_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]))
* 100 > 1
for: 15m
labels:
severity: warning
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts from
Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.'
summary: Prometheus has encountered more than 1% errors sending alerts to a specific
Alertmanager.
|
ok
|
|
22.901s ago
|
867.4us |
| alert: PrometheusErrorSendingAlertsToAnyAlertmanager
expr: min
without(alertmanager) (rate(prometheus_notifications_errors_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
/ rate(prometheus_notifications_sent_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]))
* 100 > 3
for: 15m
labels:
severity: critical
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts
from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
ok
|
|
22.9s ago
|
935.8us |
| alert: PrometheusNotConnectedToAlertmanagers
expr: max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
< 1
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to
any Alertmanagers.
summary: Prometheus is not connected to any Alertmanagers.
|
ok
|
|
22.899s ago
|
404.2us |
| alert: PrometheusTSDBReloadsFailing
expr: increase(prometheus_tsdb_reloads_failures_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[3h])
> 0
for: 4h
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value
| humanize}} reload failures over the last 3h.
summary: Prometheus has issues reloading blocks from disk.
|
ok
|
|
22.899s ago
|
689.5us |
| alert: PrometheusTSDBCompactionsFailing
expr: increase(prometheus_tsdb_compactions_failed_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[3h])
> 0
for: 4h
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value
| humanize}} compaction failures over the last 3h.
summary: Prometheus has issues compacting blocks.
|
ok
|
|
22.899s ago
|
603.8us |
| alert: PrometheusTSDBWALCorruptions
expr: increase(tsdb_wal_corruptions_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[3h])
> 0
for: 4h
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value
| humanize}} corruptions of the write-ahead log (WAL) over the last 3h.
summary: Prometheus is detecting WAL corruptions.
|
ok
|
|
22.898s ago
|
335.2us |
| alert: PrometheusNotIngestingSamples
expr: rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
<= 0
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples.
summary: Prometheus is not ingesting samples.
|
ok
|
|
22.898s ago
|
391.2us |
| alert: PrometheusDuplicateTimestamps
expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
> 0
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{$value
| humanize}} samples/s with different values but duplicated timestamp.
summary: Prometheus is dropping samples with duplicate timestamps.
|
ok
|
|
22.898s ago
|
346us |
| alert: PrometheusOutOfOrderTimestamps
expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
> 0
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{$value
| humanize}} samples/s with timestamps arriving out of order.
summary: Prometheus drops samples with out-of-order timestamps.
|
ok
|
|
22.898s ago
|
373.5us |
| alert: PrometheusRemoteStorageFailures
expr: (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
/ (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
+ rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])))
* 100 > 1
for: 15m
labels:
severity: critical
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{
printf "%.1f" $value }}% of the samples to queue {{$labels.queue}}.
summary: Prometheus fails to send samples to remote storage.
|
ok
|
|
22.897s ago
|
818.2us |
| alert: PrometheusRemoteWriteBehind
expr: (max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
- on(job, instance) group_right() max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]))
> 120
for: 15m
labels:
severity: critical
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{
printf "%.1f" $value }}s behind for queue {{$labels.queue}}.
summary: Prometheus remote write is behind.
|
ok
|
|
22.897s ago
|
667.6us |
| alert: PrometheusRuleFailures
expr: increase(prometheus_rule_evaluation_failures_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
> 0
for: 15m
labels:
severity: critical
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate
{{ printf "%.0f" $value }} rules in the last 5m.
summary: Prometheus is failing rule evaluations.
|
ok
|
|
22.896s ago
|
397us |
| alert: PrometheusMissingRuleEvaluations
expr: increase(prometheus_rule_group_iterations_missed_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])
> 0
for: 15m
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf
"%.0f" $value }} rule group evaluations in the last 5m.
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
|
ok
|
|
22.896s ago
|
1.278ms |