Rules

alertmanager.rules

20.626s ago

1.738ms

Rule State Error Last Evaluation Evaluation Time
alert: AlertmanagerConfigInconsistent expr: count_values by(service) ("config_hash", alertmanager_config_hash{job="prometheus-prometheus-oper-alertmanager",namespace="monitoring"}) / on(service) group_left() label_replace(prometheus_operator_spec_replicas{controller="alertmanager",job="prometheus-prometheus-oper-operator",namespace="monitoring"}, "service", "$1", "name", "(.*)") != 1 for: 5m labels: severity: critical annotations: message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync. ok 20.627s ago 1.022ms
alert: AlertmanagerFailedReload expr: alertmanager_config_last_reload_successful{job="prometheus-prometheus-oper-alertmanager",namespace="monitoring"} == 0 for: 10m labels: severity: warning annotations: message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. ok 20.626s ago 226.6us
alert: AlertmanagerMembersInconsistent expr: alertmanager_cluster_members{job="prometheus-prometheus-oper-alertmanager",namespace="monitoring"} != on(service) group_left() count by(service) (alertmanager_cluster_members{job="prometheus-prometheus-oper-alertmanager",namespace="monitoring"}) for: 5m labels: severity: critical annotations: message: Alertmanager has not found all other members of the cluster. ok 20.626s ago 469.5us

etcd

2.897s ago

6.132ms

Rule State Error Last Evaluation Evaluation Time
alert: etcdInsufficientMembers expr: sum by(job) (up{job=~".*etcd.*"} == bool 1) < ((count by(job) (up{job=~".*etcd.*"}) + 1) / 2) for: 3m labels: severity: critical annotations: message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).' ok 2.897s ago 1.973ms
alert: etcdNoLeader expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 for: 1m labels: severity: critical annotations: message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.' ok 2.896s ago 329.9us
alert: etcdHighNumberOfLeaderChanges expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3 for: 15m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour.' ok 2.895s ago 386us
alert: etcdHighNumberOfFailedGRPCRequests expr: 100 * sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!="OK",job=~".*etcd.*"}[5m])) / sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) > 1 for: 10m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' ok 2.895s ago 777.1us
alert: etcdHighNumberOfFailedGRPCRequests expr: 100 * sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!="OK",job=~".*etcd.*"}[5m])) / sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) > 5 for: 5m labels: severity: critical annotations: message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' ok 2.895s ago 748.4us
alert: etcdGRPCRequestsSlow expr: histogram_quantile(0.99, sum by(job, instance, grpc_service, grpc_method, le) (rate(grpc_server_handling_seconds_bucket{grpc_type="unary",job=~".*etcd.*"}[5m]))) > 0.15 for: 10m labels: severity: critical annotations: message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.' ok 2.894s ago 261.2us
alert: etcdMemberCommunicationSlow expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.15 for: 10m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.' ok 2.894s ago 194.7us
alert: etcdHighNumberOfFailedProposals expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 for: 15m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}.' ok 2.894s ago 168.5us
alert: etcdHighFsyncDurations expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.5 for: 10m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.' ok 2.894s ago 212.5us
alert: etcdHighCommitDurations expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.25 for: 10m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.' ok 2.894s ago 195.1us
alert: etcdHighNumberOfFailedHTTPRequests expr: sum by(method) (rate(etcd_http_failed_total{code!="404",job=~".*etcd.*"}[5m])) / sum by(method) (rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) > 0.01 for: 10m labels: severity: warning annotations: message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}' ok 2.894s ago 350.1us
alert: etcdHighNumberOfFailedHTTPRequests expr: sum by(method) (rate(etcd_http_failed_total{code!="404",job=~".*etcd.*"}[5m])) / sum by(method) (rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) > 0.05 for: 10m labels: severity: critical annotations: message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.' ok 2.894s ago 366.8us
alert: etcdHTTPRequestsSlow expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 for: 10m labels: severity: warning annotations: message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow. ok 2.893s ago 129.1us

general.rules

18.579s ago

4.489ms

Rule State Error Last Evaluation Evaluation Time
alert: TargetDown expr: 100 * (count by(job) (up == 0) / count by(job) (up)) > 10 for: 10m labels: severity: warning annotations: message: '{{ $value }}% of the {{ $labels.job }} targets are down.' ok 18.58s ago 4.078ms
alert: Watchdog expr: vector(1) labels: severity: none annotations: message: | This is an alert meant to ensure that the entire alerting pipeline is functional. This alert is always firing, therefore it should always be firing in Alertmanager and always fire against a receiver. There are integrations with various notification mechanisms that send a notification when this alert is not firing. For example the "DeadMansSnitch" integration in PagerDuty. ok 18.576s ago 390.8us

k8s.rules

6.873s ago

75.29ms

Rule State Error Last Evaluation Evaluation Time
record: namespace:container_cpu_usage_seconds_total:sum_rate expr: sum by(namespace) (rate(container_cpu_usage_seconds_total{container!="POD",image!="",job="kubelet"}[5m])) ok 6.873s ago 15.89ms
record: namespace_pod_container:container_cpu_usage_seconds_total:sum_rate expr: sum by(namespace, pod, container) (rate(container_cpu_usage_seconds_total{container!="POD",image!="",job="kubelet"}[5m])) ok 6.857s ago 15.44ms
record: namespace:container_memory_usage_bytes:sum expr: sum by(namespace) (container_memory_usage_bytes{container!="POD",image!="",job="kubelet"}) ok 6.842s ago 6.594ms
record: namespace:container_memory_usage_bytes:sum expr: sum by(namespace, label_name) (sum by(pod, namespace) (container_memory_usage_bytes{container!="POD",image!="",job="kubelet"}) * on(namespace, pod) group_left(label_name) kube_pod_labels{job="kube-state-metrics"}) ok 6.835s ago 10.7ms
record: namespace:kube_pod_container_resource_requests_memory_bytes:sum expr: sum by(namespace, label_name) (sum by(namespace, pod) (kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} * on(endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) * on(namespace, pod) group_left(label_name) kube_pod_labels{job="kube-state-metrics"}) ok 6.825s ago 12.79ms
record: namespace:kube_pod_container_resource_requests_cpu_cores:sum expr: sum by(namespace, label_name) (sum by(namespace, pod) (kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} * on(endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) * on(namespace, pod) group_left(label_name) kube_pod_labels{job="kube-state-metrics"}) ok 6.812s ago 9.645ms
record: mixin_pod_workload expr: sum by(namespace, workload, pod) (label_replace(label_replace(kube_pod_owner{job="kube-state-metrics",owner_kind="ReplicaSet"}, "replicaset", "$1", "owner_name", "(.*)") * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"}, "workload", "$1", "owner_name", "(.*)")) labels: workload_type: deployment ok 6.802s ago 3.291ms
record: mixin_pod_workload expr: sum by(namespace, workload, pod) (label_replace(kube_pod_owner{job="kube-state-metrics",owner_kind="DaemonSet"}, "workload", "$1", "owner_name", "(.*)")) labels: workload_type: daemonset ok 6.799s ago 455.4us
record: mixin_pod_workload expr: sum by(namespace, workload, pod) (label_replace(kube_pod_owner{job="kube-state-metrics",owner_kind="StatefulSet"}, "workload", "$1", "owner_name", "(.*)")) labels: workload_type: statefulset ok 6.799s ago 433.7us

kube-apiserver.rules

29.982s ago

1.233s

Rule State Error Last Evaluation Evaluation Time
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile expr: histogram_quantile(0.99, sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m]))) labels: quantile: "0.99" ok 29.982s ago 464.2ms
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile expr: histogram_quantile(0.9, sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m]))) labels: quantile: "0.9" ok 29.518s ago 391.2ms
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile expr: histogram_quantile(0.5, sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m]))) labels: quantile: "0.5" ok 29.127s ago 377.5ms

kube-prometheus-node-alerting.rules

4.058s ago

3.377ms

Rule State Error Last Evaluation Evaluation Time
alert: NodeDiskRunningFull expr: '(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)' for: 30m labels: severity: warning annotations: message: Device {{ $labels.device }} on node {{ $labels.instance }} will be full within the next 24 hours. ok 4.058s ago 2.542ms
alert: NodeDiskRunningFull expr: '(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)' for: 10m labels: severity: critical annotations: message: Device {{ $labels.device }} on node {{ $labels.instance }} will be full within the next 2 hours. ok 4.055s ago 816.1us

kube-prometheus-node-recording.rules

27.4s ago

86.06ms

Rule State Error Last Evaluation Evaluation Time
record: instance:node_cpu:rate:sum expr: sum by(instance) (rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) ok 27.4s ago 15.44ms
record: instance:node_filesystem_usage:sum expr: sum by(instance) ((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})) ok 27.385s ago 416.1us
record: instance:node_network_receive_bytes:rate:sum expr: sum by(instance) (rate(node_network_receive_bytes_total[3m])) ok 27.385s ago 5.364ms
record: instance:node_network_transmit_bytes:rate:sum expr: sum by(instance) (rate(node_network_transmit_bytes_total[3m])) ok 27.379s ago 6.082ms
record: instance:node_cpu:ratio expr: sum without(cpu, mode) (rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) / on(instance) group_left() count by(instance) (sum by(instance, cpu) (node_cpu_seconds_total)) ok 27.373s ago 35.09ms
record: cluster:node_cpu:sum_rate5m expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) ok 27.338s ago 11.55ms
record: cluster:node_cpu:ratio expr: cluster:node_cpu_seconds_total:rate5m / count(sum by(instance, cpu) (node_cpu_seconds_total)) ok 27.327s ago 12.1ms

kube-scheduler.rules

29.242s ago

2.111ms

Rule State Error Last Evaluation Evaluation Time
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile expr: histogram_quantile(0.99, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.99" ok 29.243s ago 500.2us
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile expr: histogram_quantile(0.99, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.99" ok 29.242s ago 253.2us
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile expr: histogram_quantile(0.99, sum without(instance, pod) (rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.99" ok 29.242s ago 205.9us
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile expr: histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.9" ok 29.242s ago 206.3us
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile expr: histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.9" ok 29.242s ago 201.4us
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile expr: histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.9" ok 29.242s ago 174us
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile expr: histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.5" ok 29.242s ago 179.6us
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile expr: histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.5" ok 29.242s ago 181.2us
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile expr: histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.5" ok 29.242s ago 179.2us

kubernetes-absent

6.834s ago

3.714ms

Rule State Error Last Evaluation Evaluation Time
alert: AlertmanagerDown expr: absent(up{job="prometheus-prometheus-oper-alertmanager",namespace="monitoring"} == 1) for: 15m labels: severity: critical annotations: message: Alertmanager has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown ok 6.834s ago 402.4us
alert: KubeAPIDown expr: absent(up{job="apiserver"} == 1) for: 15m labels: severity: critical annotations: message: KubeAPI has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown ok 6.834s ago 266.5us
alert: KubeControllerManagerDown expr: absent(up{job="kube-controller-manager"} == 1) for: 15m labels: severity: critical annotations: message: KubeControllerManager has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown ok 6.834s ago 960.1us
alert: KubeSchedulerDown expr: absent(up{job="kube-scheduler"} == 1) for: 15m labels: severity: critical annotations: message: KubeScheduler has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown ok 6.833s ago 952.5us
alert: KubeStateMetricsDown expr: absent(up{job="kube-state-metrics"} == 1) for: 15m labels: severity: critical annotations: message: KubeStateMetrics has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsdown ok 6.832s ago 267.1us
alert: KubeletDown expr: absent(up{job="kubelet"} == 1) for: 15m labels: severity: critical annotations: message: Kubelet has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown ok 6.832s ago 291.1us
alert: NodeExporterDown expr: absent(up{job="node-exporter"} == 1) for: 15m labels: severity: critical annotations: message: NodeExporter has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeexporterdown ok 6.832s ago 192.5us
alert: PrometheusDown expr: absent(up{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"} == 1) for: 15m labels: severity: critical annotations: message: Prometheus has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown ok 6.832s ago 165us
alert: PrometheusOperatorDown expr: absent(up{job="prometheus-prometheus-oper-operator",namespace="monitoring"} == 1) for: 15m labels: severity: critical annotations: message: PrometheusOperator has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown ok 6.832s ago 182.1us

kubernetes-apps

1.468s ago

41.08ms

Rule State Error Last Evaluation Evaluation Time
alert: KubePodCrashLooping expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0 for: 1h labels: severity: critical annotations: message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping ok 1.468s ago 10.84ms
alert: KubePodNotReady expr: sum by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics",phase=~"Failed|Pending|Unknown"}) > 0 for: 1h labels: severity: critical annotations: message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than an hour. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready ok 1.458s ago 18.38ms
alert: KubeDeploymentGenerationMismatch expr: kube_deployment_status_observed_generation{job="kube-state-metrics"} != kube_deployment_metadata_generation{job="kube-state-metrics"} for: 15m labels: severity: critical annotations: message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch ok 1.439s ago 4.395ms
alert: KubeDeploymentReplicasMismatch expr: kube_deployment_spec_replicas{job="kube-state-metrics"} != kube_deployment_status_replicas_available{job="kube-state-metrics"} for: 1h labels: severity: critical annotations: message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than an hour. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch ok 1.435s ago 4.42ms
alert: KubeStatefulSetReplicasMismatch expr: kube_statefulset_status_replicas_ready{job="kube-state-metrics"} != kube_statefulset_status_replicas{job="kube-state-metrics"} for: 15m labels: severity: critical annotations: message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch ok 1.431s ago 398.7us
alert: KubeStatefulSetGenerationMismatch expr: kube_statefulset_status_observed_generation{job="kube-state-metrics"} != kube_statefulset_metadata_generation{job="kube-state-metrics"} for: 15m labels: severity: critical annotations: message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch ok 1.431s ago 393.3us
alert: KubeStatefulSetUpdateNotRolledOut expr: max without(revision) (kube_statefulset_status_current_revision{job="kube-state-metrics"} unless kube_statefulset_status_update_revision{job="kube-state-metrics"}) * (kube_statefulset_replicas{job="kube-state-metrics"} != kube_statefulset_status_replicas_updated{job="kube-state-metrics"}) for: 15m labels: severity: critical annotations: message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout ok 1.43s ago 787.7us
alert: KubeDaemonSetRolloutStuck expr: kube_daemonset_status_number_ready{job="kube-state-metrics"} / kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} * 100 < 100 for: 15m labels: severity: critical annotations: message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are scheduled and ready. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck ok 1.43s ago 272.8us
alert: KubeDaemonSetNotScheduled expr: kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0 for: 10m labels: severity: warning annotations: message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled ok 1.43s ago 199.4us
alert: KubeDaemonSetMisScheduled expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 for: 10m labels: severity: warning annotations: message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled ok 1.43s ago 106us
alert: KubeCronJobRunning expr: time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600 for: 1h labels: severity: warning annotations: message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning ok 1.43s ago 107.2us
alert: KubeJobCompletion expr: kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 for: 1h labels: severity: warning annotations: message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than one hour to complete. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion ok 1.43s ago 595.3us
alert: KubeJobFailed expr: kube_job_status_failed{job="kube-state-metrics"} > 0 for: 1h labels: severity: warning annotations: message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed ok 1.429s ago 142.3us

kubernetes-resources

16.777s ago

9.41ms

Rule State Error Last Evaluation Evaluation Time
alert: KubeCPUOvercommit expr: sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum) / sum(kube_node_status_allocatable_cpu_cores) > (count(kube_node_status_allocatable_cpu_cores) - 1) / count(kube_node_status_allocatable_cpu_cores) for: 5m labels: severity: warning annotations: message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit ok 16.777s ago 1.103ms
alert: KubeMemOvercommit expr: sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum) / sum(kube_node_status_allocatable_memory_bytes) > (count(kube_node_status_allocatable_memory_bytes) - 1) / count(kube_node_status_allocatable_memory_bytes) for: 5m labels: severity: warning annotations: message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit ok 16.776s ago 1.685ms
alert: KubeCPUOvercommit expr: sum(kube_resourcequota{job="kube-state-metrics",resource="cpu",type="hard"}) / sum(kube_node_status_allocatable_cpu_cores) > 1.5 for: 5m labels: severity: warning annotations: message: Cluster has overcommitted CPU resource requests for Namespaces. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit ok 16.775s ago 361.1us
alert: KubeMemOvercommit expr: sum(kube_resourcequota{job="kube-state-metrics",resource="memory",type="hard"}) / sum(kube_node_status_allocatable_memory_bytes{job="node-exporter"}) > 1.5 for: 5m labels: severity: warning annotations: message: Cluster has overcommitted memory resource requests for Namespaces. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit ok 16.775s ago 302.5us
alert: KubeQuotaExceeded expr: 100 * kube_resourcequota{job="kube-state-metrics",type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"} > 0) > 90 for: 15m labels: severity: warning annotations: message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value }}% of its {{ $labels.resource }} quota. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded ok 16.774s ago 367us
alert: CPUThrottlingHigh expr: 100 * sum by(container, pod, namespace) (increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) / sum by(container, pod, namespace) (increase(container_cpu_cfs_periods_total[5m])) > 25 for: 15m labels: severity: warning annotations: message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh ok 16.774s ago 5.558ms

kubernetes-storage

6.612s ago

16.12ms

Rule State Error Last Evaluation Evaluation Time
alert: KubePersistentVolumeUsageCritical expr: 100 * kubelet_volume_stats_available_bytes{job="kubelet"} / kubelet_volume_stats_capacity_bytes{job="kubelet"} < 3 for: 1m labels: severity: critical annotations: message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value }}% free. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical ok 6.612s ago 5.797ms
alert: KubePersistentVolumeFullInFourDays expr: 100 * (kubelet_volume_stats_available_bytes{job="kubelet"} / kubelet_volume_stats_capacity_bytes{job="kubelet"}) < 15 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0 for: 5m labels: severity: critical annotations: message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ printf "%0.2f" $value }}% is available. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays ok 6.606s ago 8.974ms
alert: KubePersistentVolumeErrors expr: kube_persistentvolume_status_phase{job="kube-state-metrics",phase=~"Failed|Pending"} > 0 for: 5m labels: severity: critical annotations: message: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors ok 6.597s ago 1.328ms

kubernetes-system

25.836s ago

68.69ms

Rule State Error Last Evaluation Evaluation Time
alert: KubeNodeNotReady expr: kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} == 0 for: 1h labels: severity: warning annotations: message: '{{ $labels.node }} has been unready for more than an hour.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready ok 25.836s ago 440.7us
alert: KubeVersionMismatch expr: count(count by(gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"}, "gitVersion", "$1", "gitVersion", "(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1 for: 1h labels: severity: warning annotations: message: There are {{ $value }} different semantic versions of Kubernetes components running. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch ok 25.835s ago 798us
alert: KubeClientErrors expr: (sum by(instance, job) (rate(rest_client_requests_total{code=~"5.."}[5m])) / sum by(instance, job) (rate(rest_client_requests_total[5m]))) * 100 > 1 for: 15m labels: severity: warning annotations: message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ printf "%0.0f" $value }}% errors.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors ok 25.835s ago 2.556ms
alert: KubeClientErrors expr: sum by(instance, job) (rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) > 0.1 for: 15m labels: severity: warning annotations: message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ printf "%0.0f" $value }} errors / second. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors ok 25.832s ago 214.1us
alert: KubeletTooManyPods expr: kubelet_running_pod_count{job="kubelet"} > 110 * 0.9 for: 15m labels: severity: warning annotations: message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close to the limit of 110. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods ok 25.832s ago 177.8us
alert: KubeAPILatencyHigh expr: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 for: 10m labels: severity: warning annotations: message: The API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh ok 25.832s ago 3.408ms
alert: KubeAPILatencyHigh expr: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 for: 10m labels: severity: critical annotations: message: The API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh ok 25.829s ago 2.982ms
alert: KubeAPIErrorsHigh expr: sum(rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m])) / sum(rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 3 for: 10m labels: severity: critical annotations: message: API server is returning errors for {{ $value }}% of requests. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh ok 25.826s ago 15.39ms
alert: KubeAPIErrorsHigh expr: sum(rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m])) / sum(rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 1 for: 10m labels: severity: warning annotations: message: API server is returning errors for {{ $value }}% of requests. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh ok 25.811s ago 13.55ms
alert: KubeAPIErrorsHigh expr: sum by(resource, subresource, verb) (rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m])) / sum by(resource, subresource, verb) (rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 10 for: 10m labels: severity: critical annotations: message: API server is returning errors for {{ $value }}% of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh ok 25.798s ago 13.83ms
alert: KubeAPIErrorsHigh expr: sum by(resource, subresource, verb) (rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m])) / sum by(resource, subresource, verb) (rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 5 for: 10m labels: severity: warning annotations: message: API server is returning errors for {{ $value }}% of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh ok 25.784s ago 13.77ms
alert: KubeClientCertificateExpiration expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 labels: severity: warning annotations: message: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration ok 25.771s ago 832.1us
alert: KubeClientCertificateExpiration expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 labels: severity: critical annotations: message: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration ok 25.77s ago 692.7us

node-network

3.718s ago

6.569ms

Rule State Error Last Evaluation Evaluation Time
alert: NetworkReceiveErrors expr: rate(node_network_receive_errs_total{device!~"veth.+",job="node-exporter"}[2m]) > 0 for: 2m labels: severity: warning annotations: message: Network interface "{{ $labels.device }}" showing receive errors on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" ok 3.718s ago 2.578ms
alert: NetworkTransmitErrors expr: rate(node_network_transmit_errs_total{device!~"veth.+",job="node-exporter"}[2m]) > 0 for: 2m labels: severity: warning annotations: message: Network interface "{{ $labels.device }}" showing transmit errors on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" ok 3.715s ago 2.074ms
alert: NodeNetworkInterfaceFlapping expr: changes(node_network_up{device!~"veth.+",job="node-exporter"}[2m]) > 2 for: 2m labels: severity: warning annotations: message: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" ok 3.713s ago 1.892ms

node-time

7.93s ago

545.9us

Rule State Error Last Evaluation Evaluation Time
alert: ClockSkewDetected expr: abs(node_timex_offset_seconds{job="node-exporter"}) > 0.05 for: 2m labels: severity: warning annotations: message: Clock skew detected on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}. Ensure NTP is configured correctly on this host. ok 7.931s ago 532.1us

node.rules

23.163s ago

130.6ms

Rule State Error Last Evaluation Evaluation Time
record: :kube_pod_info_node_count: expr: sum(min by(node) (kube_pod_info)) ok 23.163s ago 5.296ms
record: node_namespace_pod:kube_pod_info: expr: max by(node, namespace, pod) (label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) ok 23.158s ago 10.42ms
record: node:node_num_cpu:sum expr: count by(node) (sum by(node, cpu) (node_cpu_seconds_total{job="node-exporter"} * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:)) ok 23.148s ago 18.87ms
record: :node_cpu_utilisation:avg1m expr: 1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])) ok 23.129s ago 2.05ms
record: node:node_cpu_utilisation:avg1m expr: 1 - avg by(node) (rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 23.127s ago 4.742ms
record: node:cluster_cpu_utilisation:ratio expr: node:node_cpu_utilisation:avg1m * node:node_num_cpu:sum / scalar(sum(node:node_num_cpu:sum)) ok 23.122s ago 256.2us
record: :node_cpu_saturation_load1: expr: sum(node_load1{job="node-exporter"}) / sum(node:node_num_cpu:sum) ok 23.122s ago 174.7us
record: node:node_cpu_saturation_load1: expr: sum by(node) (node_load1{job="node-exporter"} * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) / node:node_num_cpu:sum ok 23.122s ago 2.646ms
record: :node_memory_utilisation: expr: 1 - sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) / sum(node_memory_MemTotal_bytes{job="node-exporter"}) ok 23.119s ago 373us
record: :node_memory_MemFreeCachedBuffers_bytes:sum expr: sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) ok 23.119s ago 246.9us
record: :node_memory_MemTotal_bytes:sum expr: sum(node_memory_MemTotal_bytes{job="node-exporter"}) ok 23.119s ago 89.31us
record: node:node_memory_bytes_available:sum expr: sum by(node) ((node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 23.119s ago 2.723ms
record: node:node_memory_bytes_total:sum expr: sum by(node) (node_memory_MemTotal_bytes{job="node-exporter"} * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 23.116s ago 2.508ms
record: node:node_memory_utilisation:ratio expr: (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) / node:node_memory_bytes_total:sum ok 23.114s ago 225us
record: node:cluster_memory_utilisation:ratio expr: (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) / scalar(sum(node:node_memory_bytes_total:sum)) ok 23.114s ago 204.1us
record: :node_memory_swap_io_bytes:sum_rate expr: 1000 * sum((rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) + rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))) ok 23.114s ago 259.9us
record: node:node_memory_utilisation: expr: 1 - sum by(node) ((node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) / sum by(node) (node_memory_MemTotal_bytes{job="node-exporter"} * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 23.114s ago 5.147ms
record: node:node_memory_utilisation_2: expr: 1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum) ok 23.109s ago 188.3us
record: node:node_memory_swap_io_bytes:sum_rate expr: 1000 * sum by(node) ((rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 23.108s ago 2.646ms
record: :node_disk_utilisation:avg_irate expr: avg(irate(node_disk_io_time_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m])) ok 23.106s ago 1.323ms
record: node:node_disk_utilisation:avg_irate expr: avg by(node) (irate(node_disk_io_time_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 23.105s ago 6.605ms
record: :node_disk_saturation:avg_irate expr: avg(irate(node_disk_io_time_weighted_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m])) ok 23.098s ago 3.949ms
record: node:node_disk_saturation:avg_irate expr: avg by(node) (irate(node_disk_io_time_weighted_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 23.094s ago 10.49ms
record: node:node_filesystem_usage: expr: max by(instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) ok 23.084s ago 1.211ms
record: node:node_filesystem_avail: expr: max by(instance, namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) ok 23.083s ago 794.6us
record: :node_net_utilisation:sum_irate expr: sum(irate(node_network_receive_bytes_total{device!~"veth.+",job="node-exporter"}[1m])) + sum(irate(node_network_transmit_bytes_total{device!~"veth.+",job="node-exporter"}[1m])) ok 23.082s ago 5.748ms
record: node:node_net_utilisation:sum_irate expr: sum by(node) ((irate(node_network_receive_bytes_total{device!~"veth.+",job="node-exporter"}[1m]) + irate(node_network_transmit_bytes_total{device!~"veth.+",job="node-exporter"}[1m])) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 23.077s ago 12.02ms
record: :node_net_saturation:sum_irate expr: sum(irate(node_network_receive_drop_total{device!~"veth.+",job="node-exporter"}[1m])) + sum(irate(node_network_transmit_drop_total{device!~"veth.+",job="node-exporter"}[1m])) ok 23.065s ago 4.66ms
record: node:node_net_saturation:sum_irate expr: sum by(node) ((irate(node_network_receive_drop_total{device!~"veth.+",job="node-exporter"}[1m]) + irate(node_network_transmit_drop_total{device!~"veth.+",job="node-exporter"}[1m])) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 23.06s ago 11.07ms
record: node:node_inodes_total: expr: max by(node) (max by(node, host_ip) (kube_pod_info{host_ip!="",job="kube-state-metrics"}) * on(host_ip) group_right(node) label_replace((max by(instance) (node_filesystem_files{job="node-exporter",mountpoint="/"})), "host_ip", "$1", "instance", "(.*):.*")) ok 23.049s ago 7.361ms
record: node:node_inodes_free: expr: max by(node) (max by(node, host_ip) (kube_pod_info{host_ip!="",job="kube-state-metrics"}) * on(host_ip) group_right(node) label_replace((max by(instance) (node_filesystem_files_free{job="node-exporter",mountpoint="/"})), "host_ip", "$1", "instance", "(.*):.*")) ok 23.042s ago 6.207ms

prometheus-operator

26.392s ago

813.3us

Rule State Error Last Evaluation Evaluation Time
alert: PrometheusOperatorReconcileErrors expr: rate(prometheus_operator_reconcile_errors_total{job="prometheus-prometheus-oper-operator",namespace="monitoring"}[5m]) > 0.1 for: 10m labels: severity: warning annotations: message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace }} Namespace. ok 26.393s ago 582.8us
alert: PrometheusOperatorNodeLookupErrors expr: rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-prometheus-oper-operator",namespace="monitoring"}[5m]) > 0.1 for: 10m labels: severity: warning annotations: message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. ok 26.392s ago 212.1us

prometheus

22.901s ago

9.417ms

Rule State Error Last Evaluation Evaluation Time
alert: PrometheusBadConfig expr: max_over_time(prometheus_config_last_reload_successful{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) == 0 for: 10m labels: severity: critical annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration. summary: Failed Prometheus configuration reload. ok 22.902s ago 477.2us
alert: PrometheusNotificationQueueRunningFull expr: (predict_linear(prometheus_notifications_queue_length{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m], 60 * 30) > min_over_time(prometheus_notifications_queue_capacity{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])) for: 15m labels: severity: warning annotations: description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full. summary: Prometheus alert notification queue predicted to run full in less than 30m. ok 22.901s ago 762.8us
alert: PrometheusErrorSendingAlertsToSomeAlertmanagers expr: (rate(prometheus_notifications_errors_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])) * 100 > 1 for: 15m labels: severity: warning annotations: description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.' summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. ok 22.901s ago 867.4us
alert: PrometheusErrorSendingAlertsToAnyAlertmanager expr: min without(alertmanager) (rate(prometheus_notifications_errors_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])) * 100 > 3 for: 15m labels: severity: critical annotations: description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.' summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. ok 22.9s ago 935.8us
alert: PrometheusNotConnectedToAlertmanagers expr: max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) < 1 for: 10m labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers. summary: Prometheus is not connected to any Alertmanagers. ok 22.899s ago 404.2us
alert: PrometheusTSDBReloadsFailing expr: increase(prometheus_tsdb_reloads_failures_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[3h]) > 0 for: 4h labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h. summary: Prometheus has issues reloading blocks from disk. ok 22.899s ago 689.5us
alert: PrometheusTSDBCompactionsFailing expr: increase(prometheus_tsdb_compactions_failed_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[3h]) > 0 for: 4h labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h. summary: Prometheus has issues compacting blocks. ok 22.899s ago 603.8us
alert: PrometheusTSDBWALCorruptions expr: increase(tsdb_wal_corruptions_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[3h]) > 0 for: 4h labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} corruptions of the write-ahead log (WAL) over the last 3h. summary: Prometheus is detecting WAL corruptions. ok 22.898s ago 335.2us
alert: PrometheusNotIngestingSamples expr: rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) <= 0 for: 10m labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples. summary: Prometheus is not ingesting samples. ok 22.898s ago 391.2us
alert: PrometheusDuplicateTimestamps expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) > 0 for: 10m labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{$value | humanize}} samples/s with different values but duplicated timestamp. summary: Prometheus is dropping samples with duplicate timestamps. ok 22.898s ago 346us
alert: PrometheusOutOfOrderTimestamps expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) > 0 for: 10m labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{$value | humanize}} samples/s with timestamps arriving out of order. summary: Prometheus drops samples with out-of-order timestamps. ok 22.898s ago 373.5us
alert: PrometheusRemoteStorageFailures expr: (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) / (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) + rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]))) * 100 > 1 for: 15m labels: severity: critical annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to queue {{$labels.queue}}. summary: Prometheus fails to send samples to remote storage. ok 22.897s ago 818.2us
alert: PrometheusRemoteWriteBehind expr: (max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) - on(job, instance) group_right() max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])) > 120 for: 15m labels: severity: critical annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for queue {{$labels.queue}}. summary: Prometheus remote write is behind. ok 22.897s ago 667.6us
alert: PrometheusRuleFailures expr: increase(prometheus_rule_evaluation_failures_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) > 0 for: 15m labels: severity: critical annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m. summary: Prometheus is failing rule evaluations. ok 22.896s ago 397us
alert: PrometheusMissingRuleEvaluations expr: increase(prometheus_rule_group_iterations_missed_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) > 0 for: 15m labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m. summary: Prometheus is missing rule evaluations due to slow rule group evaluation. ok 22.896s ago 1.278ms