Rules

alertmanager.rules

23.923s ago

1.376ms

Rule State Error Last Evaluation Evaluation Time
alert: AlertmanagerConfigInconsistent expr: count_values by(service) ("config_hash", alertmanager_config_hash{job="prometheus-prometheus-oper-alertmanager",namespace="monitoring"}) / on(service) group_left() label_replace(prometheus_operator_spec_replicas{controller="alertmanager",job="prometheus-prometheus-oper-operator",namespace="monitoring"}, "service", "$1", "name", "(.*)") != 1 for: 5m labels: severity: critical annotations: message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync. ok 23.927s ago 758.5us
alert: AlertmanagerFailedReload expr: alertmanager_config_last_reload_successful{job="prometheus-prometheus-oper-alertmanager",namespace="monitoring"} == 0 for: 10m labels: severity: warning annotations: message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. ok 23.926s ago 250.3us
alert: AlertmanagerMembersInconsistent expr: alertmanager_cluster_members{job="prometheus-prometheus-oper-alertmanager",namespace="monitoring"} != on(service) group_left() count by(service) (alertmanager_cluster_members{job="prometheus-prometheus-oper-alertmanager",namespace="monitoring"}) for: 5m labels: severity: critical annotations: message: Alertmanager has not found all other members of the cluster. ok 23.926s ago 348.9us

etcd

6.199s ago

5.081ms

Rule State Error Last Evaluation Evaluation Time
alert: etcdInsufficientMembers expr: sum by(job) (up{job=~".*etcd.*"} == bool 1) < ((count by(job) (up{job=~".*etcd.*"}) + 1) / 2) for: 3m labels: severity: critical annotations: message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).' ok 6.199s ago 1.768ms
alert: etcdNoLeader expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 for: 1m labels: severity: critical annotations: message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.' ok 6.198s ago 226.2us
alert: etcdHighNumberOfLeaderChanges expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3 for: 15m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour.' ok 6.198s ago 216.7us
alert: etcdHighNumberOfFailedGRPCRequests expr: 100 * sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!="OK",job=~".*etcd.*"}[5m])) / sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) > 1 for: 10m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' ok 6.198s ago 417.4us
alert: etcdHighNumberOfFailedGRPCRequests expr: 100 * sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!="OK",job=~".*etcd.*"}[5m])) / sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) > 5 for: 5m labels: severity: critical annotations: message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' ok 6.198s ago 404.8us
alert: etcdGRPCRequestsSlow expr: histogram_quantile(0.99, sum by(job, instance, grpc_service, grpc_method, le) (rate(grpc_server_handling_seconds_bucket{grpc_type="unary",job=~".*etcd.*"}[5m]))) > 0.15 for: 10m labels: severity: critical annotations: message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.' ok 6.197s ago 261.9us
alert: etcdMemberCommunicationSlow expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.15 for: 10m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.' ok 6.197s ago 208.8us
alert: etcdHighNumberOfFailedProposals expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 for: 15m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}.' ok 6.197s ago 177.8us
alert: etcdHighFsyncDurations expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.5 for: 10m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.' ok 6.197s ago 283.8us
alert: etcdHighCommitDurations expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.25 for: 10m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.' ok 6.197s ago 225.9us
alert: etcdHighNumberOfFailedHTTPRequests expr: sum by(method) (rate(etcd_http_failed_total{code!="404",job=~".*etcd.*"}[5m])) / sum by(method) (rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) > 0.01 for: 10m labels: severity: warning annotations: message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}' ok 6.197s ago 366.6us
alert: etcdHighNumberOfFailedHTTPRequests expr: sum by(method) (rate(etcd_http_failed_total{code!="404",job=~".*etcd.*"}[5m])) / sum by(method) (rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) > 0.05 for: 10m labels: severity: critical annotations: message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.' ok 6.197s ago 352.1us
alert: etcdHTTPRequestsSlow expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 for: 10m labels: severity: warning annotations: message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow. ok 6.197s ago 131.3us

general.rules

21.88s ago

4.446ms

Rule State Error Last Evaluation Evaluation Time
alert: TargetDown expr: 100 * (count by(job) (up == 0) / count by(job) (up)) > 10 for: 10m labels: severity: warning annotations: message: '{{ $value }}% of the {{ $labels.job }} targets are down.' ok 21.881s ago 4.05ms
alert: Watchdog expr: vector(1) labels: severity: none annotations: message: | This is an alert meant to ensure that the entire alerting pipeline is functional. This alert is always firing, therefore it should always be firing in Alertmanager and always fire against a receiver. There are integrations with various notification mechanisms that send a notification when this alert is not firing. For example the "DeadMansSnitch" integration in PagerDuty. ok 21.877s ago 375.6us

k8s.rules

10.174s ago

72.24ms

Rule State Error Last Evaluation Evaluation Time
record: namespace:container_cpu_usage_seconds_total:sum_rate expr: sum by(namespace) (rate(container_cpu_usage_seconds_total{container!="POD",image!="",job="kubelet"}[5m])) ok 10.174s ago 15.74ms
record: namespace_pod_container:container_cpu_usage_seconds_total:sum_rate expr: sum by(namespace, pod, container) (rate(container_cpu_usage_seconds_total{container!="POD",image!="",job="kubelet"}[5m])) ok 10.159s ago 16.86ms
record: namespace:container_memory_usage_bytes:sum expr: sum by(namespace) (container_memory_usage_bytes{container!="POD",image!="",job="kubelet"}) ok 10.142s ago 6.426ms
record: namespace:container_memory_usage_bytes:sum expr: sum by(namespace, label_name) (sum by(pod, namespace) (container_memory_usage_bytes{container!="POD",image!="",job="kubelet"}) * on(namespace, pod) group_left(label_name) kube_pod_labels{job="kube-state-metrics"}) ok 10.136s ago 10.69ms
record: namespace:kube_pod_container_resource_requests_memory_bytes:sum expr: sum by(namespace, label_name) (sum by(namespace, pod) (kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} * on(endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) * on(namespace, pod) group_left(label_name) kube_pod_labels{job="kube-state-metrics"}) ok 10.126s ago 10.64ms
record: namespace:kube_pod_container_resource_requests_cpu_cores:sum expr: sum by(namespace, label_name) (sum by(namespace, pod) (kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} * on(endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) * on(namespace, pod) group_left(label_name) kube_pod_labels{job="kube-state-metrics"}) ok 10.115s ago 8.06ms
record: mixin_pod_workload expr: sum by(namespace, workload, pod) (label_replace(label_replace(kube_pod_owner{job="kube-state-metrics",owner_kind="ReplicaSet"}, "replicaset", "$1", "owner_name", "(.*)") * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"}, "workload", "$1", "owner_name", "(.*)")) labels: workload_type: deployment ok 10.107s ago 2.993ms
record: mixin_pod_workload expr: sum by(namespace, workload, pod) (label_replace(kube_pod_owner{job="kube-state-metrics",owner_kind="DaemonSet"}, "workload", "$1", "owner_name", "(.*)")) labels: workload_type: daemonset ok 10.105s ago 408.1us
record: mixin_pod_workload expr: sum by(namespace, workload, pod) (label_replace(kube_pod_owner{job="kube-state-metrics",owner_kind="StatefulSet"}, "workload", "$1", "owner_name", "(.*)")) labels: workload_type: statefulset ok 10.104s ago 376.9us

kube-apiserver.rules

3.284s ago

1.501s

Rule State Error Last Evaluation Evaluation Time
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile expr: histogram_quantile(0.99, sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m]))) labels: quantile: "0.99" ok 3.284s ago 520.2ms
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile expr: histogram_quantile(0.9, sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m]))) labels: quantile: "0.9" ok 2.764s ago 504.6ms
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile expr: histogram_quantile(0.5, sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m]))) labels: quantile: "0.5" ok 2.26s ago 476.1ms

kube-prometheus-node-alerting.rules

7.361s ago

2.038ms

Rule State Error Last Evaluation Evaluation Time
alert: NodeDiskRunningFull expr: '(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)' for: 30m labels: severity: warning annotations: message: Device {{ $labels.device }} on node {{ $labels.instance }} will be full within the next 24 hours. ok 7.361s ago 1.628ms
alert: NodeDiskRunningFull expr: '(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)' for: 10m labels: severity: critical annotations: message: Device {{ $labels.device }} on node {{ $labels.instance }} will be full within the next 2 hours. ok 7.359s ago 396.6us

kube-prometheus-node-recording.rules

703ms ago

85.65ms

Rule State Error Last Evaluation Evaluation Time
record: instance:node_cpu:rate:sum expr: sum by(instance) (rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) ok 703ms ago 17.25ms
record: instance:node_filesystem_usage:sum expr: sum by(instance) ((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})) ok 686ms ago 431us
record: instance:node_network_receive_bytes:rate:sum expr: sum by(instance) (rate(node_network_receive_bytes_total[3m])) ok 686ms ago 6.07ms
record: instance:node_network_transmit_bytes:rate:sum expr: sum by(instance) (rate(node_network_transmit_bytes_total[3m])) ok 680ms ago 7.013ms
record: instance:node_cpu:ratio expr: sum without(cpu, mode) (rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) / on(instance) group_left() count by(instance) (sum by(instance, cpu) (node_cpu_seconds_total)) ok 673ms ago 31.85ms
record: cluster:node_cpu:sum_rate5m expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) ok 641ms ago 10.21ms
record: cluster:node_cpu:ratio expr: cluster:node_cpu_seconds_total:rate5m / count(sum by(instance, cpu) (node_cpu_seconds_total)) ok 631ms ago 12.79ms

kube-scheduler.rules

2.546s ago

1.594ms

Rule State Error Last Evaluation Evaluation Time
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile expr: histogram_quantile(0.99, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.99" ok 2.546s ago 473.6us
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile expr: histogram_quantile(0.99, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.99" ok 2.546s ago 141.9us
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile expr: histogram_quantile(0.99, sum without(instance, pod) (rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.99" ok 2.546s ago 135.9us
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile expr: histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.9" ok 2.546s ago 156.5us
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile expr: histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.9" ok 2.546s ago 151.6us
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile expr: histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.9" ok 2.546s ago 119us
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile expr: histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.5" ok 2.546s ago 125.5us
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile expr: histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.5" ok 2.546s ago 124.5us
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile expr: histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.5" ok 2.546s ago 127.4us

kubernetes-absent

10.138s ago

3.661ms

Rule State Error Last Evaluation Evaluation Time
alert: AlertmanagerDown expr: absent(up{job="prometheus-prometheus-oper-alertmanager",namespace="monitoring"} == 1) for: 15m labels: severity: critical annotations: message: Alertmanager has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown ok 10.138s ago 499.1us
alert: KubeAPIDown expr: absent(up{job="apiserver"} == 1) for: 15m labels: severity: critical annotations: message: KubeAPI has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown ok 10.137s ago 357.5us
alert: KubeControllerManagerDown expr: absent(up{job="kube-controller-manager"} == 1) for: 15m labels: severity: critical annotations: message: KubeControllerManager has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown ok 10.137s ago 829us
alert: KubeSchedulerDown expr: absent(up{job="kube-scheduler"} == 1) for: 15m labels: severity: critical annotations: message: KubeScheduler has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown ok 10.136s ago 719.4us
alert: KubeStateMetricsDown expr: absent(up{job="kube-state-metrics"} == 1) for: 15m labels: severity: critical annotations: message: KubeStateMetrics has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsdown ok 10.136s ago 219.8us
alert: KubeletDown expr: absent(up{job="kubelet"} == 1) for: 15m labels: severity: critical annotations: message: Kubelet has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown ok 10.136s ago 336.9us
alert: NodeExporterDown expr: absent(up{job="node-exporter"} == 1) for: 15m labels: severity: critical annotations: message: NodeExporter has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeexporterdown ok 10.135s ago 242.8us
alert: PrometheusDown expr: absent(up{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"} == 1) for: 15m labels: severity: critical annotations: message: Prometheus has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown ok 10.135s ago 216.7us
alert: PrometheusOperatorDown expr: absent(up{job="prometheus-prometheus-oper-operator",namespace="monitoring"} == 1) for: 15m labels: severity: critical annotations: message: PrometheusOperator has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown ok 10.135s ago 206.5us

kubernetes-apps

4.772s ago

30.22ms

Rule State Error Last Evaluation Evaluation Time
alert: KubePodCrashLooping expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0 for: 1h labels: severity: critical annotations: message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping ok 4.772s ago 8.106ms
alert: KubePodNotReady expr: sum by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics",phase=~"Failed|Pending|Unknown"}) > 0 for: 1h labels: severity: critical annotations: message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than an hour. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready ok 4.764s ago 10.6ms
alert: KubeDeploymentGenerationMismatch expr: kube_deployment_status_observed_generation{job="kube-state-metrics"} != kube_deployment_metadata_generation{job="kube-state-metrics"} for: 15m labels: severity: critical annotations: message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch ok 4.754s ago 4.108ms
alert: KubeDeploymentReplicasMismatch expr: kube_deployment_spec_replicas{job="kube-state-metrics"} != kube_deployment_status_replicas_available{job="kube-state-metrics"} for: 1h labels: severity: critical annotations: message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than an hour. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch ok 4.75s ago 4.117ms
alert: KubeStatefulSetReplicasMismatch expr: kube_statefulset_status_replicas_ready{job="kube-state-metrics"} != kube_statefulset_status_replicas{job="kube-state-metrics"} for: 15m labels: severity: critical annotations: message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch ok 4.746s ago 367us
alert: KubeStatefulSetGenerationMismatch expr: kube_statefulset_status_observed_generation{job="kube-state-metrics"} != kube_statefulset_metadata_generation{job="kube-state-metrics"} for: 15m labels: severity: critical annotations: message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch ok 4.746s ago 341.4us
alert: KubeStatefulSetUpdateNotRolledOut expr: max without(revision) (kube_statefulset_status_current_revision{job="kube-state-metrics"} unless kube_statefulset_status_update_revision{job="kube-state-metrics"}) * (kube_statefulset_replicas{job="kube-state-metrics"} != kube_statefulset_status_replicas_updated{job="kube-state-metrics"}) for: 15m labels: severity: critical annotations: message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout ok 4.745s ago 705.8us
alert: KubeDaemonSetRolloutStuck expr: kube_daemonset_status_number_ready{job="kube-state-metrics"} / kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} * 100 < 100 for: 15m labels: severity: critical annotations: message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are scheduled and ready. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck ok 4.745s ago 219.8us
alert: KubeDaemonSetNotScheduled expr: kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0 for: 10m labels: severity: warning annotations: message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled ok 4.745s ago 182.1us
alert: KubeDaemonSetMisScheduled expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 for: 10m labels: severity: warning annotations: message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled ok 4.745s ago 94.55us
alert: KubeCronJobRunning expr: time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600 for: 1h labels: severity: warning annotations: message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning ok 4.745s ago 87.27us
alert: KubeJobCompletion expr: kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 for: 1h labels: severity: warning annotations: message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than one hour to complete. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion ok 4.745s ago 913.2us
alert: KubeJobFailed expr: kube_job_status_failed{job="kube-state-metrics"} > 0 for: 1h labels: severity: warning annotations: message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed ok 4.744s ago 336.6us

kubernetes-resources

20.081s ago

11.99ms

Rule State Error Last Evaluation Evaluation Time
alert: KubeCPUOvercommit expr: sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum) / sum(kube_node_status_allocatable_cpu_cores) > (count(kube_node_status_allocatable_cpu_cores) - 1) / count(kube_node_status_allocatable_cpu_cores) for: 5m labels: severity: warning annotations: message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit ok 20.081s ago 1.275ms
alert: KubeMemOvercommit expr: sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum) / sum(kube_node_status_allocatable_memory_bytes) > (count(kube_node_status_allocatable_memory_bytes) - 1) / count(kube_node_status_allocatable_memory_bytes) for: 5m labels: severity: warning annotations: message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit ok 20.08s ago 1.573ms
alert: KubeCPUOvercommit expr: sum(kube_resourcequota{job="kube-state-metrics",resource="cpu",type="hard"}) / sum(kube_node_status_allocatable_cpu_cores) > 1.5 for: 5m labels: severity: warning annotations: message: Cluster has overcommitted CPU resource requests for Namespaces. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit ok 20.078s ago 376.2us
alert: KubeMemOvercommit expr: sum(kube_resourcequota{job="kube-state-metrics",resource="memory",type="hard"}) / sum(kube_node_status_allocatable_memory_bytes{job="node-exporter"}) > 1.5 for: 5m labels: severity: warning annotations: message: Cluster has overcommitted memory resource requests for Namespaces. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit ok 20.078s ago 271.4us
alert: KubeQuotaExceeded expr: 100 * kube_resourcequota{job="kube-state-metrics",type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"} > 0) > 90 for: 15m labels: severity: warning annotations: message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value }}% of its {{ $labels.resource }} quota. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded ok 20.078s ago 309.4us
alert: CPUThrottlingHigh expr: 100 * sum by(container, pod, namespace) (increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) / sum by(container, pod, namespace) (increase(container_cpu_cfs_periods_total[5m])) > 25 for: 15m labels: severity: warning annotations: message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh ok 20.078s ago 8.156ms

kubernetes-storage

9.916s ago

13.34ms

Rule State Error Last Evaluation Evaluation Time
alert: KubePersistentVolumeUsageCritical expr: 100 * kubelet_volume_stats_available_bytes{job="kubelet"} / kubelet_volume_stats_capacity_bytes{job="kubelet"} < 3 for: 1m labels: severity: critical annotations: message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value }}% free. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical ok 9.916s ago 3.318ms
alert: KubePersistentVolumeFullInFourDays expr: 100 * (kubelet_volume_stats_available_bytes{job="kubelet"} / kubelet_volume_stats_capacity_bytes{job="kubelet"}) < 15 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0 for: 5m labels: severity: critical annotations: message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ printf "%0.2f" $value }}% is available. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays ok 9.913s ago 8.996ms
alert: KubePersistentVolumeErrors expr: kube_persistentvolume_status_phase{job="kube-state-metrics",phase=~"Failed|Pending"} > 0 for: 5m labels: severity: critical annotations: message: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors ok 9.904s ago 1.005ms

kubernetes-system

29.139s ago

94.09ms

Rule State Error Last Evaluation Evaluation Time
alert: KubeNodeNotReady expr: kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} == 0 for: 1h labels: severity: warning annotations: message: '{{ $labels.node }} has been unready for more than an hour.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready ok 29.14s ago 406.1us
alert: KubeVersionMismatch expr: count(count by(gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"}, "gitVersion", "$1", "gitVersion", "(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1 for: 1h labels: severity: warning annotations: message: There are {{ $value }} different semantic versions of Kubernetes components running. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch ok 29.139s ago 703.4us
alert: KubeClientErrors expr: (sum by(instance, job) (rate(rest_client_requests_total{code=~"5.."}[5m])) / sum by(instance, job) (rate(rest_client_requests_total[5m]))) * 100 > 1 for: 15m labels: severity: warning annotations: message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ printf "%0.0f" $value }}% errors.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors ok 29.139s ago 1.972ms
alert: KubeClientErrors expr: sum by(instance, job) (rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) > 0.1 for: 15m labels: severity: warning annotations: message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ printf "%0.0f" $value }} errors / second. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors ok 29.137s ago 150.6us
alert: KubeletTooManyPods expr: kubelet_running_pod_count{job="kubelet"} > 110 * 0.9 for: 15m labels: severity: warning annotations: message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close to the limit of 110. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods ok 29.137s ago 84.1us
alert: KubeAPILatencyHigh expr: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 for: 10m labels: severity: warning annotations: message: The API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh ok 29.137s ago 2.367ms
alert: KubeAPILatencyHigh expr: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 for: 10m labels: severity: critical annotations: message: The API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh ok 29.135s ago 2.166ms
alert: KubeAPIErrorsHigh expr: sum(rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m])) / sum(rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 3 for: 10m labels: severity: critical annotations: message: API server is returning errors for {{ $value }}% of requests. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh ok 29.133s ago 25.94ms
alert: KubeAPIErrorsHigh expr: sum(rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m])) / sum(rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 1 for: 10m labels: severity: warning annotations: message: API server is returning errors for {{ $value }}% of requests. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh ok 29.108s ago 21.82ms
alert: KubeAPIErrorsHigh expr: sum by(resource, subresource, verb) (rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m])) / sum by(resource, subresource, verb) (rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 10 for: 10m labels: severity: critical annotations: message: API server is returning errors for {{ $value }}% of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh ok 29.086s ago 19.16ms
alert: KubeAPIErrorsHigh expr: sum by(resource, subresource, verb) (rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m])) / sum by(resource, subresource, verb) (rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 5 for: 10m labels: severity: warning annotations: message: API server is returning errors for {{ $value }}% of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh ok 29.067s ago 17.65ms
alert: KubeClientCertificateExpiration expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 labels: severity: warning annotations: message: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration ok 29.049s ago 883.6us
alert: KubeClientCertificateExpiration expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 labels: severity: critical annotations: message: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration ok 29.049s ago 748.7us

node-network

7.022s ago

4.524ms

Rule State Error Last Evaluation Evaluation Time
alert: NetworkReceiveErrors expr: rate(node_network_receive_errs_total{device!~"veth.+",job="node-exporter"}[2m]) > 0 for: 2m labels: severity: warning annotations: message: Network interface "{{ $labels.device }}" showing receive errors on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" ok 7.022s ago 1.853ms
alert: NetworkTransmitErrors expr: rate(node_network_transmit_errs_total{device!~"veth.+",job="node-exporter"}[2m]) > 0 for: 2m labels: severity: warning annotations: message: Network interface "{{ $labels.device }}" showing transmit errors on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" ok 7.02s ago 1.396ms
alert: NodeNetworkInterfaceFlapping expr: changes(node_network_up{device!~"veth.+",job="node-exporter"}[2m]) > 2 for: 2m labels: severity: warning annotations: message: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" ok 7.019s ago 1.26ms

node-time

11.235s ago

619.6us

Rule State Error Last Evaluation Evaluation Time
alert: ClockSkewDetected expr: abs(node_timex_offset_seconds{job="node-exporter"}) > 0.05 for: 2m labels: severity: warning annotations: message: Clock skew detected on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}. Ensure NTP is configured correctly on this host. ok 11.235s ago 609.7us

node.rules

26.467s ago

97.41ms

Rule State Error Last Evaluation Evaluation Time
record: :kube_pod_info_node_count: expr: sum(min by(node) (kube_pod_info)) ok 26.467s ago 6.721ms
record: node_namespace_pod:kube_pod_info: expr: max by(node, namespace, pod) (label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) ok 26.461s ago 7.875ms
record: node:node_num_cpu:sum expr: count by(node) (sum by(node, cpu) (node_cpu_seconds_total{job="node-exporter"} * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:)) ok 26.453s ago 18.97ms
record: :node_cpu_utilisation:avg1m expr: 1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])) ok 26.434s ago 2.145ms
record: node:node_cpu_utilisation:avg1m expr: 1 - avg by(node) (rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 26.432s ago 5.037ms
record: node:cluster_cpu_utilisation:ratio expr: node:node_cpu_utilisation:avg1m * node:node_num_cpu:sum / scalar(sum(node:node_num_cpu:sum)) ok 26.427s ago 255us
record: :node_cpu_saturation_load1: expr: sum(node_load1{job="node-exporter"}) / sum(node:node_num_cpu:sum) ok 26.427s ago 180.8us
record: node:node_cpu_saturation_load1: expr: sum by(node) (node_load1{job="node-exporter"} * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) / node:node_num_cpu:sum ok 26.427s ago 2.809ms
record: :node_memory_utilisation: expr: 1 - sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) / sum(node_memory_MemTotal_bytes{job="node-exporter"}) ok 26.424s ago 453.7us
record: :node_memory_MemFreeCachedBuffers_bytes:sum expr: sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) ok 26.424s ago 277.4us
record: :node_memory_MemTotal_bytes:sum expr: sum(node_memory_MemTotal_bytes{job="node-exporter"}) ok 26.424s ago 107.7us
record: node:node_memory_bytes_available:sum expr: sum by(node) ((node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 26.424s ago 2.92ms
record: node:node_memory_bytes_total:sum expr: sum by(node) (node_memory_MemTotal_bytes{job="node-exporter"} * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 26.421s ago 2.723ms
record: node:node_memory_utilisation:ratio expr: (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) / node:node_memory_bytes_total:sum ok 26.418s ago 229us
record: node:cluster_memory_utilisation:ratio expr: (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) / scalar(sum(node:node_memory_bytes_total:sum)) ok 26.418s ago 221.2us
record: :node_memory_swap_io_bytes:sum_rate expr: 1000 * sum((rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) + rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))) ok 26.418s ago 273.5us
record: node:node_memory_utilisation: expr: 1 - sum by(node) ((node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) / sum by(node) (node_memory_MemTotal_bytes{job="node-exporter"} * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 26.418s ago 5.574ms
record: node:node_memory_utilisation_2: expr: 1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum) ok 26.413s ago 201.7us
record: node:node_memory_swap_io_bytes:sum_rate expr: 1000 * sum by(node) ((rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 26.413s ago 2.891ms
record: :node_disk_utilisation:avg_irate expr: avg(irate(node_disk_io_time_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m])) ok 26.41s ago 1.36ms
record: node:node_disk_utilisation:avg_irate expr: avg by(node) (irate(node_disk_io_time_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 26.408s ago 3.981ms
record: :node_disk_saturation:avg_irate expr: avg(irate(node_disk_io_time_weighted_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m])) ok 26.405s ago 1.332ms
record: node:node_disk_saturation:avg_irate expr: avg by(node) (irate(node_disk_io_time_weighted_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 26.403s ago 3.994ms
record: node:node_filesystem_usage: expr: max by(instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) ok 26.4s ago 471.4us
record: node:node_filesystem_avail: expr: max by(instance, namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) ok 26.399s ago 304.3us
record: :node_net_utilisation:sum_irate expr: sum(irate(node_network_receive_bytes_total{device!~"veth.+",job="node-exporter"}[1m])) + sum(irate(node_network_transmit_bytes_total{device!~"veth.+",job="node-exporter"}[1m])) ok 26.399s ago 1.911ms
record: node:node_net_utilisation:sum_irate expr: sum by(node) ((irate(node_network_receive_bytes_total{device!~"veth.+",job="node-exporter"}[1m]) + irate(node_network_transmit_bytes_total{device!~"veth.+",job="node-exporter"}[1m])) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 26.397s ago 4.652ms
record: :node_net_saturation:sum_irate expr: sum(irate(node_network_receive_drop_total{device!~"veth.+",job="node-exporter"}[1m])) + sum(irate(node_network_transmit_drop_total{device!~"veth.+",job="node-exporter"}[1m])) ok 26.393s ago 1.638ms
record: node:node_net_saturation:sum_irate expr: sum by(node) ((irate(node_network_receive_drop_total{device!~"veth.+",job="node-exporter"}[1m]) + irate(node_network_transmit_drop_total{device!~"veth.+",job="node-exporter"}[1m])) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 26.391s ago 8.367ms
record: node:node_inodes_total: expr: max by(node) (max by(node, host_ip) (kube_pod_info{host_ip!="",job="kube-state-metrics"}) * on(host_ip) group_right(node) label_replace((max by(instance) (node_filesystem_files{job="node-exporter",mountpoint="/"})), "host_ip", "$1", "instance", "(.*):.*")) ok 26.383s ago 5.084ms
record: node:node_inodes_free: expr: max by(node) (max by(node, host_ip) (kube_pod_info{host_ip!="",job="kube-state-metrics"}) * on(host_ip) group_right(node) label_replace((max by(instance) (node_filesystem_files_free{job="node-exporter",mountpoint="/"})), "host_ip", "$1", "instance", "(.*):.*")) ok 26.378s ago 4.383ms

prometheus-operator

29.697s ago

634.7us

Rule State Error Last Evaluation Evaluation Time
alert: PrometheusOperatorReconcileErrors expr: rate(prometheus_operator_reconcile_errors_total{job="prometheus-prometheus-oper-operator",namespace="monitoring"}[5m]) > 0.1 for: 10m labels: severity: warning annotations: message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace }} Namespace. ok 29.697s ago 427us
alert: PrometheusOperatorNodeLookupErrors expr: rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-prometheus-oper-operator",namespace="monitoring"}[5m]) > 0.1 for: 10m labels: severity: warning annotations: message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. ok 29.697s ago 192.5us

prometheus

26.206s ago

3.878ms

Rule State Error Last Evaluation Evaluation Time
alert: PrometheusBadConfig expr: max_over_time(prometheus_config_last_reload_successful{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) == 0 for: 10m labels: severity: critical annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration. summary: Failed Prometheus configuration reload. ok 26.206s ago 514.9us
alert: PrometheusNotificationQueueRunningFull expr: (predict_linear(prometheus_notifications_queue_length{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m], 60 * 30) > min_over_time(prometheus_notifications_queue_capacity{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])) for: 15m labels: severity: warning annotations: description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full. summary: Prometheus alert notification queue predicted to run full in less than 30m. ok 26.206s ago 516.6us
alert: PrometheusErrorSendingAlertsToSomeAlertmanagers expr: (rate(prometheus_notifications_errors_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])) * 100 > 1 for: 15m labels: severity: warning annotations: description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.' summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. ok 26.205s ago 400.8us
alert: PrometheusErrorSendingAlertsToAnyAlertmanager expr: min without(alertmanager) (rate(prometheus_notifications_errors_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])) * 100 > 3 for: 15m labels: severity: critical annotations: description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.' summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. ok 26.205s ago 306.5us
alert: PrometheusNotConnectedToAlertmanagers expr: max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) < 1 for: 10m labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers. summary: Prometheus is not connected to any Alertmanagers. ok 26.205s ago 142.2us
alert: PrometheusTSDBReloadsFailing expr: increase(prometheus_tsdb_reloads_failures_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[3h]) > 0 for: 4h labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h. summary: Prometheus has issues reloading blocks from disk. ok 26.205s ago 297.3us
alert: PrometheusTSDBCompactionsFailing expr: increase(prometheus_tsdb_compactions_failed_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[3h]) > 0 for: 4h labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h. summary: Prometheus has issues compacting blocks. ok 26.205s ago 251.4us
alert: PrometheusTSDBWALCorruptions expr: increase(tsdb_wal_corruptions_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[3h]) > 0 for: 4h labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} corruptions of the write-ahead log (WAL) over the last 3h. summary: Prometheus is detecting WAL corruptions. ok 26.205s ago 127.1us
alert: PrometheusNotIngestingSamples expr: rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) <= 0 for: 10m labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples. summary: Prometheus is not ingesting samples. ok 26.205s ago 151.9us
alert: PrometheusDuplicateTimestamps expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) > 0 for: 10m labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{$value | humanize}} samples/s with different values but duplicated timestamp. summary: Prometheus is dropping samples with duplicate timestamps. ok 26.205s ago 136.3us
alert: PrometheusOutOfOrderTimestamps expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) > 0 for: 10m labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{$value | humanize}} samples/s with timestamps arriving out of order. summary: Prometheus drops samples with out-of-order timestamps. ok 26.205s ago 145us
alert: PrometheusRemoteStorageFailures expr: (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) / (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) + rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]))) * 100 > 1 for: 15m labels: severity: critical annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to queue {{$labels.queue}}. summary: Prometheus fails to send samples to remote storage. ok 26.205s ago 324.1us
alert: PrometheusRemoteWriteBehind expr: (max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) - on(job, instance) group_right() max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])) > 120 for: 15m labels: severity: critical annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for queue {{$labels.queue}}. summary: Prometheus remote write is behind. ok 26.205s ago 240.3us
alert: PrometheusRuleFailures expr: increase(prometheus_rule_evaluation_failures_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) > 0 for: 15m labels: severity: critical annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m. summary: Prometheus is failing rule evaluations. ok 26.205s ago 138.4us
alert: PrometheusMissingRuleEvaluations expr: increase(prometheus_rule_group_iterations_missed_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) > 0 for: 15m labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m. summary: Prometheus is missing rule evaluations due to slow rule group evaluation. ok 26.205s ago 143.8us