Rules

alertmanager.rules

28.978s ago

965.8us

Rule State Error Last Evaluation Evaluation Time
alert: AlertmanagerConfigInconsistent expr: count_values by(service) ("config_hash", alertmanager_config_hash{job="prometheus-prometheus-oper-alertmanager",namespace="monitoring"}) / on(service) group_left() label_replace(prometheus_operator_spec_replicas{controller="alertmanager",job="prometheus-prometheus-oper-operator",namespace="monitoring"}, "service", "$1", "name", "(.*)") != 1 for: 5m labels: severity: critical annotations: message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync. ok 28.979s ago 622us
alert: AlertmanagerFailedReload expr: alertmanager_config_last_reload_successful{job="prometheus-prometheus-oper-alertmanager",namespace="monitoring"} == 0 for: 10m labels: severity: warning annotations: message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. ok 28.978s ago 125.4us
alert: AlertmanagerMembersInconsistent expr: alertmanager_cluster_members{job="prometheus-prometheus-oper-alertmanager",namespace="monitoring"} != on(service) group_left() count by(service) (alertmanager_cluster_members{job="prometheus-prometheus-oper-alertmanager",namespace="monitoring"}) for: 5m labels: severity: critical annotations: message: Alertmanager has not found all other members of the cluster. ok 28.979s ago 201.9us

etcd

11.251s ago

4.752ms

Rule State Error Last Evaluation Evaluation Time
alert: etcdInsufficientMembers expr: sum by(job) (up{job=~".*etcd.*"} == bool 1) < ((count by(job) (up{job=~".*etcd.*"}) + 1) / 2) for: 3m labels: severity: critical annotations: message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).' ok 11.252s ago 1.083ms
alert: etcdNoLeader expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 for: 1m labels: severity: critical annotations: message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.' ok 11.251s ago 218.7us
alert: etcdHighNumberOfLeaderChanges expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3 for: 15m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour.' ok 11.251s ago 175.4us
alert: etcdHighNumberOfFailedGRPCRequests expr: 100 * sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!="OK",job=~".*etcd.*"}[5m])) / sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) > 1 for: 10m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' ok 11.251s ago 328.7us
alert: etcdHighNumberOfFailedGRPCRequests expr: 100 * sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!="OK",job=~".*etcd.*"}[5m])) / sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) > 5 for: 5m labels: severity: critical annotations: message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' ok 11.251s ago 531.6us
alert: etcdGRPCRequestsSlow expr: histogram_quantile(0.99, sum by(job, instance, grpc_service, grpc_method, le) (rate(grpc_server_handling_seconds_bucket{grpc_type="unary",job=~".*etcd.*"}[5m]))) > 0.15 for: 10m labels: severity: critical annotations: message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.' ok 11.251s ago 360.3us
alert: etcdMemberCommunicationSlow expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.15 for: 10m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.' ok 11.251s ago 265us
alert: etcdHighNumberOfFailedProposals expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 for: 15m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}.' ok 11.251s ago 218.2us
alert: etcdHighFsyncDurations expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.5 for: 10m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.' ok 11.251s ago 245.5us
alert: etcdHighCommitDurations expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.25 for: 10m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.' ok 11.251s ago 242us
alert: etcdHighNumberOfFailedHTTPRequests expr: sum by(method) (rate(etcd_http_failed_total{code!="404",job=~".*etcd.*"}[5m])) / sum by(method) (rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) > 0.01 for: 10m labels: severity: warning annotations: message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}' ok 11.251s ago 473.7us
alert: etcdHighNumberOfFailedHTTPRequests expr: sum by(method) (rate(etcd_http_failed_total{code!="404",job=~".*etcd.*"}[5m])) / sum by(method) (rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) > 0.05 for: 10m labels: severity: critical annotations: message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.' ok 11.251s ago 425.5us
alert: etcdHTTPRequestsSlow expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 for: 10m labels: severity: warning annotations: message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow. ok 11.25s ago 147.3us

general.rules

26.934s ago

2.555ms

Rule State Error Last Evaluation Evaluation Time
alert: TargetDown expr: 100 * (count by(job) (up == 0) / count by(job) (up)) > 10 for: 10m labels: severity: warning annotations: message: '{{ $value }}% of the {{ $labels.job }} targets are down.' ok 26.934s ago 2.268ms
alert: Watchdog expr: vector(1) labels: severity: none annotations: message: | This is an alert meant to ensure that the entire alerting pipeline is functional. This alert is always firing, therefore it should always be firing in Alertmanager and always fire against a receiver. There are integrations with various notification mechanisms that send a notification when this alert is not firing. For example the "DeadMansSnitch" integration in PagerDuty. ok 26.932s ago 269.3us

k8s.rules

15.227s ago

96.36ms

Rule State Error Last Evaluation Evaluation Time
record: namespace:container_cpu_usage_seconds_total:sum_rate expr: sum by(namespace) (rate(container_cpu_usage_seconds_total{container!="POD",image!="",job="kubelet"}[5m])) ok 15.227s ago 19.55ms
record: namespace_pod_container:container_cpu_usage_seconds_total:sum_rate expr: sum by(namespace, pod, container) (rate(container_cpu_usage_seconds_total{container!="POD",image!="",job="kubelet"}[5m])) ok 15.208s ago 15.09ms
record: namespace:container_memory_usage_bytes:sum expr: sum by(namespace) (container_memory_usage_bytes{container!="POD",image!="",job="kubelet"}) ok 15.193s ago 7.66ms
record: namespace:container_memory_usage_bytes:sum expr: sum by(namespace, label_name) (sum by(pod, namespace) (container_memory_usage_bytes{container!="POD",image!="",job="kubelet"}) * on(namespace, pod) group_left(label_name) kube_pod_labels{job="kube-state-metrics"}) ok 15.185s ago 18.62ms
record: namespace:kube_pod_container_resource_requests_memory_bytes:sum expr: sum by(namespace, label_name) (sum by(namespace, pod) (kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} * on(endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) * on(namespace, pod) group_left(label_name) kube_pod_labels{job="kube-state-metrics"}) ok 15.167s ago 19.46ms
record: namespace:kube_pod_container_resource_requests_cpu_cores:sum expr: sum by(namespace, label_name) (sum by(namespace, pod) (kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} * on(endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) * on(namespace, pod) group_left(label_name) kube_pod_labels{job="kube-state-metrics"}) ok 15.148s ago 11.38ms
record: mixin_pod_workload expr: sum by(namespace, workload, pod) (label_replace(label_replace(kube_pod_owner{job="kube-state-metrics",owner_kind="ReplicaSet"}, "replicaset", "$1", "owner_name", "(.*)") * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"}, "workload", "$1", "owner_name", "(.*)")) labels: workload_type: deployment ok 15.136s ago 3.619ms
record: mixin_pod_workload expr: sum by(namespace, workload, pod) (label_replace(kube_pod_owner{job="kube-state-metrics",owner_kind="DaemonSet"}, "workload", "$1", "owner_name", "(.*)")) labels: workload_type: daemonset ok 15.133s ago 477.1us
record: mixin_pod_workload expr: sum by(namespace, workload, pod) (label_replace(kube_pod_owner{job="kube-state-metrics",owner_kind="StatefulSet"}, "workload", "$1", "owner_name", "(.*)")) labels: workload_type: statefulset ok 15.133s ago 457.5us

kube-apiserver.rules

8.336s ago

1.305s

Rule State Error Last Evaluation Evaluation Time
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile expr: histogram_quantile(0.99, sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m]))) labels: quantile: "0.99" ok 8.337s ago 511.7ms
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile expr: histogram_quantile(0.9, sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m]))) labels: quantile: "0.9" ok 7.825s ago 393.3ms
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile expr: histogram_quantile(0.5, sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m]))) labels: quantile: "0.5" ok 7.432s ago 400.1ms

kube-prometheus-node-alerting.rules

12.413s ago

3.32ms

Rule State Error Last Evaluation Evaluation Time
alert: NodeDiskRunningFull expr: '(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)' for: 30m labels: severity: warning annotations: message: Device {{ $labels.device }} on node {{ $labels.instance }} will be full within the next 24 hours. ok 12.413s ago 2.615ms
alert: NodeDiskRunningFull expr: '(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)' for: 10m labels: severity: critical annotations: message: Device {{ $labels.device }} on node {{ $labels.instance }} will be full within the next 2 hours. ok 12.41s ago 672.7us

kube-prometheus-node-recording.rules

5.755s ago

94.62ms

Rule State Error Last Evaluation Evaluation Time
record: instance:node_cpu:rate:sum expr: sum by(instance) (rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) ok 5.755s ago 15.18ms
record: instance:node_filesystem_usage:sum expr: sum by(instance) ((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})) ok 5.74s ago 358.5us
record: instance:node_network_receive_bytes:rate:sum expr: sum by(instance) (rate(node_network_receive_bytes_total[3m])) ok 5.74s ago 5.133ms
record: instance:node_network_transmit_bytes:rate:sum expr: sum by(instance) (rate(node_network_transmit_bytes_total[3m])) ok 5.735s ago 5.845ms
record: instance:node_cpu:ratio expr: sum without(cpu, mode) (rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) / on(instance) group_left() count by(instance) (sum by(instance, cpu) (node_cpu_seconds_total)) ok 5.729s ago 34.15ms
record: cluster:node_cpu:sum_rate5m expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) ok 5.695s ago 15.02ms
record: cluster:node_cpu:ratio expr: cluster:node_cpu_seconds_total:rate5m / count(sum by(instance, cpu) (node_cpu_seconds_total)) ok 5.681s ago 18.89ms

kube-scheduler.rules

7.598s ago

2.23ms

Rule State Error Last Evaluation Evaluation Time
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile expr: histogram_quantile(0.99, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.99" ok 7.598s ago 643.1us
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile expr: histogram_quantile(0.99, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.99" ok 7.598s ago 213.5us
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile expr: histogram_quantile(0.99, sum without(instance, pod) (rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.99" ok 7.598s ago 196.2us
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile expr: histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.9" ok 7.598s ago 191.9us
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile expr: histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.9" ok 7.598s ago 198.4us
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile expr: histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.9" ok 7.598s ago 189.4us
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile expr: histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.5" ok 7.598s ago 197.5us
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile expr: histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.5" ok 7.598s ago 181us
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile expr: histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.5" ok 7.598s ago 186.4us

kubernetes-absent

15.19s ago

3.362ms

Rule State Error Last Evaluation Evaluation Time
alert: AlertmanagerDown expr: absent(up{job="prometheus-prometheus-oper-alertmanager",namespace="monitoring"} == 1) for: 15m labels: severity: critical annotations: message: Alertmanager has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown ok 15.19s ago 468.9us
alert: KubeAPIDown expr: absent(up{job="apiserver"} == 1) for: 15m labels: severity: critical annotations: message: KubeAPI has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown ok 15.19s ago 339.3us
alert: KubeControllerManagerDown expr: absent(up{job="kube-controller-manager"} == 1) for: 15m labels: severity: critical annotations: message: KubeControllerManager has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown ok 15.19s ago 739.4us
alert: KubeSchedulerDown expr: absent(up{job="kube-scheduler"} == 1) for: 15m labels: severity: critical annotations: message: KubeScheduler has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown ok 15.189s ago 631.7us
alert: KubeStateMetricsDown expr: absent(up{job="kube-state-metrics"} == 1) for: 15m labels: severity: critical annotations: message: KubeStateMetrics has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsdown ok 15.189s ago 240.1us
alert: KubeletDown expr: absent(up{job="kubelet"} == 1) for: 15m labels: severity: critical annotations: message: Kubelet has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown ok 15.189s ago 241.5us
alert: NodeExporterDown expr: absent(up{job="node-exporter"} == 1) for: 15m labels: severity: critical annotations: message: NodeExporter has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeexporterdown ok 15.189s ago 261.7us
alert: PrometheusDown expr: absent(up{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"} == 1) for: 15m labels: severity: critical annotations: message: Prometheus has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown ok 15.188s ago 211.5us
alert: PrometheusOperatorDown expr: absent(up{job="prometheus-prometheus-oper-operator",namespace="monitoring"} == 1) for: 15m labels: severity: critical annotations: message: PrometheusOperator has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown ok 15.188s ago 195us

kubernetes-apps

9.825s ago

33.86ms

Rule State Error Last Evaluation Evaluation Time
alert: KubePodCrashLooping expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0 for: 1h labels: severity: critical annotations: message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping ok 9.825s ago 9.493ms
alert: KubePodNotReady expr: sum by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics",phase=~"Failed|Pending|Unknown"}) > 0 for: 1h labels: severity: critical annotations: message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than an hour. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready ok 9.816s ago 12.33ms
alert: KubeDeploymentGenerationMismatch expr: kube_deployment_status_observed_generation{job="kube-state-metrics"} != kube_deployment_metadata_generation{job="kube-state-metrics"} for: 15m labels: severity: critical annotations: message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch ok 9.804s ago 4.634ms
alert: KubeDeploymentReplicasMismatch expr: kube_deployment_spec_replicas{job="kube-state-metrics"} != kube_deployment_status_replicas_available{job="kube-state-metrics"} for: 1h labels: severity: critical annotations: message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than an hour. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch ok 9.799s ago 4.521ms
alert: KubeStatefulSetReplicasMismatch expr: kube_statefulset_status_replicas_ready{job="kube-state-metrics"} != kube_statefulset_status_replicas{job="kube-state-metrics"} for: 15m labels: severity: critical annotations: message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch ok 9.795s ago 398.7us
alert: KubeStatefulSetGenerationMismatch expr: kube_statefulset_status_observed_generation{job="kube-state-metrics"} != kube_statefulset_metadata_generation{job="kube-state-metrics"} for: 15m labels: severity: critical annotations: message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch ok 9.795s ago 376.2us
alert: KubeStatefulSetUpdateNotRolledOut expr: max without(revision) (kube_statefulset_status_current_revision{job="kube-state-metrics"} unless kube_statefulset_status_update_revision{job="kube-state-metrics"}) * (kube_statefulset_replicas{job="kube-state-metrics"} != kube_statefulset_status_replicas_updated{job="kube-state-metrics"}) for: 15m labels: severity: critical annotations: message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout ok 9.795s ago 749.6us
alert: KubeDaemonSetRolloutStuck expr: kube_daemonset_status_number_ready{job="kube-state-metrics"} / kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} * 100 < 100 for: 15m labels: severity: critical annotations: message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are scheduled and ready. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck ok 9.795s ago 249.7us
alert: KubeDaemonSetNotScheduled expr: kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0 for: 10m labels: severity: warning annotations: message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled ok 9.795s ago 200us
alert: KubeDaemonSetMisScheduled expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 for: 10m labels: severity: warning annotations: message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled ok 9.795s ago 98.13us
alert: KubeCronJobRunning expr: time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600 for: 1h labels: severity: warning annotations: message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning ok 9.795s ago 97.53us
alert: KubeJobCompletion expr: kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 for: 1h labels: severity: warning annotations: message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than one hour to complete. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion ok 9.795s ago 538.2us
alert: KubeJobFailed expr: kube_job_status_failed{job="kube-state-metrics"} > 0 for: 1h labels: severity: warning annotations: message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed ok 9.794s ago 137.4us

kubernetes-resources

25.135s ago

8.143ms

Rule State Error Last Evaluation Evaluation Time
alert: KubeCPUOvercommit expr: sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum) / sum(kube_node_status_allocatable_cpu_cores) > (count(kube_node_status_allocatable_cpu_cores) - 1) / count(kube_node_status_allocatable_cpu_cores) for: 5m labels: severity: warning annotations: message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit ok 25.135s ago 1.169ms
alert: KubeMemOvercommit expr: sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum) / sum(kube_node_status_allocatable_memory_bytes) > (count(kube_node_status_allocatable_memory_bytes) - 1) / count(kube_node_status_allocatable_memory_bytes) for: 5m labels: severity: warning annotations: message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit ok 25.134s ago 1.649ms
alert: KubeCPUOvercommit expr: sum(kube_resourcequota{job="kube-state-metrics",resource="cpu",type="hard"}) / sum(kube_node_status_allocatable_cpu_cores) > 1.5 for: 5m labels: severity: warning annotations: message: Cluster has overcommitted CPU resource requests for Namespaces. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit ok 25.132s ago 308.2us
alert: KubeMemOvercommit expr: sum(kube_resourcequota{job="kube-state-metrics",resource="memory",type="hard"}) / sum(kube_node_status_allocatable_memory_bytes{job="node-exporter"}) > 1.5 for: 5m labels: severity: warning annotations: message: Cluster has overcommitted memory resource requests for Namespaces. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit ok 25.132s ago 194.5us
alert: KubeQuotaExceeded expr: 100 * kube_resourcequota{job="kube-state-metrics",type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"} > 0) > 90 for: 15m labels: severity: warning annotations: message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value }}% of its {{ $labels.resource }} quota. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded ok 25.132s ago 254us
alert: CPUThrottlingHigh expr: 100 * sum by(container, pod, namespace) (increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) / sum by(container, pod, namespace) (increase(container_cpu_cfs_periods_total[5m])) > 25 for: 15m labels: severity: warning annotations: message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh ok 25.132s ago 4.541ms

kubernetes-storage

14.969s ago

14.23ms

Rule State Error Last Evaluation Evaluation Time
alert: KubePersistentVolumeUsageCritical expr: 100 * kubelet_volume_stats_available_bytes{job="kubelet"} / kubelet_volume_stats_capacity_bytes{job="kubelet"} < 3 for: 1m labels: severity: critical annotations: message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value }}% free. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical ok 14.969s ago 2.772ms
alert: KubePersistentVolumeFullInFourDays expr: 100 * (kubelet_volume_stats_available_bytes{job="kubelet"} / kubelet_volume_stats_capacity_bytes{job="kubelet"}) < 15 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0 for: 5m labels: severity: critical annotations: message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ printf "%0.2f" $value }}% is available. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays ok 14.967s ago 9.411ms
alert: KubePersistentVolumeErrors expr: kube_persistentvolume_status_phase{job="kube-state-metrics",phase=~"Failed|Pending"} > 0 for: 5m labels: severity: critical annotations: message: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors ok 14.957s ago 2.03ms

kubernetes-system

4.193s ago

86.39ms

Rule State Error Last Evaluation Evaluation Time
alert: KubeNodeNotReady expr: kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} == 0 for: 1h labels: severity: warning annotations: message: '{{ $labels.node }} has been unready for more than an hour.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready ok 4.193s ago 279.3us
alert: KubeVersionMismatch expr: count(count by(gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"}, "gitVersion", "$1", "gitVersion", "(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1 for: 1h labels: severity: warning annotations: message: There are {{ $value }} different semantic versions of Kubernetes components running. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch ok 4.193s ago 531.6us
alert: KubeClientErrors expr: (sum by(instance, job) (rate(rest_client_requests_total{code=~"5.."}[5m])) / sum by(instance, job) (rate(rest_client_requests_total[5m]))) * 100 > 1 for: 15m labels: severity: warning annotations: message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ printf "%0.0f" $value }}% errors.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors ok 4.193s ago 1.594ms
alert: KubeClientErrors expr: sum by(instance, job) (rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) > 0.1 for: 15m labels: severity: warning annotations: message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ printf "%0.0f" $value }} errors / second. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors ok 4.191s ago 137.3us
alert: KubeletTooManyPods expr: kubelet_running_pod_count{job="kubelet"} > 110 * 0.9 for: 15m labels: severity: warning annotations: message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close to the limit of 110. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods ok 4.191s ago 82.92us
alert: KubeAPILatencyHigh expr: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 for: 10m labels: severity: warning annotations: message: The API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh ok 4.191s ago 2.109ms
alert: KubeAPILatencyHigh expr: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 for: 10m labels: severity: critical annotations: message: The API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh ok 4.189s ago 1.742ms
alert: KubeAPIErrorsHigh expr: sum(rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m])) / sum(rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 3 for: 10m labels: severity: critical annotations: message: API server is returning errors for {{ $value }}% of requests. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh ok 4.188s ago 18.22ms
alert: KubeAPIErrorsHigh expr: sum(rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m])) / sum(rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 1 for: 10m labels: severity: warning annotations: message: API server is returning errors for {{ $value }}% of requests. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh ok 4.17s ago 18.29ms
alert: KubeAPIErrorsHigh expr: sum by(resource, subresource, verb) (rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m])) / sum by(resource, subresource, verb) (rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 10 for: 10m labels: severity: critical annotations: message: API server is returning errors for {{ $value }}% of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh ok 4.152s ago 18.93ms
alert: KubeAPIErrorsHigh expr: sum by(resource, subresource, verb) (rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m])) / sum by(resource, subresource, verb) (rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 5 for: 10m labels: severity: warning annotations: message: API server is returning errors for {{ $value }}% of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh ok 4.133s ago 22.67ms
alert: KubeClientCertificateExpiration expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 labels: severity: warning annotations: message: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration ok 4.11s ago 976.6us
alert: KubeClientCertificateExpiration expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 labels: severity: critical annotations: message: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration ok 4.109s ago 788.5us

node-network

12.075s ago

5.807ms

Rule State Error Last Evaluation Evaluation Time
alert: NetworkReceiveErrors expr: rate(node_network_receive_errs_total{device!~"veth.+",job="node-exporter"}[2m]) > 0 for: 2m labels: severity: warning annotations: message: Network interface "{{ $labels.device }}" showing receive errors on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" ok 12.075s ago 2.234ms
alert: NetworkTransmitErrors expr: rate(node_network_transmit_errs_total{device!~"veth.+",job="node-exporter"}[2m]) > 0 for: 2m labels: severity: warning annotations: message: Network interface "{{ $labels.device }}" showing transmit errors on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" ok 12.073s ago 1.886ms
alert: NodeNetworkInterfaceFlapping expr: changes(node_network_up{device!~"veth.+",job="node-exporter"}[2m]) > 2 for: 2m labels: severity: warning annotations: message: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" ok 12.071s ago 1.668ms

node-time

16.288s ago

604.2us

Rule State Error Last Evaluation Evaluation Time
alert: ClockSkewDetected expr: abs(node_timex_offset_seconds{job="node-exporter"}) > 0.05 for: 2m labels: severity: warning annotations: message: Clock skew detected on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}. Ensure NTP is configured correctly on this host. ok 16.288s ago 588.8us

node.rules

1.52s ago

98.62ms

Rule State Error Last Evaluation Evaluation Time
record: :kube_pod_info_node_count: expr: sum(min by(node) (kube_pod_info)) ok 1.52s ago 3.841ms
record: node_namespace_pod:kube_pod_info: expr: max by(node, namespace, pod) (label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) ok 1.516s ago 7.916ms
record: node:node_num_cpu:sum expr: count by(node) (sum by(node, cpu) (node_cpu_seconds_total{job="node-exporter"} * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:)) ok 1.509s ago 24.33ms
record: :node_cpu_utilisation:avg1m expr: 1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])) ok 1.484s ago 2.933ms
record: node:node_cpu_utilisation:avg1m expr: 1 - avg by(node) (rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 1.481s ago 7.019ms
record: node:cluster_cpu_utilisation:ratio expr: node:node_cpu_utilisation:avg1m * node:node_num_cpu:sum / scalar(sum(node:node_num_cpu:sum)) ok 1.475s ago 341.7us
record: :node_cpu_saturation_load1: expr: sum(node_load1{job="node-exporter"}) / sum(node:node_num_cpu:sum) ok 1.474s ago 268.5us
record: node:node_cpu_saturation_load1: expr: sum by(node) (node_load1{job="node-exporter"} * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) / node:node_num_cpu:sum ok 1.474s ago 3.63ms
record: :node_memory_utilisation: expr: 1 - sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) / sum(node_memory_MemTotal_bytes{job="node-exporter"}) ok 1.471s ago 537.8us
record: :node_memory_MemFreeCachedBuffers_bytes:sum expr: sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) ok 1.47s ago 327.7us
record: :node_memory_MemTotal_bytes:sum expr: sum(node_memory_MemTotal_bytes{job="node-exporter"}) ok 1.47s ago 109.2us
record: node:node_memory_bytes_available:sum expr: sum by(node) ((node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 1.47s ago 3.665ms
record: node:node_memory_bytes_total:sum expr: sum by(node) (node_memory_MemTotal_bytes{job="node-exporter"} * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 1.466s ago 3.499ms
record: node:node_memory_utilisation:ratio expr: (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) / node:node_memory_bytes_total:sum ok 1.463s ago 289.7us
record: node:cluster_memory_utilisation:ratio expr: (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) / scalar(sum(node:node_memory_bytes_total:sum)) ok 1.463s ago 275.8us
record: :node_memory_swap_io_bytes:sum_rate expr: 1000 * sum((rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) + rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))) ok 1.462s ago 346.3us
record: node:node_memory_utilisation: expr: 1 - sum by(node) ((node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) / sum by(node) (node_memory_MemTotal_bytes{job="node-exporter"} * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 1.462s ago 7.27ms
record: node:node_memory_utilisation_2: expr: 1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum) ok 1.455s ago 272.1us
record: node:node_memory_swap_io_bytes:sum_rate expr: 1000 * sum by(node) ((rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 1.455s ago 3.632ms
record: :node_disk_utilisation:avg_irate expr: avg(irate(node_disk_io_time_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m])) ok 1.451s ago 1.605ms
record: node:node_disk_utilisation:avg_irate expr: avg by(node) (irate(node_disk_io_time_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 1.45s ago 3.745ms
record: :node_disk_saturation:avg_irate expr: avg(irate(node_disk_io_time_weighted_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m])) ok 1.446s ago 1.214ms
record: node:node_disk_saturation:avg_irate expr: avg by(node) (irate(node_disk_io_time_weighted_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 1.445s ago 3.738ms
record: node:node_filesystem_usage: expr: max by(instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) ok 1.441s ago 450.5us
record: node:node_filesystem_avail: expr: max by(instance, namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) ok 1.441s ago 270.7us
record: :node_net_utilisation:sum_irate expr: sum(irate(node_network_receive_bytes_total{device!~"veth.+",job="node-exporter"}[1m])) + sum(irate(node_network_transmit_bytes_total{device!~"veth.+",job="node-exporter"}[1m])) ok 1.441s ago 1.696ms
record: node:node_net_utilisation:sum_irate expr: sum by(node) ((irate(node_network_receive_bytes_total{device!~"veth.+",job="node-exporter"}[1m]) + irate(node_network_transmit_bytes_total{device!~"veth.+",job="node-exporter"}[1m])) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 1.439s ago 4.191ms
record: :node_net_saturation:sum_irate expr: sum(irate(node_network_receive_drop_total{device!~"veth.+",job="node-exporter"}[1m])) + sum(irate(node_network_transmit_drop_total{device!~"veth.+",job="node-exporter"}[1m])) ok 1.435s ago 1.471ms
record: node:node_net_saturation:sum_irate expr: sum by(node) ((irate(node_network_receive_drop_total{device!~"veth.+",job="node-exporter"}[1m]) + irate(node_network_transmit_drop_total{device!~"veth.+",job="node-exporter"}[1m])) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 1.434s ago 3.956ms
record: node:node_inodes_total: expr: max by(node) (max by(node, host_ip) (kube_pod_info{host_ip!="",job="kube-state-metrics"}) * on(host_ip) group_right(node) label_replace((max by(instance) (node_filesystem_files{job="node-exporter",mountpoint="/"})), "host_ip", "$1", "instance", "(.*):.*")) ok 1.43s ago 2.983ms
record: node:node_inodes_free: expr: max by(node) (max by(node, host_ip) (kube_pod_info{host_ip!="",job="kube-state-metrics"}) * on(host_ip) group_right(node) label_replace((max by(instance) (node_filesystem_files_free{job="node-exporter",mountpoint="/"})), "host_ip", "$1", "instance", "(.*):.*")) ok 1.427s ago 2.707ms

prometheus-operator

4.749s ago

614us

Rule State Error Last Evaluation Evaluation Time
alert: PrometheusOperatorReconcileErrors expr: rate(prometheus_operator_reconcile_errors_total{job="prometheus-prometheus-oper-operator",namespace="monitoring"}[5m]) > 0.1 for: 10m labels: severity: warning annotations: message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace }} Namespace. ok 4.749s ago 435.5us
alert: PrometheusOperatorNodeLookupErrors expr: rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-prometheus-oper-operator",namespace="monitoring"}[5m]) > 0.1 for: 10m labels: severity: warning annotations: message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. ok 4.749s ago 166.7us

prometheus

1.258s ago

3.851ms

Rule State Error Last Evaluation Evaluation Time
alert: PrometheusBadConfig expr: max_over_time(prometheus_config_last_reload_successful{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) == 0 for: 10m labels: severity: critical annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration. summary: Failed Prometheus configuration reload. ok 1.258s ago 382us
alert: PrometheusNotificationQueueRunningFull expr: (predict_linear(prometheus_notifications_queue_length{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m], 60 * 30) > min_over_time(prometheus_notifications_queue_capacity{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])) for: 15m labels: severity: warning annotations: description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full. summary: Prometheus alert notification queue predicted to run full in less than 30m. ok 1.258s ago 343.2us
alert: PrometheusErrorSendingAlertsToSomeAlertmanagers expr: (rate(prometheus_notifications_errors_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])) * 100 > 1 for: 15m labels: severity: warning annotations: description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.' summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. ok 1.258s ago 435.5us
alert: PrometheusErrorSendingAlertsToAnyAlertmanager expr: min without(alertmanager) (rate(prometheus_notifications_errors_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])) * 100 > 3 for: 15m labels: severity: critical annotations: description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.' summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. ok 1.258s ago 331.6us
alert: PrometheusNotConnectedToAlertmanagers expr: max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) < 1 for: 10m labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers. summary: Prometheus is not connected to any Alertmanagers. ok 1.258s ago 154.6us
alert: PrometheusTSDBReloadsFailing expr: increase(prometheus_tsdb_reloads_failures_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[3h]) > 0 for: 4h labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h. summary: Prometheus has issues reloading blocks from disk. ok 1.257s ago 368.1us
alert: PrometheusTSDBCompactionsFailing expr: increase(prometheus_tsdb_compactions_failed_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[3h]) > 0 for: 4h labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h. summary: Prometheus has issues compacting blocks. ok 1.257s ago 289.1us
alert: PrometheusTSDBWALCorruptions expr: increase(tsdb_wal_corruptions_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[3h]) > 0 for: 4h labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} corruptions of the write-ahead log (WAL) over the last 3h. summary: Prometheus is detecting WAL corruptions. ok 1.257s ago 149us
alert: PrometheusNotIngestingSamples expr: rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) <= 0 for: 10m labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples. summary: Prometheus is not ingesting samples. ok 1.257s ago 168.9us
alert: PrometheusDuplicateTimestamps expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) > 0 for: 10m labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{$value | humanize}} samples/s with different values but duplicated timestamp. summary: Prometheus is dropping samples with duplicate timestamps. ok 1.257s ago 152.4us
alert: PrometheusOutOfOrderTimestamps expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) > 0 for: 10m labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{$value | humanize}} samples/s with timestamps arriving out of order. summary: Prometheus drops samples with out-of-order timestamps. ok 1.257s ago 151.5us
alert: PrometheusRemoteStorageFailures expr: (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) / (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) + rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]))) * 100 > 1 for: 15m labels: severity: critical annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to queue {{$labels.queue}}. summary: Prometheus fails to send samples to remote storage. ok 1.257s ago 326.3us
alert: PrometheusRemoteWriteBehind expr: (max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) - on(job, instance) group_right() max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])) > 120 for: 15m labels: severity: critical annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for queue {{$labels.queue}}. summary: Prometheus remote write is behind. ok 1.257s ago 268.3us
alert: PrometheusRuleFailures expr: increase(prometheus_rule_evaluation_failures_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) > 0 for: 15m labels: severity: critical annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m. summary: Prometheus is failing rule evaluations. ok 1.257s ago 150.9us
alert: PrometheusMissingRuleEvaluations expr: increase(prometheus_rule_group_iterations_missed_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) > 0 for: 15m labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m. summary: Prometheus is missing rule evaluations due to slow rule group evaluation. ok 1.257s ago 140.1us