Rules

alertmanager.rules

23.096s ago

1.469ms

Rule State Error Last Evaluation Evaluation Time
alert: AlertmanagerConfigInconsistent expr: count_values by(service) ("config_hash", alertmanager_config_hash{job="prometheus-prometheus-oper-alertmanager",namespace="monitoring"}) / on(service) group_left() label_replace(prometheus_operator_spec_replicas{controller="alertmanager",job="prometheus-prometheus-oper-operator",namespace="monitoring"}, "service", "$1", "name", "(.*)") != 1 for: 5m labels: severity: critical annotations: message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync. ok 23.096s ago 864.2us
alert: AlertmanagerFailedReload expr: alertmanager_config_last_reload_successful{job="prometheus-prometheus-oper-alertmanager",namespace="monitoring"} == 0 for: 10m labels: severity: warning annotations: message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. ok 23.096s ago 223.6us
alert: AlertmanagerMembersInconsistent expr: alertmanager_cluster_members{job="prometheus-prometheus-oper-alertmanager",namespace="monitoring"} != on(service) group_left() count by(service) (alertmanager_cluster_members{job="prometheus-prometheus-oper-alertmanager",namespace="monitoring"}) for: 5m labels: severity: critical annotations: message: Alertmanager has not found all other members of the cluster. ok 23.096s ago 362.9us

etcd

5.369s ago

5.785ms

Rule State Error Last Evaluation Evaluation Time
alert: etcdInsufficientMembers expr: sum by(job) (up{job=~".*etcd.*"} == bool 1) < ((count by(job) (up{job=~".*etcd.*"}) + 1) / 2) for: 3m labels: severity: critical annotations: message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).' ok 5.37s ago 1.587ms
alert: etcdNoLeader expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 for: 1m labels: severity: critical annotations: message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.' ok 5.369s ago 240.1us
alert: etcdHighNumberOfLeaderChanges expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3 for: 15m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour.' ok 5.369s ago 259.3us
alert: etcdHighNumberOfFailedGRPCRequests expr: 100 * sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!="OK",job=~".*etcd.*"}[5m])) / sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) > 1 for: 10m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' ok 5.369s ago 506.1us
alert: etcdHighNumberOfFailedGRPCRequests expr: 100 * sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!="OK",job=~".*etcd.*"}[5m])) / sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) > 5 for: 5m labels: severity: critical annotations: message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' ok 5.369s ago 462.3us
alert: etcdGRPCRequestsSlow expr: histogram_quantile(0.99, sum by(job, instance, grpc_service, grpc_method, le) (rate(grpc_server_handling_seconds_bucket{grpc_type="unary",job=~".*etcd.*"}[5m]))) > 0.15 for: 10m labels: severity: critical annotations: message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.' ok 5.369s ago 568.1us
alert: etcdMemberCommunicationSlow expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.15 for: 10m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.' ok 5.369s ago 303.4us
alert: etcdHighNumberOfFailedProposals expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 for: 15m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}.' ok 5.369s ago 230.4us
alert: etcdHighFsyncDurations expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.5 for: 10m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.' ok 5.369s ago 257.4us
alert: etcdHighCommitDurations expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.25 for: 10m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.' ok 5.369s ago 246.9us
alert: etcdHighNumberOfFailedHTTPRequests expr: sum by(method) (rate(etcd_http_failed_total{code!="404",job=~".*etcd.*"}[5m])) / sum by(method) (rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) > 0.01 for: 10m labels: severity: warning annotations: message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}' ok 5.369s ago 437.6us
alert: etcdHighNumberOfFailedHTTPRequests expr: sum by(method) (rate(etcd_http_failed_total{code!="404",job=~".*etcd.*"}[5m])) / sum by(method) (rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) > 0.05 for: 10m labels: severity: critical annotations: message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.' ok 5.369s ago 483us
alert: etcdHTTPRequestsSlow expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 for: 10m labels: severity: warning annotations: message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow. ok 5.369s ago 159.7us

general.rules

21.053s ago

3.642ms

Rule State Error Last Evaluation Evaluation Time
alert: TargetDown expr: 100 * (count by(job) (up == 0) / count by(job) (up)) > 10 for: 10m labels: severity: warning annotations: message: '{{ $value }}% of the {{ $labels.job }} targets are down.' ok 21.053s ago 3.223ms
alert: Watchdog expr: vector(1) labels: severity: none annotations: message: | This is an alert meant to ensure that the entire alerting pipeline is functional. This alert is always firing, therefore it should always be firing in Alertmanager and always fire against a receiver. There are integrations with various notification mechanisms that send a notification when this alert is not firing. For example the "DeadMansSnitch" integration in PagerDuty. ok 21.05s ago 398.4us

k8s.rules

9.347s ago

107.1ms

Rule State Error Last Evaluation Evaluation Time
record: namespace:container_cpu_usage_seconds_total:sum_rate expr: sum by(namespace) (rate(container_cpu_usage_seconds_total{container!="POD",image!="",job="kubelet"}[5m])) ok 9.347s ago 20.27ms
record: namespace_pod_container:container_cpu_usage_seconds_total:sum_rate expr: sum by(namespace, pod, container) (rate(container_cpu_usage_seconds_total{container!="POD",image!="",job="kubelet"}[5m])) ok 9.327s ago 15.51ms
record: namespace:container_memory_usage_bytes:sum expr: sum by(namespace) (container_memory_usage_bytes{container!="POD",image!="",job="kubelet"}) ok 9.312s ago 8.131ms
record: namespace:container_memory_usage_bytes:sum expr: sum by(namespace, label_name) (sum by(pod, namespace) (container_memory_usage_bytes{container!="POD",image!="",job="kubelet"}) * on(namespace, pod) group_left(label_name) kube_pod_labels{job="kube-state-metrics"}) ok 9.304s ago 14.44ms
record: namespace:kube_pod_container_resource_requests_memory_bytes:sum expr: sum by(namespace, label_name) (sum by(namespace, pod) (kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} * on(endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) * on(namespace, pod) group_left(label_name) kube_pod_labels{job="kube-state-metrics"}) ok 9.29s ago 25.55ms
record: namespace:kube_pod_container_resource_requests_cpu_cores:sum expr: sum by(namespace, label_name) (sum by(namespace, pod) (kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} * on(endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) * on(namespace, pod) group_left(label_name) kube_pod_labels{job="kube-state-metrics"}) ok 9.265s ago 14.02ms
record: mixin_pod_workload expr: sum by(namespace, workload, pod) (label_replace(label_replace(kube_pod_owner{job="kube-state-metrics",owner_kind="ReplicaSet"}, "replicaset", "$1", "owner_name", "(.*)") * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"}, "workload", "$1", "owner_name", "(.*)")) labels: workload_type: deployment ok 9.251s ago 7.445ms
record: mixin_pod_workload expr: sum by(namespace, workload, pod) (label_replace(kube_pod_owner{job="kube-state-metrics",owner_kind="DaemonSet"}, "workload", "$1", "owner_name", "(.*)")) labels: workload_type: daemonset ok 9.244s ago 916.5us
record: mixin_pod_workload expr: sum by(namespace, workload, pod) (label_replace(kube_pod_owner{job="kube-state-metrics",owner_kind="StatefulSet"}, "workload", "$1", "owner_name", "(.*)")) labels: workload_type: statefulset ok 9.243s ago 800.2us

kube-apiserver.rules

2.457s ago

1.152s

Rule State Error Last Evaluation Evaluation Time
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile expr: histogram_quantile(0.99, sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m]))) labels: quantile: "0.99" ok 2.457s ago 462.1ms
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile expr: histogram_quantile(0.9, sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m]))) labels: quantile: "0.9" ok 1.995s ago 345.5ms
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile expr: histogram_quantile(0.5, sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m]))) labels: quantile: "0.5" ok 1.65s ago 344ms

kube-prometheus-node-alerting.rules

6.533s ago

3.008ms

Rule State Error Last Evaluation Evaluation Time
alert: NodeDiskRunningFull expr: '(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)' for: 30m labels: severity: warning annotations: message: Device {{ $labels.device }} on node {{ $labels.instance }} will be full within the next 24 hours. ok 6.533s ago 2.43ms
alert: NodeDiskRunningFull expr: '(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)' for: 10m labels: severity: critical annotations: message: Device {{ $labels.device }} on node {{ $labels.instance }} will be full within the next 2 hours. ok 6.531s ago 561.5us

kube-prometheus-node-recording.rules

29.875s ago

89.4ms

Rule State Error Last Evaluation Evaluation Time
record: instance:node_cpu:rate:sum expr: sum by(instance) (rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) ok 29.876s ago 20.73ms
record: instance:node_filesystem_usage:sum expr: sum by(instance) ((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})) ok 29.855s ago 366.8us
record: instance:node_network_receive_bytes:rate:sum expr: sum by(instance) (rate(node_network_receive_bytes_total[3m])) ok 29.855s ago 4.171ms
record: instance:node_network_transmit_bytes:rate:sum expr: sum by(instance) (rate(node_network_transmit_bytes_total[3m])) ok 29.851s ago 4.365ms
record: instance:node_cpu:ratio expr: sum without(cpu, mode) (rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) / on(instance) group_left() count by(instance) (sum by(instance, cpu) (node_cpu_seconds_total)) ok 29.846s ago 33.77ms
record: cluster:node_cpu:sum_rate5m expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) ok 29.813s ago 11.27ms
record: cluster:node_cpu:ratio expr: cluster:node_cpu_seconds_total:rate5m / count(sum by(instance, cpu) (node_cpu_seconds_total)) ok 29.801s ago 14.71ms

kube-scheduler.rules

1.718s ago

2.078ms

Rule State Error Last Evaluation Evaluation Time
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile expr: histogram_quantile(0.99, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.99" ok 1.718s ago 583.4us
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile expr: histogram_quantile(0.99, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.99" ok 1.717s ago 228.9us
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile expr: histogram_quantile(0.99, sum without(instance, pod) (rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.99" ok 1.717s ago 195.6us
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile expr: histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.9" ok 1.717s ago 177.1us
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile expr: histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.9" ok 1.717s ago 169.9us
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile expr: histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.9" ok 1.717s ago 184.6us
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile expr: histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.5" ok 1.717s ago 173us
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile expr: histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.5" ok 1.717s ago 167us
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile expr: histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.5" ok 1.717s ago 167.6us

kubernetes-absent

9.309s ago

2.127ms

Rule State Error Last Evaluation Evaluation Time
alert: AlertmanagerDown expr: absent(up{job="prometheus-prometheus-oper-alertmanager",namespace="monitoring"} == 1) for: 15m labels: severity: critical annotations: message: Alertmanager has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown ok 9.31s ago 315.4us
alert: KubeAPIDown expr: absent(up{job="apiserver"} == 1) for: 15m labels: severity: critical annotations: message: KubeAPI has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown ok 9.31s ago 197.9us
alert: KubeControllerManagerDown expr: absent(up{job="kube-controller-manager"} == 1) for: 15m labels: severity: critical annotations: message: KubeControllerManager has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown ok 9.31s ago 472.3us
alert: KubeSchedulerDown expr: absent(up{job="kube-scheduler"} == 1) for: 15m labels: severity: critical annotations: message: KubeScheduler has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown ok 9.309s ago 403.5us
alert: KubeStateMetricsDown expr: absent(up{job="kube-state-metrics"} == 1) for: 15m labels: severity: critical annotations: message: KubeStateMetrics has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsdown ok 9.309s ago 132.2us
alert: KubeletDown expr: absent(up{job="kubelet"} == 1) for: 15m labels: severity: critical annotations: message: Kubelet has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown ok 9.309s ago 165.7us
alert: NodeExporterDown expr: absent(up{job="node-exporter"} == 1) for: 15m labels: severity: critical annotations: message: NodeExporter has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeexporterdown ok 9.309s ago 159.6us
alert: PrometheusDown expr: absent(up{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"} == 1) for: 15m labels: severity: critical annotations: message: Prometheus has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown ok 9.309s ago 135.3us
alert: PrometheusOperatorDown expr: absent(up{job="prometheus-prometheus-oper-operator",namespace="monitoring"} == 1) for: 15m labels: severity: critical annotations: message: PrometheusOperator has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown ok 9.309s ago 122.4us

kubernetes-apps

3.944s ago

36.04ms

Rule State Error Last Evaluation Evaluation Time
alert: KubePodCrashLooping expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0 for: 1h labels: severity: critical annotations: message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping ok 3.944s ago 12.53ms
alert: KubePodNotReady expr: sum by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics",phase=~"Failed|Pending|Unknown"}) > 0 for: 1h labels: severity: critical annotations: message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than an hour. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready ok 3.932s ago 10.69ms
alert: KubeDeploymentGenerationMismatch expr: kube_deployment_status_observed_generation{job="kube-state-metrics"} != kube_deployment_metadata_generation{job="kube-state-metrics"} for: 15m labels: severity: critical annotations: message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch ok 3.921s ago 4.503ms
alert: KubeDeploymentReplicasMismatch expr: kube_deployment_spec_replicas{job="kube-state-metrics"} != kube_deployment_status_replicas_available{job="kube-state-metrics"} for: 1h labels: severity: critical annotations: message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than an hour. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch ok 3.917s ago 4.39ms
alert: KubeStatefulSetReplicasMismatch expr: kube_statefulset_status_replicas_ready{job="kube-state-metrics"} != kube_statefulset_status_replicas{job="kube-state-metrics"} for: 15m labels: severity: critical annotations: message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch ok 3.913s ago 386us
alert: KubeStatefulSetGenerationMismatch expr: kube_statefulset_status_observed_generation{job="kube-state-metrics"} != kube_statefulset_metadata_generation{job="kube-state-metrics"} for: 15m labels: severity: critical annotations: message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch ok 3.912s ago 395.6us
alert: KubeStatefulSetUpdateNotRolledOut expr: max without(revision) (kube_statefulset_status_current_revision{job="kube-state-metrics"} unless kube_statefulset_status_update_revision{job="kube-state-metrics"}) * (kube_statefulset_replicas{job="kube-state-metrics"} != kube_statefulset_status_replicas_updated{job="kube-state-metrics"}) for: 15m labels: severity: critical annotations: message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout ok 3.912s ago 718.7us
alert: KubeDaemonSetRolloutStuck expr: kube_daemonset_status_number_ready{job="kube-state-metrics"} / kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} * 100 < 100 for: 15m labels: severity: critical annotations: message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are scheduled and ready. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck ok 3.912s ago 245us
alert: KubeDaemonSetNotScheduled expr: kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0 for: 10m labels: severity: warning annotations: message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled ok 3.912s ago 184.1us
alert: KubeDaemonSetMisScheduled expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 for: 10m labels: severity: warning annotations: message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled ok 3.912s ago 98.78us
alert: KubeCronJobRunning expr: time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600 for: 1h labels: severity: warning annotations: message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning ok 3.912s ago 90.31us
alert: KubeJobCompletion expr: kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 for: 1h labels: severity: warning annotations: message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than one hour to complete. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion ok 3.912s ago 882.1us
alert: KubeJobFailed expr: kube_job_status_failed{job="kube-state-metrics"} > 0 for: 1h labels: severity: warning annotations: message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed ok 3.911s ago 887.8us

kubernetes-resources

19.253s ago

10.47ms

Rule State Error Last Evaluation Evaluation Time
alert: KubeCPUOvercommit expr: sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum) / sum(kube_node_status_allocatable_cpu_cores) > (count(kube_node_status_allocatable_cpu_cores) - 1) / count(kube_node_status_allocatable_cpu_cores) for: 5m labels: severity: warning annotations: message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit ok 19.253s ago 1.197ms
alert: KubeMemOvercommit expr: sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum) / sum(kube_node_status_allocatable_memory_bytes) > (count(kube_node_status_allocatable_memory_bytes) - 1) / count(kube_node_status_allocatable_memory_bytes) for: 5m labels: severity: warning annotations: message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit ok 19.252s ago 1.65ms
alert: KubeCPUOvercommit expr: sum(kube_resourcequota{job="kube-state-metrics",resource="cpu",type="hard"}) / sum(kube_node_status_allocatable_cpu_cores) > 1.5 for: 5m labels: severity: warning annotations: message: Cluster has overcommitted CPU resource requests for Namespaces. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit ok 19.251s ago 384.9us
alert: KubeMemOvercommit expr: sum(kube_resourcequota{job="kube-state-metrics",resource="memory",type="hard"}) / sum(kube_node_status_allocatable_memory_bytes{job="node-exporter"}) > 1.5 for: 5m labels: severity: warning annotations: message: Cluster has overcommitted memory resource requests for Namespaces. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit ok 19.25s ago 257.1us
alert: KubeQuotaExceeded expr: 100 * kube_resourcequota{job="kube-state-metrics",type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"} > 0) > 90 for: 15m labels: severity: warning annotations: message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value }}% of its {{ $labels.resource }} quota. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded ok 19.25s ago 314.2us
alert: CPUThrottlingHigh expr: 100 * sum by(container, pod, namespace) (increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) / sum by(container, pod, namespace) (increase(container_cpu_cfs_periods_total[5m])) > 25 for: 15m labels: severity: warning annotations: message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh ok 19.25s ago 6.636ms

kubernetes-storage

9.088s ago

13.26ms

Rule State Error Last Evaluation Evaluation Time
alert: KubePersistentVolumeUsageCritical expr: 100 * kubelet_volume_stats_available_bytes{job="kubelet"} / kubelet_volume_stats_capacity_bytes{job="kubelet"} < 3 for: 1m labels: severity: critical annotations: message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value }}% free. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical ok 9.088s ago 2.625ms
alert: KubePersistentVolumeFullInFourDays expr: 100 * (kubelet_volume_stats_available_bytes{job="kubelet"} / kubelet_volume_stats_capacity_bytes{job="kubelet"}) < 15 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0 for: 5m labels: severity: critical annotations: message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ printf "%0.2f" $value }}% is available. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays ok 9.085s ago 8.373ms
alert: KubePersistentVolumeErrors expr: kube_persistentvolume_status_phase{job="kube-state-metrics",phase=~"Failed|Pending"} > 0 for: 5m labels: severity: critical annotations: message: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors ok 9.077s ago 2.237ms

kubernetes-system

28.312s ago

57.08ms

Rule State Error Last Evaluation Evaluation Time
alert: KubeNodeNotReady expr: kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} == 0 for: 1h labels: severity: warning annotations: message: '{{ $labels.node }} has been unready for more than an hour.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready ok 28.312s ago 354.3us
alert: KubeVersionMismatch expr: count(count by(gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"}, "gitVersion", "$1", "gitVersion", "(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1 for: 1h labels: severity: warning annotations: message: There are {{ $value }} different semantic versions of Kubernetes components running. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch ok 28.311s ago 388.1us
alert: KubeClientErrors expr: (sum by(instance, job) (rate(rest_client_requests_total{code=~"5.."}[5m])) / sum by(instance, job) (rate(rest_client_requests_total[5m]))) * 100 > 1 for: 15m labels: severity: warning annotations: message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ printf "%0.0f" $value }}% errors.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors ok 28.311s ago 1.239ms
alert: KubeClientErrors expr: sum by(instance, job) (rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) > 0.1 for: 15m labels: severity: warning annotations: message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ printf "%0.0f" $value }} errors / second. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors ok 28.31s ago 107.3us
alert: KubeletTooManyPods expr: kubelet_running_pod_count{job="kubelet"} > 110 * 0.9 for: 15m labels: severity: warning annotations: message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close to the limit of 110. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods ok 28.31s ago 66.78us
alert: KubeAPILatencyHigh expr: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 for: 10m labels: severity: warning annotations: message: The API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh ok 28.31s ago 1.685ms
alert: KubeAPILatencyHigh expr: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 for: 10m labels: severity: critical annotations: message: The API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh ok 28.309s ago 1.473ms
alert: KubeAPIErrorsHigh expr: sum(rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m])) / sum(rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 3 for: 10m labels: severity: critical annotations: message: API server is returning errors for {{ $value }}% of requests. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh ok 28.308s ago 12.77ms
alert: KubeAPIErrorsHigh expr: sum(rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m])) / sum(rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 1 for: 10m labels: severity: warning annotations: message: API server is returning errors for {{ $value }}% of requests. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh ok 28.295s ago 11.93ms
alert: KubeAPIErrorsHigh expr: sum by(resource, subresource, verb) (rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m])) / sum by(resource, subresource, verb) (rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 10 for: 10m labels: severity: critical annotations: message: API server is returning errors for {{ $value }}% of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh ok 28.283s ago 12.72ms
alert: KubeAPIErrorsHigh expr: sum by(resource, subresource, verb) (rate(apiserver_request_total{code=~"^(?:5..)$",job="apiserver"}[5m])) / sum by(resource, subresource, verb) (rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 5 for: 10m labels: severity: warning annotations: message: API server is returning errors for {{ $value }}% of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh ok 28.271s ago 12.63ms
alert: KubeClientCertificateExpiration expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 labels: severity: warning annotations: message: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration ok 28.258s ago 898.7us
alert: KubeClientCertificateExpiration expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 labels: severity: critical annotations: message: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration ok 28.258s ago 769us

node-network

6.194s ago

6.661ms

Rule State Error Last Evaluation Evaluation Time
alert: NetworkReceiveErrors expr: rate(node_network_receive_errs_total{device!~"veth.+",job="node-exporter"}[2m]) > 0 for: 2m labels: severity: warning annotations: message: Network interface "{{ $labels.device }}" showing receive errors on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" ok 6.194s ago 2.548ms
alert: NetworkTransmitErrors expr: rate(node_network_transmit_errs_total{device!~"veth.+",job="node-exporter"}[2m]) > 0 for: 2m labels: severity: warning annotations: message: Network interface "{{ $labels.device }}" showing transmit errors on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" ok 6.192s ago 2.121ms
alert: NodeNetworkInterfaceFlapping expr: changes(node_network_up{device!~"veth.+",job="node-exporter"}[2m]) > 2 for: 2m labels: severity: warning annotations: message: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" ok 6.19s ago 1.965ms

node-time

10.407s ago

656.1us

Rule State Error Last Evaluation Evaluation Time
alert: ClockSkewDetected expr: abs(node_timex_offset_seconds{job="node-exporter"}) > 0.05 for: 2m labels: severity: warning annotations: message: Clock skew detected on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}. Ensure NTP is configured correctly on this host. ok 10.407s ago 640.1us

node.rules

25.639s ago

119.5ms

Rule State Error Last Evaluation Evaluation Time
record: :kube_pod_info_node_count: expr: sum(min by(node) (kube_pod_info)) ok 25.639s ago 4.925ms
record: node_namespace_pod:kube_pod_info: expr: max by(node, namespace, pod) (label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) ok 25.635s ago 11.16ms
record: node:node_num_cpu:sum expr: count by(node) (sum by(node, cpu) (node_cpu_seconds_total{job="node-exporter"} * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:)) ok 25.623s ago 29.39ms
record: :node_cpu_utilisation:avg1m expr: 1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])) ok 25.594s ago 2.64ms
record: node:node_cpu_utilisation:avg1m expr: 1 - avg by(node) (rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 25.592s ago 5.691ms
record: node:cluster_cpu_utilisation:ratio expr: node:node_cpu_utilisation:avg1m * node:node_num_cpu:sum / scalar(sum(node:node_num_cpu:sum)) ok 25.586s ago 271.3us
record: :node_cpu_saturation_load1: expr: sum(node_load1{job="node-exporter"}) / sum(node:node_num_cpu:sum) ok 25.586s ago 205.1us
record: node:node_cpu_saturation_load1: expr: sum by(node) (node_load1{job="node-exporter"} * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) / node:node_num_cpu:sum ok 25.586s ago 5.34ms
record: :node_memory_utilisation: expr: 1 - sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) / sum(node_memory_MemTotal_bytes{job="node-exporter"}) ok 25.581s ago 745.3us
record: :node_memory_MemFreeCachedBuffers_bytes:sum expr: sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) ok 25.58s ago 421.2us
record: :node_memory_MemTotal_bytes:sum expr: sum(node_memory_MemTotal_bytes{job="node-exporter"}) ok 25.58s ago 137.8us
record: node:node_memory_bytes_available:sum expr: sum by(node) ((node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 25.58s ago 5.935ms
record: node:node_memory_bytes_total:sum expr: sum by(node) (node_memory_MemTotal_bytes{job="node-exporter"} * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 25.574s ago 4.5ms
record: node:node_memory_utilisation:ratio expr: (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) / node:node_memory_bytes_total:sum ok 25.569s ago 339.1us
record: node:cluster_memory_utilisation:ratio expr: (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) / scalar(sum(node:node_memory_bytes_total:sum)) ok 25.569s ago 222.4us
record: :node_memory_swap_io_bytes:sum_rate expr: 1000 * sum((rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) + rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))) ok 25.569s ago 296us
record: node:node_memory_utilisation: expr: 1 - sum by(node) ((node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) / sum by(node) (node_memory_MemTotal_bytes{job="node-exporter"} * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 25.569s ago 7.494ms
record: node:node_memory_utilisation_2: expr: 1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum) ok 25.561s ago 300.5us
record: node:node_memory_swap_io_bytes:sum_rate expr: 1000 * sum by(node) ((rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 25.561s ago 4.917ms
record: :node_disk_utilisation:avg_irate expr: avg(irate(node_disk_io_time_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m])) ok 25.556s ago 2.324ms
record: node:node_disk_utilisation:avg_irate expr: avg by(node) (irate(node_disk_io_time_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 25.554s ago 6.031ms
record: :node_disk_saturation:avg_irate expr: avg(irate(node_disk_io_time_weighted_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m])) ok 25.548s ago 1.601ms
record: node:node_disk_saturation:avg_irate expr: avg by(node) (irate(node_disk_io_time_weighted_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+",job="node-exporter"}[1m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 25.547s ago 4.213ms
record: node:node_filesystem_usage: expr: max by(instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) ok 25.543s ago 489.4us
record: node:node_filesystem_avail: expr: max by(instance, namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) ok 25.542s ago 299.8us
record: :node_net_utilisation:sum_irate expr: sum(irate(node_network_receive_bytes_total{device!~"veth.+",job="node-exporter"}[1m])) + sum(irate(node_network_transmit_bytes_total{device!~"veth.+",job="node-exporter"}[1m])) ok 25.542s ago 2.533ms
record: node:node_net_utilisation:sum_irate expr: sum by(node) ((irate(node_network_receive_bytes_total{device!~"veth.+",job="node-exporter"}[1m]) + irate(node_network_transmit_bytes_total{device!~"veth.+",job="node-exporter"}[1m])) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 25.54s ago 4.741ms
record: :node_net_saturation:sum_irate expr: sum(irate(node_network_receive_drop_total{device!~"veth.+",job="node-exporter"}[1m])) + sum(irate(node_network_transmit_drop_total{device!~"veth.+",job="node-exporter"}[1m])) ok 25.535s ago 1.656ms
record: node:node_net_saturation:sum_irate expr: sum by(node) ((irate(node_network_receive_drop_total{device!~"veth.+",job="node-exporter"}[1m]) + irate(node_network_transmit_drop_total{device!~"veth.+",job="node-exporter"}[1m])) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) ok 25.533s ago 4.361ms
record: node:node_inodes_total: expr: max by(node) (max by(node, host_ip) (kube_pod_info{host_ip!="",job="kube-state-metrics"}) * on(host_ip) group_right(node) label_replace((max by(instance) (node_filesystem_files{job="node-exporter",mountpoint="/"})), "host_ip", "$1", "instance", "(.*):.*")) ok 25.529s ago 3.216ms
record: node:node_inodes_free: expr: max by(node) (max by(node, host_ip) (kube_pod_info{host_ip!="",job="kube-state-metrics"}) * on(host_ip) group_right(node) label_replace((max by(instance) (node_filesystem_files_free{job="node-exporter",mountpoint="/"})), "host_ip", "$1", "instance", "(.*):.*")) ok 25.526s ago 2.977ms

prometheus-operator

28.868s ago

915.4us

Rule State Error Last Evaluation Evaluation Time
alert: PrometheusOperatorReconcileErrors expr: rate(prometheus_operator_reconcile_errors_total{job="prometheus-prometheus-oper-operator",namespace="monitoring"}[5m]) > 0.1 for: 10m labels: severity: warning annotations: message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace }} Namespace. ok 28.869s ago 613.6us
alert: PrometheusOperatorNodeLookupErrors expr: rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-prometheus-oper-operator",namespace="monitoring"}[5m]) > 0.1 for: 10m labels: severity: warning annotations: message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. ok 28.868s ago 280.4us

prometheus

25.378s ago

3.94ms

Rule State Error Last Evaluation Evaluation Time
alert: PrometheusBadConfig expr: max_over_time(prometheus_config_last_reload_successful{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) == 0 for: 10m labels: severity: critical annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration. summary: Failed Prometheus configuration reload. ok 25.378s ago 434.1us
alert: PrometheusNotificationQueueRunningFull expr: (predict_linear(prometheus_notifications_queue_length{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m], 60 * 30) > min_over_time(prometheus_notifications_queue_capacity{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])) for: 15m labels: severity: warning annotations: description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full. summary: Prometheus alert notification queue predicted to run full in less than 30m. ok 25.378s ago 335.7us
alert: PrometheusErrorSendingAlertsToSomeAlertmanagers expr: (rate(prometheus_notifications_errors_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])) * 100 > 1 for: 15m labels: severity: warning annotations: description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.' summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. ok 25.378s ago 419.5us
alert: PrometheusErrorSendingAlertsToAnyAlertmanager expr: min without(alertmanager) (rate(prometheus_notifications_errors_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])) * 100 > 3 for: 15m labels: severity: critical annotations: description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.' summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. ok 25.377s ago 341.2us
alert: PrometheusNotConnectedToAlertmanagers expr: max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) < 1 for: 10m labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers. summary: Prometheus is not connected to any Alertmanagers. ok 25.377s ago 164us
alert: PrometheusTSDBReloadsFailing expr: increase(prometheus_tsdb_reloads_failures_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[3h]) > 0 for: 4h labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h. summary: Prometheus has issues reloading blocks from disk. ok 25.377s ago 340.8us
alert: PrometheusTSDBCompactionsFailing expr: increase(prometheus_tsdb_compactions_failed_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[3h]) > 0 for: 4h labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h. summary: Prometheus has issues compacting blocks. ok 25.377s ago 267.5us
alert: PrometheusTSDBWALCorruptions expr: increase(tsdb_wal_corruptions_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[3h]) > 0 for: 4h labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} corruptions of the write-ahead log (WAL) over the last 3h. summary: Prometheus is detecting WAL corruptions. ok 25.377s ago 154.4us
alert: PrometheusNotIngestingSamples expr: rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) <= 0 for: 10m labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples. summary: Prometheus is not ingesting samples. ok 25.377s ago 166.3us
alert: PrometheusDuplicateTimestamps expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) > 0 for: 10m labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{$value | humanize}} samples/s with different values but duplicated timestamp. summary: Prometheus is dropping samples with duplicate timestamps. ok 25.377s ago 158.5us
alert: PrometheusOutOfOrderTimestamps expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) > 0 for: 10m labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{$value | humanize}} samples/s with timestamps arriving out of order. summary: Prometheus drops samples with out-of-order timestamps. ok 25.377s ago 146.8us
alert: PrometheusRemoteStorageFailures expr: (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) / (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) + rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]))) * 100 > 1 for: 15m labels: severity: critical annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to queue {{$labels.queue}}. summary: Prometheus fails to send samples to remote storage. ok 25.377s ago 345.2us
alert: PrometheusRemoteWriteBehind expr: (max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) - on(job, instance) group_right() max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m])) > 120 for: 15m labels: severity: critical annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for queue {{$labels.queue}}. summary: Prometheus remote write is behind. ok 25.377s ago 290.1us
alert: PrometheusRuleFailures expr: increase(prometheus_rule_evaluation_failures_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) > 0 for: 15m labels: severity: critical annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m. summary: Prometheus is failing rule evaluations. ok 25.377s ago 170.3us
alert: PrometheusMissingRuleEvaluations expr: increase(prometheus_rule_group_iterations_missed_total{job="prometheus-prometheus-oper-prometheus",namespace="monitoring"}[5m]) > 0 for: 15m labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m. summary: Prometheus is missing rule evaluations due to slow rule group evaluation. ok 25.377s ago 167.5us