|
/etc/alerts.d/node_alerting_rules.yml > container_cpu_usage_is_high
|
|
|
|
/etc/alerts.d/node_alerting_rules.yml > container_memory_usage_is_high
|
|
|
|
/etc/alerts.d/node_alerting_rules.yml > node_cpu_greater_than_80
|
|
|
|
/etc/alerts.d/node_alerting_rules.yml > node_disk_space_too_low
|
|
|
|
/etc/alerts.d/node_alerting_rules.yml > node_down
|
alert: NODE_DOWN
expr: up{component="node-exporter"} == 0
for: 3m
labels:
severity: warning
annotations:
description: '{{ $labels.job }} job failed to scrape instance {{ $labels.instance }} for more than 3 minutes. Node Seems to be down'
summary: Node {{ $labels.kubernetes_node }} is down
|
|
/etc/alerts.d/node_alerting_rules.yml > node_memory_left_lessser_than_10
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > Front50-cache
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > autopilot-component-jvm-errors
|
alert: jvm-memory-filling-up-for-oes-audit-client
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="auditclient"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="auditclient"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
alert: jvm-memory-filling-up-for-oes-autopilot
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="autopilot"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="autopilot"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
alert: jvm-memory-filling-up-for-oes-dashboard
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="dashboard"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="autopilot"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
alert: jvm-memory-filling-up-for-oes-platform
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="platform"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="platform"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
alert: jvm-memory-filling-up-for-oes-sapor
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="sapor"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="sapor"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
alert: jvm-memory-filling-up-for-oes-visibility
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="visibility"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="visibility"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
|
/etc/alerts.d/spin_alerting_rules.yml > autopilot-component-latency-too-high
|
alert: oes-audit-client-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="auditclient"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="auditclient"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
alert: oes-autopilot-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="autopilot"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="autopilot"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
alert: oes-dashboard-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="dashboard"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="dashboard"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
alert: oes-platform-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="platform"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="platform"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
alert: oes-sapor-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="sapor"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="sapor"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
alert: oes-visibility-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="visibility"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="visibility"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
|
/etc/alerts.d/spin_alerting_rules.yml > autopilot-scrape-target-is-down
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > igor-needs-attention
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > jvm-too-high
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > kube-api-server-is-down
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > kubernetes-api-server-experiencing-high-error-rate
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > latency-too-high
|
alert: clouddriver-caching-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_caching:controller:invocations__total{service="spin-clouddriver-caching"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_caching:controller:invocations__count_total{service="spin-clouddriver-caching"}[5m])) > 5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: clouddriver-ro-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_ro:controller:invocations__total{service="spin-clouddriver-ro"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_ro:controller:invocations__count_total{service="spin-clouddriver-ro"}[5m])) > 1
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: clouddriver-rw-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_rw:controller:invocations__total{service="spin-clouddriver-rw"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_rw:controller:invocations__count_total{service="spin-clouddriver-rw"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is ({{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: clouddriver_ro_deck-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_ro_deck:controller:invocations__total{service="spin-clouddriver-ro-deck"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_ro_deck:controller:invocations__total{service="spin-clouddriver-ro-deck"}[5m])) > 5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: echo_scheduler-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(echo_scheduler:controller:invocations__total{service="spin-echo-scheduler"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(echo_scheduler:controller:invocations__count_total{service="spin-echo-scheduler"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: echo_worker-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(echo_worker:controller:invocations__total{service="spin-echo-worker"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(echo_worker:controller:invocations__count_total{service="spin-echo-worker"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: fiat-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(fiat:controller:invocations__total{service="spin-fiat"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(fiat:controller:invocations__count_total{service="spin-fiat"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: front50-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(front50:controller:invocations__total{service="spin-front50"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(front50:controller:invocations__count_total{service="spin-front50"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: gate-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(gate:controller:invocations__total{service="spin-gate"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(gate:controller:invocations__count_total{service="spin-gate"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: igor-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(igor:controller:invocations__total{service="spin-igor"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(igor:controller:invocations__count_total{service="spin-igor"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: orca-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(orca:controller:invocations__total{service="spin-orca"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(orca:controller:invocations__count_total{service="spin-orca"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: rosco-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(rosco:controller:invocations__total{service="spin-rosco"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(rosco:controller:invocations__count_total{service="spin-rosco"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
|
/etc/alerts.d/spin_alerting_rules.yml > orca-queue-issue
|
|
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > prometheus-job-down
|
alert: prometheus-job-is-down
expr: up{job="prometheus"} == 0
for: 5m
labels:
severity: warning
annotations:
description: Default Prometheus Job is Down LABELS = {{ $labels }}
summary: The Default Prometheus Job is Down (job {{ $labels.job}})
|
|
/etc/alerts.d/spin_alerting_rules.yml > spinnaker-service-is-down
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > volume-is-almost-full (< 10% left)
|
| Labels |
State |
Active Since |
Value |
|
alertname="pvc-storage-full"
beta_kubernetes_io_arch="amd64"
beta_kubernetes_io_instance_type="io1-15"
beta_kubernetes_io_os="linux"
failure_domain_beta_kubernetes_io_region="IAD"
instance="prod-instance-1761572832308"
job="kubernetes-nodes"
kubernetes_io_arch="amd64"
kubernetes_io_hostname="prod-instance-1761572832308"
kubernetes_io_os="linux"
namespace="forstairwell"
node_kubernetes_io_instance_type="io1-15"
nodepool_ngpc_rxt_io_name="72c2db09-dbb2-4768-805b-7a6b062d9df5"
persistentvolumeclaim="isd-prometheus-server"
servers_ngpc_rxt_io_class="gp.vs1.large-iad"
servers_ngpc_rxt_io_k8sNodeName="prod-instance-1761572832308"
servers_ngpc_rxt_io_type="spot"
severity="warning"
topology_cinder_csi_openstack_org_zone="nova"
topology_kubernetes_io_region="IAD"
|
firing |
2025-12-24 06:49:05.000257301 +0000 UTC |
0 |
| Annotations |
- description
- Volume is almost full (< 10% left)
VALUE = 0
LABELS = map[beta_kubernetes_io_arch:amd64 beta_kubernetes_io_instance_type:io1-15 beta_kubernetes_io_os:linux failure_domain_beta_kubernetes_io_region:IAD instance:prod-instance-1761572832308 job:kubernetes-nodes kubernetes_io_arch:amd64 kubernetes_io_hostname:prod-instance-1761572832308 kubernetes_io_os:linux namespace:forstairwell node_kubernetes_io_instance_type:io1-15 nodepool_ngpc_rxt_io_name:72c2db09-dbb2-4768-805b-7a6b062d9df5 persistentvolumeclaim:isd-prometheus-server servers_ngpc_rxt_io_class:gp.vs1.large-iad servers_ngpc_rxt_io_k8sNodeName:prod-instance-1761572832308 servers_ngpc_rxt_io_type:spot topology_cinder_csi_openstack_org_zone:nova topology_kubernetes_io_region:IAD]
- summary
- Kubernetes Volume running out of disk space for (persistentvolumeclaim isd-prometheus-server in namespace forstairwell)
|
|