Alerts


/etc/alerts.d/node_alerting_rules.yml > container_cpu_usage_is_high
POD_CPU_IS_HIGH (0 active)
alert: POD_CPU_IS_HIGH
expr: sum by(container, pod, namespace) (rate(container_cpu_usage_seconds_total{container!=""}[5m])) * 100 > 90
for: 1m
labels:
  severity: critical
annotations:
  description: Container {{ $labels.container }} CPU usage inside POD {{ $labels.pod}} is high in {{ $labels.namespace}}
  summary: POD {{ $labels.pod}} CPU Usage is high in {{ $labels.namespace}}
/etc/alerts.d/node_alerting_rules.yml > container_memory_usage_is_high
POD_MEMORY_USAGE_IS_HIGH (0 active)
alert: POD_MEMORY_USAGE_IS_HIGH
expr: (sum by(container, pod, namespace) (container_memory_working_set_bytes{container!=""}) / sum by(container, pod, namespace) (container_spec_memory_limit_bytes > 0) * 100) > 80
for: 1m
labels:
  severity: critical
annotations:
  description: |-
    Container Memory usage is above 80%
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Container {{ $labels.container }} Memory usage inside POD {{ $labels.pod}} is high in {{ $labels.namespace}}
/etc/alerts.d/node_alerting_rules.yml > node_cpu_greater_than_80
NODE_CPU_IS_HIGH (0 active)
alert: NODE_CPU_IS_HIGH
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
for: 1m
labels:
  severity: critical
annotations:
  description: node {{ $labels.kubernetes_node }} cpu is high
  summary: node cpu is greater than 80 precent
/etc/alerts.d/node_alerting_rules.yml > node_disk_space_too_low
NODE_DISK_SPACE_IS_LOW (0 active)
alert: NODE_DISK_SPACE_IS_LOW
expr: (100 * ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/"}) / (node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/"}))) < 10
for: 1m
labels:
  severity: critical
annotations:
  description: node {{ $labels.node }} disk space is only {{ printf "%0.2f" $value }}% free.
  summary: node disk space remaining is less than 10 percent
/etc/alerts.d/node_alerting_rules.yml > node_down
NODE_DOWN (0 active)
alert: NODE_DOWN
expr: up{component="node-exporter"} == 0
for: 3m
labels:
  severity: warning
annotations:
  description: '{{ $labels.job }} job failed to scrape instance {{ $labels.instance }} for more than 3 minutes. Node Seems to be down'
  summary: Node {{ $labels.kubernetes_node }} is down
/etc/alerts.d/node_alerting_rules.yml > node_memory_left_lessser_than_10
NODE_MEMORY_LESS_THAN_10% (4 active)
alert: NODE_MEMORY_LESS_THAN_10%
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 1m
labels:
  severity: critical
annotations:
  description: node {{ $labels.kubernetes_node }} memory left is low
  summary: node memory left is lesser than 10 precent
Labels State Active Since Value
alertname="NODE_MEMORY_LESS_THAN_10%" app_kubernetes_io_component="metrics" app_kubernetes_io_instance="monitor" app_kubernetes_io_managed_by="Helm" app_kubernetes_io_name="prometheus-node-exporter" app_kubernetes_io_part_of="prometheus-node-exporter" app_kubernetes_io_version="1.9.1" helm_sh_chart="prometheus-node-exporter-4.46.1" instance="162.209.125.31:9100" job="kubernetes-service-endpoints" jobLabel="node-exporter" kubernetes_name="monitor-prometheus-node-exporter" kubernetes_namespace="spinhar16" kubernetes_node="prod-instance-17630262671353084" release="monitor" severity="critical" firing 2025-12-23 11:24:34.345417764 +0000 UTC 9.417531783150848
alertname="NODE_MEMORY_LESS_THAN_10%" app="prometheus" chart="prometheus-11.16.2" component="node-exporter" heritage="Helm" instance="162.209.125.97:9100" job="kubernetes-service-endpoints" kubernetes_name="isd-prometheus-node-exporter" kubernetes_namespace="isdall202508" kubernetes_node="prod-instance-1757954925652" release="isd" severity="critical" pending 2025-12-23 11:27:34.345417764 +0000 UTC 7.178954172416676
alertname="NODE_MEMORY_LESS_THAN_10%" app_kubernetes_io_component="metrics" app_kubernetes_io_instance="monitor" app_kubernetes_io_managed_by="Helm" app_kubernetes_io_name="prometheus-node-exporter" app_kubernetes_io_part_of="prometheus-node-exporter" app_kubernetes_io_version="1.9.1" helm_sh_chart="prometheus-node-exporter-4.46.1" instance="162.209.125.18:9100" job="kubernetes-service-endpoints" jobLabel="node-exporter" kubernetes_name="monitor-prometheus-node-exporter" kubernetes_namespace="spinhar16" kubernetes_node="prod-instance-17630262671063083" release="monitor" severity="critical" pending 2025-12-23 11:27:34.345417764 +0000 UTC 4.694559603776086
alertname="NODE_MEMORY_LESS_THAN_10%" app_kubernetes_io_component="metrics" app_kubernetes_io_instance="monitor" app_kubernetes_io_managed_by="Helm" app_kubernetes_io_name="prometheus-node-exporter" app_kubernetes_io_part_of="prometheus-node-exporter" app_kubernetes_io_version="1.9.1" helm_sh_chart="prometheus-node-exporter-4.46.1" instance="162.209.124.122:9100" job="kubernetes-service-endpoints" jobLabel="node-exporter" kubernetes_name="monitor-prometheus-node-exporter" kubernetes_namespace="spinhar16" kubernetes_node="prod-instance-1760074260256" release="monitor" severity="critical" firing 2025-12-23 11:24:34.345417764 +0000 UTC 6.595103088689165
/etc/alerts.d/spin_alerting_rules.yml > Front50-cache
front50:storageServiceSupport:cacheAge__value (0 active)
alert: front50:storageServiceSupport:cacheAge__value
expr: front50:storageServiceSupport:cacheAge__value > 300000
for: 2m
labels:
  severity: warning
annotations:
  description: front50 cacheAge for {{$labels.pod}} in namespace {{$labels.namespace}} has value = {{$value}}
  summary: front50 cacheAge too high
/etc/alerts.d/spin_alerting_rules.yml > autopilot-component-jvm-errors
jvm-memory-filling-up-for-oes-audit-client (0 active)
alert: jvm-memory-filling-up-for-oes-audit-client
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="auditclient"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="auditclient"})) * 100 > 90
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
      VALUE = {{ $value }}
  summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
jvm-memory-filling-up-for-oes-autopilot (0 active)
alert: jvm-memory-filling-up-for-oes-autopilot
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="autopilot"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="autopilot"})) * 100 > 90
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
      VALUE = {{ $value }}
  summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
jvm-memory-filling-up-for-oes-dashboard (0 active)
alert: jvm-memory-filling-up-for-oes-dashboard
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="dashboard"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="autopilot"})) * 100 > 90
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
      VALUE = {{ $value }}
  summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
jvm-memory-filling-up-for-oes-platform (0 active)
alert: jvm-memory-filling-up-for-oes-platform
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="platform"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="platform"})) * 100 > 90
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
      VALUE = {{ $value }}
  summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
jvm-memory-filling-up-for-oes-sapor (0 active)
alert: jvm-memory-filling-up-for-oes-sapor
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="sapor"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="sapor"})) * 100 > 90
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
      VALUE = {{ $value }}
  summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
jvm-memory-filling-up-for-oes-visibility (0 active)
alert: jvm-memory-filling-up-for-oes-visibility
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="visibility"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="visibility"})) * 100 > 90
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
      VALUE = {{ $value }}
  summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
/etc/alerts.d/spin_alerting_rules.yml > autopilot-component-latency-too-high
oes-audit-client-latency-too-high (0 active)
alert: oes-audit-client-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="auditclient"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="auditclient"}[2m])) > 0.5
for: 2m
labels:
  severity: warning
annotations:
  description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
  summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
oes-autopilot-latency-too-high (0 active)
alert: oes-autopilot-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="autopilot"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="autopilot"}[2m])) > 0.5
for: 2m
labels:
  severity: warning
annotations:
  description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
  summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
oes-dashboard-latency-too-high (0 active)
alert: oes-dashboard-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="dashboard"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="dashboard"}[2m])) > 0.5
for: 2m
labels:
  severity: warning
annotations:
  description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
  summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
oes-platform-latency-too-high (0 active)
alert: oes-platform-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="platform"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="platform"}[2m])) > 0.5
for: 2m
labels:
  severity: warning
annotations:
  description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
  summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
oes-sapor-latency-too-high (0 active)
alert: oes-sapor-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="sapor"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="sapor"}[2m])) > 0.5
for: 2m
labels:
  severity: warning
annotations:
  description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
  summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
oes-visibility-latency-too-high (0 active)
alert: oes-visibility-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="visibility"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="visibility"}[2m])) > 0.5
for: 2m
labels:
  severity: warning
annotations:
  description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
  summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
/etc/alerts.d/spin_alerting_rules.yml > autopilot-scrape-target-is-down
oes-audit-client-scrape-target-is-down (1 active)
alert: oes-audit-client-scrape-target-is-down
expr: up{component="auditclient"} == 0
labels:
  severity: critical
annotations:
  description: The scrape target endpoint of component {{$labels.component}} in namespace {{$labels.kubernetes_namespace}} is down
  summary: oes-audit-client scrape target is down
Labels State Active Since Value
alertname="oes-audit-client-scrape-target-is-down" app="oes" chart="oes-4.0.29" component="auditclient" heritage="Helm" instance="10.20.173.140:8098" job="kubernetes-pods" kubernetes_namespace="isdupg4043to2508" kubernetes_pod_name="oes-audit-client-6bd5494c96-xnxcf" pod_template_hash="6bd5494c96" release="isd" severity="critical" firing 2025-12-23 11:19:13.633722481 +0000 UTC 0
oes-autopilot-scrape-target-is-down (2 active)
alert: oes-autopilot-scrape-target-is-down
expr: up{component="autopilot"} == 0
labels:
  severity: critical
annotations:
  description: The scrape target endpoint of component {{$labels.component}} in namespace {{$labels.kubernetes_namespace}} is down
  summary: oes-autopilot scrape target is down
Labels State Active Since Value
alertname="oes-autopilot-scrape-target-is-down" app="oes" chart="oes-4.0.29" component="autopilot" heritage="Helm" instance="10.20.173.156:8090" job="kubernetes-pods" kubernetes_namespace="isdspin251000" kubernetes_pod_name="oes-autopilot-69cb658b65-78d84" pod_template_hash="69cb658b65" release="isd" severity="critical" firing 2025-12-23 11:18:13.633722481 +0000 UTC 0
alertname="oes-autopilot-scrape-target-is-down" app="oes" chart="oes-4.0.29" component="autopilot" heritage="Helm" instance="10.20.173.146:8090" job="kubernetes-pods" kubernetes_namespace="isdupg4043to2508" kubernetes_pod_name="oes-autopilot-6ccd5c6f99-8bq4d" pod_template_hash="6ccd5c6f99" release="isd" severity="critical" firing 2025-12-23 11:19:13.633722481 +0000 UTC 0
oes-dashboard-scrape-target-is-down (1 active)
alert: oes-dashboard-scrape-target-is-down
expr: up{component="dashboard"} == 0
labels:
  severity: critical
annotations:
  description: The scrape target endpoint of component {{$labels.component}} in namespace {{$labels.kubernetes_namespace}} is down
  summary: oes-dashboard scrape target is down
Labels State Active Since Value
alertname="oes-dashboard-scrape-target-is-down" app="oes" chart="oes-4.0.29" component="dashboard" heritage="Helm" instance="10.20.173.141:8094" job="kubernetes-pods" kubernetes_namespace="isdupg4043to2508" kubernetes_pod_name="oes-dashboard-845698bdd7-hdv5z" pod_template_hash="845698bdd7" release="isd" severity="critical" firing 2025-12-23 11:19:13.633722481 +0000 UTC 0
oes-platform-scrape-target-is-down (1 active)
alert: oes-platform-scrape-target-is-down
expr: up{component="platform"} == 0
labels:
  severity: critical
annotations:
  description: The scrape target endpoint of component {{$labels.component}} in namespace {{$labels.kubernetes_namespace}} is down
  summary: oes-platform scrape target is down
Labels State Active Since Value
alertname="oes-platform-scrape-target-is-down" app="oes" chart="oes-4.0.29" component="platform" heritage="Helm" instance="10.20.173.189:8095" job="kubernetes-pods" kubernetes_namespace="isdupg4043to2508" kubernetes_pod_name="oes-platform-5d86cfc5bc-g4cz8" pod_template_hash="5d86cfc5bc" release="isd" severity="critical" firing 2025-12-23 11:19:13.633722481 +0000 UTC 0
oes-sapor-scrape-target-is-down (1 active)
alert: oes-sapor-scrape-target-is-down
expr: up{component="sapor"} == 0
labels:
  severity: critical
annotations:
  description: The scrape target endpoint of component {{$labels.component}} in namespace {{$labels.kubernetes_namespace}} is down
  summary: oes-sapor scrape target is down
Labels State Active Since Value
alertname="oes-sapor-scrape-target-is-down" app="oes" chart="oes-4.0.29" component="sapor" heritage="Helm" instance="10.20.173.134:8085" job="kubernetes-pods" kubernetes_namespace="isdspin251000" kubernetes_pod_name="oes-sapor-56865cdb8-6mdwh" pod_template_hash="56865cdb8" release="isd" severity="critical" firing 2025-12-23 11:19:13.633722481 +0000 UTC 0
oes-visibility-scrape-target-is-down (0 active)
alert: oes-visibility-scrape-target-is-down
expr: up{component="visibility"} == 0
labels:
  severity: critical
annotations:
  description: The scrape target endpoint of component {{$labels.component}} in namespace {{$labels.kubernetes_namespace}} is down
  summary: oes-visibility scrape target is down
/etc/alerts.d/spin_alerting_rules.yml > igor-needs-attention
igor-needs-attention (0 active)
alert: igor-needs-attention
expr: igor:pollingMonitor:itemsOverThreshold__value > 0
labels:
  severity: crtical
annotations:
  description: Igor in namespace {{$labels.namespace}} needs human help
  summary: Igor needs attention
/etc/alerts.d/spin_alerting_rules.yml > jvm-too-high
clouddriver-caching-pod-may-be-evicted-soon (0 active)
alert: clouddriver-caching-pod-may-be-evicted-soon
expr: (sum by(instance, area) (clouddriver_caching:jvm:memory:used__value) / sum by(instance, area) (clouddriver_caching:jvm:memory:max__value)) > 0.9
labels:
  severity: warning
annotations:
  description: Service {{$labels.service}} in namespace {{$labels.namespace}} may be evicted soon
  summary: Clouddriver-caching JVM memory too high
clouddriver-ro-pod-may-be-evicted-soon (0 active)
alert: clouddriver-ro-pod-may-be-evicted-soon
expr: (sum by(instance, area) (clouddriver_ro:jvm:memory:used__value) / sum by(instance, area) (clouddriver_ro:jvm:memory:max__value)) > 0.9
labels:
  severity: warning
annotations:
  description: Service {{$labels.service}} in namespace {{$labels.namespace}} may be evicted soon
  summary: Clouddriver-ro JVM memory too high
clouddriver-rw-pod-may-be-evicted-soon (0 active)
alert: clouddriver-rw-pod-may-be-evicted-soon
expr: (sum by(instance, area) (clouddriver_rw:jvm:memory:used__value) / sum by(instance, area) (clouddriver_rw:jvm:memory:max__value)) > 0.9
labels:
  severity: warning
annotations:
  description: Service {{$labels.service}} in namespace {{$labels.namespace}} may be evicted soon
  summary: Clouddriver-rw JVM memory too high
echo-scheduler-pod-may-be-evicted-soon (0 active)
alert: echo-scheduler-pod-may-be-evicted-soon
expr: (sum by(instance, area) (echo_scheduler:jvm:memory:used__value) / sum by(instance, area) (echo_scheduler:jvm:memory:max__value)) > 0.9
labels:
  severity: warning
annotations:
  description: Service {{$labels.service}} in namespace {{$labels.namespace}} may be evicted soon
  summary: echo-scheduler JVM memory too high
echo-worker-pod-may-be-evicted-soon (0 active)
alert: echo-worker-pod-may-be-evicted-soon
expr: (sum by(instance, area) (echo_worker:jvm:memory:used__value) / sum by(instance, area) (echo_worker:jvm:memory:max__value)) > 0.9
labels:
  severity: warning
annotations:
  description: Service {{$labels.service}} in namespace {{$labels.namespace}} may be evicted soon
  summary: echo-worker JVM memory too high
front50-pod-may-be-evicted-soon (0 active)
alert: front50-pod-may-be-evicted-soon
expr: (sum by(instance, area) (front50:jvm:memory:used__value) / sum by(instance, area) (front50:jvm:memory:max__value)) > 0.9
labels:
  severity: warning
annotations:
  description: Service {{$labels.service}} in namespace {{$labels.namespace}} may be evicted soon
  summary: Front50 JVM memory too high
gate-pod-may-be-evicted-soon (0 active)
alert: gate-pod-may-be-evicted-soon
expr: (sum by(instance, area) (gate:jvm:memory:used__value) / sum by(instance, area) (gate:jvm:memory:max__value)) > 0.9
labels:
  severity: warning
annotations:
  description: Service {{$labels.service}} in namespace {{$labels.namespace}} may be evicted soon
  summary: gate JVM memory too high
igor-pod-may-be-evicted-soon (0 active)
alert: igor-pod-may-be-evicted-soon
expr: (sum by(instance, area) (igor:jvm:memory:used__value) / sum by(instance, area) (igor:jvm:memory:max__value)) > 0.9
labels:
  severity: warning
annotations:
  description: Service {{$labels.service}} in namespace {{$labels.namespace}} may be evicted soon
  summary: igor JVM memory too high
orca-pod-may-be-evicted-soon (0 active)
alert: orca-pod-may-be-evicted-soon
expr: (sum by(instance, area) (orca:jvm:gc:liveDataSize__value) / sum by(instance, area) (orca:jvm:gc:maxDataSize__value)) > 0.9
labels:
  severity: warning
annotations:
  description: Service {{$labels.service}} in namespace {{$labels.namespace}} may be evicted soon
  summary: orca JVM memory too high
/etc/alerts.d/spin_alerting_rules.yml > kube-api-server-is-down
kube-api-server-down (0 active)
alert: kube-api-server-down
expr: up{job="kubernetes-apiservers"} == 0
for: 2m
labels:
  severity: critical
annotations:
  description: Kubernetes API Server service went down LABELS = {{ $labels }}
  summary: Kube API Server job {{ $labels.job }} is down
/etc/alerts.d/spin_alerting_rules.yml > kubernetes-api-server-experiencing-high-error-rate
kube-api-server-errors (0 active)
alert: kube-api-server-errors
expr: sum(rate(apiserver_request_total{code=~"^(?:5..)$",job="kubernetes-apiservers"}[2m])) / sum(rate(apiserver_request_total{job="kubernetes-apiservers"}[2m])) * 100 > 3
for: 2m
labels:
  severity: critical
annotations:
  description: |-
    Kubernetes API server is experiencing high error rate
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Kubernetes API server errors (instance {{ $labels.instance }})
/etc/alerts.d/spin_alerting_rules.yml > latency-too-high
clouddriver-caching-latency-too-high (0 active)
clouddriver-ro-latency-too-high (0 active)
alert: clouddriver-ro-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_ro:controller:invocations__total{service="spin-clouddriver-ro"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_ro:controller:invocations__count_total{service="spin-clouddriver-ro"}[5m])) > 1
for: 15m
labels:
  severity: warning
annotations:
  description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
  summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
clouddriver-rw-latency-too-high (0 active)
alert: clouddriver-rw-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_rw:controller:invocations__total{service="spin-clouddriver-rw"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_rw:controller:invocations__count_total{service="spin-clouddriver-rw"}[5m])) > 0.5
for: 15m
labels:
  severity: warning
annotations:
  description: Latency of the Service {{$labels.service}} is ({{$value}} seconds for {{ $labels }}
  summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
clouddriver_ro_deck-latency-too-high (0 active)
echo_scheduler-latency-too-high (0 active)
alert: echo_scheduler-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(echo_scheduler:controller:invocations__total{service="spin-echo-scheduler"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(echo_scheduler:controller:invocations__count_total{service="spin-echo-scheduler"}[5m])) > 0.5
for: 15m
labels:
  severity: warning
annotations:
  description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
  summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
echo_worker-latency-too-high (0 active)
alert: echo_worker-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(echo_worker:controller:invocations__total{service="spin-echo-worker"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(echo_worker:controller:invocations__count_total{service="spin-echo-worker"}[5m])) > 0.5
for: 15m
labels:
  severity: warning
annotations:
  description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
  summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
fiat-latency-too-high (0 active)
alert: fiat-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(fiat:controller:invocations__total{service="spin-fiat"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(fiat:controller:invocations__count_total{service="spin-fiat"}[5m])) > 0.5
for: 15m
labels:
  severity: warning
annotations:
  description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
  summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
front50-latency-too-high (0 active)
alert: front50-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(front50:controller:invocations__total{service="spin-front50"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(front50:controller:invocations__count_total{service="spin-front50"}[5m])) > 0.5
for: 15m
labels:
  severity: warning
annotations:
  description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
  summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
gate-latency-too-high (0 active)
alert: gate-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(gate:controller:invocations__total{service="spin-gate"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(gate:controller:invocations__count_total{service="spin-gate"}[5m])) > 0.5
for: 15m
labels:
  severity: warning
annotations:
  description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
  summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
igor-latency-too-high (0 active)
alert: igor-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(igor:controller:invocations__total{service="spin-igor"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(igor:controller:invocations__count_total{service="spin-igor"}[5m])) > 0.5
for: 15m
labels:
  severity: warning
annotations:
  description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
  summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
orca-latency-too-high (0 active)
alert: orca-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(orca:controller:invocations__total{service="spin-orca"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(orca:controller:invocations__count_total{service="spin-orca"}[5m])) > 0.5
for: 15m
labels:
  severity: warning
annotations:
  description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
  summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
rosco-latency-too-high (0 active)
alert: rosco-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(rosco:controller:invocations__total{service="spin-rosco"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(rosco:controller:invocations__count_total{service="spin-rosco"}[5m])) > 0.5
for: 15m
labels:
  severity: warning
annotations:
  description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
  summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
/etc/alerts.d/spin_alerting_rules.yml > orca-queue-issue
orca-queue-depth-high (0 active)
alert: orca-queue-depth-high
expr: (sum by(instance) (orca:queue:ready:depth__value{namespace!=""})) > 10
labels:
  severity: warning
annotations:
  description: Service {{$labels.service}} in namespace {{$labels.namespace}} may be evicted soon
  summary: Orca queue depth is high
orca-queue-lag-high (0 active)
alert: orca-queue-lag-high
expr: sum by(instance, service, namespace) (rate(orca:controller:invocations__total[2m])) / sum by(instance, service, namespace) (rate(orca:controller:invocations__count_total[2m])) > 0.5
labels:
  severity: warning
annotations:
  description: Service {{$labels.service}} in namespace {{$labels.namespace}} has Lag value of {{$value}}
  summary: Orca queue lag is high
/etc/alerts.d/spin_alerting_rules.yml > prometheus-job-down
prometheus-job-is-down (0 active)
alert: prometheus-job-is-down
expr: up{job="prometheus"} == 0
for: 5m
labels:
  severity: warning
annotations:
  description: Default Prometheus Job is Down LABELS = {{ $labels }}
  summary: The Default Prometheus Job is Down (job {{ $labels.job}})
/etc/alerts.d/spin_alerting_rules.yml > spinnaker-service-is-down
clouddriver-caching-is-down (0 active)
alert: clouddriver-caching-is-down
expr: up{job="opsmx_spinnaker_metrics",service="spin-clouddriver-caching"} == 0
labels:
  severity: critical
annotations:
  description: Service {{$labels.service}} with pod name {{$labels.pod}} in namespace {{$labels.namespace}} is not responding
  summary: Clouddriver-caching Spinnaker service is down
clouddriver-ro-deck-is-down (0 active)
alert: clouddriver-ro-deck-is-down
expr: up{job="opsmx_spinnaker_metrics",service="spin-clouddriver-ro-deck"} == 0
labels:
  severity: critical
annotations:
  description: Service {{$labels.service}} with pod name {{$labels.pod}} in namespace {{$labels.namespace}} is not responding
  summary: Clouddriver-ro-deck Spinnaker service is down
clouddriver-ro-is-down (0 active)
alert: clouddriver-ro-is-down
expr: up{job="opsmx_spinnaker_metrics",service="spin-clouddriver-ro"} == 0
labels:
  severity: critical
annotations:
  description: Service {{$labels.service}} with pod name {{$labels.pod}} in namespace {{$labels.namespace}} is not responding
  summary: Clouddriver-ro Spinnaker service is down
clouddriver-rw-is-down (0 active)
alert: clouddriver-rw-is-down
expr: up{job="opsmx_spinnaker_metrics",service="spin-clouddriver-rw"} == 0
labels:
  severity: critical
annotations:
  description: Service {{$labels.service}} with pod name {{$labels.pod}} in namespace {{$labels.namespace}} is not responding
  summary: Clouddriver-rw Spinnaker service is down
echo-scheduler-is-down (0 active)
alert: echo-scheduler-is-down
expr: up{job="opsmx_spinnaker_metrics",service="spin-echo-scheduler"} == 0
labels:
  severity: critical
annotations:
  description: Service {{$labels.service}} with pod name {{$labels.pod}} in namespace {{$labels.namespace}} is not responding
  summary: Echo-Scheduler Spinnaker service is down
echo-worker-is-down (0 active)
alert: echo-worker-is-down
expr: up{job="opsmx_spinnaker_metrics",service="spin-echo-worker"} == 0
labels:
  severity: critical
annotations:
  description: Service {{$labels.service}} with pod name {{$labels.pod}} in namespace {{$labels.namespace}} is not responding
  summary: Echo-worker Spinnaker service is down
fiat-is-down (0 active)
alert: fiat-is-down
expr: up{job="opsmx_spinnaker_metrics",service="spin-fiat"} == 0
labels:
  severity: critical
annotations:
  description: Service {{$labels.service}} with pod name {{$labels.pod}} in namespace {{$labels.namespace}} is not responding
  summary: Fiat Spinnaker service is down
front50-is-down (0 active)
alert: front50-is-down
expr: up{job="opsmx_spinnaker_metrics",service="spin-front50"} == 0
labels:
  severity: critical
annotations:
  description: Service {{$labels.service}} with pod name {{$labels.pod}} in namespace {{$labels.namespace}} is not responding
  summary: Front50 Spinnaker service is down
gate-is-down (0 active)
alert: gate-is-down
expr: up{job="opsmx_spinnaker_metrics",service="spin-gate"} == 0
labels:
  severity: critical
annotations:
  description: Service {{$labels.service}} with pod name {{$labels.pod}} in namespace {{$labels.namespace}} is not responding
  summary: Gate Spinnaker services is down
igor-is-down (0 active)
alert: igor-is-down
expr: up{job="opsmx_spinnaker_metrics",service="spin-igor"} == 0
labels:
  severity: critical
annotations:
  description: Service {{$labels.service}} with pod name {{$labels.pod}} in namespace {{$labels.namespace}} is not responding
  summary: Igor Spinnaker service is down
orca-is-down (0 active)
alert: orca-is-down
expr: up{job="opsmx_spinnaker_metrics",service="spin-orca"} == 0
labels:
  severity: critical
annotations:
  description: Service {{$labels.service}} with pod name {{$labels.pod}} in namespace {{$labels.namespace}} is not responding
  summary: Orca Spinnaker service is down
rosco-is-down (0 active)
alert: rosco-is-down
expr: up{job="opsmx_spinnaker_metrics",service="spin-rosco"} == 0
labels:
  severity: critical
annotations:
  description: Service {{$labels.service}} with pod name {{$labels.pod}} in namespace {{$labels.namespace}} is not responding
  summary: Rosco Spinnaker service is down
/etc/alerts.d/spin_alerting_rules.yml > volume-is-almost-full (< 10% left)
pvc-storage-full (1 active)
alert: pvc-storage-full
expr: kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Volume is almost full (< 10% left)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Kubernetes Volume running out of disk space for (persistentvolumeclaim {{ $labels.persistentvolumeclaim }} in namespace {{$labels.namespace}})
Labels State Active Since Value
alertname="pvc-storage-full" beta_kubernetes_io_arch="amd64" beta_kubernetes_io_instance_type="io1-15" beta_kubernetes_io_os="linux" failure_domain_beta_kubernetes_io_region="IAD" instance="prod-instance-1761572832308" job="kubernetes-nodes" kubernetes_io_arch="amd64" kubernetes_io_hostname="prod-instance-1761572832308" kubernetes_io_os="linux" namespace="forstairwell" node_kubernetes_io_instance_type="io1-15" nodepool_ngpc_rxt_io_name="72c2db09-dbb2-4768-805b-7a6b062d9df5" persistentvolumeclaim="isd-prometheus-server" servers_ngpc_rxt_io_class="gp.vs1.large-iad" servers_ngpc_rxt_io_k8sNodeName="prod-instance-1761572832308" servers_ngpc_rxt_io_type="spot" severity="warning" topology_cinder_csi_openstack_org_zone="nova" topology_kubernetes_io_region="IAD" firing 2025-12-23 11:19:05.000257301 +0000 UTC 0