|
/etc/alerts.d/node_alerting_rules.yml > container_cpu_usage_is_high
|
|
|
|
/etc/alerts.d/node_alerting_rules.yml > container_memory_usage_is_high
|
|
|
|
/etc/alerts.d/node_alerting_rules.yml > node_cpu_greater_than_80
|
|
|
|
/etc/alerts.d/node_alerting_rules.yml > node_disk_space_too_low
|
|
|
|
/etc/alerts.d/node_alerting_rules.yml > node_down
|
alert: NODE_DOWN
expr: up{component="node-exporter"} == 0
for: 3m
labels:
severity: warning
annotations:
description: '{{ $labels.job }} job failed to scrape instance {{ $labels.instance }} for more than 3 minutes. Node Seems to be down'
summary: Node {{ $labels.kubernetes_node }} is down
|
|
/etc/alerts.d/node_alerting_rules.yml > node_memory_left_lessser_than_10
|
| Labels |
State |
Active Since |
Value |
|
alertname="NODE_MEMORY_LESS_THAN_10%"
app_kubernetes_io_component="metrics"
app_kubernetes_io_instance="monitor"
app_kubernetes_io_managed_by="Helm"
app_kubernetes_io_name="prometheus-node-exporter"
app_kubernetes_io_part_of="prometheus-node-exporter"
app_kubernetes_io_version="1.9.1"
helm_sh_chart="prometheus-node-exporter-4.46.1"
instance="162.209.125.31:9100"
job="kubernetes-service-endpoints"
jobLabel="node-exporter"
kubernetes_name="monitor-prometheus-node-exporter"
kubernetes_namespace="spinhar16"
kubernetes_node="prod-instance-17630262671353084"
release="monitor"
severity="critical"
|
firing |
2025-12-23 11:24:34.345417764 +0000 UTC |
9.417531783150848 |
| Annotations |
- description
- node prod-instance-17630262671353084 memory left is low
- summary
- node memory left is lesser than 10 precent
|
|
alertname="NODE_MEMORY_LESS_THAN_10%"
app="prometheus"
chart="prometheus-11.16.2"
component="node-exporter"
heritage="Helm"
instance="162.209.125.97:9100"
job="kubernetes-service-endpoints"
kubernetes_name="isd-prometheus-node-exporter"
kubernetes_namespace="isdall202508"
kubernetes_node="prod-instance-1757954925652"
release="isd"
severity="critical"
|
pending |
2025-12-23 11:27:34.345417764 +0000 UTC |
7.178954172416676 |
| Annotations |
- description
- node prod-instance-1757954925652 memory left is low
- summary
- node memory left is lesser than 10 precent
|
|
alertname="NODE_MEMORY_LESS_THAN_10%"
app_kubernetes_io_component="metrics"
app_kubernetes_io_instance="monitor"
app_kubernetes_io_managed_by="Helm"
app_kubernetes_io_name="prometheus-node-exporter"
app_kubernetes_io_part_of="prometheus-node-exporter"
app_kubernetes_io_version="1.9.1"
helm_sh_chart="prometheus-node-exporter-4.46.1"
instance="162.209.125.18:9100"
job="kubernetes-service-endpoints"
jobLabel="node-exporter"
kubernetes_name="monitor-prometheus-node-exporter"
kubernetes_namespace="spinhar16"
kubernetes_node="prod-instance-17630262671063083"
release="monitor"
severity="critical"
|
pending |
2025-12-23 11:27:34.345417764 +0000 UTC |
4.694559603776086 |
| Annotations |
- description
- node prod-instance-17630262671063083 memory left is low
- summary
- node memory left is lesser than 10 precent
|
|
alertname="NODE_MEMORY_LESS_THAN_10%"
app_kubernetes_io_component="metrics"
app_kubernetes_io_instance="monitor"
app_kubernetes_io_managed_by="Helm"
app_kubernetes_io_name="prometheus-node-exporter"
app_kubernetes_io_part_of="prometheus-node-exporter"
app_kubernetes_io_version="1.9.1"
helm_sh_chart="prometheus-node-exporter-4.46.1"
instance="162.209.124.122:9100"
job="kubernetes-service-endpoints"
jobLabel="node-exporter"
kubernetes_name="monitor-prometheus-node-exporter"
kubernetes_namespace="spinhar16"
kubernetes_node="prod-instance-1760074260256"
release="monitor"
severity="critical"
|
firing |
2025-12-23 11:24:34.345417764 +0000 UTC |
6.595103088689165 |
| Annotations |
- description
- node prod-instance-1760074260256 memory left is low
- summary
- node memory left is lesser than 10 precent
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > Front50-cache
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > autopilot-component-jvm-errors
|
alert: jvm-memory-filling-up-for-oes-audit-client
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="auditclient"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="auditclient"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
alert: jvm-memory-filling-up-for-oes-autopilot
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="autopilot"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="autopilot"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
alert: jvm-memory-filling-up-for-oes-dashboard
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="dashboard"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="autopilot"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
alert: jvm-memory-filling-up-for-oes-platform
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="platform"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="platform"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
alert: jvm-memory-filling-up-for-oes-sapor
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="sapor"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="sapor"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
alert: jvm-memory-filling-up-for-oes-visibility
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="visibility"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="visibility"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
|
/etc/alerts.d/spin_alerting_rules.yml > autopilot-component-latency-too-high
|
alert: oes-audit-client-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="auditclient"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="auditclient"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
alert: oes-autopilot-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="autopilot"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="autopilot"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
alert: oes-dashboard-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="dashboard"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="dashboard"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
alert: oes-platform-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="platform"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="platform"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
alert: oes-sapor-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="sapor"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="sapor"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
alert: oes-visibility-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="visibility"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="visibility"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
|
/etc/alerts.d/spin_alerting_rules.yml > autopilot-scrape-target-is-down
|
| Labels |
State |
Active Since |
Value |
|
alertname="oes-audit-client-scrape-target-is-down"
app="oes"
chart="oes-4.0.29"
component="auditclient"
heritage="Helm"
instance="10.20.173.140:8098"
job="kubernetes-pods"
kubernetes_namespace="isdupg4043to2508"
kubernetes_pod_name="oes-audit-client-6bd5494c96-xnxcf"
pod_template_hash="6bd5494c96"
release="isd"
severity="critical"
|
firing |
2025-12-23 11:19:13.633722481 +0000 UTC |
0 |
| Annotations |
- description
- The scrape target endpoint of component auditclient in namespace isdupg4043to2508 is down
- summary
- oes-audit-client scrape target is down
|
|
| Labels |
State |
Active Since |
Value |
|
alertname="oes-autopilot-scrape-target-is-down"
app="oes"
chart="oes-4.0.29"
component="autopilot"
heritage="Helm"
instance="10.20.173.156:8090"
job="kubernetes-pods"
kubernetes_namespace="isdspin251000"
kubernetes_pod_name="oes-autopilot-69cb658b65-78d84"
pod_template_hash="69cb658b65"
release="isd"
severity="critical"
|
firing |
2025-12-23 11:18:13.633722481 +0000 UTC |
0 |
| Annotations |
- description
- The scrape target endpoint of component autopilot in namespace isdspin251000 is down
- summary
- oes-autopilot scrape target is down
|
|
alertname="oes-autopilot-scrape-target-is-down"
app="oes"
chart="oes-4.0.29"
component="autopilot"
heritage="Helm"
instance="10.20.173.146:8090"
job="kubernetes-pods"
kubernetes_namespace="isdupg4043to2508"
kubernetes_pod_name="oes-autopilot-6ccd5c6f99-8bq4d"
pod_template_hash="6ccd5c6f99"
release="isd"
severity="critical"
|
firing |
2025-12-23 11:19:13.633722481 +0000 UTC |
0 |
| Annotations |
- description
- The scrape target endpoint of component autopilot in namespace isdupg4043to2508 is down
- summary
- oes-autopilot scrape target is down
|
|
| Labels |
State |
Active Since |
Value |
|
alertname="oes-dashboard-scrape-target-is-down"
app="oes"
chart="oes-4.0.29"
component="dashboard"
heritage="Helm"
instance="10.20.173.141:8094"
job="kubernetes-pods"
kubernetes_namespace="isdupg4043to2508"
kubernetes_pod_name="oes-dashboard-845698bdd7-hdv5z"
pod_template_hash="845698bdd7"
release="isd"
severity="critical"
|
firing |
2025-12-23 11:19:13.633722481 +0000 UTC |
0 |
| Annotations |
- description
- The scrape target endpoint of component dashboard in namespace isdupg4043to2508 is down
- summary
- oes-dashboard scrape target is down
|
|
| Labels |
State |
Active Since |
Value |
|
alertname="oes-platform-scrape-target-is-down"
app="oes"
chart="oes-4.0.29"
component="platform"
heritage="Helm"
instance="10.20.173.189:8095"
job="kubernetes-pods"
kubernetes_namespace="isdupg4043to2508"
kubernetes_pod_name="oes-platform-5d86cfc5bc-g4cz8"
pod_template_hash="5d86cfc5bc"
release="isd"
severity="critical"
|
firing |
2025-12-23 11:19:13.633722481 +0000 UTC |
0 |
| Annotations |
- description
- The scrape target endpoint of component platform in namespace isdupg4043to2508 is down
- summary
- oes-platform scrape target is down
|
|
| Labels |
State |
Active Since |
Value |
|
alertname="oes-sapor-scrape-target-is-down"
app="oes"
chart="oes-4.0.29"
component="sapor"
heritage="Helm"
instance="10.20.173.134:8085"
job="kubernetes-pods"
kubernetes_namespace="isdspin251000"
kubernetes_pod_name="oes-sapor-56865cdb8-6mdwh"
pod_template_hash="56865cdb8"
release="isd"
severity="critical"
|
firing |
2025-12-23 11:19:13.633722481 +0000 UTC |
0 |
| Annotations |
- description
- The scrape target endpoint of component sapor in namespace isdspin251000 is down
- summary
- oes-sapor scrape target is down
|
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > igor-needs-attention
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > jvm-too-high
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > kube-api-server-is-down
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > kubernetes-api-server-experiencing-high-error-rate
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > latency-too-high
|
alert: clouddriver-caching-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_caching:controller:invocations__total{service="spin-clouddriver-caching"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_caching:controller:invocations__count_total{service="spin-clouddriver-caching"}[5m])) > 5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: clouddriver-ro-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_ro:controller:invocations__total{service="spin-clouddriver-ro"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_ro:controller:invocations__count_total{service="spin-clouddriver-ro"}[5m])) > 1
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: clouddriver-rw-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_rw:controller:invocations__total{service="spin-clouddriver-rw"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_rw:controller:invocations__count_total{service="spin-clouddriver-rw"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is ({{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: clouddriver_ro_deck-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_ro_deck:controller:invocations__total{service="spin-clouddriver-ro-deck"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_ro_deck:controller:invocations__total{service="spin-clouddriver-ro-deck"}[5m])) > 5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: echo_scheduler-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(echo_scheduler:controller:invocations__total{service="spin-echo-scheduler"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(echo_scheduler:controller:invocations__count_total{service="spin-echo-scheduler"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: echo_worker-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(echo_worker:controller:invocations__total{service="spin-echo-worker"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(echo_worker:controller:invocations__count_total{service="spin-echo-worker"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: fiat-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(fiat:controller:invocations__total{service="spin-fiat"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(fiat:controller:invocations__count_total{service="spin-fiat"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: front50-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(front50:controller:invocations__total{service="spin-front50"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(front50:controller:invocations__count_total{service="spin-front50"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: gate-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(gate:controller:invocations__total{service="spin-gate"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(gate:controller:invocations__count_total{service="spin-gate"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: igor-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(igor:controller:invocations__total{service="spin-igor"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(igor:controller:invocations__count_total{service="spin-igor"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: orca-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(orca:controller:invocations__total{service="spin-orca"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(orca:controller:invocations__count_total{service="spin-orca"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: rosco-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(rosco:controller:invocations__total{service="spin-rosco"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(rosco:controller:invocations__count_total{service="spin-rosco"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
|
/etc/alerts.d/spin_alerting_rules.yml > orca-queue-issue
|
|
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > prometheus-job-down
|
alert: prometheus-job-is-down
expr: up{job="prometheus"} == 0
for: 5m
labels:
severity: warning
annotations:
description: Default Prometheus Job is Down LABELS = {{ $labels }}
summary: The Default Prometheus Job is Down (job {{ $labels.job}})
|
|
/etc/alerts.d/spin_alerting_rules.yml > spinnaker-service-is-down
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > volume-is-almost-full (< 10% left)
|
| Labels |
State |
Active Since |
Value |
|
alertname="pvc-storage-full"
beta_kubernetes_io_arch="amd64"
beta_kubernetes_io_instance_type="io1-15"
beta_kubernetes_io_os="linux"
failure_domain_beta_kubernetes_io_region="IAD"
instance="prod-instance-1761572832308"
job="kubernetes-nodes"
kubernetes_io_arch="amd64"
kubernetes_io_hostname="prod-instance-1761572832308"
kubernetes_io_os="linux"
namespace="forstairwell"
node_kubernetes_io_instance_type="io1-15"
nodepool_ngpc_rxt_io_name="72c2db09-dbb2-4768-805b-7a6b062d9df5"
persistentvolumeclaim="isd-prometheus-server"
servers_ngpc_rxt_io_class="gp.vs1.large-iad"
servers_ngpc_rxt_io_k8sNodeName="prod-instance-1761572832308"
servers_ngpc_rxt_io_type="spot"
severity="warning"
topology_cinder_csi_openstack_org_zone="nova"
topology_kubernetes_io_region="IAD"
|
firing |
2025-12-23 11:19:05.000257301 +0000 UTC |
0 |
| Annotations |
- description
- Volume is almost full (< 10% left)
VALUE = 0
LABELS = map[beta_kubernetes_io_arch:amd64 beta_kubernetes_io_instance_type:io1-15 beta_kubernetes_io_os:linux failure_domain_beta_kubernetes_io_region:IAD instance:prod-instance-1761572832308 job:kubernetes-nodes kubernetes_io_arch:amd64 kubernetes_io_hostname:prod-instance-1761572832308 kubernetes_io_os:linux namespace:forstairwell node_kubernetes_io_instance_type:io1-15 nodepool_ngpc_rxt_io_name:72c2db09-dbb2-4768-805b-7a6b062d9df5 persistentvolumeclaim:isd-prometheus-server servers_ngpc_rxt_io_class:gp.vs1.large-iad servers_ngpc_rxt_io_k8sNodeName:prod-instance-1761572832308 servers_ngpc_rxt_io_type:spot topology_cinder_csi_openstack_org_zone:nova topology_kubernetes_io_region:IAD]
- summary
- Kubernetes Volume running out of disk space for (persistentvolumeclaim isd-prometheus-server in namespace forstairwell)
|
|