|
/etc/prometheus/rules/alerts.yml > Blackbox exporter alerts
|
|
|
alert: ProbeFailure
expr: probe_success{job="blackbox"}
== 0
for: 10m
labels:
severity: critical
annotations:
description: The {{ $labels.module }} probe to {{ $labels.instance }} has failed
due to protocol errors or failed checks.
generic_summary: Blackbox probe failed
summary: The probe to {{ $labels.instance }} has failed
|
|
|
|
|
|
/etc/prometheus/rules/alerts.yml > Cronjob alerts
|
|
|
|
|
alert: SlowCronJob
expr: batch_running_time_seconds
> 7200
for: 5m
labels:
severity: info
annotations:
description: The last run of cronjob {{ $labels.job }} in {{ $labels.instance }}
has taken more than 2 hours.
generic_summary: Cronjob too slow
summary: Cronjob {{ $labels.job }} in {{ $labels.instance }} is too slow
|
alert: StuckCronJob
expr: batch_running_time_seconds
> 14400
for: 5m
labels:
severity: warning
annotations:
description: The last run of cronjob {{ $labels.job }} in {{ $labels.instance }}
has taken more than 4 hours, and it is considered stuck/hung.
generic_summary: Cronjob stuck
summary: Cronjob {{ $labels.job }} in {{ $labels.instance }} is stuck
|
|
/etc/prometheus/rules/alerts.yml > General alerts
|
alert: InstanceDown
expr: up ==
0 or pg_up == 0
for: 5m
labels:
severity: critical
annotations:
description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for
more than 5 minutes.'
generic_summary: Service down
summary: Instance {{ $labels.instance }} down
| Labels |
State |
Active Since |
Value |
|
alertname="InstanceDown"
env="test"
instance="miller.infra.assekuransa.com:9100"
job="node"
severity="critical"
|
firing |
2025-10-03 14:06:34.863625018 +0000 UTC |
0 |
| Annotations |
- description
- miller.infra.assekuransa.com:9100 of job node has been down for more than 5 minutes.
- generic_summary
- Service down
- summary
- Instance miller.infra.assekuransa.com:9100 down
|
|
|
/etc/prometheus/rules/alerts.yml > Node alerts
|
|
|
|
|
|
|
|
|
|
|
alert: HighLoadAvg
expr: node_load15{job="node"}
> 100
for: 5m
labels:
severity: info
annotations:
description: The 15-minute load average in {{ $labels.instance }} has been over
100 for more than 5 minutes.
generic_summary: Load average too high
summary: The load average in {{ $labels.instance }} is too high
|
|
|
|
|
|
|
|
|
alert: ProcessNearFDLimits
expr: process_open_fds
/ process_max_fds * 100 > 80
for: 5m
labels:
severity: warning
annotations:
description: The process for {{ $labels.job }} in {{ $labels.instance }} has {{
$value }}% of available file descriptors in use.
generic_summary: Too many files open
summary: The process in {{ $labels.instance }} has too many files open.
|