Alerts

Inactive (19) Pending (0) Firing (1)

/etc/prometheus/rules/alerts.yml > Blackbox exporter alerts

HTTPSNotUsed (0 active)

alert: HTTPSNotUsed
expr: probe_http_ssl{job="blackbox",module=~"https(_ipv6)?"}
  == 0
for: 10m
labels:
  severity: warning
annotations:
  description: The HTTP server at {{ $labels.instance }} did not redirect to HTTPS,
    or SSL failed.
  generic_summary: HTTPS not used
  summary: The HTTP server at {{ $labels.instance }} did not force SSL

ProbeFailure (0 active)

alert: ProbeFailure
expr: probe_success{job="blackbox"}
  == 0
for: 10m
labels:
  severity: critical
annotations:
  description: The {{ $labels.module }} probe to {{ $labels.instance }} has failed
    due to protocol errors or failed checks.
  generic_summary: Blackbox probe failed
  summary: The probe to {{ $labels.instance }} has failed

SSLCertExpiringSoon (0 active)

alert: SSLCertExpiringSoon
expr: probe_ssl_earliest_cert_expiry{job="blackbox"}
  - time() < 86400 * 15
for: 10m
labels:
  severity: warning
annotations:
  description: The SSL certificate at {{ $labels.instance }} will expire in {{ humanizeDuration
    $value }} days.
  generic_summary: SSL certificate expiring soon
  summary: The SSL certificate at {{ $labels.instance }} will expire soon

SSLCertExpiringSoon (0 active)

alert: SSLCertExpiringSoon
expr: probe_ssl_earliest_cert_expiry{job="blackbox"}
  - time() < 86400 * 7
for: 10m
labels:
  severity: critical
annotations:
  description: The SSL certificate at {{ $labels.instance }} will expire in {{ humanizeDuration
    $value }} days.
  generic_summary: SSL certificate expiring VERY soon
  summary: The SSL certificate at {{ $labels.instance }} will expire VERY soon

/etc/prometheus/rules/alerts.yml > Cronjob alerts

FailedCronJob (0 active)

alert: FailedCronJob
expr: batch_last_finish_seconds
  > batch_last_success_seconds
for: 5m
labels:
  severity: warning
annotations:
  description: The last run of cronjob {{ $labels.job }} in {{ $labels.instance }}
    has failed.
  generic_summary: Cronjob failed
  summary: Cronjob {{ $labels.job }} in {{ $labels.instance }} has failed

MissingCronJob (0 active)

alert: MissingCronJob
expr: time()
  - batch_last_start_seconds > batch_period_seconds
for: 5m
labels:
  severity: warning
annotations:
  description: The cronjob {{ $labels.job }} in {{ $labels.instance }} has not run
    in the expected period.
  generic_summary: Cronjob missing
  summary: Cronjob {{ $labels.job }} in {{ $labels.instance }} has not run

SlowCronJob (0 active)

alert: SlowCronJob
expr: batch_running_time_seconds
  > 7200
for: 5m
labels:
  severity: info
annotations:
  description: The last run of cronjob {{ $labels.job }} in {{ $labels.instance }}
    has taken more than 2 hours.
  generic_summary: Cronjob too slow
  summary: Cronjob {{ $labels.job }} in {{ $labels.instance }} is too slow

StuckCronJob (0 active)

alert: StuckCronJob
expr: batch_running_time_seconds
  > 14400
for: 5m
labels:
  severity: warning
annotations:
  description: The last run of cronjob {{ $labels.job }} in {{ $labels.instance }}
    has taken more than 4 hours, and it is considered stuck/hung.
  generic_summary: Cronjob stuck
  summary: Cronjob {{ $labels.job }} in {{ $labels.instance }} is stuck

/etc/prometheus/rules/alerts.yml > General alerts

InstanceDown (1 active)

alert: InstanceDown
expr: up ==
  0 or pg_up == 0
for: 5m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for
    more than 5 minutes.'
  generic_summary: Service down
  summary: Instance {{ $labels.instance }} down

Labels	State	Active Since	Value
alertname="InstanceDown" env="test" instance="miller.infra.assekuransa.com:9100" job="node" severity="critical"	firing	2025-10-03 14:06:34.863625018 +0000 UTC	0
Annotations
description miller.infra.assekuransa.com:9100 of job node has been down for more than 5 minutes. generic_summary Service down summary Instance miller.infra.assekuransa.com:9100 down

/etc/prometheus/rules/alerts.yml > Node alerts

FSFull (0 active)

alert: FSFull
expr: instance:node_filesystem_avail_bytes_per_node_filesystem_size_bytes:ratio{job="node"}
  * 100 <= 1
for: 5m
labels:
  severity: warning
annotations:
  description: The {{ $labels.mountpoint }} filesystem in {{ $labels.instance }} has
    less than 5% available space.
  generic_summary: Filesystem almost full
  summary: Filesystem {{ $labels.mountpoint }} in {{ $labels.instance }} is almost
    full

FSFull (0 active)

alert: FSFull
expr: instance:node_filesystem_avail_bytes_per_node_filesystem_size_bytes:ratio{job="node"}
  * 100 <= 0.5
for: 5m
labels:
  severity: critical
annotations:
  description: The {{ $labels.mountpoint }} filesystem in {{ $labels.instance }} is
    full.
  generic_summary: Filesystem full
  summary: Filesystem {{ $labels.mountpoint }} in {{ $labels.instance }} is full

FSFullSoon (0 active)

alert: FSFullSoon
expr: predict_linear(instance:node_filesystem_avail_bytes:sum{job="node"}[12h],
  24 * 3600) <= 0
for: 5m
labels:
  severity: info
annotations:
  description: The {{ $labels.mountpoint }} filesystem in {{ $labels.instance }} will
    be full in 24 hours at the current rate.
  generic_summary: Filesystem full soon
  summary: Filesystem {{ $labels.mountpoint }} in {{ $labels.instance }} will fill
    soon

FSFullSoon (0 active)

alert: FSFullSoon
expr: predict_linear(instance:node_filesystem_avail_bytes:sum{job="node"}[4h],
  4 * 3600) <= 0
for: 30m
labels:
  severity: warning
annotations:
  description: The {{ $labels.mountpoint }} filesystem in {{ $labels.instance }} will
    be full in 4 hours at the current rate.
  generic_summary: Filesystem full VERY soon
  summary: Filesystem {{ $labels.mountpoint }} in {{ $labels.instance }} will fill
    VERY soon

HighCpuUsage (0 active)

alert: HighCpuUsage
expr: (1
  - instance:node_cpu_seconds_total:avg_rate5m{job="node",mode="idle"})
  * 100 > 90
for: 5m
labels:
  severity: info
annotations:
  description: The CPU usage in {{ $labels.instance }} has been over 90% for more
    than 5 minutes.
  generic_summary: CPU usage too high
  summary: CPU usage in {{ $labels.instance }} is too high

HighLoadAvg (0 active)

alert: HighLoadAvg
expr: node_load15{job="node"}
  > 100
for: 5m
labels:
  severity: info
annotations:
  description: The 15-minute load average in {{ $labels.instance }} has been over
    100 for more than 5 minutes.
  generic_summary: Load average too high
  summary: The load average in {{ $labels.instance }} is too high

MemFull (0 active)

alert: MemFull
expr: instance:node_memory_MemUsed_bytes_per_node_memory_MemTotal_bytes:ratio{job="node"}
  * 100 > 90
for: 15m
labels:
  severity: info
annotations:
  description: The memory usage in {{ $labels.instance }} has been over 90% for more
    than 15 minutes.
  generic_summary: Memory usage too high
  summary: Memory usage in {{ $labels.instance }} is too high

MemFull (0 active)

alert: MemFull
expr: instance:node_memory_MemUsed_bytes_per_node_memory_MemTotal_bytes:ratio{job="node"}
  * 100 > 95
for: 5m
labels:
  severity: warning
annotations:
  description: The memory usage in {{ $labels.instance }} has been over 95% for more
    than 5 minutes.
  generic_summary: Memory usage critical
  summary: Memory usage in {{ $labels.instance }} is critical

MemFullSoon (0 active)

alert: MemFullSoon
expr: predict_linear(instance:node_memory_MemUsed_bytes_per_node_memory_MemTotal_bytes:ratio{job="node"}[12h],
  24 * 3600) * 100 > 99
for: 5m
labels:
  severity: info
annotations:
  description: The memory usage in {{ $labels.instance }} in {{ $labels.instance }}
    will reach 100% in 24 hours at the current rate.
  generic_summary: Memory full soon
  summary: Memory in {{ $labels.instance }} will fill in 24h

MemFullSoon (0 active)

alert: MemFullSoon
expr: predict_linear(instance:node_memory_MemUsed_bytes_per_node_memory_MemTotal_bytes:ratio{job="node"}[8h],
  4 * 3600) * 100 > 99
for: 30m
labels:
  severity: warning
annotations:
  description: The memory usage in {{ $labels.instance }} in {{ $labels.instance }}
    will reach 100% in 4 hours at the current rate.
  generic_summary: Memory full VERY soon
  summary: Memory in {{ $labels.instance }} will fill in 4h

ProcessNearFDLimits (0 active)

alert: ProcessNearFDLimits
expr: process_open_fds
  / process_max_fds * 100 > 80
for: 5m
labels:
  severity: warning
annotations:
  description: The process for {{ $labels.job }} in {{ $labels.instance }} has {{
    $value }}% of available file descriptors in use.
  generic_summary: Too many files open
  summary: The process in {{ $labels.instance }} has too many files open.