veylant/deploy/prometheus/rules.yml
2026-02-23 13:35:04 +01:00

148 lines
5.7 KiB
YAML
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

groups:
# ── Recording rules — pre-compute expensive percentile queries ─────────────
- name: veylant_recording_rules
interval: 30s
rules:
# p99 request duration over a 5-minute sliding window, per model and provider.
- record: veylant:request_duration:p99
expr: |
histogram_quantile(
0.99,
sum by (le, model, provider) (
rate(veylant_request_duration_seconds_bucket[5m])
)
)
# p95 request duration (for dashboard and alerting).
- record: veylant:request_duration:p95
expr: |
histogram_quantile(
0.95,
sum by (le, model, provider) (
rate(veylant_request_duration_seconds_bucket[5m])
)
)
# Request rate (RPS) per provider.
- record: veylant:request_rate:1m
expr: |
sum by (provider, status_code) (
rate(veylant_request_total[1m])
)
# Error rate (4xx/5xx) as a fraction of total requests.
- record: veylant:error_rate:5m
expr: |
sum by (provider) (
rate(veylant_request_total{status_code=~"[45].."}[5m])
)
/
sum by (provider) (
rate(veylant_request_total[5m])
)
# ── Alert rules ────────────────────────────────────────────────────────────
- name: veylant_alerts
rules:
# Fire when p99 latency exceeds 500ms for more than 5 minutes.
- alert: VeylantHighLatencyP99
expr: veylant:request_duration:p99 > 0.5
for: 5m
labels:
severity: warning
team: platform
annotations:
summary: "Veylant proxy p99 latency is above 500ms"
description: >
p99 latency for model={{ $labels.model }} provider={{ $labels.provider }}
is {{ $value | humanizeDuration }} (threshold: 500ms).
Check upstream provider health and connection pool utilisation.
runbook: "https://docs.veylant.ai/runbooks/high-latency"
# Fire when error rate exceeds 5% for more than 2 minutes.
- alert: VeylantHighErrorRate
expr: veylant:error_rate:5m > 0.05
for: 2m
labels:
severity: critical
team: platform
annotations:
summary: "Veylant proxy error rate is above 5%"
description: >
Error rate for provider={{ $labels.provider }} is
{{ $value | humanizePercentage }} over the last 5 minutes.
runbook: "https://docs.veylant.ai/runbooks/high-error-rate"
# Fire when a circuit breaker opens (provider is failing).
- alert: VeylantCircuitBreakerOpen
expr: veylant_circuit_breaker_state{state="open"} == 1
for: 1m
labels:
severity: critical
team: platform
annotations:
summary: "Circuit breaker open for provider {{ $labels.provider }}"
description: >
The circuit breaker for provider={{ $labels.provider }} has been open
for more than 1 minute. Requests are being rejected.
runbook: "https://docs.veylant.ai/runbooks/provider-down"
# Fire when the proxy is not reachable by Prometheus scrape.
- alert: VeylantProxyDown
expr: up{job="veylant-proxy"} == 0
for: 1m
labels:
severity: critical
team: platform
annotations:
summary: "Veylant proxy is down"
description: >
The Prometheus scrape target for job="veylant-proxy" has been unreachable
for more than 1 minute. The proxy may be crashed or the pod is not running.
runbook: "https://docs.veylant.ai/runbooks/provider-down"
# Fire when a TLS certificate expires in less than 30 days.
- alert: VeylantCertExpiringSoon
expr: |
probe_ssl_earliest_cert_expiry{job="veylant-proxy"} - time() < 30 * 24 * 3600
for: 1h
labels:
severity: warning
team: platform
annotations:
summary: "TLS certificate expiring within 30 days"
description: >
The TLS certificate for the Veylant proxy expires in
{{ $value | humanizeDuration }}. Renew immediately to avoid service disruption.
runbook: "https://docs.veylant.ai/runbooks/certificate-expired"
# Fire when PostgreSQL active connections are high (pool exhaustion risk).
- alert: VeylantDBConnectionsHigh
expr: veylant_db_connections_active > 20
for: 5m
labels:
severity: warning
team: platform
annotations:
summary: "PostgreSQL active connections above threshold"
description: >
PostgreSQL active connections = {{ $value }} (threshold: 20).
Risk of connection pool exhaustion — check for slow queries or connection leaks.
runbook: "https://docs.veylant.ai/runbooks/database-full"
# Fire when PII detection volume is anomalously high (possible data exfiltration attempt).
- alert: VeylantPIIVolumeAnomaly
expr: |
rate(veylant_pii_entities_detected_total[5m])
> 3 * avg_over_time(rate(veylant_pii_entities_detected_total[5m])[1h:5m])
for: 5m
labels:
severity: warning
team: security
annotations:
summary: "PII detection volume anomaly detected"
description: >
PII entity detection rate is {{ $value | humanize }} entities/sec —
more than 3× the 1-hour baseline. Possible data exfiltration or misconfigured client.
runbook: "https://docs.veylant.ai/runbooks/pii-breach"