148 lines
5.7 KiB
YAML
148 lines
5.7 KiB
YAML
groups:
|
||
# ── Recording rules — pre-compute expensive percentile queries ─────────────
|
||
- name: veylant_recording_rules
|
||
interval: 30s
|
||
rules:
|
||
# p99 request duration over a 5-minute sliding window, per model and provider.
|
||
- record: veylant:request_duration:p99
|
||
expr: |
|
||
histogram_quantile(
|
||
0.99,
|
||
sum by (le, model, provider) (
|
||
rate(veylant_request_duration_seconds_bucket[5m])
|
||
)
|
||
)
|
||
|
||
# p95 request duration (for dashboard and alerting).
|
||
- record: veylant:request_duration:p95
|
||
expr: |
|
||
histogram_quantile(
|
||
0.95,
|
||
sum by (le, model, provider) (
|
||
rate(veylant_request_duration_seconds_bucket[5m])
|
||
)
|
||
)
|
||
|
||
# Request rate (RPS) per provider.
|
||
- record: veylant:request_rate:1m
|
||
expr: |
|
||
sum by (provider, status_code) (
|
||
rate(veylant_request_total[1m])
|
||
)
|
||
|
||
# Error rate (4xx/5xx) as a fraction of total requests.
|
||
- record: veylant:error_rate:5m
|
||
expr: |
|
||
sum by (provider) (
|
||
rate(veylant_request_total{status_code=~"[45].."}[5m])
|
||
)
|
||
/
|
||
sum by (provider) (
|
||
rate(veylant_request_total[5m])
|
||
)
|
||
|
||
# ── Alert rules ────────────────────────────────────────────────────────────
|
||
- name: veylant_alerts
|
||
rules:
|
||
# Fire when p99 latency exceeds 500ms for more than 5 minutes.
|
||
- alert: VeylantHighLatencyP99
|
||
expr: veylant:request_duration:p99 > 0.5
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
team: platform
|
||
annotations:
|
||
summary: "Veylant proxy p99 latency is above 500ms"
|
||
description: >
|
||
p99 latency for model={{ $labels.model }} provider={{ $labels.provider }}
|
||
is {{ $value | humanizeDuration }} (threshold: 500ms).
|
||
Check upstream provider health and connection pool utilisation.
|
||
runbook: "https://docs.veylant.ai/runbooks/high-latency"
|
||
|
||
# Fire when error rate exceeds 5% for more than 2 minutes.
|
||
- alert: VeylantHighErrorRate
|
||
expr: veylant:error_rate:5m > 0.05
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
team: platform
|
||
annotations:
|
||
summary: "Veylant proxy error rate is above 5%"
|
||
description: >
|
||
Error rate for provider={{ $labels.provider }} is
|
||
{{ $value | humanizePercentage }} over the last 5 minutes.
|
||
runbook: "https://docs.veylant.ai/runbooks/high-error-rate"
|
||
|
||
# Fire when a circuit breaker opens (provider is failing).
|
||
- alert: VeylantCircuitBreakerOpen
|
||
expr: veylant_circuit_breaker_state{state="open"} == 1
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
team: platform
|
||
annotations:
|
||
summary: "Circuit breaker open for provider {{ $labels.provider }}"
|
||
description: >
|
||
The circuit breaker for provider={{ $labels.provider }} has been open
|
||
for more than 1 minute. Requests are being rejected.
|
||
runbook: "https://docs.veylant.ai/runbooks/provider-down"
|
||
|
||
# Fire when the proxy is not reachable by Prometheus scrape.
|
||
- alert: VeylantProxyDown
|
||
expr: up{job="veylant-proxy"} == 0
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
team: platform
|
||
annotations:
|
||
summary: "Veylant proxy is down"
|
||
description: >
|
||
The Prometheus scrape target for job="veylant-proxy" has been unreachable
|
||
for more than 1 minute. The proxy may be crashed or the pod is not running.
|
||
runbook: "https://docs.veylant.ai/runbooks/provider-down"
|
||
|
||
# Fire when a TLS certificate expires in less than 30 days.
|
||
- alert: VeylantCertExpiringSoon
|
||
expr: |
|
||
probe_ssl_earliest_cert_expiry{job="veylant-proxy"} - time() < 30 * 24 * 3600
|
||
for: 1h
|
||
labels:
|
||
severity: warning
|
||
team: platform
|
||
annotations:
|
||
summary: "TLS certificate expiring within 30 days"
|
||
description: >
|
||
The TLS certificate for the Veylant proxy expires in
|
||
{{ $value | humanizeDuration }}. Renew immediately to avoid service disruption.
|
||
runbook: "https://docs.veylant.ai/runbooks/certificate-expired"
|
||
|
||
# Fire when PostgreSQL active connections are high (pool exhaustion risk).
|
||
- alert: VeylantDBConnectionsHigh
|
||
expr: veylant_db_connections_active > 20
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
team: platform
|
||
annotations:
|
||
summary: "PostgreSQL active connections above threshold"
|
||
description: >
|
||
PostgreSQL active connections = {{ $value }} (threshold: 20).
|
||
Risk of connection pool exhaustion — check for slow queries or connection leaks.
|
||
runbook: "https://docs.veylant.ai/runbooks/database-full"
|
||
|
||
# Fire when PII detection volume is anomalously high (possible data exfiltration attempt).
|
||
- alert: VeylantPIIVolumeAnomaly
|
||
expr: |
|
||
rate(veylant_pii_entities_detected_total[5m])
|
||
> 3 * avg_over_time(rate(veylant_pii_entities_detected_total[5m])[1h:5m])
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
team: security
|
||
annotations:
|
||
summary: "PII detection volume anomaly detected"
|
||
description: >
|
||
PII entity detection rate is {{ $value | humanize }} entities/sec —
|
||
more than 3× the 1-hour baseline. Possible data exfiltration or misconfigured client.
|
||
runbook: "https://docs.veylant.ai/runbooks/pii-breach"
|