groups: # ── Recording rules — pre-compute expensive percentile queries ───────────── - name: veylant_recording_rules interval: 30s rules: # p99 request duration over a 5-minute sliding window, per model and provider. - record: veylant:request_duration:p99 expr: | histogram_quantile( 0.99, sum by (le, model, provider) ( rate(veylant_request_duration_seconds_bucket[5m]) ) ) # p95 request duration (for dashboard and alerting). - record: veylant:request_duration:p95 expr: | histogram_quantile( 0.95, sum by (le, model, provider) ( rate(veylant_request_duration_seconds_bucket[5m]) ) ) # Request rate (RPS) per provider. - record: veylant:request_rate:1m expr: | sum by (provider, status_code) ( rate(veylant_request_total[1m]) ) # Error rate (4xx/5xx) as a fraction of total requests. - record: veylant:error_rate:5m expr: | sum by (provider) ( rate(veylant_request_total{status_code=~"[45].."}[5m]) ) / sum by (provider) ( rate(veylant_request_total[5m]) ) # ── Alert rules ──────────────────────────────────────────────────────────── - name: veylant_alerts rules: # Fire when p99 latency exceeds 500ms for more than 5 minutes. - alert: VeylantHighLatencyP99 expr: veylant:request_duration:p99 > 0.5 for: 5m labels: severity: warning team: platform annotations: summary: "Veylant proxy p99 latency is above 500ms" description: > p99 latency for model={{ $labels.model }} provider={{ $labels.provider }} is {{ $value | humanizeDuration }} (threshold: 500ms). Check upstream provider health and connection pool utilisation. runbook: "https://docs.veylant.ai/runbooks/high-latency" # Fire when error rate exceeds 5% for more than 2 minutes. - alert: VeylantHighErrorRate expr: veylant:error_rate:5m > 0.05 for: 2m labels: severity: critical team: platform annotations: summary: "Veylant proxy error rate is above 5%" description: > Error rate for provider={{ $labels.provider }} is {{ $value | humanizePercentage }} over the last 5 minutes. runbook: "https://docs.veylant.ai/runbooks/high-error-rate" # Fire when a circuit breaker opens (provider is failing). - alert: VeylantCircuitBreakerOpen expr: veylant_circuit_breaker_state{state="open"} == 1 for: 1m labels: severity: critical team: platform annotations: summary: "Circuit breaker open for provider {{ $labels.provider }}" description: > The circuit breaker for provider={{ $labels.provider }} has been open for more than 1 minute. Requests are being rejected. runbook: "https://docs.veylant.ai/runbooks/provider-down" # Fire when the proxy is not reachable by Prometheus scrape. - alert: VeylantProxyDown expr: up{job="veylant-proxy"} == 0 for: 1m labels: severity: critical team: platform annotations: summary: "Veylant proxy is down" description: > The Prometheus scrape target for job="veylant-proxy" has been unreachable for more than 1 minute. The proxy may be crashed or the pod is not running. runbook: "https://docs.veylant.ai/runbooks/provider-down" # Fire when a TLS certificate expires in less than 30 days. - alert: VeylantCertExpiringSoon expr: | probe_ssl_earliest_cert_expiry{job="veylant-proxy"} - time() < 30 * 24 * 3600 for: 1h labels: severity: warning team: platform annotations: summary: "TLS certificate expiring within 30 days" description: > The TLS certificate for the Veylant proxy expires in {{ $value | humanizeDuration }}. Renew immediately to avoid service disruption. runbook: "https://docs.veylant.ai/runbooks/certificate-expired" # Fire when PostgreSQL active connections are high (pool exhaustion risk). - alert: VeylantDBConnectionsHigh expr: veylant_db_connections_active > 20 for: 5m labels: severity: warning team: platform annotations: summary: "PostgreSQL active connections above threshold" description: > PostgreSQL active connections = {{ $value }} (threshold: 20). Risk of connection pool exhaustion — check for slow queries or connection leaks. runbook: "https://docs.veylant.ai/runbooks/database-full" # Fire when PII detection volume is anomalously high (possible data exfiltration attempt). - alert: VeylantPIIVolumeAnomaly expr: | rate(veylant_pii_entities_detected_total[5m]) > 3 * avg_over_time(rate(veylant_pii_entities_detected_total[5m])[1h:5m]) for: 5m labels: severity: warning team: security annotations: summary: "PII detection volume anomaly detected" description: > PII entity detection rate is {{ $value | humanize }} entities/sec — more than 3× the 1-hour baseline. Possible data exfiltration or misconfigured client. runbook: "https://docs.veylant.ai/runbooks/pii-breach"