veylant/deploy/prometheus/rules.yml

groups:
  # ── Recording rules — pre-compute expensive percentile queries ─────────────
  - name: veylant_recording_rules
    interval: 30s
    rules:
      # p99 request duration over a 5-minute sliding window, per model and provider.
      - record: veylant:request_duration:p99
        expr: |
          histogram_quantile(
            0.99,
            sum by (le, model, provider) (
              rate(veylant_request_duration_seconds_bucket[5m])
            )
          )

      # p95 request duration (for dashboard and alerting).
      - record: veylant:request_duration:p95
        expr: |
          histogram_quantile(
            0.95,
            sum by (le, model, provider) (
              rate(veylant_request_duration_seconds_bucket[5m])
            )
          )

      # Request rate (RPS) per provider.
      - record: veylant:request_rate:1m
        expr: |
          sum by (provider, status_code) (
            rate(veylant_request_total[1m])
          )

      # Error rate (4xx/5xx) as a fraction of total requests.
      - record: veylant:error_rate:5m
        expr: |
          sum by (provider) (
            rate(veylant_request_total{status_code=~"[45].."}[5m])
          )
          /
          sum by (provider) (
            rate(veylant_request_total[5m])
          )

  # ── Alert rules ────────────────────────────────────────────────────────────
  - name: veylant_alerts
    rules:
      # Fire when p99 latency exceeds 500ms for more than 5 minutes.
      - alert: VeylantHighLatencyP99
        expr: veylant:request_duration:p99 > 0.5
        for: 5m
        labels:
          severity: warning
          team: platform
        annotations:
          summary: "Veylant proxy p99 latency is above 500ms"
          description: >
            p99 latency for model={{ $labels.model }} provider={{ $labels.provider }}
            is {{ $value | humanizeDuration }} (threshold: 500ms).
            Check upstream provider health and connection pool utilisation.
          runbook: "https://docs.veylant.ai/runbooks/high-latency"

      # Fire when error rate exceeds 5% for more than 2 minutes.
      - alert: VeylantHighErrorRate
        expr: veylant:error_rate:5m > 0.05
        for: 2m
        labels:
          severity: critical
          team: platform
        annotations:
          summary: "Veylant proxy error rate is above 5%"
          description: >
            Error rate for provider={{ $labels.provider }} is
            {{ $value | humanizePercentage }} over the last 5 minutes.
          runbook: "https://docs.veylant.ai/runbooks/high-error-rate"

      # Fire when a circuit breaker opens (provider is failing).
      - alert: VeylantCircuitBreakerOpen
        expr: veylant_circuit_breaker_state{state="open"} == 1
        for: 1m
        labels:
          severity: critical
          team: platform
        annotations:
          summary: "Circuit breaker open for provider {{ $labels.provider }}"
          description: >
            The circuit breaker for provider={{ $labels.provider }} has been open
            for more than 1 minute. Requests are being rejected.
          runbook: "https://docs.veylant.ai/runbooks/provider-down"

      # Fire when the proxy is not reachable by Prometheus scrape.
      - alert: VeylantProxyDown
        expr: up{job="veylant-proxy"} == 0
        for: 1m
        labels:
          severity: critical
          team: platform
        annotations:
          summary: "Veylant proxy is down"
          description: >
            The Prometheus scrape target for job="veylant-proxy" has been unreachable
            for more than 1 minute. The proxy may be crashed or the pod is not running.
          runbook: "https://docs.veylant.ai/runbooks/provider-down"

      # Fire when a TLS certificate expires in less than 30 days.
      - alert: VeylantCertExpiringSoon
        expr: |
          probe_ssl_earliest_cert_expiry{job="veylant-proxy"} - time() < 30 * 24 * 3600
        for: 1h
        labels:
          severity: warning
          team: platform
        annotations:
          summary: "TLS certificate expiring within 30 days"
          description: >
            The TLS certificate for the Veylant proxy expires in
            {{ $value | humanizeDuration }}. Renew immediately to avoid service disruption.
          runbook: "https://docs.veylant.ai/runbooks/certificate-expired"

      # Fire when PostgreSQL active connections are high (pool exhaustion risk).
      - alert: VeylantDBConnectionsHigh
        expr: veylant_db_connections_active > 20
        for: 5m
        labels:
          severity: warning
          team: platform
        annotations:
          summary: "PostgreSQL active connections above threshold"
          description: >
            PostgreSQL active connections = {{ $value }} (threshold: 20).
            Risk of connection pool exhaustion — check for slow queries or connection leaks.
          runbook: "https://docs.veylant.ai/runbooks/database-full"

      # Fire when PII detection volume is anomalously high (possible data exfiltration attempt).
      - alert: VeylantPIIVolumeAnomaly
        expr: |
          rate(veylant_pii_entities_detected_total[5m])
          > 3 * avg_over_time(rate(veylant_pii_entities_detected_total[5m])[1h:5m])
        for: 5m
        labels:
          severity: warning
          team: security
        annotations:
          summary: "PII detection volume anomaly detected"
          description: >
            PII entity detection rate is {{ $value | humanize }} entities/sec —
            more than 3× the 1-hour baseline. Possible data exfiltration or misconfigured client.
          runbook: "https://docs.veylant.ai/runbooks/pii-breach"