veylant/deploy/grafana/dashboards/production-slo.json
2026-02-23 13:35:04 +01:00

257 lines
6.8 KiB
JSON

{
"title": "Veylant — Production SLO & Error Budget",
"uid": "veylant-production-slo",
"schemaVersion": 38,
"version": 1,
"refresh": "1m",
"time": { "from": "now-30d", "to": "now" },
"tags": ["slo", "production", "veylant"],
"panels": [
{
"id": 1,
"title": "Uptime SLO — 30-day rolling (target: 99.5%)",
"type": "gauge",
"gridPos": { "h": 8, "w": 6, "x": 0, "y": 0 },
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"orientation": "auto",
"showThresholdLabels": true,
"showThresholdMarkers": true
},
"fieldConfig": {
"defaults": {
"unit": "percentunit",
"min": 0.99,
"max": 1,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "yellow", "value": 0.995 },
{ "color": "green", "value": 0.999 }
]
}
}
},
"targets": [
{
"expr": "1 - (sum(increase(veylant_request_errors_total[30d])) / sum(increase(veylant_requests_total[30d])))",
"legendFormat": "Uptime SLO"
}
]
},
{
"id": 2,
"title": "Error Budget Remaining (minutes)",
"description": "SLO target: 99.5% uptime over 30 days = 216 min allowed downtime",
"type": "stat",
"gridPos": { "h": 8, "w": 6, "x": 6, "y": 0 },
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"colorMode": "background"
},
"fieldConfig": {
"defaults": {
"unit": "m",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "yellow", "value": 43 },
{ "color": "green", "value": 108 }
]
}
}
},
"targets": [
{
"expr": "(0.005 * 30 * 24 * 60) - (sum(increase(veylant_request_errors_total[30d])) / sum(increase(veylant_requests_total[30d])) * 30 * 24 * 60)",
"legendFormat": "Budget remaining (min)"
}
]
},
{
"id": 3,
"title": "p99 Latency SLO (target: < 500ms)",
"type": "gauge",
"gridPos": { "h": 8, "w": 6, "x": 12, "y": 0 },
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"orientation": "auto",
"showThresholdMarkers": true
},
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.3 },
{ "color": "red", "value": 0.5 }
]
}
}
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(veylant_request_duration_seconds_bucket[5m])))",
"legendFormat": "p99 latency"
}
]
},
{
"id": 4,
"title": "Active Alerts",
"type": "stat",
"gridPos": { "h": 8, "w": 6, "x": 18, "y": 0 },
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"colorMode": "background"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 },
{ "color": "red", "value": 3 }
]
}
}
},
"targets": [
{
"expr": "sum(ALERTS{alertstate=\"firing\",job=~\"veylant.*\"})",
"legendFormat": "Firing alerts"
}
]
},
{
"id": 5,
"title": "PII Entities Detected — Rate by Type (per min)",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
"targets": [
{
"expr": "sum by (entity_type) (rate(veylant_pii_entities_detected_total[1m])) * 60",
"legendFormat": "{{ entity_type }}"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"custom": { "lineWidth": 2 }
}
}
},
{
"id": 6,
"title": "PostgreSQL Active Connections",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
"targets": [
{
"expr": "veylant_db_connections_active",
"legendFormat": "Active connections"
},
{
"expr": "veylant_db_connections_idle",
"legendFormat": "Idle connections"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 15 },
{ "color": "red", "value": 20 }
]
}
}
}
},
{
"id": 7,
"title": "Provider RPS Breakdown",
"type": "piechart",
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 16 },
"options": {
"pieType": "donut",
"displayLabels": ["name", "percent"]
},
"targets": [
{
"expr": "sum by (provider) (rate(veylant_requests_total[5m]))",
"legendFormat": "{{ provider }}"
}
]
},
{
"id": 8,
"title": "Provider RPS — Time Series",
"type": "timeseries",
"gridPos": { "h": 8, "w": 16, "x": 8, "y": 16 },
"targets": [
{
"expr": "sum by (provider) (rate(veylant_requests_total[1m]))",
"legendFormat": "{{ provider }}"
}
],
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": { "lineWidth": 2 }
}
}
},
{
"id": 9,
"title": "Redis Memory Usage %",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 },
"targets": [
{
"expr": "redis_memory_used_bytes / redis_memory_max_bytes * 100",
"legendFormat": "Redis memory %"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 70 },
{ "color": "red", "value": 90 }
]
}
}
}
},
{
"id": 10,
"title": "Error Rate by Provider (5m avg)",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 },
"targets": [
{
"expr": "veylant:error_rate:5m * 100",
"legendFormat": "{{ provider }} error %"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"custom": { "lineWidth": 2 }
}
}
}
]
}