veylant/web/src/pages/docs/guides/CircuitBreakerGuide.tsx
2026-02-27 23:33:07 +01:00

119 lines
4.6 KiB
TypeScript

import { Callout } from "../components/Callout";
import { CodeBlock } from "../components/CodeBlock";
export function CircuitBreakerGuide() {
return (
<div>
<h1 id="circuit-breaker">Circuit Breaker & Failover</h1>
<p>
Veylant IA includes a per-provider circuit breaker that prevents cascading failures when an
LLM provider is degraded or unreachable.
</p>
<h2 id="states">Circuit Breaker States</h2>
<div className="grid grid-cols-1 sm:grid-cols-3 gap-4 my-4">
{[
{
state: "Closed",
color: "border-green-400",
bg: "bg-green-50 dark:bg-green-950/30",
desc: "Normal operation. Requests are forwarded to the provider. Failures are counted.",
},
{
state: "Open",
color: "border-red-400",
bg: "bg-red-50 dark:bg-red-950/30",
desc: "Provider bypassed. All requests use the fallback chain. Stays open for open_ttl seconds.",
},
{
state: "Half-Open",
color: "border-amber-400",
bg: "bg-amber-50 dark:bg-amber-950/30",
desc: "Testing if provider has recovered. One probe request sent. Success → Closed; Failure → Open.",
},
].map((item) => (
<div key={item.state} className={`rounded-lg border-l-4 p-4 ${item.color} ${item.bg}`}>
<h3 className="font-semibold text-sm mb-2">{item.state}</h3>
<p className="text-sm text-muted-foreground leading-relaxed">{item.desc}</p>
</div>
))}
</div>
<h2 id="configuration">Configuration</h2>
<CodeBlock
language="yaml"
code={`circuit_breaker:
threshold: 5 # consecutive failures to open the breaker
open_ttl: 60s # how long to stay open before half-open probe`}
/>
<Callout type="tip" title="Per-provider isolation">
Each provider has an independent circuit breaker. A failing Azure deployment does not affect
OpenAI or Anthropic calls.
</Callout>
<h2 id="fallback">Fallback Chain</h2>
<p>
When the primary provider's circuit is open, the routing engine uses the{" "}
<code>fallback_providers</code> array from the matched routing rule:
</p>
<CodeBlock
language="json"
code={`{
"provider": "azure",
"fallback_providers": ["anthropic", "openai"],
"conditions": [{"field": "user.department", "operator": "eq", "value": "legal"}]
}
// If azure is open → try anthropic → if anthropic is open → try openai → if all fail → 503`}
/>
<h2 id="check-status">Checking Status</h2>
<CodeBlock
language="bash"
code={`curl http://localhost:8090/v1/admin/providers/status \\
-H "Authorization: Bearer $TOKEN"
# Response:
{
"data": [
{"provider": "openai", "state": "closed", "failures": 0, "last_failure": null},
{"provider": "anthropic", "state": "closed", "failures": 0, "last_failure": null},
{"provider": "azure", "state": "open", "failures": 5, "last_failure": "2026-01-15T14:30:00Z"},
{"provider": "mistral", "state": "half-open", "failures": 3, "last_failure": "2026-01-15T14:28:00Z"},
{"provider": "ollama", "state": "closed", "failures": 0, "last_failure": null}
]
}`}
/>
<h2 id="prometheus">Prometheus Alert</h2>
<p>
The <code>CircuitBreakerOpen</code> alert fires when any provider is in the open state:
</p>
<CodeBlock
language="yaml"
code={`- alert: CircuitBreakerOpen
expr: veylant_circuit_breaker_state > 0
for: 0m
labels:
severity: warning
annotations:
summary: "Provider {{ $labels.provider }} circuit breaker is open"
description: "The circuit breaker for {{ $labels.provider }} has opened after repeated failures."`}
/>
<h2 id="graceful-degradation">Development Mode Degradation</h2>
<Callout type="info">
In <code>server.env=development</code>, Veylant IA degrades gracefully if services are
unreachable:
<ul className="mt-2 space-y-1">
<li><strong>Keycloak unreachable</strong> MockVerifier (auth bypassed)</li>
<li><strong>PostgreSQL unreachable</strong> routing disabled, feature flags use in-memory defaults</li>
<li><strong>ClickHouse unreachable</strong> audit logging disabled</li>
<li><strong>PII service unreachable</strong> PII skipped if <code>fail_open=true</code></li>
</ul>
In <code>production</code> mode, any of the above causes a fatal startup error.
</Callout>
</div>
);
}