veylant/internal/pii/http.go
2026-03-06 18:38:04 +01:00

185 lines
5.6 KiB
Go

package pii
import (
"encoding/json"
"fmt"
"net/http"
"regexp"
"sort"
"strings"
"go.uber.org/zap"
"github.com/veylant/ia-gateway/internal/apierror"
)
// AnalyzeRequest is the JSON body accepted by POST /v1/pii/analyze.
type AnalyzeRequest struct {
Text string `json:"text"`
}
// AnalyzeEntity is a single PII entity returned by the analyze endpoint.
type AnalyzeEntity struct {
Type string `json:"type"`
Start int `json:"start"`
End int `json:"end"`
Confidence float64 `json:"confidence"`
Layer string `json:"layer"`
}
// AnalyzeResponse is the JSON response of POST /v1/pii/analyze.
type AnalyzeResponse struct {
Anonymized string `json:"anonymized"`
Entities []AnalyzeEntity `json:"entities"`
}
// AnalyzeHandler wraps a pii.Client as an HTTP handler for the playground.
// It is safe to call when client is nil: falls back to regex detection.
type AnalyzeHandler struct {
client *Client
logger *zap.Logger
}
// NewAnalyzeHandler creates a new AnalyzeHandler.
// client may be nil (PII service disabled) — the handler falls back to regex.
func NewAnalyzeHandler(client *Client, logger *zap.Logger) *AnalyzeHandler {
return &AnalyzeHandler{client: client, logger: logger}
}
func (h *AnalyzeHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
var req AnalyzeRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
apierror.WriteError(w, apierror.NewBadRequestError("invalid JSON: "+err.Error()))
return
}
if req.Text == "" {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
_ = json.NewEncoder(w).Encode(AnalyzeResponse{Anonymized: "", Entities: []AnalyzeEntity{}})
return
}
// Attempt real PII detection if service is available.
// NOTE: when fail_open=true, Detect() returns (result, nil) even on RPC
// failure, but sets result.Entities to nil to signal the degraded path.
// A real successful response always has a non-nil Entities slice.
if h.client != nil {
resp, err := h.client.Detect(r.Context(), req.Text, "playground", "playground-analyze", true, false)
if err == nil && resp.Entities != nil {
// Real PII service response (may be empty if no PII detected).
entities := make([]AnalyzeEntity, 0, len(resp.Entities))
for _, e := range resp.Entities {
entities = append(entities, AnalyzeEntity{
Type: e.EntityType,
Start: int(e.Start),
End: int(e.End),
Confidence: float64(e.Confidence),
Layer: e.DetectionLayer,
})
}
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
_ = json.NewEncoder(w).Encode(AnalyzeResponse{
Anonymized: resp.AnonymizedText,
Entities: entities,
})
return
}
if err != nil {
h.logger.Warn("PII service error — falling back to regex detection", zap.Error(err))
} else {
h.logger.Debug("PII service unavailable (fail-open) — falling back to regex detection")
}
}
// Fallback: local regex detection so the playground stays useful when the
// PII sidecar is not running (dev mode, demo environments).
result := regexDetect(req.Text)
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
_ = json.NewEncoder(w).Encode(result)
}
// ── Regex-based local detection ───────────────────────────────────────────────
// Package-level compiled regexes (compiled once at startup).
var (
rePiiIBAN = regexp.MustCompile(`(?i)FR\d{2}[\s]?\d{4}[\s]?\d{4}[\s]?\d{4}[\s]?\d{4}[\s]?\d{4}[\s]?\d{3}`)
rePiiEmail = regexp.MustCompile(`[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}`)
rePiiPhone = regexp.MustCompile(`(?:\+33|0033|0)[\s]?[1-9](?:[\s.\-]?\d{2}){4}`)
rePiiCC = regexp.MustCompile(`(?:4\d{3}|5[1-5]\d{2}|3[47]\d{2})(?:[ \-]?\d{4}){3}`)
rePiiSSN = regexp.MustCompile(`\b[12][\s]?\d{2}[\s]?\d{2}[\s]?\d{2}[\s]?\d{3}[\s]?\d{3}[\s]?\d{2}\b`)
)
type regexPattern struct {
re *regexp.Regexp
typ string
}
var piiPatterns = []regexPattern{
{rePiiIBAN, "IBAN_CODE"},
{rePiiEmail, "EMAIL_ADDRESS"},
{rePiiPhone, "PHONE_NUMBER"},
{rePiiCC, "CREDIT_CARD"},
{rePiiSSN, "FR_SSN"},
}
type rawMatch struct {
typ string
start int
end int
}
// regexDetect runs a set of compiled regexes over text and returns matched
// entities with their byte offsets, plus an anonymized version of the text.
func regexDetect(text string) AnalyzeResponse {
var raw []rawMatch
for _, p := range piiPatterns {
for _, loc := range p.re.FindAllStringIndex(text, -1) {
raw = append(raw, rawMatch{typ: p.typ, start: loc[0], end: loc[1]})
}
}
// Sort by start position.
sort.Slice(raw, func(i, j int) bool { return raw[i].start < raw[j].start })
// Remove overlapping matches (keep the first / longest-starting one).
filtered := raw[:0]
cursor := 0
for _, m := range raw {
if m.start >= cursor {
filtered = append(filtered, m)
cursor = m.end
}
}
if len(filtered) == 0 {
return AnalyzeResponse{Anonymized: text, Entities: []AnalyzeEntity{}}
}
// Build anonymized text and entity list simultaneously.
var sb strings.Builder
counters := map[string]int{}
entities := make([]AnalyzeEntity, 0, len(filtered))
cursor = 0
for _, m := range filtered {
sb.WriteString(text[cursor:m.start])
counters[m.typ]++
sb.WriteString(fmt.Sprintf("[%s_%d]", m.typ, counters[m.typ]))
entities = append(entities, AnalyzeEntity{
Type: m.typ,
Start: m.start,
End: m.end,
Confidence: 0.95,
Layer: "regex-local",
})
cursor = m.end
}
sb.WriteString(text[cursor:])
return AnalyzeResponse{
Anonymized: sb.String(),
Entities: entities,
}
}