185 lines
5.6 KiB
Go
185 lines
5.6 KiB
Go
package pii
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"net/http"
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
|
|
"go.uber.org/zap"
|
|
|
|
"github.com/veylant/ia-gateway/internal/apierror"
|
|
)
|
|
|
|
// AnalyzeRequest is the JSON body accepted by POST /v1/pii/analyze.
|
|
type AnalyzeRequest struct {
|
|
Text string `json:"text"`
|
|
}
|
|
|
|
// AnalyzeEntity is a single PII entity returned by the analyze endpoint.
|
|
type AnalyzeEntity struct {
|
|
Type string `json:"type"`
|
|
Start int `json:"start"`
|
|
End int `json:"end"`
|
|
Confidence float64 `json:"confidence"`
|
|
Layer string `json:"layer"`
|
|
}
|
|
|
|
// AnalyzeResponse is the JSON response of POST /v1/pii/analyze.
|
|
type AnalyzeResponse struct {
|
|
Anonymized string `json:"anonymized"`
|
|
Entities []AnalyzeEntity `json:"entities"`
|
|
}
|
|
|
|
// AnalyzeHandler wraps a pii.Client as an HTTP handler for the playground.
|
|
// It is safe to call when client is nil: falls back to regex detection.
|
|
type AnalyzeHandler struct {
|
|
client *Client
|
|
logger *zap.Logger
|
|
}
|
|
|
|
// NewAnalyzeHandler creates a new AnalyzeHandler.
|
|
// client may be nil (PII service disabled) — the handler falls back to regex.
|
|
func NewAnalyzeHandler(client *Client, logger *zap.Logger) *AnalyzeHandler {
|
|
return &AnalyzeHandler{client: client, logger: logger}
|
|
}
|
|
|
|
func (h *AnalyzeHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
|
var req AnalyzeRequest
|
|
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
|
apierror.WriteError(w, apierror.NewBadRequestError("invalid JSON: "+err.Error()))
|
|
return
|
|
}
|
|
if req.Text == "" {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
w.WriteHeader(http.StatusOK)
|
|
_ = json.NewEncoder(w).Encode(AnalyzeResponse{Anonymized: "", Entities: []AnalyzeEntity{}})
|
|
return
|
|
}
|
|
|
|
// Attempt real PII detection if service is available.
|
|
// NOTE: when fail_open=true, Detect() returns (result, nil) even on RPC
|
|
// failure, but sets result.Entities to nil to signal the degraded path.
|
|
// A real successful response always has a non-nil Entities slice.
|
|
if h.client != nil {
|
|
resp, err := h.client.Detect(r.Context(), req.Text, "playground", "playground-analyze", true, false)
|
|
if err == nil && resp.Entities != nil {
|
|
// Real PII service response (may be empty if no PII detected).
|
|
entities := make([]AnalyzeEntity, 0, len(resp.Entities))
|
|
for _, e := range resp.Entities {
|
|
entities = append(entities, AnalyzeEntity{
|
|
Type: e.EntityType,
|
|
Start: int(e.Start),
|
|
End: int(e.End),
|
|
Confidence: float64(e.Confidence),
|
|
Layer: e.DetectionLayer,
|
|
})
|
|
}
|
|
w.Header().Set("Content-Type", "application/json")
|
|
w.WriteHeader(http.StatusOK)
|
|
_ = json.NewEncoder(w).Encode(AnalyzeResponse{
|
|
Anonymized: resp.AnonymizedText,
|
|
Entities: entities,
|
|
})
|
|
return
|
|
}
|
|
if err != nil {
|
|
h.logger.Warn("PII service error — falling back to regex detection", zap.Error(err))
|
|
} else {
|
|
h.logger.Debug("PII service unavailable (fail-open) — falling back to regex detection")
|
|
}
|
|
}
|
|
|
|
// Fallback: local regex detection so the playground stays useful when the
|
|
// PII sidecar is not running (dev mode, demo environments).
|
|
result := regexDetect(req.Text)
|
|
w.Header().Set("Content-Type", "application/json")
|
|
w.WriteHeader(http.StatusOK)
|
|
_ = json.NewEncoder(w).Encode(result)
|
|
}
|
|
|
|
// ── Regex-based local detection ───────────────────────────────────────────────
|
|
|
|
// Package-level compiled regexes (compiled once at startup).
|
|
var (
|
|
rePiiIBAN = regexp.MustCompile(`(?i)FR\d{2}[\s]?\d{4}[\s]?\d{4}[\s]?\d{4}[\s]?\d{4}[\s]?\d{4}[\s]?\d{3}`)
|
|
rePiiEmail = regexp.MustCompile(`[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}`)
|
|
rePiiPhone = regexp.MustCompile(`(?:\+33|0033|0)[\s]?[1-9](?:[\s.\-]?\d{2}){4}`)
|
|
rePiiCC = regexp.MustCompile(`(?:4\d{3}|5[1-5]\d{2}|3[47]\d{2})(?:[ \-]?\d{4}){3}`)
|
|
rePiiSSN = regexp.MustCompile(`\b[12][\s]?\d{2}[\s]?\d{2}[\s]?\d{2}[\s]?\d{3}[\s]?\d{3}[\s]?\d{2}\b`)
|
|
)
|
|
|
|
type regexPattern struct {
|
|
re *regexp.Regexp
|
|
typ string
|
|
}
|
|
|
|
var piiPatterns = []regexPattern{
|
|
{rePiiIBAN, "IBAN_CODE"},
|
|
{rePiiEmail, "EMAIL_ADDRESS"},
|
|
{rePiiPhone, "PHONE_NUMBER"},
|
|
{rePiiCC, "CREDIT_CARD"},
|
|
{rePiiSSN, "FR_SSN"},
|
|
}
|
|
|
|
type rawMatch struct {
|
|
typ string
|
|
start int
|
|
end int
|
|
}
|
|
|
|
// regexDetect runs a set of compiled regexes over text and returns matched
|
|
// entities with their byte offsets, plus an anonymized version of the text.
|
|
func regexDetect(text string) AnalyzeResponse {
|
|
var raw []rawMatch
|
|
for _, p := range piiPatterns {
|
|
for _, loc := range p.re.FindAllStringIndex(text, -1) {
|
|
raw = append(raw, rawMatch{typ: p.typ, start: loc[0], end: loc[1]})
|
|
}
|
|
}
|
|
// Sort by start position.
|
|
sort.Slice(raw, func(i, j int) bool { return raw[i].start < raw[j].start })
|
|
|
|
// Remove overlapping matches (keep the first / longest-starting one).
|
|
filtered := raw[:0]
|
|
cursor := 0
|
|
for _, m := range raw {
|
|
if m.start >= cursor {
|
|
filtered = append(filtered, m)
|
|
cursor = m.end
|
|
}
|
|
}
|
|
|
|
if len(filtered) == 0 {
|
|
return AnalyzeResponse{Anonymized: text, Entities: []AnalyzeEntity{}}
|
|
}
|
|
|
|
// Build anonymized text and entity list simultaneously.
|
|
var sb strings.Builder
|
|
counters := map[string]int{}
|
|
entities := make([]AnalyzeEntity, 0, len(filtered))
|
|
cursor = 0
|
|
|
|
for _, m := range filtered {
|
|
sb.WriteString(text[cursor:m.start])
|
|
counters[m.typ]++
|
|
sb.WriteString(fmt.Sprintf("[%s_%d]", m.typ, counters[m.typ]))
|
|
entities = append(entities, AnalyzeEntity{
|
|
Type: m.typ,
|
|
Start: m.start,
|
|
End: m.end,
|
|
Confidence: 0.95,
|
|
Layer: "regex-local",
|
|
})
|
|
cursor = m.end
|
|
}
|
|
sb.WriteString(text[cursor:])
|
|
|
|
return AnalyzeResponse{
|
|
Anonymized: sb.String(),
|
|
Entities: entities,
|
|
}
|
|
}
|