veylant/services/pii/pseudonymize.py
2026-02-23 13:35:04 +01:00

124 lines
4.4 KiB
Python

"""Pseudonymization and de-pseudonymization of PII entities.
Tokens: [PII:<TYPE>:<8-hex-chars>]
Mappings stored in Redis encrypted with AES-256-GCM.
"""
from __future__ import annotations
import base64
import os
import re
import uuid
from typing import Optional
from cryptography.hazmat.primitives.ciphers.aead import AESGCM
from layers.regex_layer import DetectedEntity
# ---------------------------------------------------------------------------
# AES-256-GCM encryptor
# ---------------------------------------------------------------------------
_TOKEN_RE = re.compile(r"\[PII:[A-Z_]+:[0-9a-f]{8}\]")
class AESEncryptor:
"""AES-256-GCM encrypt/decrypt using a 32-byte key."""
_NONCE_SIZE = 12 # bytes
def __init__(self, key_b64: str) -> None:
key = base64.b64decode(key_b64)
if len(key) != 32:
raise ValueError(f"Encryption key must be 32 bytes, got {len(key)}")
self._aesgcm = AESGCM(key)
def encrypt(self, plaintext: str) -> bytes:
"""Return nonce (12 bytes) + ciphertext."""
nonce = os.urandom(self._NONCE_SIZE)
ciphertext = self._aesgcm.encrypt(nonce, plaintext.encode(), None)
return nonce + ciphertext
def decrypt(self, data: bytes) -> str:
"""Decrypt nonce-prefixed ciphertext and return plaintext string."""
nonce, ciphertext = data[: self._NONCE_SIZE], data[self._NONCE_SIZE :]
return self._aesgcm.decrypt(nonce, ciphertext, None).decode()
# ---------------------------------------------------------------------------
# PseudonymMapper
# ---------------------------------------------------------------------------
class PseudonymMapper:
"""Map PII entities to pseudonym tokens, persisted in Redis."""
def __init__(self, redis_client, encryptor: AESEncryptor, ttl_seconds: int = 3600) -> None:
self._redis = redis_client
self._encryptor = encryptor
self._ttl = ttl_seconds
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
def anonymize(
self,
text: str,
entities: list[DetectedEntity],
tenant_id: str,
request_id: str,
) -> tuple[str, dict[str, str]]:
"""Replace PII entities in *text* with pseudonym tokens.
Returns:
(anonymized_text, mapping) where mapping is {token → original_value}.
"""
mapping: dict[str, str] = {}
# Process entities from end to start to preserve offsets
sorted_entities = sorted(entities, key=lambda e: e.start, reverse=True)
result = text
for entity in sorted_entities:
short_id = uuid.uuid4().hex[:8]
token = f"[PII:{entity.entity_type}:{short_id}]"
mapping[token] = entity.original_value
# Replace in text (slice is safe because we process right-to-left)
result = result[: entity.start] + token + result[entity.end :]
# Store encrypted mapping in Redis
redis_key = f"pii:{tenant_id}:{request_id}:{short_id}"
encrypted = self._encryptor.encrypt(entity.original_value)
self._redis.set(redis_key, encrypted, ex=self._ttl)
return result, mapping
def depseudonymize(self, text: str, mapping: dict[str, str]) -> str:
"""Replace pseudonym tokens in *text* with their original values."""
def replace(m: re.Match) -> str:
token = m.group(0)
return mapping.get(token, token)
return _TOKEN_RE.sub(replace, text)
def load_mapping_from_redis(
self, tenant_id: str, request_id: str
) -> dict[str, str]:
"""Reconstruct the token → original mapping from Redis for a given request."""
pattern = f"pii:{tenant_id}:{request_id}:*"
mapping: dict[str, str] = {}
for key in self._redis.scan_iter(pattern):
encrypted = self._redis.get(key)
if encrypted is None:
continue
short_id = key.decode().split(":")[-1] if isinstance(key, bytes) else key.split(":")[-1]
original = self._encryptor.decrypt(encrypted)
# We don't know the entity_type from the key alone — token must be in text
# This method is for reference; in practice the mapping is passed in-memory
mapping[short_id] = original
return mapping