veylant/services/pii/tests/test_pseudo.py
2026-02-23 13:35:04 +01:00

173 lines
6.3 KiB
Python

"""Tests for pseudonymization: AES-256-GCM encryptor + PseudonymMapper."""
from __future__ import annotations
import base64
import re
from unittest.mock import MagicMock
import pytest
from layers.regex_layer import DetectedEntity
from pseudonymize import AESEncryptor, PseudonymMapper
# Stable 32-byte dev key (base64-encoded)
_KEY_B64 = base64.b64encode(b"veylant-dev-key-32bytes-padding-").decode()
# Token pattern
_TOKEN_RE = re.compile(r"\[PII:[A-Z_]+:[0-9a-f]{8}\]")
# ---------------------------------------------------------------------------
# AESEncryptor tests
# ---------------------------------------------------------------------------
class TestAESEncryptor:
@pytest.fixture
def enc(self) -> AESEncryptor:
return AESEncryptor(_KEY_B64)
def test_encrypt_returns_bytes(self, enc):
result = enc.encrypt("secret")
assert isinstance(result, bytes)
def test_decrypt_round_trip(self, enc):
plaintext = "jean.dupont@example.com"
assert enc.decrypt(enc.encrypt(plaintext)) == plaintext
def test_different_encryptions_of_same_value(self, enc):
"""AES-GCM uses random nonce — same input gives different ciphertext."""
a = enc.encrypt("secret")
b = enc.encrypt("secret")
assert a != b
# But both decrypt correctly
assert enc.decrypt(a) == enc.decrypt(b) == "secret"
def test_empty_string_roundtrip(self, enc):
assert enc.decrypt(enc.encrypt("")) == ""
def test_unicode_roundtrip(self, enc):
value = "Óscar García-López"
assert enc.decrypt(enc.encrypt(value)) == value
def test_wrong_key_raises(self, enc):
encrypted = enc.encrypt("secret")
wrong_enc = AESEncryptor(base64.b64encode(b"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx").decode())
with pytest.raises(Exception):
wrong_enc.decrypt(encrypted)
def test_invalid_key_length_raises(self):
bad_key = base64.b64encode(b"short").decode()
with pytest.raises(ValueError):
AESEncryptor(bad_key)
# ---------------------------------------------------------------------------
# PseudonymMapper tests
# ---------------------------------------------------------------------------
@pytest.fixture
def redis_mock() -> MagicMock:
mock = MagicMock()
mock.set.return_value = True
return mock
@pytest.fixture
def mapper(redis_mock) -> PseudonymMapper:
enc = AESEncryptor(_KEY_B64)
return PseudonymMapper(redis_mock, enc, ttl_seconds=3600)
class TestPseudonymMapper:
def test_anonymize_replaces_entity(self, mapper):
text = "Email: alice@example.com"
entities = [DetectedEntity("EMAIL", "alice@example.com", 7, 24)]
anon, mapping = mapper.anonymize(text, entities, "tenant1", "req1")
assert "alice@example.com" not in anon
assert _TOKEN_RE.search(anon) is not None
def test_anonymize_returns_mapping(self, mapper):
text = "alice@example.com"
entities = [DetectedEntity("EMAIL", "alice@example.com", 0, 17)]
_, mapping = mapper.anonymize(text, entities, "tenant1", "req1")
assert len(mapping) == 1
assert "alice@example.com" in mapping.values()
def test_token_format(self, mapper):
text = "alice@example.com"
entities = [DetectedEntity("EMAIL", "alice@example.com", 0, 17)]
anon, mapping = mapper.anonymize(text, entities, "t", "r")
token = list(mapping.keys())[0]
assert re.match(r"\[PII:EMAIL:[0-9a-f]{8}\]", token)
def test_depseudonymize_restores_original(self, mapper):
text = "Email: alice@example.com"
entities = [DetectedEntity("EMAIL", "alice@example.com", 7, 24)]
anon, mapping = mapper.anonymize(text, entities, "t", "r")
restored = mapper.depseudonymize(anon, mapping)
assert restored == text
def test_multiple_entities_anonymized(self, mapper):
text = "alice@example.com et bob@example.com"
entities = [
DetectedEntity("EMAIL", "alice@example.com", 0, 17),
DetectedEntity("EMAIL", "bob@example.com", 21, 36),
]
anon, mapping = mapper.anonymize(text, entities, "t", "r")
assert "alice@example.com" not in anon
assert "bob@example.com" not in anon
assert len(mapping) == 2
def test_depseudonymize_multiple(self, mapper):
text = "alice@example.com et bob@example.com"
entities = [
DetectedEntity("EMAIL", "alice@example.com", 0, 17),
DetectedEntity("EMAIL", "bob@example.com", 21, 36),
]
anon, mapping = mapper.anonymize(text, entities, "t", "r")
restored = mapper.depseudonymize(anon, mapping)
assert "alice@example.com" in restored
assert "bob@example.com" in restored
def test_redis_set_called_per_entity(self, mapper, redis_mock):
text = "alice@example.com et 06 12 34 56 78"
entities = [
DetectedEntity("EMAIL", "alice@example.com", 0, 17),
DetectedEntity("PHONE_FR", "06 12 34 56 78", 21, 35),
]
mapper.anonymize(text, entities, "tenant1", "req1")
assert redis_mock.set.call_count == 2
def test_redis_ttl_passed(self, mapper, redis_mock):
text = "alice@example.com"
entities = [DetectedEntity("EMAIL", "alice@example.com", 0, 17)]
mapper.anonymize(text, entities, "t", "r")
call_kwargs = redis_mock.set.call_args[1]
assert call_kwargs.get("ex") == 3600
def test_empty_entities_no_change(self, mapper):
text = "Texte sans PII"
anon, mapping = mapper.anonymize(text, [], "t", "r")
assert anon == text
assert mapping == {}
def test_depseudonymize_unknown_token_left_as_is(self, mapper):
text = "[PII:EMAIL:deadbeef]"
result = mapper.depseudonymize(text, {})
assert result == "[PII:EMAIL:deadbeef]"
def test_overlapping_entities_correct_offsets(self, mapper):
"""Right-to-left replacement must preserve offsets."""
text = "a@b.com c@d.com"
entities = [
DetectedEntity("EMAIL", "a@b.com", 0, 7),
DetectedEntity("EMAIL", "c@d.com", 8, 15),
]
anon, mapping = mapper.anonymize(text, entities, "t", "r")
restored = mapper.depseudonymize(anon, mapping)
assert "a@b.com" in restored
assert "c@d.com" in restored