173 lines
6.3 KiB
Python
173 lines
6.3 KiB
Python
"""Tests for pseudonymization: AES-256-GCM encryptor + PseudonymMapper."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import base64
|
|
import re
|
|
from unittest.mock import MagicMock
|
|
|
|
import pytest
|
|
|
|
from layers.regex_layer import DetectedEntity
|
|
from pseudonymize import AESEncryptor, PseudonymMapper
|
|
|
|
# Stable 32-byte dev key (base64-encoded)
|
|
_KEY_B64 = base64.b64encode(b"veylant-dev-key-32bytes-padding-").decode()
|
|
|
|
# Token pattern
|
|
_TOKEN_RE = re.compile(r"\[PII:[A-Z_]+:[0-9a-f]{8}\]")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# AESEncryptor tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestAESEncryptor:
|
|
@pytest.fixture
|
|
def enc(self) -> AESEncryptor:
|
|
return AESEncryptor(_KEY_B64)
|
|
|
|
def test_encrypt_returns_bytes(self, enc):
|
|
result = enc.encrypt("secret")
|
|
assert isinstance(result, bytes)
|
|
|
|
def test_decrypt_round_trip(self, enc):
|
|
plaintext = "jean.dupont@example.com"
|
|
assert enc.decrypt(enc.encrypt(plaintext)) == plaintext
|
|
|
|
def test_different_encryptions_of_same_value(self, enc):
|
|
"""AES-GCM uses random nonce — same input gives different ciphertext."""
|
|
a = enc.encrypt("secret")
|
|
b = enc.encrypt("secret")
|
|
assert a != b
|
|
# But both decrypt correctly
|
|
assert enc.decrypt(a) == enc.decrypt(b) == "secret"
|
|
|
|
def test_empty_string_roundtrip(self, enc):
|
|
assert enc.decrypt(enc.encrypt("")) == ""
|
|
|
|
def test_unicode_roundtrip(self, enc):
|
|
value = "Óscar García-López"
|
|
assert enc.decrypt(enc.encrypt(value)) == value
|
|
|
|
def test_wrong_key_raises(self, enc):
|
|
encrypted = enc.encrypt("secret")
|
|
wrong_enc = AESEncryptor(base64.b64encode(b"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx").decode())
|
|
with pytest.raises(Exception):
|
|
wrong_enc.decrypt(encrypted)
|
|
|
|
def test_invalid_key_length_raises(self):
|
|
bad_key = base64.b64encode(b"short").decode()
|
|
with pytest.raises(ValueError):
|
|
AESEncryptor(bad_key)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# PseudonymMapper tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.fixture
|
|
def redis_mock() -> MagicMock:
|
|
mock = MagicMock()
|
|
mock.set.return_value = True
|
|
return mock
|
|
|
|
|
|
@pytest.fixture
|
|
def mapper(redis_mock) -> PseudonymMapper:
|
|
enc = AESEncryptor(_KEY_B64)
|
|
return PseudonymMapper(redis_mock, enc, ttl_seconds=3600)
|
|
|
|
|
|
class TestPseudonymMapper:
|
|
def test_anonymize_replaces_entity(self, mapper):
|
|
text = "Email: alice@example.com"
|
|
entities = [DetectedEntity("EMAIL", "alice@example.com", 7, 24)]
|
|
anon, mapping = mapper.anonymize(text, entities, "tenant1", "req1")
|
|
assert "alice@example.com" not in anon
|
|
assert _TOKEN_RE.search(anon) is not None
|
|
|
|
def test_anonymize_returns_mapping(self, mapper):
|
|
text = "alice@example.com"
|
|
entities = [DetectedEntity("EMAIL", "alice@example.com", 0, 17)]
|
|
_, mapping = mapper.anonymize(text, entities, "tenant1", "req1")
|
|
assert len(mapping) == 1
|
|
assert "alice@example.com" in mapping.values()
|
|
|
|
def test_token_format(self, mapper):
|
|
text = "alice@example.com"
|
|
entities = [DetectedEntity("EMAIL", "alice@example.com", 0, 17)]
|
|
anon, mapping = mapper.anonymize(text, entities, "t", "r")
|
|
token = list(mapping.keys())[0]
|
|
assert re.match(r"\[PII:EMAIL:[0-9a-f]{8}\]", token)
|
|
|
|
def test_depseudonymize_restores_original(self, mapper):
|
|
text = "Email: alice@example.com"
|
|
entities = [DetectedEntity("EMAIL", "alice@example.com", 7, 24)]
|
|
anon, mapping = mapper.anonymize(text, entities, "t", "r")
|
|
restored = mapper.depseudonymize(anon, mapping)
|
|
assert restored == text
|
|
|
|
def test_multiple_entities_anonymized(self, mapper):
|
|
text = "alice@example.com et bob@example.com"
|
|
entities = [
|
|
DetectedEntity("EMAIL", "alice@example.com", 0, 17),
|
|
DetectedEntity("EMAIL", "bob@example.com", 21, 36),
|
|
]
|
|
anon, mapping = mapper.anonymize(text, entities, "t", "r")
|
|
assert "alice@example.com" not in anon
|
|
assert "bob@example.com" not in anon
|
|
assert len(mapping) == 2
|
|
|
|
def test_depseudonymize_multiple(self, mapper):
|
|
text = "alice@example.com et bob@example.com"
|
|
entities = [
|
|
DetectedEntity("EMAIL", "alice@example.com", 0, 17),
|
|
DetectedEntity("EMAIL", "bob@example.com", 21, 36),
|
|
]
|
|
anon, mapping = mapper.anonymize(text, entities, "t", "r")
|
|
restored = mapper.depseudonymize(anon, mapping)
|
|
assert "alice@example.com" in restored
|
|
assert "bob@example.com" in restored
|
|
|
|
def test_redis_set_called_per_entity(self, mapper, redis_mock):
|
|
text = "alice@example.com et 06 12 34 56 78"
|
|
entities = [
|
|
DetectedEntity("EMAIL", "alice@example.com", 0, 17),
|
|
DetectedEntity("PHONE_FR", "06 12 34 56 78", 21, 35),
|
|
]
|
|
mapper.anonymize(text, entities, "tenant1", "req1")
|
|
assert redis_mock.set.call_count == 2
|
|
|
|
def test_redis_ttl_passed(self, mapper, redis_mock):
|
|
text = "alice@example.com"
|
|
entities = [DetectedEntity("EMAIL", "alice@example.com", 0, 17)]
|
|
mapper.anonymize(text, entities, "t", "r")
|
|
call_kwargs = redis_mock.set.call_args[1]
|
|
assert call_kwargs.get("ex") == 3600
|
|
|
|
def test_empty_entities_no_change(self, mapper):
|
|
text = "Texte sans PII"
|
|
anon, mapping = mapper.anonymize(text, [], "t", "r")
|
|
assert anon == text
|
|
assert mapping == {}
|
|
|
|
def test_depseudonymize_unknown_token_left_as_is(self, mapper):
|
|
text = "[PII:EMAIL:deadbeef]"
|
|
result = mapper.depseudonymize(text, {})
|
|
assert result == "[PII:EMAIL:deadbeef]"
|
|
|
|
def test_overlapping_entities_correct_offsets(self, mapper):
|
|
"""Right-to-left replacement must preserve offsets."""
|
|
text = "a@b.com c@d.com"
|
|
entities = [
|
|
DetectedEntity("EMAIL", "a@b.com", 0, 7),
|
|
DetectedEntity("EMAIL", "c@d.com", 8, 15),
|
|
]
|
|
anon, mapping = mapper.anonymize(text, entities, "t", "r")
|
|
restored = mapper.depseudonymize(anon, mapping)
|
|
assert "a@b.com" in restored
|
|
assert "c@d.com" in restored
|