veylant/services/pii/tests/test_regex.py
2026-02-23 13:35:04 +01:00

285 lines
9.6 KiB
Python

"""Tests for the regex detection layer.
Coverage: IBAN, EMAIL, PHONE_FR, PHONE_INTL, FR_SSN, CREDIT_CARD.
"""
from __future__ import annotations
import pytest
from layers.regex_layer import RegexLayer, _iban_valid, _luhn_valid
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest.fixture
def layer() -> RegexLayer:
return RegexLayer()
# ---------------------------------------------------------------------------
# IBAN — valid checksums
# ---------------------------------------------------------------------------
class TestIBAN:
VALID_IBANS = [
"FR7630006000011234567890189", # FR — no spaces
"FR76 3000 6000 0112 3456 7890 189", # FR — with spaces
"DE89370400440532013000", # DE
"GB29NWBK60161331926819", # GB
"ES9121000418450200051332", # ES
"IT60X0542811101000000123456", # IT
"NL91ABNA0417164300", # NL
"BE68539007547034", # BE
]
INVALID_IBANS = [
"FR7630006000011234567890188", # wrong checksum
"XX0000000000000000000000000", # invalid country
"FR76", # too short
"notaniban",
]
@pytest.mark.parametrize("iban", VALID_IBANS)
def test_valid_iban_detected(self, layer, iban):
results = layer.detect(f"Mon IBAN est {iban} merci")
types = [e.entity_type for e in results]
assert "IBAN" in types, f"Expected IBAN detected in: {iban}"
@pytest.mark.parametrize("iban", INVALID_IBANS)
def test_invalid_iban_not_detected(self, layer, iban):
results = layer.detect(iban)
types = [e.entity_type for e in results]
assert "IBAN" not in types, f"IBAN should not be detected: {iban}"
def test_iban_entity_fields(self, layer):
text = "IBAN: FR7630006000011234567890189"
results = layer.detect(text)
iban = next(e for e in results if e.entity_type == "IBAN")
assert iban.confidence == 1.0
assert iban.detection_layer == "regex"
assert iban.start >= 0
assert iban.end > iban.start
def test_iban_checksum_validation(self):
assert _iban_valid("FR7630006000011234567890189") is True
assert _iban_valid("FR7630006000011234567890188") is False
def test_multiple_ibans_in_text(self, layer):
text = "Premier: FR7630006000011234567890189, second: DE89370400440532013000"
results = layer.detect(text)
ibans = [e for e in results if e.entity_type == "IBAN"]
assert len(ibans) == 2
# ---------------------------------------------------------------------------
# EMAIL
# ---------------------------------------------------------------------------
class TestEmail:
VALID_EMAILS = [
"user@example.com",
"first.last@subdomain.example.org",
"user+tag@example.co.uk",
"USER@EXAMPLE.COM",
"user123@example-domain.com",
"user@xn--nxasmq6b.com", # IDN domain
"a@b.io",
]
INVALID_EMAILS = [
"notanemail",
"@nodomain.com",
"user@",
"user@.com",
]
@pytest.mark.parametrize("email", VALID_EMAILS)
def test_valid_email_detected(self, layer, email):
results = layer.detect(f"Contact: {email}")
types = [e.entity_type for e in results]
assert "EMAIL" in types
@pytest.mark.parametrize("email", INVALID_EMAILS)
def test_invalid_email_not_detected(self, layer, email):
results = layer.detect(email)
types = [e.entity_type for e in results]
assert "EMAIL" not in types
def test_email_in_sentence(self, layer):
text = "Envoyez votre CV à recrutement@veylant.io pour postuler."
results = layer.detect(text)
email = next(e for e in results if e.entity_type == "EMAIL")
assert email.original_value == "recrutement@veylant.io"
def test_multiple_emails(self, layer):
text = "Copie à alice@example.com et bob@example.org"
results = layer.detect(text)
emails = [e for e in results if e.entity_type == "EMAIL"]
assert len(emails) == 2
# ---------------------------------------------------------------------------
# PHONE_FR
# ---------------------------------------------------------------------------
class TestPhoneFR:
VALID_FR_PHONES = [
"0612345678",
"06 12 34 56 78",
"06-12-34-56-78",
"06.12.34.56.78",
"0712345678",
"+33612345678",
"+33 6 12 34 56 78",
"0033612345678",
]
INVALID_FR_PHONES = [
"0512345678", # landline, not mobile (05 not 06/07)
"0123456789", # landline 01
"123456", # too short
]
@pytest.mark.parametrize("phone", VALID_FR_PHONES)
def test_valid_fr_phone_detected(self, layer, phone):
results = layer.detect(f"Appelez-moi au {phone}")
types = [e.entity_type for e in results]
assert "PHONE_FR" in types, f"Expected PHONE_FR for: {phone}"
@pytest.mark.parametrize("phone", INVALID_FR_PHONES)
def test_non_mobile_not_detected_as_fr(self, layer, phone):
results = layer.detect(phone)
types = [e.entity_type for e in results]
assert "PHONE_FR" not in types
# ---------------------------------------------------------------------------
# PHONE_INTL
# ---------------------------------------------------------------------------
class TestPhoneIntl:
VALID_INTL_PHONES = [
"+12025550123", # US
"+441632960789", # UK
"+4915123456789", # DE mobile
"+34612345678", # ES mobile
]
@pytest.mark.parametrize("phone", VALID_INTL_PHONES)
def test_valid_intl_phone_detected(self, layer, phone):
results = layer.detect(f"Call me at {phone}")
types = [e.entity_type for e in results]
# +33 FR numbers are captured as PHONE_FR, others as PHONE_INTL
assert "PHONE_FR" in types or "PHONE_INTL" in types, f"No phone detected for: {phone}"
# ---------------------------------------------------------------------------
# FR_SSN
# ---------------------------------------------------------------------------
class TestFRSSN:
VALID_SSNS = [
"175086912345678", # Male born Aug 1975
"299051234567890", # Female born May 1999
"182011234512345", # Male born Jan 1982
"120031234512345", # Male born Mar 2020 — dept 12
]
INVALID_SSNS = [
"375086912345678", # invalid first digit (3)
"000086912345678", # year 00 could be valid but first digit invalid
"short",
]
@pytest.mark.parametrize("ssn", VALID_SSNS)
def test_valid_ssn_detected(self, layer, ssn):
results = layer.detect(f"Numéro de sécurité sociale: {ssn}")
types = [e.entity_type for e in results]
assert "FR_SSN" in types, f"Expected FR_SSN for: {ssn}"
@pytest.mark.parametrize("ssn", INVALID_SSNS)
def test_invalid_ssn_not_detected(self, layer, ssn):
results = layer.detect(ssn)
types = [e.entity_type for e in results]
assert "FR_SSN" not in types
# ---------------------------------------------------------------------------
# CREDIT_CARD (Luhn)
# ---------------------------------------------------------------------------
class TestCreditCard:
VALID_CARDS = [
"4532015112830366", # Visa
"5425233430109903", # Mastercard
"4532 0151 1283 0366", # with spaces
"4532-0151-1283-0366", # with hyphens
"4111111111111111", # Visa test card
"5500005555555559", # Mastercard test card
]
INVALID_CARDS = [
"1234567890123456", # Fails Luhn
"4532015112830365", # Valid format but wrong Luhn
"1234 5678 9012 3456", # Fails Luhn
]
@pytest.mark.parametrize("card", VALID_CARDS)
def test_valid_card_detected(self, layer, card):
results = layer.detect(f"Carte: {card}")
types = [e.entity_type for e in results]
assert "CREDIT_CARD" in types, f"Expected CREDIT_CARD for: {card}"
@pytest.mark.parametrize("card", INVALID_CARDS)
def test_invalid_card_not_detected(self, layer, card):
results = layer.detect(card)
types = [e.entity_type for e in results]
assert "CREDIT_CARD" not in types, f"Should not detect: {card}"
def test_luhn_algorithm(self):
assert _luhn_valid("4532015112830366") is True
assert _luhn_valid("4532015112830365") is False
assert _luhn_valid("4111111111111111") is True
assert _luhn_valid("1234567890123456") is False
# ---------------------------------------------------------------------------
# Mixed / integration
# ---------------------------------------------------------------------------
class TestMixedPII:
def test_multiple_pii_types_in_prompt(self, layer):
text = (
"Bonjour, je suis Jean Dupont (jean.dupont@example.com). "
"Mon IBAN est FR7630006000011234567890189. "
"Appelez-moi au 06 12 34 56 78."
)
results = layer.detect(text)
types = {e.entity_type for e in results}
assert "EMAIL" in types
assert "IBAN" in types
assert "PHONE_FR" in types
def test_empty_text_returns_empty(self, layer):
assert layer.detect("") == []
def test_no_pii_text(self, layer):
text = "Le projet avance bien et l'équipe est motivée."
assert layer.detect(text) == []
def test_entities_sorted_by_position(self, layer):
text = "Email: a@b.com IBAN: FR7630006000011234567890189"
results = layer.detect(text)
positions = [e.start for e in results]
assert positions == sorted(positions)