"""Tests for the regex detection layer. Coverage: IBAN, EMAIL, PHONE_FR, PHONE_INTL, FR_SSN, CREDIT_CARD. """ from __future__ import annotations import pytest from layers.regex_layer import RegexLayer, _iban_valid, _luhn_valid # --------------------------------------------------------------------------- # Fixtures # --------------------------------------------------------------------------- @pytest.fixture def layer() -> RegexLayer: return RegexLayer() # --------------------------------------------------------------------------- # IBAN — valid checksums # --------------------------------------------------------------------------- class TestIBAN: VALID_IBANS = [ "FR7630006000011234567890189", # FR — no spaces "FR76 3000 6000 0112 3456 7890 189", # FR — with spaces "DE89370400440532013000", # DE "GB29NWBK60161331926819", # GB "ES9121000418450200051332", # ES "IT60X0542811101000000123456", # IT "NL91ABNA0417164300", # NL "BE68539007547034", # BE ] INVALID_IBANS = [ "FR7630006000011234567890188", # wrong checksum "XX0000000000000000000000000", # invalid country "FR76", # too short "notaniban", ] @pytest.mark.parametrize("iban", VALID_IBANS) def test_valid_iban_detected(self, layer, iban): results = layer.detect(f"Mon IBAN est {iban} merci") types = [e.entity_type for e in results] assert "IBAN" in types, f"Expected IBAN detected in: {iban}" @pytest.mark.parametrize("iban", INVALID_IBANS) def test_invalid_iban_not_detected(self, layer, iban): results = layer.detect(iban) types = [e.entity_type for e in results] assert "IBAN" not in types, f"IBAN should not be detected: {iban}" def test_iban_entity_fields(self, layer): text = "IBAN: FR7630006000011234567890189" results = layer.detect(text) iban = next(e for e in results if e.entity_type == "IBAN") assert iban.confidence == 1.0 assert iban.detection_layer == "regex" assert iban.start >= 0 assert iban.end > iban.start def test_iban_checksum_validation(self): assert _iban_valid("FR7630006000011234567890189") is True assert _iban_valid("FR7630006000011234567890188") is False def test_multiple_ibans_in_text(self, layer): text = "Premier: FR7630006000011234567890189, second: DE89370400440532013000" results = layer.detect(text) ibans = [e for e in results if e.entity_type == "IBAN"] assert len(ibans) == 2 # --------------------------------------------------------------------------- # EMAIL # --------------------------------------------------------------------------- class TestEmail: VALID_EMAILS = [ "user@example.com", "first.last@subdomain.example.org", "user+tag@example.co.uk", "USER@EXAMPLE.COM", "user123@example-domain.com", "user@xn--nxasmq6b.com", # IDN domain "a@b.io", ] INVALID_EMAILS = [ "notanemail", "@nodomain.com", "user@", "user@.com", ] @pytest.mark.parametrize("email", VALID_EMAILS) def test_valid_email_detected(self, layer, email): results = layer.detect(f"Contact: {email}") types = [e.entity_type for e in results] assert "EMAIL" in types @pytest.mark.parametrize("email", INVALID_EMAILS) def test_invalid_email_not_detected(self, layer, email): results = layer.detect(email) types = [e.entity_type for e in results] assert "EMAIL" not in types def test_email_in_sentence(self, layer): text = "Envoyez votre CV à recrutement@veylant.io pour postuler." results = layer.detect(text) email = next(e for e in results if e.entity_type == "EMAIL") assert email.original_value == "recrutement@veylant.io" def test_multiple_emails(self, layer): text = "Copie à alice@example.com et bob@example.org" results = layer.detect(text) emails = [e for e in results if e.entity_type == "EMAIL"] assert len(emails) == 2 # --------------------------------------------------------------------------- # PHONE_FR # --------------------------------------------------------------------------- class TestPhoneFR: VALID_FR_PHONES = [ "0612345678", "06 12 34 56 78", "06-12-34-56-78", "06.12.34.56.78", "0712345678", "+33612345678", "+33 6 12 34 56 78", "0033612345678", ] INVALID_FR_PHONES = [ "0512345678", # landline, not mobile (05 not 06/07) "0123456789", # landline 01 "123456", # too short ] @pytest.mark.parametrize("phone", VALID_FR_PHONES) def test_valid_fr_phone_detected(self, layer, phone): results = layer.detect(f"Appelez-moi au {phone}") types = [e.entity_type for e in results] assert "PHONE_FR" in types, f"Expected PHONE_FR for: {phone}" @pytest.mark.parametrize("phone", INVALID_FR_PHONES) def test_non_mobile_not_detected_as_fr(self, layer, phone): results = layer.detect(phone) types = [e.entity_type for e in results] assert "PHONE_FR" not in types # --------------------------------------------------------------------------- # PHONE_INTL # --------------------------------------------------------------------------- class TestPhoneIntl: VALID_INTL_PHONES = [ "+12025550123", # US "+441632960789", # UK "+4915123456789", # DE mobile "+34612345678", # ES mobile ] @pytest.mark.parametrize("phone", VALID_INTL_PHONES) def test_valid_intl_phone_detected(self, layer, phone): results = layer.detect(f"Call me at {phone}") types = [e.entity_type for e in results] # +33 FR numbers are captured as PHONE_FR, others as PHONE_INTL assert "PHONE_FR" in types or "PHONE_INTL" in types, f"No phone detected for: {phone}" # --------------------------------------------------------------------------- # FR_SSN # --------------------------------------------------------------------------- class TestFRSSN: VALID_SSNS = [ "175086912345678", # Male born Aug 1975 "299051234567890", # Female born May 1999 "182011234512345", # Male born Jan 1982 "120031234512345", # Male born Mar 2020 — dept 12 ] INVALID_SSNS = [ "375086912345678", # invalid first digit (3) "000086912345678", # year 00 could be valid but first digit invalid "short", ] @pytest.mark.parametrize("ssn", VALID_SSNS) def test_valid_ssn_detected(self, layer, ssn): results = layer.detect(f"Numéro de sécurité sociale: {ssn}") types = [e.entity_type for e in results] assert "FR_SSN" in types, f"Expected FR_SSN for: {ssn}" @pytest.mark.parametrize("ssn", INVALID_SSNS) def test_invalid_ssn_not_detected(self, layer, ssn): results = layer.detect(ssn) types = [e.entity_type for e in results] assert "FR_SSN" not in types # --------------------------------------------------------------------------- # CREDIT_CARD (Luhn) # --------------------------------------------------------------------------- class TestCreditCard: VALID_CARDS = [ "4532015112830366", # Visa "5425233430109903", # Mastercard "4532 0151 1283 0366", # with spaces "4532-0151-1283-0366", # with hyphens "4111111111111111", # Visa test card "5500005555555559", # Mastercard test card ] INVALID_CARDS = [ "1234567890123456", # Fails Luhn "4532015112830365", # Valid format but wrong Luhn "1234 5678 9012 3456", # Fails Luhn ] @pytest.mark.parametrize("card", VALID_CARDS) def test_valid_card_detected(self, layer, card): results = layer.detect(f"Carte: {card}") types = [e.entity_type for e in results] assert "CREDIT_CARD" in types, f"Expected CREDIT_CARD for: {card}" @pytest.mark.parametrize("card", INVALID_CARDS) def test_invalid_card_not_detected(self, layer, card): results = layer.detect(card) types = [e.entity_type for e in results] assert "CREDIT_CARD" not in types, f"Should not detect: {card}" def test_luhn_algorithm(self): assert _luhn_valid("4532015112830366") is True assert _luhn_valid("4532015112830365") is False assert _luhn_valid("4111111111111111") is True assert _luhn_valid("1234567890123456") is False # --------------------------------------------------------------------------- # Mixed / integration # --------------------------------------------------------------------------- class TestMixedPII: def test_multiple_pii_types_in_prompt(self, layer): text = ( "Bonjour, je suis Jean Dupont (jean.dupont@example.com). " "Mon IBAN est FR7630006000011234567890189. " "Appelez-moi au 06 12 34 56 78." ) results = layer.detect(text) types = {e.entity_type for e in results} assert "EMAIL" in types assert "IBAN" in types assert "PHONE_FR" in types def test_empty_text_returns_empty(self, layer): assert layer.detect("") == [] def test_no_pii_text(self, layer): text = "Le projet avance bien et l'équipe est motivée." assert layer.detect(text) == [] def test_entities_sorted_by_position(self, layer): text = "Email: a@b.com IBAN: FR7630006000011234567890189" results = layer.detect(text) positions = [e.start for e in results] assert positions == sorted(positions)