"""Tests for the pipeline orchestrator (deduplication, layer control).""" from __future__ import annotations from unittest.mock import MagicMock, patch import pytest from layers.regex_layer import DetectedEntity, RegexLayer from pipeline import Pipeline, _deduplicate # --------------------------------------------------------------------------- # Deduplication unit tests # --------------------------------------------------------------------------- class TestDeduplicate: def test_no_overlap_keeps_all(self): a = DetectedEntity("EMAIL", "a@b.com", 0, 7) b = DetectedEntity("IBAN", "FR76...", 10, 36) result = _deduplicate([a, b]) assert len(result) == 2 def test_overlapping_keeps_longer(self): shorter = DetectedEntity("EMAIL", "short", 0, 5, detection_layer="ner") longer = DetectedEntity("PERSON", "longer text", 0, 11, detection_layer="ner") result = _deduplicate([shorter, longer]) assert len(result) == 1 assert result[0].original_value == "longer text" def test_overlapping_same_length_regex_wins(self): regex_e = DetectedEntity("EMAIL", "a@b.com", 0, 7, detection_layer="regex") ner_e = DetectedEntity("PERSON", "a@b.com", 0, 7, detection_layer="ner") result = _deduplicate([regex_e, ner_e]) assert len(result) == 1 assert result[0].detection_layer == "regex" def test_adjacent_spans_not_merged(self): a = DetectedEntity("EMAIL", "a@b.com", 0, 7) b = DetectedEntity("IBAN", "FR76...", 7, 14) result = _deduplicate([a, b]) assert len(result) == 2 def test_result_sorted_by_position(self): b = DetectedEntity("IBAN", "FR76...", 20, 30) a = DetectedEntity("EMAIL", "a@b.com", 0, 7) result = _deduplicate([b, a]) assert result[0].start == 0 assert result[1].start == 20 # --------------------------------------------------------------------------- # Pipeline integration tests (regex only — NER disabled for speed) # --------------------------------------------------------------------------- def make_pipeline_regex_only() -> Pipeline: with patch("config.NER_ENABLED", False): return Pipeline() class TestPipelineRegexOnly: def test_detects_email_in_prompt(self): p = make_pipeline_regex_only() results = p.detect("Contact: test@example.com", enable_ner=False) assert any(e.entity_type == "EMAIL" for e in results) def test_detects_multiple_types(self): p = make_pipeline_regex_only() text = ( "Email: alice@corp.com, " "IBAN: FR7630006000011234567890189, " "Tel: 06 12 34 56 78" ) results = p.detect(text, enable_ner=False) types = {e.entity_type for e in results} assert "EMAIL" in types assert "IBAN" in types assert "PHONE_FR" in types def test_empty_text_returns_empty(self): p = make_pipeline_regex_only() assert p.detect("", enable_ner=False) == [] def test_no_pii_returns_empty(self): p = make_pipeline_regex_only() text = "Le projet avance bien, nous sommes en bonne voie." assert p.detect(text, enable_ner=False) == [] def test_deduplication_applied(self): p = make_pipeline_regex_only() # Email appears twice — should be two separate entities (different offsets) text = "alice@corp.com et alice@corp.com" results = p.detect(text, enable_ner=False) emails = [e for e in results if e.entity_type == "EMAIL"] assert len(emails) == 2 def test_long_text_performance(self): """2000-token text should complete without error.""" p = make_pipeline_regex_only() long_text = ("Le projet avance bien. " * 100) + "Email: perf@test.com" results = p.detect(long_text, enable_ner=False) assert any(e.entity_type == "EMAIL" for e in results) def test_five_pii_types_in_one_prompt(self): p = make_pipeline_regex_only() text = ( "Bonjour, je suis alice@example.com. " "IBAN: FR7630006000011234567890189. " "Tel: 0612345678. " "SS: 175086912345678. " "CB: 4111111111111111." ) results = p.detect(text, enable_ner=False) types = {e.entity_type for e in results} assert "EMAIL" in types assert "IBAN" in types assert "PHONE_FR" in types assert "FR_SSN" in types assert "CREDIT_CARD" in types # --------------------------------------------------------------------------- # NER mock tests (verify integration path without loading spaCy) # --------------------------------------------------------------------------- class TestPipelineWithNERMock: def test_ner_results_merged_and_deduplicated(self): """Verify pipeline merges NER results with regex results.""" from layers.regex_layer import DetectedEntity p = make_pipeline_regex_only() # Inject a mock NER layer mock_ner = MagicMock() mock_ner.detect.return_value = [ DetectedEntity("PERSON", "Jean Dupont", 0, 11, confidence=0.95, detection_layer="ner") ] p._ner = mock_ner text = "Jean Dupont a envoyé un email à alice@example.com" with patch("config.NER_ENABLED", True): results = p.detect(text, enable_ner=True) types = {e.entity_type for e in results} assert "PERSON" in types assert "EMAIL" in types def test_ner_failure_falls_back_to_regex(self): """If NER raises, pipeline returns regex results without crashing.""" p = make_pipeline_regex_only() mock_ner = MagicMock() mock_ner.detect.side_effect = RuntimeError("spaCy model not loaded") p._ner = mock_ner text = "Contact: alice@example.com" with patch("config.NER_ENABLED", True): results = p.detect(text, enable_ner=True) assert any(e.entity_type == "EMAIL" for e in results)