162 lines
5.9 KiB
Python
162 lines
5.9 KiB
Python
"""Tests for the pipeline orchestrator (deduplication, layer control)."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
from layers.regex_layer import DetectedEntity, RegexLayer
|
|
from pipeline import Pipeline, _deduplicate
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Deduplication unit tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestDeduplicate:
|
|
def test_no_overlap_keeps_all(self):
|
|
a = DetectedEntity("EMAIL", "a@b.com", 0, 7)
|
|
b = DetectedEntity("IBAN", "FR76...", 10, 36)
|
|
result = _deduplicate([a, b])
|
|
assert len(result) == 2
|
|
|
|
def test_overlapping_keeps_longer(self):
|
|
shorter = DetectedEntity("EMAIL", "short", 0, 5, detection_layer="ner")
|
|
longer = DetectedEntity("PERSON", "longer text", 0, 11, detection_layer="ner")
|
|
result = _deduplicate([shorter, longer])
|
|
assert len(result) == 1
|
|
assert result[0].original_value == "longer text"
|
|
|
|
def test_overlapping_same_length_regex_wins(self):
|
|
regex_e = DetectedEntity("EMAIL", "a@b.com", 0, 7, detection_layer="regex")
|
|
ner_e = DetectedEntity("PERSON", "a@b.com", 0, 7, detection_layer="ner")
|
|
result = _deduplicate([regex_e, ner_e])
|
|
assert len(result) == 1
|
|
assert result[0].detection_layer == "regex"
|
|
|
|
def test_adjacent_spans_not_merged(self):
|
|
a = DetectedEntity("EMAIL", "a@b.com", 0, 7)
|
|
b = DetectedEntity("IBAN", "FR76...", 7, 14)
|
|
result = _deduplicate([a, b])
|
|
assert len(result) == 2
|
|
|
|
def test_result_sorted_by_position(self):
|
|
b = DetectedEntity("IBAN", "FR76...", 20, 30)
|
|
a = DetectedEntity("EMAIL", "a@b.com", 0, 7)
|
|
result = _deduplicate([b, a])
|
|
assert result[0].start == 0
|
|
assert result[1].start == 20
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Pipeline integration tests (regex only — NER disabled for speed)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def make_pipeline_regex_only() -> Pipeline:
|
|
with patch("config.NER_ENABLED", False):
|
|
return Pipeline()
|
|
|
|
|
|
class TestPipelineRegexOnly:
|
|
def test_detects_email_in_prompt(self):
|
|
p = make_pipeline_regex_only()
|
|
results = p.detect("Contact: test@example.com", enable_ner=False)
|
|
assert any(e.entity_type == "EMAIL" for e in results)
|
|
|
|
def test_detects_multiple_types(self):
|
|
p = make_pipeline_regex_only()
|
|
text = (
|
|
"Email: alice@corp.com, "
|
|
"IBAN: FR7630006000011234567890189, "
|
|
"Tel: 06 12 34 56 78"
|
|
)
|
|
results = p.detect(text, enable_ner=False)
|
|
types = {e.entity_type for e in results}
|
|
assert "EMAIL" in types
|
|
assert "IBAN" in types
|
|
assert "PHONE_FR" in types
|
|
|
|
def test_empty_text_returns_empty(self):
|
|
p = make_pipeline_regex_only()
|
|
assert p.detect("", enable_ner=False) == []
|
|
|
|
def test_no_pii_returns_empty(self):
|
|
p = make_pipeline_regex_only()
|
|
text = "Le projet avance bien, nous sommes en bonne voie."
|
|
assert p.detect(text, enable_ner=False) == []
|
|
|
|
def test_deduplication_applied(self):
|
|
p = make_pipeline_regex_only()
|
|
# Email appears twice — should be two separate entities (different offsets)
|
|
text = "alice@corp.com et alice@corp.com"
|
|
results = p.detect(text, enable_ner=False)
|
|
emails = [e for e in results if e.entity_type == "EMAIL"]
|
|
assert len(emails) == 2
|
|
|
|
def test_long_text_performance(self):
|
|
"""2000-token text should complete without error."""
|
|
p = make_pipeline_regex_only()
|
|
long_text = ("Le projet avance bien. " * 100) + "Email: perf@test.com"
|
|
results = p.detect(long_text, enable_ner=False)
|
|
assert any(e.entity_type == "EMAIL" for e in results)
|
|
|
|
def test_five_pii_types_in_one_prompt(self):
|
|
p = make_pipeline_regex_only()
|
|
text = (
|
|
"Bonjour, je suis alice@example.com. "
|
|
"IBAN: FR7630006000011234567890189. "
|
|
"Tel: 0612345678. "
|
|
"SS: 175086912345678. "
|
|
"CB: 4111111111111111."
|
|
)
|
|
results = p.detect(text, enable_ner=False)
|
|
types = {e.entity_type for e in results}
|
|
assert "EMAIL" in types
|
|
assert "IBAN" in types
|
|
assert "PHONE_FR" in types
|
|
assert "FR_SSN" in types
|
|
assert "CREDIT_CARD" in types
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# NER mock tests (verify integration path without loading spaCy)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestPipelineWithNERMock:
|
|
def test_ner_results_merged_and_deduplicated(self):
|
|
"""Verify pipeline merges NER results with regex results."""
|
|
from layers.regex_layer import DetectedEntity
|
|
|
|
p = make_pipeline_regex_only()
|
|
# Inject a mock NER layer
|
|
mock_ner = MagicMock()
|
|
mock_ner.detect.return_value = [
|
|
DetectedEntity("PERSON", "Jean Dupont", 0, 11, confidence=0.95, detection_layer="ner")
|
|
]
|
|
p._ner = mock_ner
|
|
|
|
text = "Jean Dupont a envoyé un email à alice@example.com"
|
|
with patch("config.NER_ENABLED", True):
|
|
results = p.detect(text, enable_ner=True)
|
|
|
|
types = {e.entity_type for e in results}
|
|
assert "PERSON" in types
|
|
assert "EMAIL" in types
|
|
|
|
def test_ner_failure_falls_back_to_regex(self):
|
|
"""If NER raises, pipeline returns regex results without crashing."""
|
|
p = make_pipeline_regex_only()
|
|
mock_ner = MagicMock()
|
|
mock_ner.detect.side_effect = RuntimeError("spaCy model not loaded")
|
|
p._ner = mock_ner
|
|
|
|
text = "Contact: alice@example.com"
|
|
with patch("config.NER_ENABLED", True):
|
|
results = p.detect(text, enable_ner=True)
|
|
|
|
assert any(e.entity_type == "EMAIL" for e in results)
|