veylant/services/pii/tests/test_pipeline.py
2026-02-23 13:35:04 +01:00

162 lines
5.9 KiB
Python

"""Tests for the pipeline orchestrator (deduplication, layer control)."""
from __future__ import annotations
from unittest.mock import MagicMock, patch
import pytest
from layers.regex_layer import DetectedEntity, RegexLayer
from pipeline import Pipeline, _deduplicate
# ---------------------------------------------------------------------------
# Deduplication unit tests
# ---------------------------------------------------------------------------
class TestDeduplicate:
def test_no_overlap_keeps_all(self):
a = DetectedEntity("EMAIL", "a@b.com", 0, 7)
b = DetectedEntity("IBAN", "FR76...", 10, 36)
result = _deduplicate([a, b])
assert len(result) == 2
def test_overlapping_keeps_longer(self):
shorter = DetectedEntity("EMAIL", "short", 0, 5, detection_layer="ner")
longer = DetectedEntity("PERSON", "longer text", 0, 11, detection_layer="ner")
result = _deduplicate([shorter, longer])
assert len(result) == 1
assert result[0].original_value == "longer text"
def test_overlapping_same_length_regex_wins(self):
regex_e = DetectedEntity("EMAIL", "a@b.com", 0, 7, detection_layer="regex")
ner_e = DetectedEntity("PERSON", "a@b.com", 0, 7, detection_layer="ner")
result = _deduplicate([regex_e, ner_e])
assert len(result) == 1
assert result[0].detection_layer == "regex"
def test_adjacent_spans_not_merged(self):
a = DetectedEntity("EMAIL", "a@b.com", 0, 7)
b = DetectedEntity("IBAN", "FR76...", 7, 14)
result = _deduplicate([a, b])
assert len(result) == 2
def test_result_sorted_by_position(self):
b = DetectedEntity("IBAN", "FR76...", 20, 30)
a = DetectedEntity("EMAIL", "a@b.com", 0, 7)
result = _deduplicate([b, a])
assert result[0].start == 0
assert result[1].start == 20
# ---------------------------------------------------------------------------
# Pipeline integration tests (regex only — NER disabled for speed)
# ---------------------------------------------------------------------------
def make_pipeline_regex_only() -> Pipeline:
with patch("config.NER_ENABLED", False):
return Pipeline()
class TestPipelineRegexOnly:
def test_detects_email_in_prompt(self):
p = make_pipeline_regex_only()
results = p.detect("Contact: test@example.com", enable_ner=False)
assert any(e.entity_type == "EMAIL" for e in results)
def test_detects_multiple_types(self):
p = make_pipeline_regex_only()
text = (
"Email: alice@corp.com, "
"IBAN: FR7630006000011234567890189, "
"Tel: 06 12 34 56 78"
)
results = p.detect(text, enable_ner=False)
types = {e.entity_type for e in results}
assert "EMAIL" in types
assert "IBAN" in types
assert "PHONE_FR" in types
def test_empty_text_returns_empty(self):
p = make_pipeline_regex_only()
assert p.detect("", enable_ner=False) == []
def test_no_pii_returns_empty(self):
p = make_pipeline_regex_only()
text = "Le projet avance bien, nous sommes en bonne voie."
assert p.detect(text, enable_ner=False) == []
def test_deduplication_applied(self):
p = make_pipeline_regex_only()
# Email appears twice — should be two separate entities (different offsets)
text = "alice@corp.com et alice@corp.com"
results = p.detect(text, enable_ner=False)
emails = [e for e in results if e.entity_type == "EMAIL"]
assert len(emails) == 2
def test_long_text_performance(self):
"""2000-token text should complete without error."""
p = make_pipeline_regex_only()
long_text = ("Le projet avance bien. " * 100) + "Email: perf@test.com"
results = p.detect(long_text, enable_ner=False)
assert any(e.entity_type == "EMAIL" for e in results)
def test_five_pii_types_in_one_prompt(self):
p = make_pipeline_regex_only()
text = (
"Bonjour, je suis alice@example.com. "
"IBAN: FR7630006000011234567890189. "
"Tel: 0612345678. "
"SS: 175086912345678. "
"CB: 4111111111111111."
)
results = p.detect(text, enable_ner=False)
types = {e.entity_type for e in results}
assert "EMAIL" in types
assert "IBAN" in types
assert "PHONE_FR" in types
assert "FR_SSN" in types
assert "CREDIT_CARD" in types
# ---------------------------------------------------------------------------
# NER mock tests (verify integration path without loading spaCy)
# ---------------------------------------------------------------------------
class TestPipelineWithNERMock:
def test_ner_results_merged_and_deduplicated(self):
"""Verify pipeline merges NER results with regex results."""
from layers.regex_layer import DetectedEntity
p = make_pipeline_regex_only()
# Inject a mock NER layer
mock_ner = MagicMock()
mock_ner.detect.return_value = [
DetectedEntity("PERSON", "Jean Dupont", 0, 11, confidence=0.95, detection_layer="ner")
]
p._ner = mock_ner
text = "Jean Dupont a envoyé un email à alice@example.com"
with patch("config.NER_ENABLED", True):
results = p.detect(text, enable_ner=True)
types = {e.entity_type for e in results}
assert "PERSON" in types
assert "EMAIL" in types
def test_ner_failure_falls_back_to_regex(self):
"""If NER raises, pipeline returns regex results without crashing."""
p = make_pipeline_regex_only()
mock_ner = MagicMock()
mock_ner.detect.side_effect = RuntimeError("spaCy model not loaded")
p._ner = mock_ner
text = "Contact: alice@example.com"
with patch("config.NER_ENABLED", True):
results = p.detect(text, enable_ner=True)
assert any(e.entity_type == "EMAIL" for e in results)