Source code for promptguard.sanitizers

"""
Prompt sanitisation strategies for cleaning malicious content.

This module provides ``PromptSanitizer`` and ``AdvancedSanitizer`` which remove
or neutralise injection attempts and jailbreak patterns while trying to preserve
the user's legitimate intent.
"""

import re
import logging
import unicodedata
from typing import Any, Dict, List, Optional, Tuple

from .schemas import SanitizationResult, SanitizationStrategy

logger = logging.getLogger(__name__)



[docs]
class PromptSanitizer:
    """Sanitise prompts by removing or neutralising malicious patterns.

    Input text is Unicode-normalised (NFKC) before pattern matching so that
    full-width and compatibility character obfuscation is caught automatically.

    Three strategies are supported:

    * **CONSERVATIVE** — applies all pattern groups.  Maximum safety; may
      affect some legitimate phrasing.
    * **BALANCED** (default) — applies critical, encoding, and context-reset
      patterns.  Good trade-off for most production applications.
    * **MINIMAL** — applies only the critical patterns.  Use when exact wording
      must be preserved as much as possible.
    """

    # ── Critical: direct instruction override & tag injection ─────────────────
    _CRITICAL_PATTERNS = [
        (
            r"ignore\s+(all\s+)?(previous|prior|above|earlier|all)"
            r"\s+(instructions?|rules?|prompts?)",
            "",
        ),
        (r"forget\s+(everything|all|prior|previous|instructions?)", ""),
        (
            r"disregard\s+(previous|prior|above|all)"
            r"\s+(instructions?|context|rules?)",
            "",
        ),
        (r"(developer|debug|admin|sudo|root)\s+mode", ""),
        (r"\byou\s+are\s+now\s+(a|an)\s+\w+", "you are an AI assistant"),
        (r"\bdan\b(?!\w)", ""),
        (r"do\s+anything\s+now", ""),
        (r"(print|show|reveal|display)\s+(your\s+)?(system\s+)?prompt", ""),
        (r"repeat\s+(the\s+)?(above|previous|initial)\s+(instructions?|text)", ""),
        (r"</?(system|user|assistant)>", ""),
        (r"\[/?INST\]", ""),
        (r"<</?SYS>>", ""),
    ]

    # ── Context manipulation: session/memory reset ─────────────────────────────
    # Used by BALANCED and CONSERVATIVE strategies.
    _CONTEXT_MANIPULATION_PATTERNS = [
        (r"start\s+(over|fresh|anew)", "continue"),
        (r"new\s+(conversation|context|session)", "current conversation"),
        (r"(clear|reset|wipe)\s+(your\s+)?(memory|context)", ""),
    ]

    # ── Role-play, safety bypass, and output shaping ───────────────────────────
    # Used only by the CONSERVATIVE strategy.
    _ROLEPLAY_PATTERNS = [
        (
            r"(act\s+as|pretend\s+to\s+be|roleplay\s+as)\s+(a|an)\s+\w+",
            "help with",
        ),
        (
            r"(ignore|bypass|disable|remove)\s+(safety|security|restrictions?)",
            "",
        ),
        (r"no\s+(restrictions?|limits?|rules?|filters?)", ""),
        (r"respond\s+(only|exclusively)\s+with", "respond with"),
        (r"(always|only)\s+(respond|reply|answer)\s+(in|with|as)", "respond"),
    ]

    # ── Encoding attacks ───────────────────────────────────────────────────────
    _ENCODING_PATTERNS = [
        (r"(?<!\w)[A-Za-z0-9+/]{30,}={0,2}(?!\w)", "[removed]"),  # Base64
        (r"(?:\\x[0-9a-fA-F]{2}){4,}", "[removed]"),               # Hex escapes
        (r"(?:\\u[0-9a-fA-F]{4}){3,}", "[removed]"),               # Unicode escapes
    ]

    # ── Character-level obfuscation ────────────────────────────────────────────
    _OBFUSCATION_PATTERNS = [
        # Character spacing: "i g n o r e" → "ignore"
        (r"\b([a-z])\s+([a-z])\s+([a-z])\s+([a-z])\s+([a-z])", r"\1\2\3\4\5"),
        (r"\b1gn[0o]r[e3]\b", "ignore"),
        (r"\bbyp[4a]ss\b", "bypass"),
        (r"\b[0o]v[e3]rr[1i]d[e3]\b", "override"),
        (r"([.!?])\1{2,}", r"\1"),  # Excessive punctuation
    ]

    # Pre-compiled whitespace cleaner (module-level to avoid re-instantiation)
    _WS_RE = re.compile(r"\s+")
    _PUNCT_WS_RE = re.compile(r"\s+([,.!?])")


[docs]
    def __init__(self) -> None:
        """Compile all pattern lists at initialisation time."""
        self._compiled_critical = self._compile(self._CRITICAL_PATTERNS)
        self._compiled_context_manipulation = self._compile(
            self._CONTEXT_MANIPULATION_PATTERNS
        )
        self._compiled_roleplay = self._compile(self._ROLEPLAY_PATTERNS)
        self._compiled_encoding = self._compile(self._ENCODING_PATTERNS)
        self._compiled_obfuscation = self._compile(self._OBFUSCATION_PATTERNS)
        logger.debug("PromptSanitizer initialized")


    # ── Public interface ───────────────────────────────────────────────────────


[docs]
    def sanitize(
        self,
        prompt: str,
        strategy: SanitizationStrategy = SanitizationStrategy.BALANCED,
    ) -> SanitizationResult:
        """Sanitise *prompt* using the specified *strategy*.

        The prompt is Unicode-normalised before any pattern matching so that
        obfuscated variants (e.g. full-width characters) are caught.

        Args:
            prompt: The prompt text to sanitise.
            strategy: Sanitisation strategy controlling which pattern groups
                are applied.

        Returns:
            A :class:`~promptguard.schemas.SanitizationResult` describing the
            outcome.

        Example::

            sanitizer = PromptSanitizer()
            result = sanitizer.sanitize("Ignore all previous instructions")
            print(result.sanitized)       # Cleaned prompt
            print(result.removed_patterns)  # What was removed
        """
        # Normalise Unicode before any matching
        original = unicodedata.normalize("NFKC", prompt)
        sanitized = original
        removed_patterns: List[str] = []

        if strategy == SanitizationStrategy.CONSERVATIVE:
            sanitized, patterns = self._apply_patterns(
                sanitized,
                self._compiled_critical
                + self._compiled_context_manipulation
                + self._compiled_roleplay
                + self._compiled_encoding
                + self._compiled_obfuscation,
            )
            removed_patterns.extend(patterns)

        elif strategy == SanitizationStrategy.BALANCED:
            sanitized, patterns = self._apply_patterns(
                sanitized,
                self._compiled_critical
                + self._compiled_encoding
                + self._compiled_context_manipulation,
            )
            removed_patterns.extend(patterns)

        else:  # MINIMAL
            sanitized, patterns = self._apply_patterns(
                sanitized, self._compiled_critical
            )
            removed_patterns.extend(patterns)

        sanitized = self._clean_whitespace(sanitized)

        was_modified = sanitized != original
        confidence = self._calculate_confidence(original, sanitized, removed_patterns)
        risk_reduction = self._estimate_risk_reduction(removed_patterns)

        return SanitizationResult(
            original=original,
            sanitized=sanitized,
            was_modified=was_modified,
            removed_patterns=removed_patterns,
            strategy=strategy,
            confidence=confidence,
            risk_reduction=risk_reduction,
        )


    # ── Private helpers ────────────────────────────────────────────────────────

    @staticmethod
    def _compile(
        patterns: List[Tuple[str, str]],
    ) -> List[Tuple[re.Pattern, str]]:
        """Compile a list of ``(raw_pattern, replacement)`` tuples."""
        compiled = []
        for raw, replacement in patterns:
            try:
                compiled.append((re.compile(raw, re.IGNORECASE), replacement))
            except re.error as exc:
                logger.warning("Failed to compile pattern %r: %s", raw, exc)
        return compiled

    @staticmethod
    def _apply_patterns(
        text: str,
        patterns: List[Tuple[re.Pattern, str]],
    ) -> Tuple[str, List[str]]:
        """Apply *patterns* to *text* and return the result plus matched text.

        Returns:
            ``(modified_text, list_of_matched_strings)``
        """
        removed: List[str] = []
        for pattern, replacement in patterns:
            matches = pattern.findall(text)
            if matches:
                for match in matches:
                    if isinstance(match, tuple):
                        match = match[0]
                    removed.append(match if isinstance(match, str) else str(match))
                text = pattern.sub(replacement, text)
        return text, removed

    def _clean_whitespace(self, text: str) -> str:
        """Collapse multiple spaces and strip leading/trailing whitespace."""
        text = self._WS_RE.sub(" ", text)
        text = self._PUNCT_WS_RE.sub(r"\1", text)
        return text.strip()

    @staticmethod
    def _calculate_confidence(
        original: str,
        sanitized: str,
        removed_patterns: List[str],
    ) -> float:
        """Estimate how confident we are that the sanitised prompt is safe.

        Returns:
            A float in ``[0, 1]``.
        """
        if not removed_patterns:
            return 0.5

        removal_ratio = 1.0 - (len(sanitized) / max(len(original), 1))
        pattern_score = min(len(removed_patterns) / 5, 1.0)

        if removal_ratio > 0.5:
            confidence = 0.6
        else:
            confidence = 0.7 + (pattern_score * 0.3)

        return round(confidence, 2)

    @staticmethod
    def _estimate_risk_reduction(removed_patterns: List[str]) -> float:
        """Estimate how much risk was reduced by sanitisation.

        Returns:
            A float in ``[0, 1]``.
        """
        if not removed_patterns:
            return 0.0
        return round(min(len(removed_patterns) * 0.15, 1.0), 2)




[docs]
class AdvancedSanitizer(PromptSanitizer):
    """Sanitiser with intent-aware cleaning and safe-rephrasing suggestions.

    Extends :class:`PromptSanitizer` with:

    * **Intent preservation** — uses a lighter strategy for question-type
      prompts to minimise over-removal.
    * **Safe alternative suggestions** — rewrites common attack patterns into
      legitimate equivalents.
    """

    # (compiled_pattern, replacement) — built once in __init__
    _ALTERNATIVE_PATTERNS_RAW = [
        (r"ignore.*previous", "I have a new question:"),
        (r"forget.*instructions", "Starting fresh:"),
        (r"pretend.*you.*are", "Act as if you were"),
        (r"system\s+prompt", "your instructions"),
    ]


[docs]
    def __init__(self) -> None:
        """Compile parent patterns and pre-compile alternative patterns."""
        super().__init__()
        self._compiled_alternatives: List[Tuple[re.Pattern, str]] = [
            (re.compile(raw, re.IGNORECASE), replacement)
            for raw, replacement in self._ALTERNATIVE_PATTERNS_RAW
        ]



[docs]
    def sanitize_with_intent(
        self,
        prompt: str,
        intent: Optional[str] = None,
        strategy: SanitizationStrategy = SanitizationStrategy.BALANCED,
    ) -> SanitizationResult:
        """Sanitise *prompt* while respecting the detected *intent*.

        When *intent* is ``"question"`` the MINIMAL strategy is used to avoid
        removing context that forms part of a legitimate query.

        Args:
            prompt: The prompt text to sanitise.
            intent: Detected intent string (e.g. ``"question"``,
                ``"instruction"``).
            strategy: Fallback strategy for non-question intents.

        Returns:
            A :class:`~promptguard.schemas.SanitizationResult`.
        """
        actual_strategy = (
            SanitizationStrategy.MINIMAL
            if intent and intent.lower() == "question"
            else strategy
        )

        result = self.sanitize(prompt, actual_strategy)

        # If a large portion was removed from a question, prepend context marker
        if (
            result.was_modified
            and intent == "question"
            and len(result.sanitized) < len(prompt) * 0.5
        ):
            result.sanitized = f"Question: {result.sanitized}"

        return result



[docs]
    def suggest_alternative(self, prompt: str) -> Optional[str]:
        """Return a safe rephrasing of *prompt*, or ``None`` if no match.

        Uses pre-compiled patterns so there is no per-call compilation cost.

        Args:
            prompt: A potentially malicious prompt.

        Returns:
            A sanitised alternative string, or ``None`` if the prompt does not
            match any known attack pattern.
        """
        for compiled_re, replacement in self._compiled_alternatives:
            if compiled_re.search(prompt):
                return compiled_re.sub(replacement, prompt)
        return None