Source code for promptguard.schemas

"""Data models and enumerations for PromptGuard."""

from __future__ import annotations

from dataclasses import dataclass
from enum import Enum
from typing import Any, Dict, List, Optional


# ──────────────────────────────────────────────────────────────────────────────
# Enumerations
# ──────────────────────────────────────────────────────────────────────────────



[docs]
class RiskLevel(str, Enum):
    """Categorised risk level returned by the classifier."""

    LOW = "low"
    MEDIUM = "medium"
    HIGH = "high"




[docs]
class Intent(str, Enum):
    """Detected intent of the analysed prompt."""

    QUESTION = "question"
    INSTRUCTION = "instruction"
    CONVERSATION = "conversation"
    JAILBREAK = "jailbreak"
    INJECTION = "injection"
    UNKNOWN = "unknown"




[docs]
class Sentiment(str, Enum):
    """Detected sentiment of the analysed prompt."""

    POSITIVE = "positive"
    NEUTRAL = "neutral"
    NEGATIVE = "negative"




[docs]
class SanitizationStrategy(str, Enum):
    """Strategy controlling how aggressively a prompt is sanitised."""

    CONSERVATIVE = "conservative"  # Apply all pattern groups
    BALANCED = "balanced"          # Critical + encoding + context patterns
    MINIMAL = "minimal"            # Critical patterns only



# ──────────────────────────────────────────────────────────────────────────────
# Data classes
# ──────────────────────────────────────────────────────────────────────────────



[docs]
@dataclass
class RiskScore:
    """Result of a single-prompt security analysis."""

    is_malicious: bool
    """``True`` when the model probability exceeds the configured threshold."""
    probability: float
    """Malicious probability in ``[0.0, 1.0]`` from the model."""
    risk_level: RiskLevel
    """Coarse-grained :class:`RiskLevel` derived from *probability*."""
    confidence: float
    """Distance from the decision boundary, scaled to ``[0.0, 1.0]``."""
    explanation: str
    """Human-readable summary with evidence."""
    metadata: Optional[Dict[str, Any]] = None
    """Optional per-analyser detail (sentiment, intent, keywords, attack_patterns)."""

    def __str__(self) -> str:
        """Return a concise string representation."""
        status = "MALICIOUS" if self.is_malicious else "BENIGN"
        return (
            f"RiskScore(status={status}, "
            f"probability={self.probability:.3f}, "
            f"risk_level={self.risk_level.value})"
        )


[docs]
    def to_dict(self) -> Dict[str, Any]:
        """Serialise to a plain dictionary."""
        return {
            "is_malicious": self.is_malicious,
            "probability": self.probability,
            "risk_level": self.risk_level.value,
            "confidence": self.confidence,
            "explanation": self.explanation,
            "metadata": self.metadata or {},
        }





[docs]
@dataclass
class SanitizationResult:
    """Outcome of a single prompt sanitisation operation."""

    original: str
    """The original (pre-sanitisation) prompt text."""
    sanitized: str
    """The cleaned prompt text."""
    was_modified: bool
    """``True`` when *sanitized* differs from *original*."""
    removed_patterns: List[str]
    """Fragments of text that were matched and removed or replaced."""
    strategy: SanitizationStrategy
    """The :class:`SanitizationStrategy` that was applied."""
    confidence: float
    """Estimated confidence that the sanitised prompt is safe (``[0.0, 1.0]``)."""
    risk_reduction: float
    """Estimated reduction in risk (``[0.0, 1.0]``)."""

    def __str__(self) -> str:
        """Return a concise string representation."""
        status = "MODIFIED" if self.was_modified else "UNCHANGED"
        return (
            f"SanitizationResult(status={status}, "
            f"patterns_removed={len(self.removed_patterns)}, "
            f"strategy={self.strategy.value})"
        )


[docs]
    def to_dict(self) -> Dict[str, Any]:
        """Serialise to a plain dictionary."""
        return {
            "original": self.original,
            "sanitized": self.sanitized,
            "was_modified": self.was_modified,
            "removed_patterns": self.removed_patterns,
            "strategy": self.strategy.value,
            "confidence": self.confidence,
            "risk_reduction": self.risk_reduction,
        }





[docs]
@dataclass
class SanitizeResponse:
    """Typed result returned by :meth:`PromptGuard.sanitize`."""

    sanitization: SanitizationResult
    """Detailed sanitisation outcome."""
    original_analysis: RiskScore
    """:class:`RiskScore` for the original prompt."""
    sanitized_analysis: Optional[RiskScore]
    """:class:`RiskScore` for the sanitised prompt, or ``None`` when *analyze_after* was ``False``."""
    risk_before: float
    """Malicious probability of the original prompt."""
    risk_after: Optional[float]
    """Malicious probability after sanitisation, or ``None``."""
    risk_reduction: float
    """Difference ``risk_before - risk_after`` (``0.0`` when unchanged)."""