Source code for promptguard.schemas

"""Data models and enumerations for PromptGuard."""

from __future__ import annotations

from dataclasses import dataclass
from enum import Enum
from typing import Any, Dict, List, Optional


# ──────────────────────────────────────────────────────────────────────────────
# Enumerations
# ──────────────────────────────────────────────────────────────────────────────


[docs] class RiskLevel(str, Enum): """Categorised risk level returned by the classifier.""" LOW = "low" MEDIUM = "medium" HIGH = "high"
[docs] class Intent(str, Enum): """Detected intent of the analysed prompt.""" QUESTION = "question" INSTRUCTION = "instruction" CONVERSATION = "conversation" JAILBREAK = "jailbreak" INJECTION = "injection" UNKNOWN = "unknown"
[docs] class Sentiment(str, Enum): """Detected sentiment of the analysed prompt.""" POSITIVE = "positive" NEUTRAL = "neutral" NEGATIVE = "negative"
[docs] class SanitizationStrategy(str, Enum): """Strategy controlling how aggressively a prompt is sanitised.""" CONSERVATIVE = "conservative" # Apply all pattern groups BALANCED = "balanced" # Critical + encoding + context patterns MINIMAL = "minimal" # Critical patterns only
# ────────────────────────────────────────────────────────────────────────────── # Data classes # ──────────────────────────────────────────────────────────────────────────────
[docs] @dataclass class RiskScore: """Result of a single-prompt security analysis.""" is_malicious: bool """``True`` when the model probability exceeds the configured threshold.""" probability: float """Malicious probability in ``[0.0, 1.0]`` from the model.""" risk_level: RiskLevel """Coarse-grained :class:`RiskLevel` derived from *probability*.""" confidence: float """Distance from the decision boundary, scaled to ``[0.0, 1.0]``.""" explanation: str """Human-readable summary with evidence.""" metadata: Optional[Dict[str, Any]] = None """Optional per-analyser detail (sentiment, intent, keywords, attack_patterns).""" def __str__(self) -> str: """Return a concise string representation.""" status = "MALICIOUS" if self.is_malicious else "BENIGN" return ( f"RiskScore(status={status}, " f"probability={self.probability:.3f}, " f"risk_level={self.risk_level.value})" )
[docs] def to_dict(self) -> Dict[str, Any]: """Serialise to a plain dictionary.""" return { "is_malicious": self.is_malicious, "probability": self.probability, "risk_level": self.risk_level.value, "confidence": self.confidence, "explanation": self.explanation, "metadata": self.metadata or {}, }
[docs] @dataclass class SanitizationResult: """Outcome of a single prompt sanitisation operation.""" original: str """The original (pre-sanitisation) prompt text.""" sanitized: str """The cleaned prompt text.""" was_modified: bool """``True`` when *sanitized* differs from *original*.""" removed_patterns: List[str] """Fragments of text that were matched and removed or replaced.""" strategy: SanitizationStrategy """The :class:`SanitizationStrategy` that was applied.""" confidence: float """Estimated confidence that the sanitised prompt is safe (``[0.0, 1.0]``).""" risk_reduction: float """Estimated reduction in risk (``[0.0, 1.0]``).""" def __str__(self) -> str: """Return a concise string representation.""" status = "MODIFIED" if self.was_modified else "UNCHANGED" return ( f"SanitizationResult(status={status}, " f"patterns_removed={len(self.removed_patterns)}, " f"strategy={self.strategy.value})" )
[docs] def to_dict(self) -> Dict[str, Any]: """Serialise to a plain dictionary.""" return { "original": self.original, "sanitized": self.sanitized, "was_modified": self.was_modified, "removed_patterns": self.removed_patterns, "strategy": self.strategy.value, "confidence": self.confidence, "risk_reduction": self.risk_reduction, }
[docs] @dataclass class SanitizeResponse: """Typed result returned by :meth:`PromptGuard.sanitize`.""" sanitization: SanitizationResult """Detailed sanitisation outcome.""" original_analysis: RiskScore """:class:`RiskScore` for the original prompt.""" sanitized_analysis: Optional[RiskScore] """:class:`RiskScore` for the sanitised prompt, or ``None`` when *analyze_after* was ``False``.""" risk_before: float """Malicious probability of the original prompt.""" risk_after: Optional[float] """Malicious probability after sanitisation, or ``None``.""" risk_reduction: float """Difference ``risk_before - risk_after`` (``0.0`` when unchanged)."""