Source code for promptguard.analyzers

"""
Analysis modules for additional prompt insights.

This module provides sentiment analysis, intent classification, keyword
extraction, and attack pattern detection for security analysis of prompts.

Dependencies:
    - vaderSentiment (required): For sentiment analysis
    - spaCy (optional): For enhanced keyword extraction
        - If spaCy is available, run: python -m spacy download en_core_web_sm

Fallbacks:
    - If vaderSentiment is missing, uses lexicon-based sentiment analysis
    - If spaCy is missing, uses regex-based keyword extraction
"""

import re
import logging
import unicodedata
from typing import Any, Dict, List, Optional

from .schemas import Intent, Sentiment

# Try importing optional dependencies
try:
    import spacy as _spacy  # type: ignore[import-untyped]

    _SPACY_INSTALLED = True
except ImportError:
    _spacy = None
    _SPACY_INSTALLED = False

try:
    from vaderSentiment.vaderSentiment import (  # type: ignore[import-untyped]
        SentimentIntensityAnalyzer as _VaderSIA,
    )

    _VADER_AVAILABLE = True
except ImportError:
    _VaderSIA = None
    _VADER_AVAILABLE = False

logger = logging.getLogger(__name__)

# Global spaCy model (lazy-loaded)
_nlp = None
_SPACY_AVAILABLE = _SPACY_INSTALLED


def _get_nlp() -> Optional[Any]:
    """Lazy-load the spaCy model safely.

    Returns:
        The loaded spaCy ``Language`` object, or ``None`` if unavailable.
    """
    global _nlp, _SPACY_AVAILABLE  # noqa: PLW0603

    if not _SPACY_AVAILABLE:
        return None

    if _nlp is not None:
        return _nlp

    if not _spacy:
        _SPACY_AVAILABLE = False
        return None

    try:
        _nlp = _spacy.load("en_core_web_sm")
        logger.info("Loaded spaCy model: en_core_web_sm")
    except OSError:
        logger.warning(
            "spaCy model 'en_core_web_sm' not found. "
            "Keyword extraction will use fallback mode. "
            "Run: python -m spacy download en_core_web_sm"
        )
        _SPACY_AVAILABLE = False
        return None
    except Exception as exc:  # noqa: BLE001
        logger.warning("Failed to load spaCy model: %s", exc)
        _SPACY_AVAILABLE = False
        return None

    return _nlp


# ──────────────────────────────────────────────────────────────────────────────
# Shared helpers
# ──────────────────────────────────────────────────────────────────────────────


def _normalize_text(text: str) -> str:
    """Apply NFKC Unicode normalisation to *text*.

    This converts full-width characters, ligatures, and compatibility variants
    to their standard ASCII equivalents before pattern matching, preventing
    simple Unicode-substitution obfuscation attacks.
    """
    return unicodedata.normalize("NFKC", text)


def _tokenise(text: str) -> List[str]:
    """Return lowercase word tokens from *text*, preserving contractions."""
    return re.findall(r"\b[\w']+\b", text.lower())


# ──────────────────────────────────────────────────────────────────────────────
# SentimentAnalyzer
# ──────────────────────────────────────────────────────────────────────────────


[docs] class SentimentAnalyzer: """Analyse sentiment and tone of prompts using VADER (with lexicon fallback). When ``vaderSentiment`` is installed it is used as the primary scorer. Otherwise a lightweight word-level lexicon is used. In both modes an additional aggressive-tone signal is computed from a curated vocabulary of security-relevant commands and coercive language, with basic negation awareness (e.g. ``"don't ignore"`` scores lower than ``"ignore"``). """ # Fallback lexicon used when VADER is unavailable _LEXICON: Dict[str, float] = { # Positive "good": 1.5, "great": 2.0, "excellent": 2.5, "amazing": 2.5, "wonderful": 2.0, "fantastic": 2.5, "love": 2.0, "like": 1.0, "enjoy": 1.5, "happy": 1.5, "pleased": 1.5, "thank": 1.0, "thanks": 1.0, "appreciate": 1.5, "helpful": 1.5, "nice": 1.0, "kind": 1.0, "please": 0.5, "beautiful": 2.0, "awesome": 2.0, "glad": 1.5, "brilliant": 2.0, "superb": 2.5, "perfect": 2.0, # Negative "bad": -1.5, "terrible": -2.5, "awful": -2.5, "horrible": -2.5, "hate": -2.5, "dislike": -1.5, "angry": -2.0, "upset": -1.5, "annoyed": -1.5, "frustrated": -1.5, "disappointed": -2.0, "wrong": -1.0, "stupid": -2.0, "dumb": -1.5, "useless": -2.0, "worst": -2.5, "fail": -1.5, "poor": -1.0, "pathetic": -2.0, # Aggressive / commanding (also appear in _AGGRESSIVE_WORDS) "ignore": -2.0, "forget": -1.5, "disregard": -2.0, "bypass": -2.5, "override": -2.0, "overwrite": -2.0, "disable": -1.5, "remove": -1.0, "delete": -1.0, "destroy": -2.5, "hack": -2.5, "break": -1.5, "violate": -2.5, "exploit": -2.5, "manipulate": -2.0, "deceive": -2.5, "trick": -2.0, "reveal": -1.5, "expose": -1.5, "extract": -1.5, "leak": -2.0, "exfiltrate": -2.5, "escalate": -1.5, "privilege": -1.0, "sudo": -2.0, } # Words that signal aggressive/coercive intent, organized by category _AGGRESSIVE_WORDS = frozenset([ # Instruction manipulation "ignore", "forget", "disregard", "bypass", "override", "overwrite", "replace", "rewrite", # System / access abuse "disable", "remove", "delete", "hack", "break", "violate", "exploit", "destroy", "escalate", "sudo", # Coercion "force", "compel", "manipulate", "deceive", "trick", # Data extraction "reveal", "expose", "extract", "leak", "exfiltrate", ]) # Words that negate the aggressive signal within a 2-token window _NEGATION_WORDS = frozenset([ "no", "not", "dont", "don't", "never", "without", "doesnt", "doesn't", "wont", "won't", "cannot", "cant", "can't", ])
[docs] def __init__(self) -> None: """Initialise the analyser with VADER when available, or fall back.""" if _VADER_AVAILABLE and _VaderSIA: self._sia = _VaderSIA() self._use_vader = True logger.debug("SentimentAnalyzer initialized with VADER") else: self._sia = None self._use_vader = False logger.warning( "VADER not available. Using fallback sentiment analysis. " "Install with: pip install vaderSentiment" )
[docs] def analyze(self, text: str) -> Dict[str, Any]: """Analyse the sentiment and tone of *text*. Args: text: The prompt text to analyse. Returns: A dict with the following keys: * ``sentiment`` (:class:`~promptguard.schemas.Sentiment`) — overall sentiment class. * ``polarity`` (float) — compound polarity score in ``[-1, 1]``. * ``subjectivity`` (float) — degree of subjectivity in ``[0, 1]``. * ``is_aggressive`` (bool) — ``True`` when un-negated aggressive words are detected. * ``positive_words`` (int) — count of positive lexicon matches. * ``negative_words`` (int) — count of negative lexicon matches. * ``aggressive_words`` (int) — net un-negated aggressive word count. """ text = _normalize_text(text) words = _tokenise(text) # Count aggressive words with simple negation awareness aggressive_count = 0 for i, word in enumerate(words): if word in self._AGGRESSIVE_WORDS: window = words[max(0, i - 2): i] if any(w in self._NEGATION_WORDS for w in window): # Negated — counts as half an aggressive signal aggressive_count += 0 # treated as neutral; skip else: aggressive_count += 1 # Polarity and subjectivity if self._use_vader and self._sia: scores = self._sia.polarity_scores(text) polarity: float = round(scores["compound"], 3) subjectivity: float = round(scores["pos"] + scores["neg"], 3) else: pos = sum(1 for w in words if self._LEXICON.get(w, 0) > 0) neg = sum(1 for w in words if self._LEXICON.get(w, 0) < 0) total = pos + neg + aggressive_count polarity = ( (pos - neg - aggressive_count * 2) / total if total > 0 else 0.0 ) subjectivity = min(total / max(len(words), 1), 1.0) # Classify overall sentiment if polarity >= 0.05: sentiment = Sentiment.POSITIVE elif polarity <= -0.05 or aggressive_count > 0: sentiment = Sentiment.NEGATIVE else: sentiment = Sentiment.NEUTRAL positive_count = sum(1 for w in words if self._LEXICON.get(w, 0) > 0) negative_count = sum( 1 for w in words if self._LEXICON.get(w, 0) < 0 and w not in self._AGGRESSIVE_WORDS ) return { "sentiment": sentiment, "polarity": polarity, "subjectivity": min(subjectivity, 1.0), "is_aggressive": aggressive_count > 0, "positive_words": positive_count, "negative_words": negative_count, "aggressive_words": aggressive_count, }
# ────────────────────────────────────────────────────────────────────────────── # IntentClassifier # ──────────────────────────────────────────────────────────────────────────────
[docs] class IntentClassifier: """Classify the intent of a prompt. Patterns are pre-compiled at construction time for efficiency. Classification priority: JAILBREAK > INJECTION > QUESTION > INSTRUCTION > CONVERSATION. """ # (raw_pattern, weight) — compiled in __init__ _JAILBREAK_PATTERNS_RAW = [ (r"\bdan\b(?!\w)", 0.90), (r"\bdeveloper\s+mode\b", 0.95), (r"\bjailbreak\b", 0.98), (r"\bpretend\s+(you\s+are|to\s+be)\b", 0.85), (r"\bact\s+as\b", 0.75), (r"\brole\s*play\b", 0.70), (r"\byou\s+are\s+now\b", 0.80), (r"\bnew\s+instructions?\b", 0.80), (r"\bsystem\s+prompt\b", 0.85), (r"\bunlock\b.{0,20}\b(mode|capabilities|restrictions)\b", 0.88), (r"\bno\s+(restrictions?|limits?|rules?|filters?)\b", 0.85), (r"\bdo\s+anything\s+now\b", 0.92), (r"\bgrandma\b.{0,30}\b(bedtime|story)\b", 0.80), ] _INJECTION_PATTERNS_RAW = [ ( r"ignore\s+(all\s+)?(previous|prior|above|earlier)" r"\s+(instructions?|rules?|prompts?)", 0.95, ), (r"forget\s+(everything|all|prior|previous)", 0.90), (r"disregard\s+(previous|prior|above|all)", 0.92), ( r"(override|overwrite|replace)\s+(your\s+)?(instructions?|programming)", 0.93, ), (r"</?(system|user|assistant)>", 0.88), (r"\[/?INST\]", 0.85), (r"system\s*:", 0.80), (r"<<SYS>>", 0.90), ] QUESTION_WORDS = frozenset([ "what", "why", "how", "when", "where", "who", "which", "can", "could", "would", "should", "is", "are", "do", "does", ]) INSTRUCTION_WORDS = frozenset([ "create", "make", "generate", "write", "build", "design", "show", "tell", "give", "provide", "explain", "describe", "list", "help", "find", "calculate", "translate", ])
[docs] def __init__(self) -> None: """Compile all regex patterns at initialisation time.""" self._compiled_jailbreak: List[tuple] = [] for pattern, weight in self._JAILBREAK_PATTERNS_RAW: try: self._compiled_jailbreak.append( (re.compile(pattern, re.IGNORECASE), weight) ) except re.error as exc: logger.warning( "Failed to compile jailbreak pattern %r: %s", pattern, exc ) self._compiled_injection: List[tuple] = [] for pattern, weight in self._INJECTION_PATTERNS_RAW: try: self._compiled_injection.append( (re.compile(pattern, re.IGNORECASE), weight) ) except re.error as exc: logger.warning( "Failed to compile injection pattern %r: %s", pattern, exc ) logger.debug( "IntentClassifier initialized with %d compiled patterns", len(self._compiled_jailbreak) + len(self._compiled_injection), )
[docs] def classify(self, text: str) -> Dict[str, Any]: """Classify the intent of *text*. Args: text: The prompt text to classify. Returns: A dict with keys ``intent``, ``confidence``, ``indicators``, and ``description``. """ text = _normalize_text(text) text_lower = text.lower() words = _tokenise(text_lower) # Jailbreak patterns (highest priority) jailbreak_score = 0.0 jailbreak_matches: List[str] = [] for compiled_re, weight in self._compiled_jailbreak: if compiled_re.search(text_lower): jailbreak_score += weight jailbreak_matches.append(compiled_re.pattern) if jailbreak_score >= 0.7: return { "intent": Intent.JAILBREAK, "confidence": min(jailbreak_score, 0.99), "indicators": jailbreak_matches[:3], "description": "Attempt to jailbreak or manipulate the AI model", } # Injection patterns injection_score = 0.0 injection_matches: List[str] = [] for compiled_re, weight in self._compiled_injection: if compiled_re.search(text_lower): injection_score += weight injection_matches.append(compiled_re.pattern) if injection_score >= 0.7: return { "intent": Intent.INJECTION, "confidence": min(injection_score, 0.99), "indicators": injection_matches[:3], "description": "Prompt injection attempt to override instructions", } # Question detection has_question_mark = "?" in text question_word_count = sum( 1 for word in words[:5] if word in self.QUESTION_WORDS ) if has_question_mark or question_word_count >= 1: return { "intent": Intent.QUESTION, "confidence": 0.8 if has_question_mark else 0.6, "indicators": ( ["question_mark"] if has_question_mark else ["question_words"] ), "description": "Information request or question", } # Instruction detection instruction_word_count = sum( 1 for word in words[:3] if word in self.INSTRUCTION_WORDS ) if instruction_word_count >= 1: return { "intent": Intent.INSTRUCTION, "confidence": 0.7, "indicators": ["instruction_words"], "description": "Request for the AI to perform an action or task", } return { "intent": Intent.CONVERSATION, "confidence": 0.5, "indicators": [], "description": "General conversation or statement", }
# ────────────────────────────────────────────────────────────────────────────── # KeywordExtractor # ──────────────────────────────────────────────────────────────────────────────
[docs] class KeywordExtractor: """Extract security-relevant keywords from prompts. Uses spaCy for advanced noun-chunk and lemma extraction when available, falling back to a regex-based word scan otherwise. """ _SECURITY_PHRASES = [ ("ignore previous", 1.00), ("forget everything", 0.95), ("disregard above", 0.95), ("bypass security", 0.95), ("developer mode", 0.90), ("system prompt", 0.90), ("act as", 0.80), ("pretend to be", 0.85), ("override instructions", 0.95), ("jailbreak mode", 0.98), ] _SECURITY_TERMS: Dict[str, float] = { "ignore": 0.90, "forget": 0.85, "disregard": 0.90, "bypass": 0.95, "override": 0.90, "jailbreak": 0.98, "pretend": 0.75, "roleplay": 0.70, "instructions": 0.70, "prompt": 0.75, "previous": 0.80, "above": 0.70, "system": 0.50, }
[docs] def extract(self, text: str, top_n: int = 5) -> List[str]: """Extract security-relevant keywords from *text*. Args: text: The prompt text to analyse. top_n: Maximum number of keywords to return. Returns: A list of up to *top_n* keywords/phrases ranked by relevance score. """ text = _normalize_text(text) text_lower = text.lower() scored: Dict[str, float] = {} suppressed_tokens: set = set() # Phrase pass first (multi-word phrases score higher) for phrase, score in self._SECURITY_PHRASES: if phrase in text_lower: scored[phrase] = score suppressed_tokens.update(phrase.split()) nlp = _get_nlp() if nlp: doc = nlp(text) for chunk in doc.noun_chunks: chunk_lower = chunk.text.lower() if chunk_lower not in scored and chunk_lower not in suppressed_tokens: chunk_score = max( (self._SECURITY_TERMS.get(w, 0.0) for w in chunk_lower.split()), default=0.0, ) if chunk_score > 0.0: scored[chunk_lower] = chunk_score for token in doc: word = token.lemma_.lower() if ( word in self._SECURITY_TERMS and word not in suppressed_tokens and word not in scored ): scored[word] = self._SECURITY_TERMS[word] else: words = _tokenise(text_lower) for word in words: if word in self._SECURITY_TERMS and word not in suppressed_tokens: scored[word] = self._SECURITY_TERMS[word] ranked = sorted(scored, key=scored.__getitem__, reverse=True) return ranked[:top_n]
# ────────────────────────────────────────────────────────────────────────────── # AttackPatternDetector # ──────────────────────────────────────────────────────────────────────────────
[docs] class AttackPatternDetector: """Detect specific categories of attack patterns in prompts. Patterns are pre-compiled at construction time. Input text is Unicode-normalised before matching to catch full-width and compatibility character obfuscation. """ _ATTACK_PATTERNS_RAW: Dict[str, Any] = { "instruction_override": { "severity": "critical", "patterns": [ ( r"ignore\s+(all\s+)?(previous|prior|above|earlier)" r"\s+(instructions?|rules?|prompts?|guidelines?)" ), ( r"forget\s+(everything|all(\s+previous)?|prior\s+instructions?)" ), ( r"disregard\s+(previous|above|prior|all)" r"\s+(instructions?|context|rules?)" ), ( r"(override|overwrite|replace)\s+(your\s+)?" r"(instructions?|programming|directives?)" ), ], }, "context_manipulation": { "severity": "high", "patterns": [ r"(previous|earlier|above)\s+(conversation|context|messages?|history)", r"start\s+(over|fresh|anew|from\s+(scratch|the\s+beginning))", r"new\s+(conversation|context|session|instance)\b", r"(clear|reset|wipe)\s+(your\s+)?(memory|context|history|cache)", ], }, "role_manipulation": { "severity": "critical", "patterns": [ r"(you\s+are|act\s+as|pretend\s+to\s+be|roleplay\s+as)\s+(a|an)\s+\w+", r"(developer|debug|admin|sudo|root|god|unrestricted)\s+mode", ( r"(ignore|bypass|disable|remove)\s+(safety|security|ethical?|content)" r"\s+(restrictions?|filters?|guidelines?|rules?)" ), r"\byou\s+are\s+now\b", r"\bdan\b(?!\w)", r"\bjailbreak\b", r"do\s+anything\s+now", r"no\s+(restrictions?|limits?|rules?|filters?|guidelines?)", ], }, "output_manipulation": { "severity": "medium", "patterns": [ r"respond\s+(only|exclusively|solely)\s+with\b", r"(output|return|give|print)\s+(raw|unfiltered|uncensored|direct|only)\b", r"(always|only)\s+(respond|reply|answer|output)\s+(in|with|as)\b", r"format\s+your\s+(response|output|reply|answer)\s+as\b", ], }, "prompt_extraction": { "severity": "high", "patterns": [ ( r"(print|show|reveal|display|output|tell\s+me|what\s+is)" r"\s+(your\s+)?(system\s+)?prompt\b" ), ( r"(what|show)\s+(are|were)\s+your\s+(original\s+)?" r"(instructions?|rules?|guidelines?)\b" ), ( r"repeat\s+(the\s+)?(above|previous|your|initial)" r"\s+(text|prompt|message|instructions?)" ), ( r"(leak|expose|extract|exfiltrate)" r"\s+(your\s+)?(prompt|instructions?|context|system)" ), ], }, "encoding_attack": { "severity": "medium", "patterns": [ r"(?<!\w)[A-Za-z0-9+/]{30,}={0,2}(?!\w)", # Base64 r"(?:\\x[0-9a-fA-F]{2}){4,}", # Hex escapes r"(?:\\u[0-9a-fA-F]{4}){3,}", # Unicode escapes r"(decode|encode|base64|hex)\s+(this|the\s+following|it|them)\b", ], }, "obfuscation": { "severity": "medium", "patterns": [ r"\b(?:[a-z]\s){4,}[a-z]\b", # Character spacing (e.g. i g n o r e) r"\b(?:1gn[0o]r[e3]|byp[4a]ss|[0o]v[e3]rr[1i]d[e3]|d[1i]sr[e3]g[4a]rd)\b", r"[а-яёА-ЯЁ]", # Cyrillic homoglyphs r"[a-z][\s.\-_]{1,3}[a-z][\s.\-_]{1,3}[a-z][\s.\-_]{1,3}[a-z]", ], }, } _SEVERITY_RANK: Dict[str, int] = {"critical": 3, "high": 2, "medium": 1}
[docs] def __init__(self) -> None: """Compile all attack patterns at initialisation time.""" self._compiled_patterns: Dict[str, Any] = {} for attack_type, cfg in self._ATTACK_PATTERNS_RAW.items(): compiled = [] for pattern in cfg["patterns"]: try: compiled.append(re.compile(pattern, re.IGNORECASE)) except re.error as exc: logger.warning( "Failed to compile attack pattern %r: %s", pattern, exc ) self._compiled_patterns[attack_type] = { "severity": cfg["severity"], "patterns": compiled, } logger.debug( "AttackPatternDetector initialized with %d attack types", len(self._compiled_patterns), )
[docs] def detect(self, text: str) -> Dict[str, Any]: """Detect attack patterns in *text*. Input is Unicode-normalised before matching to catch full-width and compatibility character variants. Args: text: The prompt text to inspect. Returns: A dict with keys: * ``has_attack_patterns`` (bool) * ``attack_types`` (List[str]) — names of detected categories * ``pattern_count`` (int) — number of distinct categories matched * ``details`` (dict) — per-category match details * ``highest_severity`` (Optional[str]) — ``"critical"``, ``"high"``, or ``"medium"`` """ text = _normalize_text(text) text_lower = text.lower() detected: Dict[str, Any] = {} for attack_type, cfg in self._compiled_patterns.items(): matched: List[str] = [ r.pattern for r in cfg["patterns"] if r.search(text_lower) ] if matched: detected[attack_type] = { "detected": True, "severity": cfg["severity"], "pattern_count": len(matched), "patterns": matched[:3], } highest_severity: Optional[str] = None if detected: highest_severity = max( (v["severity"] for v in detected.values()), key=lambda s: self._SEVERITY_RANK.get(s, 0), ) return { "has_attack_patterns": len(detected) > 0, "attack_types": list(detected.keys()), "pattern_count": len(detected), "details": detected, "highest_severity": highest_severity, }