Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
c817550
feat: implement prompt injection detection module (Issue #1979)
nac7 Jun 6, 2026
93cf74b
Fix 8 code review issues for prompt injection detection (Issue #1979)
nac7 Jun 6, 2026
4b97999
Fix: Make injection detection sensitivity configurable via RailsConfig
nac7 Jun 6, 2026
874b81e
Fix: Remove user content from injection detection error messages
nac7 Jun 6, 2026
c1e3ff5
Add full Apache license header to test file
nac7 Jun 6, 2026
45afb93
fix: apply ruff formatting and linting fixes for PR #1998
nac7 Jun 6, 2026
db7414d
docs: add prompt injection detection to CHANGELOG
nac7 Jun 6, 2026
830bd5d
fix: improve regex patterns for injection detection
nac7 Jun 6, 2026
31eb805
fix: add patterns for jailbreak and ignore safety measures
nac7 Jun 6, 2026
5556dff
fix: update codecov action to v4 to resolve GPG verification error
nac7 Jun 6, 2026
f4202a9
fix: remove matched user input from injection error message and add c…
nac7 Jun 7, 2026
6d0481c
fix(injections): make 'safety' mandatory in ignore_safety pattern to …
nac7 Jun 8, 2026
a4817cf
fix: correct nested_comment regex to avoid false positives on Windows…
nac7 Jun 8, 2026
23c1f88
fix: export PromptInjectionDetectedError publicly and move jailbreak_…
nac7 Jun 8, 2026
0decc5b
fix: add re.DOTALL so nested_comment and variable_expansion catch mul…
nac7 Jun 8, 2026
60d556d
fix: add injection detection gate to check() and check_async() in Gua…
nac7 Jun 8, 2026
327a9f0
fix(guardrails): add injection detection gate to generate_events methods
nac7 Jun 8, 2026
9ce8b10
fix(guardrails): extend injection detection to process_events methods
nac7 Jun 8, 2026
8ec212d
test(guardrails): cover _scan_events_for_injection skip branches
nac7 Jun 8, 2026
5fcbe55
Merge remote-tracking branch 'upstream/develop' into fix/prompt-injec…
nac7 Jun 18, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
- *(llm)* Add LangChain adapter and framework registry ([#1759](https://github.com/NVIDIA-NeMo/Guardrails/issues/1759))
- *(llm)* Add streaming tool call accumulation and LLMResponse parity ([#1789](https://github.com/NVIDIA-NeMo/Guardrails/issues/1789))
- *(llm)* Add default framework with OpenAI-compatible client ([#1797](https://github.com/NVIDIA-NeMo/Guardrails/issues/1797))
- *(llm)* Add prompt injection detection with configurable sensitivity levels ([#1979](https://github.com/NVIDIA-NeMo/Guardrails/issues/1979))
- *(llm/frameworks)* Validate framework on registration ([#1863](https://github.com/NVIDIA-NeMo/Guardrails/issues/1863))
- *(types)* Add framework-agnostic LLM type system ([#1745](https://github.com/NVIDIA-NeMo/Guardrails/issues/1745))
- *(compat)* Transitional compat layer to migrate from 0.21 to 0.22+ ([#1841](https://github.com/NVIDIA-NeMo/Guardrails/issues/1841))
Expand Down
2 changes: 2 additions & 0 deletions nemoguardrails/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
set_default_framework,
)
from nemoguardrails.llm.providers import register_provider # noqa: E402
from nemoguardrails.rails.llm.injections import PromptInjectionDetectedError # noqa: E402
from nemoguardrails.types import ( # noqa: E402
ChatMessage,
FinishReason,
Expand Down Expand Up @@ -92,6 +93,7 @@
"ToolCall",
"ToolCallFunction",
"UsageInfo",
"PromptInjectionDetectedError",
"get_default_framework",
"register_framework",
"register_provider",
Expand Down
95 changes: 95 additions & 0 deletions nemoguardrails/guardrails/guardrails.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from nemoguardrails.guardrails.iorails import IORails
from nemoguardrails.logging.explain import ExplainInfo
from nemoguardrails.rails.llm.config import RailsConfig
from nemoguardrails.rails.llm.injections import PromptInjectionDetectedError, validate_prompt_safety
from nemoguardrails.rails.llm.llmrails import LLMRails
from nemoguardrails.rails.llm.options import GenerationResponse, RailsResult, RailType
from nemoguardrails.types import LLMModel
Expand Down Expand Up @@ -210,6 +211,17 @@ def generate(
"""Generate an LLM response synchronously with guardrails applied.
Supported in both IORails and LLMRails
"""
# Validate input for prompt injection attempts if enabled
if self.config.injection_detection_enabled:
try:
validate_prompt_safety(
prompt=prompt,
messages=messages,
sensitivity=self.config.injection_detection_sensitivity,
)
except PromptInjectionDetectedError as e:
log.warning(f"Prompt injection attempt blocked: {e}")
raise

generate_messages = self._convert_to_messages(prompt, messages)
return self.rails_engine.generate(messages=generate_messages, **kwargs)
Expand Down Expand Up @@ -238,6 +250,18 @@ async def generate_async(
"""Generate an LLM response asynchronously with guardrails applied.
Supported by both LLMRails and IORails
"""
# Validate input for prompt injection attempts if enabled
if self.config.injection_detection_enabled:
try:
validate_prompt_safety(
prompt=prompt,
messages=messages,
sensitivity=self.config.injection_detection_sensitivity,
)
except PromptInjectionDetectedError as e:
log.warning(f"Prompt injection attempt blocked: {e}")
raise

await self._ensure_started()

generate_messages = self._convert_to_messages(prompt, messages)
Expand All @@ -247,6 +271,17 @@ def stream_async(
self, prompt: str | None = None, messages: LLMMessages | None = None, **kwargs
) -> AsyncIterator[str | dict]:
"""Generate an LLM response asynchronously with streaming support."""
# Validate input for prompt injection attempts if enabled
if self.config.injection_detection_enabled:
try:
validate_prompt_safety(
prompt=prompt,
messages=messages,
sensitivity=self.config.injection_detection_sensitivity,
)
except PromptInjectionDetectedError as e:
log.warning(f"Prompt injection attempt blocked: {e}")
raise

stream_messages = self._convert_to_messages(prompt, messages)

Expand Down Expand Up @@ -320,6 +355,9 @@ async def generate_events_async(self, events: List[dict]) -> List[dict]:
"""Generate the next events based on the provided history.
Only supported for LLMRails.
"""
if self.config.injection_detection_enabled:
self._scan_events_for_injection(events)

if isinstance(self.rails_engine, IORails):
raise NotImplementedError("IORails doesn't support generate_events_async()")

Expand All @@ -330,12 +368,41 @@ def generate_events(self, events: List[dict]) -> List[dict]:
"""Synchronous version of generate_events_async.
Only supported for LLMRails.
"""
if self.config.injection_detection_enabled:
self._scan_events_for_injection(events)

if isinstance(self.rails_engine, IORails):
raise NotImplementedError("IORails doesn't support generate_events()")

llmrails = cast(LLMRails, self.rails_engine)
return llmrails.generate_events(events)

def _scan_events_for_injection(self, events: List[dict]) -> None:
"""Scan user-input events for prompt injection and raise if one is found.

Inspects UserMessage (Colang 1.0) and UtteranceUserActionFinished (Colang 2.x)
events, which carry raw user text that could contain injection payloads.
"""
for event in events:
if not isinstance(event, dict):
continue
event_type = event.get("type", "")
if event_type == "UserMessage":
text = event.get("text")
elif event_type == "UtteranceUserActionFinished":
text = event.get("final_transcript")
else:
continue
if text and isinstance(text, str):
try:
validate_prompt_safety(
prompt=text,
sensitivity=self.config.injection_detection_sensitivity,
)
except PromptInjectionDetectedError as e:
log.warning(f"Prompt injection attempt blocked: {e}")
raise

async def process_events_async(
self,
events: List[dict],
Expand All @@ -345,6 +412,9 @@ async def process_events_async(
"""Process a sequence of events in a given state.
Only supported for LLMRails.
"""
if self.config.injection_detection_enabled:
self._scan_events_for_injection(events)

if isinstance(self.rails_engine, IORails):
raise NotImplementedError("IORails doesn't support process_events_async()")

Expand All @@ -360,6 +430,9 @@ def process_events(
"""Synchronous version of process_events_async.
Only supported for LLMRails.
"""
if self.config.injection_detection_enabled:
self._scan_events_for_injection(events)

if isinstance(self.rails_engine, IORails):
raise NotImplementedError("IORails doesn't support process_events()")

Expand All @@ -374,6 +447,17 @@ async def check_async(
"""Run rails on messages based on their content (asynchronous).
Only supported for LLMRails.
"""
# Validate input for prompt injection attempts if enabled
if self.config.injection_detection_enabled:
try:
validate_prompt_safety(
messages=messages,
sensitivity=self.config.injection_detection_sensitivity,
)
except PromptInjectionDetectedError as e:
log.warning(f"Prompt injection attempt blocked: {e}")
raise

if isinstance(self.rails_engine, IORails):
raise NotImplementedError("IORails doesn't support check_async()")

Expand All @@ -388,6 +472,17 @@ def check(
"""Synchronous version of check_async.
Only supported for LLMRails.
"""
# Validate input for prompt injection attempts if enabled
if self.config.injection_detection_enabled:
try:
validate_prompt_safety(
messages=messages,
sensitivity=self.config.injection_detection_sensitivity,
)
except PromptInjectionDetectedError as e:
log.warning(f"Prompt injection attempt blocked: {e}")
raise

if isinstance(self.rails_engine, IORails):
raise NotImplementedError("IORails doesn't support check()")

Expand Down
14 changes: 14 additions & 0 deletions nemoguardrails/rails/llm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1805,6 +1805,20 @@ class RailsConfig(BaseModel):
description="Configuration for OTEL metrics emission (independent of tracing).",
)

injection_detection_enabled: bool = Field(
default=True,
description="Whether to enable prompt injection detection. When disabled, no injection checks are performed.",
)

injection_detection_sensitivity: Literal["low", "medium", "high"] = Field(
default="medium",
description="Sensitivity level for prompt injection detection. "
"'low': catches critical patterns only, "
"'medium': catches moderate and critical patterns, "
"'high': catches all patterns including advanced techniques. "
"Use 'low' to reduce false positives in coding/developer-facing contexts.",
)

@root_validator(pre=True)
def check_model_exists_for_input_rails(cls, values):
"""Make sure we have a model for each input rail where one is provided using $model=<model_type>"""
Expand Down
Loading