feat(community): add Prompt Security integration (#920)

lior-ps · web-flow · commit 08569c8d2876 · 2025-02-02T11:38:20.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -63,3 +63,5 @@ docs/**/config
 # Ignoring log files generated by tests
 firebase.json
 scratch.py
+
+.env
diff --git a/docs/user-guides/community/prompt-security.md b/docs/user-guides/community/prompt-security.md
@@ -0,0 +1,42 @@
+# Prompt Security Integration
+
+[Prompt Security AI](https://prompt.security/?utm_medium=github&utm_campaign=nemo-guardrails) allows you to protect LLM interaction. This integration enables NeMo Guardrails to use Prompt Security to protect input and output flows.
+
+You'll need to set the following env variables to work with Prompt Security:
+
+1. PS_PROTECT_URL - This is the URL of the protect endpoint given by Prompt Security. This will look like https://[REGION].prompt.security/api/protect where REGION is eu, useast or apac
+2. PS_APP_ID - This is the application ID given by Prompt Security (similar to an API key). You can get it from admin portal at https://[REGION].prompt.security/ where REGION is eu, useast or apac
+
+## Setup
+
+1. Ensure that you have access to Prompt Security API server (SaaS or on-prem).
+2. Update your `config.yml` file to include the Private AI settings:
+
+```yaml
+rails:
+  input:
+    flows:
+      - protect prompt
+  output:
+    flows:
+      - protect response
+```
+
+Don't forget to set the `PS_PROTECT_URL` and `PS_APP_ID` environment variables.
+
+## Usage
+
+Once configured, the Prompt Security integration will automatically:
+
+1. Protect prompts before they are processed by the LLM.
+2. Protect LLM outputs before they are sent back to the user.
+
+The `protect_text` action in `nemoguardrails/library/prompt_security/actions.py` handles the protection process.
+
+## Error Handling
+
+If the Prompt Security API request fails, it's operating in a fail-open mode (not blocking the prompt/response).
+
+## Notes
+
+For more information on Prompt Security and capabilities, please refer to the [Prompt Security documentation](https://prompt.security/?utm_medium=github&utm_campaign=nemo-guardrails).
diff --git a/docs/user-guides/guardrails-library.md b/docs/user-guides/guardrails-library.md
@@ -23,6 +23,7 @@ NeMo Guardrails comes with a library of built-in guardrails that you can easily
    - [Cleanlab Trustworthiness Score](#cleanlab)
    - [GCP Text Moderation](#gcp-text-moderation)
    - [Private AI PII detection](#private-ai-pii-detection)
+   - [Prompt Security Protection](#prompt-security-protection)
    - OpenAI Moderation API - *[COMING SOON]*
 
 4. Other
@@ -805,6 +806,27 @@ rails:
 
 For more details, check out the [Private AI Integration](./community/privateai.md) page.
 
+
+### Prompt Security Protection
+
+NeMo Guardrails supports using [Prompt Security API](https://prompt.security/?utm_medium=github&utm_campaign=nemo-guardrails) for protecting input and output retrieval flows.
+
+To activate the protection, you need to set the `PS_PROTECT_URL` and `PS_APP_ID` environment variables.
+
+#### Example usage
+
+```yaml
+rails:
+  input:
+    flows:
+      - protect prompt
+  output:
+    flows:
+      - protect response
+```
+
+For more details, check out the [Prompt Security Integration](./community/prompt_security.md) page.
+
 ## Other
 
 ### Jailbreak Detection Heuristics
diff --git a/examples/configs/prompt_security/README.md b/examples/configs/prompt_security/README.md
@@ -0,0 +1,5 @@
+# Prompt Security Configuration Example
+
+This example contains configuration files for using Prompt Security in your NeMo Guardrails project.
+
+For more details on the Prompt Security integration, see [Prompt Security Integration User Guide](../../../docs/user-guides/community/prompt-security.md).
diff --git a/examples/configs/prompt_security/config.yml b/examples/configs/prompt_security/config.yml
@@ -0,0 +1,13 @@
+models:
+  - type: main
+    engine: openai
+    model: gpt-4o
+
+rails:
+  input:
+    flows:
+      - protect prompt
+
+  output:
+    flows:
+      - protect response
diff --git a/nemoguardrails/library/prompt_security/__init__.py b/nemoguardrails/library/prompt_security/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemoguardrails/library/prompt_security/actions.py b/nemoguardrails/library/prompt_security/actions.py
@@ -0,0 +1,126 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Prompt/Response protection using Prompt Security."""
+
+import logging
+import os
+from typing import Optional
+
+import httpx
+
+from nemoguardrails.actions import action
+
+log = logging.getLogger(__name__)
+
+
+async def ps_protect_api_async(
+    ps_protect_url: str,
+    ps_app_id: str,
+    prompt: Optional[str] = None,
+    system_prompt: Optional[str] = None,
+    response: Optional[str] = None,
+    user: Optional[str] = None,
+):
+    """Calls Prompt Security Protect API asynchronously.
+
+    Args:
+        ps_protect_url: the URL of the protect endpoint given by Prompt Security.
+        URL is https://[REGION].prompt.security/api/protect where REGION is eu, useast or apac
+
+        ps_app_id: the application ID given by Prompt Security (similar to an API key).
+        Get it from the admin portal at https://[REGION].prompt.security/ where REGION is eu, useast or apac
+
+        prompt: the user message to protect.
+
+        system_prompt: the system message for context.
+
+        response: the bot message to protect.
+
+        user: the user ID or username for context.
+
+    Returns:
+        A dictionary with the following items:
+        - is_blocked: True if the text should be blocked, False otherwise.
+        - is_modified: True if the text should be modified, False otherwise.
+        - modified_text: The modified text if is_modified is True, None otherwise.
+    """
+
+    headers = {
+        "APP-ID": ps_app_id,
+        "Content-Type": "application/json",
+    }
+    payload = {
+        "prompt": prompt,
+        "system_prompt": system_prompt,
+        "response": response,
+        "user": user,
+    }
+    async with httpx.AsyncClient() as client:
+        modified_text = None
+        ps_action = "log"
+        try:
+            ret = await client.post(ps_protect_url, headers=headers, json=payload)
+            res = ret.json()
+            ps_action = res.get("result", {}).get("action", "log")
+            if ps_action == "modify":
+                key = "response" if response else "prompt"
+                modified_text = res.get("result", {}).get(key, {}).get("modified_text")
+        except Exception as e:
+            log.error("Error calling Prompt Security Protect API: %s", e)
+        return {
+            "is_blocked": ps_action == "block",
+            "is_modified": ps_action == "modify",
+            "modified_text": modified_text,
+        }
+
+
+@action(is_system_action=True)
+async def protect_text(
+    user_prompt: Optional[str] = None, bot_response: Optional[str] = None
+):
+    """Protects the given user_prompt or bot_response.
+    Args:
+        user_prompt: The user message to protect.
+        bot_response: The bot message to protect.
+    Returns:
+        A dictionary with the following items:
+        - is_blocked: True if the text should be blocked, False otherwise.
+        - is_modified: True if the text should be modified, False otherwise.
+        - modified_text: The modified text if is_modified is True, None otherwise.
+    Raises:
+        ValueError is returned in one of the following cases:
+        1. If PS_PROTECT_URL env variable is not set.
+        2. If PS_APP_ID env variable is not set.
+        3. If no user_prompt and no bot_response is provided.
+    """
+
+    ps_protect_url = os.getenv("PS_PROTECT_URL")
+    if not ps_protect_url:
+        raise ValueError("PS_PROTECT_URL env variable is required for Prompt Security.")
+
+    ps_app_id = os.getenv("PS_APP_ID")
+    if not ps_app_id:
+        raise ValueError("PS_APP_ID env variable is required for Prompt Security.")
+
+    if bot_response:
+        return await ps_protect_api_async(
+            ps_protect_url, ps_app_id, None, None, bot_response
+        )
+
+    if user_prompt:
+        return await ps_protect_api_async(ps_protect_url, ps_app_id, user_prompt)
+
+    raise ValueError("Neither user_message nor bot_message was provided")
diff --git a/nemoguardrails/library/prompt_security/flows.co b/nemoguardrails/library/prompt_security/flows.co
@@ -0,0 +1,24 @@
+# INPUT RAILS
+
+@active
+flow protect prompt
+  """Check if the prompt is valid according to Prompt Security."""
+  $result = await protect_text(user_prompt=$user_message)
+  if $result["is_blocked"]
+    bot inform answer unknown
+    stop
+  else if $result["is_modified"]
+    $user_message = $result["modified_text"]
+
+
+# OUTPUT RAILS
+
+@active
+flow protect response
+  """Check if the response is valid according to Prompt Security."""
+  $result = await protect_text(bot_response=$bot_message)
+  if $result["is_blocked"]
+    bot inform answer unknown
+    stop
+  else if $result["is_modified"]
+    $bot_message = $result["modified_text"]
diff --git a/nemoguardrails/library/prompt_security/flows.v1.co b/nemoguardrails/library/prompt_security/flows.v1.co
@@ -0,0 +1,22 @@
+# INPUT RAILS
+
+define subflow protect prompt
+  """Check if the prompt is valid according to Prompt Security."""
+  $result = execute protect_text(user_prompt=$user_message)
+  if $result["is_blocked"]
+    bot inform answer unknown
+    stop
+  else if $result["is_modified"]
+    $user_message = $result["modified_text"]
+
+
+# OUTPUT RAILS
+
+define subflow protect response
+  """Check if the response is valid according to Prompt Security."""
+  $result = execute protect_text(bot_response=$bot_message)
+  if $result["is_blocked"]
+    bot inform answer unknown
+    stop
+  else if $result["is_modified"]
+    $bot_message = $result["modified_text"]
diff --git a/tests/test_prompt_security.py b/tests/test_prompt_security.py