fix: finalize name changes and defautl values

Pouyanpi · Pouyanpi · commit c2fb77ffb0db · 2025-02-04T18:17:42.000+01:00
diff --git a/nemoguardrails/rails/llm/buffer.py b/nemoguardrails/rails/llm/buffer.py
@@ -25,33 +25,37 @@ class BufferStrategy(ABC):
     def from_config(cls, config: OutputRailsStreamingConfig) -> "BufferStrategy":
         pass
 
+    # The abstract method is not async to ensure the return type
+    # matches the async generator in the concrete implementation.
     @abstractmethod
-    async def __call__(self, streaming_handler) -> AsyncGenerator:
+    def __call__(
+        self, streaming_handler
+    ) -> AsyncGenerator[Tuple[List[str], str], None]:
         pass
 
     @abstractmethod
     def generate_chunk_str(self, *args, **kwargs) -> str:
         pass
 
 
-class SlidingWindow(BufferStrategy):
-    """DRFAT: A minimal buffer strategy that buffers chunks and yields them when the buffer is full."""
+class RollingBuffer(BufferStrategy):
+    """A minimal buffer strategy that buffers chunks and yields them when the buffer is full.
 
-    # - **chunk_size (X)**: This would correspond to the number of tokens in each chunk processed by the `streaming_handler`.
-    # - **max_validation_length (N)**: This would correspond to the `look_back_size` parameter in the code, representing the maximum number of lookback chunks.
-    #
-    # In the code:
-    # - `window_size` represents the number of chunks to process in each window.
-    # - `look_back_size` represents the number of previous chunks to include in the window for context.
+    Args:
+        buffer_context_size (int): The number of tokens carried over from the previous chunk to provide context for continuity in processing.
+        buffer_chunk_size (int): The number of tokens in each processing chunk. This is the size of the token block on which output rails are applied.
+    """
 
-    def __init__(self, look_back_size: int = 5, window_size: int = 10):
-        self.look_back_size = look_back_size
-        self.window_size = window_size
+    def __init__(self, buffer_context_size: int = 5, buffer_chunk_size: int = 10):
+        self.buffer_context_size = buffer_context_size
+        self.buffer_chunk_size = buffer_chunk_size
         self.last_index = 0
 
     @classmethod
     def from_config(cls, config: OutputRailsStreamingConfig):
-        return cls(look_back_size=config.look_back_size, window_size=config.window_size)
+        return cls(
+            buffer_context_size=config.context_size, buffer_chunk_size=config.chunk_size
+        )
 
     async def __call__(
         self, streaming_handler
@@ -62,30 +66,26 @@ async def __call__(
         async for chunk in streaming_handler:
             buffer.append(chunk)
             index += 1
-            # TODO: this is done in StreamingHandler, we need to find away to remove this duplication
-            # print(f"\033[92m{chunk}\033[0m", end="", flush=True)
-            # the hackish solution in StreamingHandler is resolved in Chat ClI, we should not alter interfaces
-            # when we have stream_async we must use it everywhere, adding enable_print will cause headaches
-            # then this hackish solution will cause a cancer of this hackish solution and will contaminate the whole codebase
 
-            if len(buffer) >= self.window_size:
+            if len(buffer) >= self.buffer_chunk_size:
                 yield (
-                    # buffer is used to apply output rails
-                    buffer[-self.window_size - self.look_back_size :],
-                    # this is what gets printed in the console or yield to user
+                    # we apply output rails on the buffer
+                    buffer[-self.buffer_chunk_size - self.buffer_context_size :],
+                    # generate_chunk_str is what gets printed in the console or yield to user
                     # to avoid repeating the already streamed/printed chunk
                     self.generate_chunk_str(
-                        buffer[-self.window_size - self.look_back_size :], index
+                        buffer[-self.buffer_chunk_size - self.buffer_context_size :],
+                        index,
                     ),
                 )
-                buffer = buffer[-self.look_back_size :]
+                buffer = buffer[-self.buffer_context_size :]
 
         # Yield any remaining buffer if it's not empty
         if buffer:
             yield (
                 buffer,
                 self.generate_chunk_str(
-                    buffer[-self.window_size - self.look_back_size :], index
+                    buffer[-self.buffer_chunk_size - self.buffer_context_size :], index
                 ),
             )
 
@@ -104,5 +104,5 @@ def generate_chunk_str(self, buffer, current_index) -> str:
 
 def get_buffer_strategy(config: OutputRailsStreamingConfig) -> BufferStrategy:
     # TODO: use a factory function or class
-    # currently we only have SlidingWindow, in future we use a registry
-    return SlidingWindow.from_config(config)
+    # currently we only have RollingBuffer, in future we use a registry
+    return RollingBuffer.from_config(config)
diff --git a/nemoguardrails/rails/llm/config.py b/nemoguardrails/rails/llm/config.py
@@ -305,14 +305,22 @@ class InputRails(BaseModel):
 
 
 class OutputRailsStreamingConfig(BaseModel):
+    """Configuration for managing streaming output of LLM tokens."""
+
     enabled: bool = Field(
-        default=False, description="Indicates if streaming is enabled."
+        default=False, description="Enables streaming mode when True."
+    )
+    chunk_size: int = Field(
+        default=200,
+        description="The number of tokens in each processing chunk. This is the size of the token block on which output rails are applied.",
+    )
+    context_size: int = Field(
+        default=50,
+        description="The number of tokens carried over from the previous chunk to provide context for continuity in processing.",
     )
-    look_back_size: int = Field(default=5, description="The look back size.")
-    window_size: int = Field(default=10, description="The window size.")
     stream_first: bool = Field(
         default=True,
-        description="Prioritizes streaming chunks before applying output rails.",
+        description="If True, token chunks are streamed immediately before output rails are applied.",
     )
     model_config = ConfigDict(extra="allow")
 
diff --git a/tests/test_buffer_strategy.py b/tests/test_buffer_strategy.py
@@ -15,7 +15,7 @@
 
 import pytest
 
-from nemoguardrails.rails.llm.buffer import SlidingWindow as BufferStrategy
+from nemoguardrails.rails.llm.buffer import RollingBuffer as BufferStrategy
 
 
 async def fake_streaming_handler():
@@ -26,7 +26,7 @@ async def fake_streaming_handler():
 
 @pytest.mark.asyncio
 async def test_buffer_strategy():
-    buffer_strategy = BufferStrategy(look_back_size=5, window_size=10)
+    buffer_strategy = BufferStrategy(buffer_context_size=5, buffer_chunk_size=10)
     streaming_handler = fake_streaming_handler()
 
     expected_buffers = [
@@ -69,7 +69,7 @@ async def async_enumerate(aiterable, start=0):
 
 
 async def test_generate_chunk_str():
-    buffer_strategy = BufferStrategy(look_back_size=5, window_size=10)
+    buffer_strategy = BufferStrategy(buffer_context_size=5, buffer_chunk_size=10)
     buffer = ["chunk0", "chunk1", "chunk2", "chunk3", "chunk4", "chunk5"]
     current_index = 6
 
diff --git a/tests/test_streaming.py b/tests/test_streaming.py
@@ -266,8 +266,8 @@ def output_rails_streaming_config():
                     "flows": {"self check output"},
                     "streaming": {
                         "enabled": True,
-                        "window_size": 4,
-                        "look_back_size": 2,
+                        "chunk_size": 4,
+                        "context_size": 2,
                         "stream_first": False,
                     },
                 }
@@ -403,15 +403,9 @@ async def test_streaming_output_rails_blocked_at_first_call(
     await asyncio.gather(*asyncio.all_tasks() - {asyncio.current_task()})
 
 
-def _calculate_number_of_actions(M, W, N):
-    """
-    M: input_length
-    W: window_size
-    N: look_back_size
-    """
-
-    if W <= N:
-        raise ValueError("Window size must be greater than look-back size.")
-    if M <= W:
+def _calculate_number_of_actions(input_length, chunk_size, context_size):
+    if chunk_size <= context_size:
+        raise ValueError("chunk_size must be greater than context_size.")
+    if input_length <= chunk_size:
         return 1
-    return math.ceil((M - N) / (W - N))
+    return math.ceil((input_length - context_size) / (chunk_size - context_size))