feat: support streaming_callback as run param for HF Chat generators (#8763)

tstadel · web-flow · commit bf79f0493282 · 2025-01-23T12:14:32.000+01:00
* feat: support streaming_callback as run param for HF Chat generators

* add tests
diff --git a/haystack/components/generators/chat/hugging_face_api.py b/haystack/components/generators/chat/hugging_face_api.py
@@ -220,6 +220,7 @@ def run(
         messages: List[ChatMessage],
         generation_kwargs: Optional[Dict[str, Any]] = None,
         tools: Optional[List[Tool]] = None,
+        streaming_callback: Optional[Callable[[StreamingChunk], None]] = None,
     ):
         """
         Invoke the text generation inference based on the provided messages and generation parameters.
@@ -231,6 +232,9 @@ def run(
         :param tools:
             A list of tools for which the model can prepare calls. If set, it will override the `tools` parameter set
             during component initialization.
+        :param streaming_callback:
+            An optional callable for handling streaming responses. If set, it will override the `streaming_callback`
+            parameter set during component initialization.
         :returns: A dictionary with the following keys:
             - `replies`: A list containing the generated responses as ChatMessage objects.
         """
@@ -245,16 +249,22 @@ def run(
             raise ValueError("Using tools and streaming at the same time is not supported. Please choose one.")
         _check_duplicate_tool_names(tools)
 
-        if self.streaming_callback:
-            return self._run_streaming(formatted_messages, generation_kwargs)
+        streaming_callback = streaming_callback or self.streaming_callback
+        if streaming_callback:
+            return self._run_streaming(formatted_messages, generation_kwargs, streaming_callback)
 
         hf_tools = None
         if tools:
             hf_tools = [{"type": "function", "function": {**t.tool_spec}} for t in tools]
 
         return self._run_non_streaming(formatted_messages, generation_kwargs, hf_tools)
 
-    def _run_streaming(self, messages: List[Dict[str, str]], generation_kwargs: Dict[str, Any]):
+    def _run_streaming(
+        self,
+        messages: List[Dict[str, str]],
+        generation_kwargs: Dict[str, Any],
+        streaming_callback: Callable[[StreamingChunk], None],
+    ):
         api_output: Iterable[ChatCompletionStreamOutput] = self._client.chat_completion(
             messages, stream=True, **generation_kwargs
         )
@@ -282,7 +292,7 @@ def _run_streaming(self, messages: List[Dict[str, str]], generation_kwargs: Dict
                 first_chunk_time = datetime.now().isoformat()
 
             stream_chunk = StreamingChunk(text, meta)
-            self.streaming_callback(stream_chunk)  # type: ignore # streaming_callback is not None (verified in the run method)
+            streaming_callback(stream_chunk)
 
         meta.update(
             {
diff --git a/haystack/components/generators/chat/hugging_face_local.py b/haystack/components/generators/chat/hugging_face_local.py
@@ -233,12 +233,18 @@ def from_dict(cls, data: Dict[str, Any]) -> "HuggingFaceLocalChatGenerator":
         return default_from_dict(cls, data)
 
     @component.output_types(replies=List[ChatMessage])
-    def run(self, messages: List[ChatMessage], generation_kwargs: Optional[Dict[str, Any]] = None):
+    def run(
+        self,
+        messages: List[ChatMessage],
+        generation_kwargs: Optional[Dict[str, Any]] = None,
+        streaming_callback: Optional[Callable[[StreamingChunk], None]] = None,
+    ):
         """
         Invoke text generation inference based on the provided messages and generation parameters.
 
         :param messages: A list of ChatMessage objects representing the input messages.
         :param generation_kwargs: Additional keyword arguments for text generation.
+        :param streaming_callback: An optional callable for handling streaming responses.
         :returns:
             A list containing the generated responses as ChatMessage instances.
         """
@@ -259,7 +265,8 @@ def run(self, messages: List[ChatMessage], generation_kwargs: Optional[Dict[str,
         if stop_words_criteria:
             generation_kwargs["stopping_criteria"] = StoppingCriteriaList([stop_words_criteria])
 
-        if self.streaming_callback:
+        streaming_callback = streaming_callback or self.streaming_callback
+        if streaming_callback:
             num_responses = generation_kwargs.get("num_return_sequences", 1)
             if num_responses > 1:
                 msg = (
@@ -270,7 +277,7 @@ def run(self, messages: List[ChatMessage], generation_kwargs: Optional[Dict[str,
                 logger.warning(msg, num_responses=num_responses)
                 generation_kwargs["num_return_sequences"] = 1
             # streamer parameter hooks into HF streaming, HFTokenStreamingHandler is an adapter to our streaming
-            generation_kwargs["streamer"] = HFTokenStreamingHandler(tokenizer, self.streaming_callback, stop_words)
+            generation_kwargs["streamer"] = HFTokenStreamingHandler(tokenizer, streaming_callback, stop_words)
 
         hf_messages = [convert_message_to_hf_format(message) for message in messages]
 
diff --git a/releasenotes/notes/streaming-callback-run-param-support-for-hf-chat-generators-68aaa7e540ad03ce.yaml b/releasenotes/notes/streaming-callback-run-param-support-for-hf-chat-generators-68aaa7e540ad03ce.yaml
@@ -0,0 +1,4 @@
+---
+enhancements:
+  - |
+    Streaming callback run param support for HF chat generators.
diff --git a/test/components/generators/chat/test_hugging_face_api.py b/test/components/generators/chat/test_hugging_face_api.py
@@ -395,6 +395,70 @@ def mock_iter(self):
         assert len(response["replies"]) > 0
         assert [isinstance(reply, ChatMessage) for reply in response["replies"]]
 
+    def test_run_with_streaming_callback_in_run_method(
+        self, mock_check_valid_model, mock_chat_completion, chat_messages
+    ):
+        streaming_call_count = 0
+
+        # Define the streaming callback function
+        def streaming_callback_fn(chunk: StreamingChunk):
+            nonlocal streaming_call_count
+            streaming_call_count += 1
+            assert isinstance(chunk, StreamingChunk)
+
+        generator = HuggingFaceAPIChatGenerator(
+            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
+            api_params={"model": "meta-llama/Llama-2-13b-chat-hf"},
+        )
+
+        # Create a fake streamed response
+        # self needed here, don't remove
+        def mock_iter(self):
+            yield ChatCompletionStreamOutput(
+                choices=[
+                    ChatCompletionStreamOutputChoice(
+                        delta=ChatCompletionStreamOutputDelta(content="The", role="assistant"),
+                        index=0,
+                        finish_reason=None,
+                    )
+                ],
+                id="some_id",
+                model="some_model",
+                system_fingerprint="some_fingerprint",
+                created=1710498504,
+            )
+
+            yield ChatCompletionStreamOutput(
+                choices=[
+                    ChatCompletionStreamOutputChoice(
+                        delta=ChatCompletionStreamOutputDelta(content=None, role=None), index=0, finish_reason="length"
+                    )
+                ],
+                id="some_id",
+                model="some_model",
+                system_fingerprint="some_fingerprint",
+                created=1710498504,
+            )
+
+        mock_response = Mock(**{"__iter__": mock_iter})
+        mock_chat_completion.return_value = mock_response
+
+        # Generate text response with streaming callback
+        response = generator.run(chat_messages, streaming_callback=streaming_callback_fn)
+
+        # check kwargs passed to text_generation
+        _, kwargs = mock_chat_completion.call_args
+        assert kwargs == {"stop": [], "stream": True, "max_tokens": 512}
+
+        # Assert that the streaming callback was called twice
+        assert streaming_call_count == 2
+
+        # Assert that the response contains the generated replies
+        assert "replies" in response
+        assert isinstance(response["replies"], list)
+        assert len(response["replies"]) > 0
+        assert [isinstance(reply, ChatMessage) for reply in response["replies"]]
+
     def test_run_fail_with_tools_and_streaming(self, tools, mock_check_valid_model):
         component = HuggingFaceAPIChatGenerator(
             api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
diff --git a/test/components/generators/chat/test_hugging_face_local.py b/test/components/generators/chat/test_hugging_face_local.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 from unittest.mock import Mock, patch
 
+from haystack.dataclasses.streaming_chunk import StreamingChunk
 import pytest
 from transformers import PreTrainedTokenizer
 
@@ -233,6 +234,46 @@ def test_run_with_custom_generation_parameters(self, model_info_mock, mock_pipel
         assert chat_message.is_from(ChatRole.ASSISTANT)
         assert chat_message.text == "Berlin is cool"
 
+    def test_run_with_streaming_callback(self, model_info_mock, mock_pipeline_tokenizer, chat_messages):
+        # Define the streaming callback function
+        def streaming_callback_fn(chunk: StreamingChunk): ...
+
+        generator = HuggingFaceLocalChatGenerator(
+            model="meta-llama/Llama-2-13b-chat-hf", streaming_callback=streaming_callback_fn
+        )
+
+        # Use the mocked pipeline from the fixture and simulate warm_up
+        generator.pipeline = mock_pipeline_tokenizer
+
+        results = generator.run(messages=chat_messages)
+
+        assert "replies" in results
+        assert isinstance(results["replies"][0], ChatMessage)
+        chat_message = results["replies"][0]
+        assert chat_message.is_from(ChatRole.ASSISTANT)
+        assert chat_message.text == "Berlin is cool"
+        generator.pipeline.assert_called_once()
+        generator.pipeline.call_args[1]["streamer"].token_handler == streaming_callback_fn
+
+    def test_run_with_streaming_callback_in_run_method(self, model_info_mock, mock_pipeline_tokenizer, chat_messages):
+        # Define the streaming callback function
+        def streaming_callback_fn(chunk: StreamingChunk): ...
+
+        generator = HuggingFaceLocalChatGenerator(model="meta-llama/Llama-2-13b-chat-hf")
+
+        # Use the mocked pipeline from the fixture and simulate warm_up
+        generator.pipeline = mock_pipeline_tokenizer
+
+        results = generator.run(messages=chat_messages, streaming_callback=streaming_callback_fn)
+
+        assert "replies" in results
+        assert isinstance(results["replies"][0], ChatMessage)
+        chat_message = results["replies"][0]
+        assert chat_message.is_from(ChatRole.ASSISTANT)
+        assert chat_message.text == "Berlin is cool"
+        generator.pipeline.assert_called_once()
+        generator.pipeline.call_args[1]["streamer"].token_handler == streaming_callback_fn
+
     @patch("haystack.components.generators.chat.hugging_face_local.convert_message_to_hf_format")
     def test_messages_conversion_is_called(self, mock_convert, model_info_mock):
         generator = HuggingFaceLocalChatGenerator(model="fake-model")