feat: Add completion start time timestamp to relevant generators (#8728)

vblagoje · anakin87 · web-flow · commit 21dd03d3e797 · 2025-01-17T09:58:45.000+01:00
* OpenAIChatGenerator - add completion_start_time

* HuggingFaceAPIChatGenerator - add completion_start_time

* Add tests

* Add reno note

* Relax condition for cached responses

* Add completion_start_time timestamping to non-chat generators

* Update haystack/components/generators/chat/hugging_face_api.py

Co-authored-by: Stefano Fiorucci &lt;stefanofiorucci@gmail.com&gt;

* PR feedback

---------

Co-authored-by: Stefano Fiorucci &lt;stefanofiorucci@gmail.com&gt;
diff --git a/haystack/components/generators/chat/hugging_face_api.py b/haystack/components/generators/chat/hugging_face_api.py
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from datetime import datetime
 from typing import Any, Callable, Dict, Iterable, List, Optional, Union
 
 from haystack import component, default_from_dict, default_to_dict, logging
@@ -259,6 +260,7 @@ def _run_streaming(self, messages: List[Dict[str, str]], generation_kwargs: Dict
         )
 
         generated_text = ""
+        first_chunk_time = None
 
         for chunk in api_output:
             # n is unused, so the API always returns only one choice
@@ -276,6 +278,9 @@ def _run_streaming(self, messages: List[Dict[str, str]], generation_kwargs: Dict
             if finish_reason:
                 meta["finish_reason"] = finish_reason
 
+            if first_chunk_time is None:
+                first_chunk_time = datetime.now().isoformat()
+
             stream_chunk = StreamingChunk(text, meta)
             self.streaming_callback(stream_chunk)  # type: ignore # streaming_callback is not None (verified in the run method)
 
@@ -285,6 +290,7 @@ def _run_streaming(self, messages: List[Dict[str, str]], generation_kwargs: Dict
                 "finish_reason": finish_reason,
                 "index": 0,
                 "usage": {"prompt_tokens": 0, "completion_tokens": 0},  # not available in streaming
+                "completion_start_time": first_chunk_time,
             }
         )
 
diff --git a/haystack/components/generators/chat/openai.py b/haystack/components/generators/chat/openai.py
@@ -4,6 +4,7 @@
 
 import json
 import os
+from datetime import datetime
 from typing import Any, Callable, Dict, List, Optional, Union
 
 from openai import OpenAI, Stream
@@ -381,6 +382,7 @@ def _convert_streaming_chunks_to_chat_message(self, chunk: Any, chunks: List[Str
             "model": chunk.model,
             "index": 0,
             "finish_reason": chunk.choices[0].finish_reason,
+            "completion_start_time": chunks[0].meta.get("received_at"),  # first chunk received
             "usage": {},  # we don't have usage data for streaming responses
         }
 
@@ -444,6 +446,7 @@ def _convert_chat_completion_chunk_to_streaming_chunk(self, chunk: ChatCompletio
                 "index": choice.index,
                 "tool_calls": choice.delta.tool_calls,
                 "finish_reason": choice.finish_reason,
+                "received_at": datetime.now().isoformat(),
             }
         )
         return chunk_message
diff --git a/haystack/components/generators/hugging_face_api.py b/haystack/components/generators/hugging_face_api.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from dataclasses import asdict
+from datetime import datetime
 from typing import Any, Callable, Dict, Iterable, List, Optional, Union
 
 from haystack import component, default_from_dict, default_to_dict, logging
@@ -217,18 +218,26 @@ def _stream_and_build_response(
         self, hf_output: Iterable["TextGenerationStreamOutput"], streaming_callback: Callable[[StreamingChunk], None]
     ):
         chunks: List[StreamingChunk] = []
+        first_chunk_time = None
+
         for chunk in hf_output:
             token: TextGenerationOutputToken = chunk.token
             if token.special:
                 continue
+
             chunk_metadata = {**asdict(token), **(asdict(chunk.details) if chunk.details else {})}
+            if first_chunk_time is None:
+                first_chunk_time = datetime.now().isoformat()
+
             stream_chunk = StreamingChunk(token.text, chunk_metadata)
             chunks.append(stream_chunk)
             streaming_callback(stream_chunk)
+
         metadata = {
             "finish_reason": chunks[-1].meta.get("finish_reason", None),
             "model": self._client.model,
             "usage": {"completion_tokens": chunks[-1].meta.get("generated_tokens", 0)},
+            "completion_start_time": first_chunk_time,
         }
         return {"replies": ["".join([chunk.content for chunk in chunks])], "meta": [metadata]}
 
diff --git a/haystack/components/generators/openai.py b/haystack/components/generators/openai.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
+from datetime import datetime
 from typing import Any, Callable, Dict, List, Optional, Union
 
 from openai import OpenAI, Stream
@@ -255,7 +256,7 @@ def _create_message_from_chunks(
                 "model": completion_chunk.model,
                 "index": 0,
                 "finish_reason": finish_reason,
-                # Usage is available when streaming only if the user explicitly requests it
+                "completion_start_time": streamed_chunks[0].meta.get("received_at"),  # first chunk received
                 "usage": dict(completion_chunk.usage or {}),
             }
         )
@@ -296,12 +297,17 @@ def _build_chunk(chunk: Any) -> StreamingChunk:
         :returns:
             The StreamingChunk.
         """
-        # function or tools calls are not going to happen in non-chat generation
-        # as users can not send ChatMessage with function or tools calls
         choice = chunk.choices[0]
         content = choice.delta.content or ""
         chunk_message = StreamingChunk(content)
-        chunk_message.meta.update({"model": chunk.model, "index": choice.index, "finish_reason": choice.finish_reason})
+        chunk_message.meta.update(
+            {
+                "model": chunk.model,
+                "index": choice.index,
+                "finish_reason": choice.finish_reason,
+                "received_at": datetime.now().isoformat(),
+            }
+        )
         return chunk_message
 
     @staticmethod
diff --git a/releasenotes/notes/add-streaming-completion-timestamp-c0ad3b8698a2d575.yaml b/releasenotes/notes/add-streaming-completion-timestamp-c0ad3b8698a2d575.yaml
@@ -0,0 +1,4 @@
+---
+enhancements:
+ - |
+   Added completion_start_time metadata to track time-to-first-token (TTFT) in streaming responses from Hugging Face API and OpenAI (Azure).
diff --git a/test/components/generators/chat/test_hugging_face_api.py b/test/components/generators/chat/test_hugging_face_api.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
 #
 # SPDX-License-Identifier: Apache-2.0
+from datetime import datetime
 import os
 from unittest.mock import MagicMock, Mock, patch
 
@@ -503,9 +504,13 @@ def test_live_run_serverless_streaming(self):
         assert isinstance(response["replies"], list)
         assert len(response["replies"]) > 0
         assert [isinstance(reply, ChatMessage) for reply in response["replies"]]
-        assert "usage" in response["replies"][0].meta
-        assert "prompt_tokens" in response["replies"][0].meta["usage"]
-        assert "completion_tokens" in response["replies"][0].meta["usage"]
+
+        response_meta = response["replies"][0].meta
+        assert "completion_start_time" in response_meta
+        assert datetime.fromisoformat(response_meta["completion_start_time"]) <= datetime.now()
+        assert "usage" in response_meta
+        assert "prompt_tokens" in response_meta["usage"]
+        assert "completion_tokens" in response_meta["usage"]
 
     @pytest.mark.integration
     @pytest.mark.skipif(
diff --git a/test/components/generators/chat/test_openai.py b/test/components/generators/chat/test_openai.py
@@ -546,6 +546,10 @@ def __call__(self, chunk: StreamingChunk) -> None:
         assert callback.counter > 1
         assert "Paris" in callback.responses
 
+        # check that the completion_start_time is set and valid ISO format
+        assert "completion_start_time" in message.meta
+        assert datetime.fromisoformat(message.meta["completion_start_time"]) < datetime.now()
+
     @pytest.mark.skipif(
         not os.environ.get("OPENAI_API_KEY", None),
         reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",
diff --git a/test/components/generators/test_hugging_face_api.py b/test/components/generators/test_hugging_face_api.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 import os
 from unittest.mock import MagicMock, Mock, patch
+from datetime import datetime
 
 import pytest
 from huggingface_hub import (
@@ -312,3 +313,25 @@ def test_run_serverless(self):
         assert isinstance(response["meta"], list)
         assert len(response["meta"]) > 0
         assert [isinstance(meta, dict) for meta in response["meta"]]
+
+    @pytest.mark.integration
+    @pytest.mark.skipif(
+        not os.environ.get("HF_API_TOKEN", None),
+        reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.",
+    )
+    def test_live_run_streaming_check_completion_start_time(self):
+        generator = HuggingFaceAPIGenerator(
+            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
+            api_params={"model": "HuggingFaceH4/zephyr-7b-beta"},
+            streaming_callback=streaming_callback_handler,
+        )
+
+        results = generator.run("What is the capital of France?")
+
+        assert len(results["replies"]) == 1
+        assert "Paris" in results["replies"][0]
+
+        # Verify completion start time in final metadata
+        assert "completion_start_time" in results["meta"][0]
+        completion_start = datetime.fromisoformat(results["meta"][0]["completion_start_time"])
+        assert completion_start <= datetime.now()
diff --git a/test/components/generators/test_openai.py b/test/components/generators/test_openai.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
 #
 # SPDX-License-Identifier: Apache-2.0
+from datetime import datetime
 import logging
 import os
 from typing import List
@@ -286,6 +287,9 @@ def __call__(self, chunk: StreamingChunk) -> None:
         assert "gpt-4o-mini" in metadata["model"]
         assert metadata["finish_reason"] == "stop"
 
+        assert "completion_start_time" in metadata
+        assert datetime.fromisoformat(metadata["completion_start_time"]) <= datetime.now()
+
         # unfortunately, the usage is not available for streaming calls
         # we keep the key in the metadata for compatibility
         assert "usage" in metadata and len(metadata["usage"]) == 0

Original file line number	Diff line number	Diff line change
`@@ -2,6 +2,7 @@`
`2`	`2`	`#`
`3`	`3`	`# SPDX-License-Identifier: Apache-2.0`
`4`	`4`
	`5`	`+from datetime import datetime`
`5`	`6`	`from typing import Any, Callable, Dict, Iterable, List, Optional, Union`
`6`	`7`
`7`	`8`	`from haystack import component, default_from_dict, default_to_dict, logging`
`@@ -259,6 +260,7 @@ def _run_streaming(self, messages: List[Dict[str, str]], generation_kwargs: Dict`
`259`	`260`	`)`
`260`	`261`
`261`	`262`	`generated_text = ""`
	`263`	`+ first_chunk_time = None`
`262`	`264`
`263`	`265`	`for chunk in api_output:`
`264`	`266`	`# n is unused, so the API always returns only one choice`
`@@ -276,6 +278,9 @@ def _run_streaming(self, messages: List[Dict[str, str]], generation_kwargs: Dict`
`276`	`278`	`if finish_reason:`
`277`	`279`	`meta["finish_reason"] = finish_reason`
`278`	`280`
	`281`	`+ if first_chunk_time is None:`
	`282`	`+ first_chunk_time = datetime.now().isoformat()`
	`283`	`+`
`279`	`284`	`stream_chunk = StreamingChunk(text, meta)`
`280`	`285`	`self.streaming_callback(stream_chunk) # type: ignore # streaming_callback is not None (verified in the run method)`
`281`	`286`
`@@ -285,6 +290,7 @@ def _run_streaming(self, messages: List[Dict[str, str]], generation_kwargs: Dict`
`285`	`290`	`"finish_reason": finish_reason,`
`286`	`291`	`"index": 0,`
`287`	`292`	`"usage": {"prompt_tokens": 0, "completion_tokens": 0}, # not available in streaming`
	`293`	`+ "completion_start_time": first_chunk_time,`
`288`	`294`	`}`
`289`	`295`	`)`
`290`	`296`
Original file line number	Diff line number	Diff line change
`@@ -4,6 +4,7 @@`
`4`	`4`
`5`	`5`	`import json`
`6`	`6`	`import os`
	`7`	`+from datetime import datetime`
`7`	`8`	`from typing import Any, Callable, Dict, List, Optional, Union`
`8`	`9`
`9`	`10`	`from openai import OpenAI, Stream`
`@@ -381,6 +382,7 @@ def _convert_streaming_chunks_to_chat_message(self, chunk: Any, chunks: List[Str`
`381`	`382`	`"model": chunk.model,`
`382`	`383`	`"index": 0,`
`383`	`384`	`"finish_reason": chunk.choices[0].finish_reason,`
	`385`	`+ "completion_start_time": chunks[0].meta.get("received_at"), # first chunk received`
`384`	`386`	`"usage": {}, # we don't have usage data for streaming responses`
`385`	`387`	`}`
`386`	`388`
`@@ -444,6 +446,7 @@ def _convert_chat_completion_chunk_to_streaming_chunk(self, chunk: ChatCompletio`
`444`	`446`	`"index": choice.index,`
`445`	`447`	`"tool_calls": choice.delta.tool_calls,`
`446`	`448`	`"finish_reason": choice.finish_reason,`
	`449`	`+ "received_at": datetime.now().isoformat(),`
`447`	`450`	`}`
`448`	`451`	`)`
`449`	`452`	`return chunk_message`
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +---
 +enhancements:
 + - |
 +   Added completion_start_time metadata to track time-to-first-token (TTFT) in streaming responses from Hugging Face API and OpenAI (Azure).