fix: fix llama.cpp types; add py.typed; Toolset support (#1973)

anakin87 · web-flow · commit 9a1546721391 · 2025-06-19T11:33:28.000+02:00
* fix: fix llama.cpp types; add py.typed; Toolset support

* missing check
diff --git a/.github/workflows/llama_cpp.yml b/.github/workflows/llama_cpp.yml
@@ -50,11 +50,9 @@ jobs:
       - name: Install Hatch
         run: pip install --upgrade hatch
 
-    # TODO: Once this integration is properly typed, use hatch run test:types
-    # https://github.com/deepset-ai/haystack-core-integrations/issues/1771
       - name: Lint
         if: matrix.python-version == '3.9' && runner.os == 'Linux'
-        run: hatch run fmt-check && hatch run lint:typing
+        run: hatch run fmt-check && hatch run test:types
 
       - name: Generate docs
         if: matrix.python-version == '3.9' && runner.os == 'Linux'
diff --git a/integrations/llama_cpp/pyproject.toml b/integrations/llama_cpp/pyproject.toml
@@ -26,7 +26,7 @@ classifiers = [
     "Programming Language :: Python :: Implementation :: CPython",
     "Programming Language :: Python :: Implementation :: PyPy",
 ]
-dependencies = ["haystack-ai>=2.9.0", "llama-cpp-python>=0.2.87"]
+dependencies = ["haystack-ai>=2.13.0", "llama-cpp-python>=0.2.87"]
 
 [project.urls]
 Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/llama_cpp#readme"
@@ -68,18 +68,13 @@ unit = 'pytest -m "not integration" {args:tests}'
 integration = 'pytest -m "integration" {args:tests}'
 all = 'pytest {args:tests}'
 cov-retry = 'all --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x'
-types = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}"
+types = "mypy -p haystack_integrations.components.generators.llama_cpp {args}"
 
-# TODO: remove lint environment once this integration is properly typed
-# test environment should be used instead
-# https://github.com/deepset-ai/haystack-core-integrations/issues/1771
-[tool.hatch.envs.lint]
-installer = "uv"
-detached = true
-dependencies = ["pip", "black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243"]
-
-[tool.hatch.envs.lint.scripts]
-typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}"
+[tool.mypy]
+install_types = true
+non_interactive = true
+check_untyped_defs = true
+disallow_incomplete_defs = true
 
 [tool.hatch.metadata]
 allow-direct-references = true
@@ -169,7 +164,3 @@ markers = [
     "integration: marks tests as slow (deselect with '-m \"not integration\"')",
 ]
 addopts = ["--import-mode=importlib"]
-
-[[tool.mypy.overrides]]
-module = ["haystack.*", "haystack_integrations.*", "pytest.*", "llama_cpp.*"]
-ignore_missing_imports = true
diff --git a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py
@@ -1,25 +1,32 @@
 import json
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
 from haystack import component, default_from_dict, default_to_dict, logging
 from haystack.dataclasses import ChatMessage, ToolCall
-from haystack.tools import Tool, _check_duplicate_tool_names
-
-# Compatibility with Haystack 2.12.0 and 2.13.0 - remove after 2.13.0 is released
-try:
-    from haystack.tools import deserialize_tools_or_toolset_inplace
-except ImportError:
-    from haystack.tools import deserialize_tools_inplace as deserialize_tools_or_toolset_inplace
-
-from llama_cpp import ChatCompletionResponseChoice, CreateChatCompletionResponse, Llama
+from haystack.tools import (
+    Tool,
+    Toolset,
+    _check_duplicate_tool_names,
+    deserialize_tools_or_toolset_inplace,
+    serialize_tools_or_toolset,
+)
+from llama_cpp import (
+    ChatCompletionMessageToolCall,
+    ChatCompletionRequestAssistantMessage,
+    ChatCompletionRequestMessage,
+    ChatCompletionResponseChoice,
+    ChatCompletionTool,
+    CreateChatCompletionResponse,
+    Llama,
+)
 from llama_cpp.llama_tokenizer import LlamaHFTokenizer
 
 logger = logging.getLogger(__name__)
 
 
-def _convert_message_to_llamacpp_format(message: ChatMessage) -> Dict[str, Any]:
+def _convert_message_to_llamacpp_format(message: ChatMessage) -> ChatCompletionRequestMessage:
     """
-    Convert a ChatMessage to the format expected by Ollama Chat API.
+    Convert a ChatMessage to the format expected by llama.cpp Chat API.
     """
     text_contents = message.texts
     tool_calls = message.tool_calls
@@ -33,38 +40,51 @@ def _convert_message_to_llamacpp_format(message: ChatMessage) -> Dict[str, Any]:
         raise ValueError(msg)
 
     role = message._role.value
-    if role == "tool":
-        role = "function"
-
-    llamacpp_msg: Dict[str, Any] = {"role": role}
 
-    if tool_call_results:
+    if role == "tool" and tool_call_results:
         if tool_call_results[0].origin.id is None:
             msg = "`ToolCall` must have a non-null `id` attribute to be used with llama.cpp."
             raise ValueError(msg)
-        llamacpp_msg["content"] = tool_call_results[0].result
-        llamacpp_msg["tool_call_id"] = tool_call_results[0].origin.id
-        # Llama.cpp does not provide a way to communicate errors in tool invocations, so we ignore the error field
-        return llamacpp_msg
-
-    if text_contents:
-        llamacpp_msg["content"] = text_contents[0]
-    if tool_calls:
-        llamacpp_tool_calls = []
-        for tc in tool_calls:
-            if tc.id is None:
-                msg = "`ToolCall` must have a non-null `id` attribute to be used with llama.cpp."
-                raise ValueError(msg)
-            llamacpp_tool_calls.append(
-                {
-                    "id": tc.id,
-                    "type": "function",
-                    # We disable ensure_ascii so special chars like emojis are not converted
-                    "function": {"name": tc.tool_name, "arguments": json.dumps(tc.arguments, ensure_ascii=False)},
-                }
-            )
-        llamacpp_msg["tool_calls"] = llamacpp_tool_calls
-    return llamacpp_msg
+        return {
+            "role": "function",
+            "content": tool_call_results[0].result,
+            "name": tool_call_results[0].origin.tool_name,
+        }
+
+    if role == "system":
+        content = text_contents[0] if text_contents else None
+        return {"role": "system", "content": content}
+
+    if role == "user":
+        content = text_contents[0] if text_contents else None
+        return {"role": "user", "content": content}
+
+    if role == "assistant":
+        result: ChatCompletionRequestAssistantMessage = {"role": "assistant"}
+
+        if text_contents:
+            result["content"] = text_contents[0]
+
+        if tool_calls:
+            llamacpp_tool_calls: List[ChatCompletionMessageToolCall] = []
+            for tc in tool_calls:
+                if tc.id is None:
+                    msg = "`ToolCall` must have a non-null `id` attribute to be used with llama.cpp."
+                    raise ValueError(msg)
+                llamacpp_tool_calls.append(
+                    {
+                        "id": tc.id,
+                        "type": "function",
+                        # We disable ensure_ascii so special chars like emojis are not converted
+                        "function": {"name": tc.tool_name, "arguments": json.dumps(tc.arguments, ensure_ascii=False)},
+                    }
+                )
+            result["tool_calls"] = llamacpp_tool_calls
+
+        return result
+
+    error_msg = f"Unknown role: {role}"
+    raise ValueError(error_msg)
 
 
 @component
@@ -94,7 +114,7 @@ def __init__(
         model_kwargs: Optional[Dict[str, Any]] = None,
         generation_kwargs: Optional[Dict[str, Any]] = None,
         *,
-        tools: Optional[List[Tool]] = None,
+        tools: Optional[Union[List[Tool], Toolset]] = None,
     ):
         """
         :param model: The path of a quantized model for text generation, for example, "zephyr-7b-beta.Q4_0.gguf".
@@ -110,7 +130,8 @@ def __init__(
             For more information on the available kwargs, see
             [llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion).
         :param tools:
-            A list of tools for which the model can prepare calls.
+            A list of tools or a Toolset for which the model can prepare calls.
+            This parameter can accept either a list of `Tool` objects or a `Toolset` instance.
         """
 
         model_kwargs = model_kwargs or {}
@@ -122,14 +143,14 @@ def __init__(
         model_kwargs.setdefault("n_ctx", n_ctx)
         model_kwargs.setdefault("n_batch", n_batch)
 
-        _check_duplicate_tool_names(tools)
+        _check_duplicate_tool_names(list(tools or []))
 
         self.model_path = model
         self.n_ctx = n_ctx
         self.n_batch = n_batch
         self.model_kwargs = model_kwargs
         self.generation_kwargs = generation_kwargs
-        self._model = None
+        self._model: Optional[Llama] = None
         self.tools = tools
 
     def warm_up(self):
@@ -147,15 +168,14 @@ def to_dict(self) -> Dict[str, Any]:
         :returns:
               Dictionary with serialized data.
         """
-        serialized_tools = [tool.to_dict() for tool in self.tools] if self.tools else None
         return default_to_dict(
             self,
             model=self.model_path,
             n_ctx=self.n_ctx,
             n_batch=self.n_batch,
             model_kwargs=self.model_kwargs,
             generation_kwargs=self.generation_kwargs,
-            tools=serialized_tools,
+            tools=serialize_tools_or_toolset(self.tools),
         )
 
     @classmethod
@@ -177,8 +197,8 @@ def run(
         messages: List[ChatMessage],
         generation_kwargs: Optional[Dict[str, Any]] = None,
         *,
-        tools: Optional[List[Tool]] = None,
-    ):
+        tools: Optional[Union[List[Tool], Toolset]] = None,
+    ) -> Dict[str, List[ChatMessage]]:
         """
         Run the text generation model on the given list of ChatMessages.
 
@@ -188,8 +208,8 @@ def run(
             For more information on the available kwargs, see
             [llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion).
         :param tools:
-            A list of tools for which the model can prepare calls. If set, it will override the `tools` parameter set
-            during component initialization.
+            A list of tools or a Toolset for which the model can prepare calls. If set, it will override the `tools`
+            parameter set during component initialization.
         :returns: A dictionary with the following keys:
             - `replies`: The responses from the model
         """
@@ -204,16 +224,33 @@ def run(
         formatted_messages = [_convert_message_to_llamacpp_format(msg) for msg in messages]
 
         tools = tools or self.tools
-        llamacpp_tools = {}
+        if isinstance(tools, Toolset):
+            tools = list(tools)
+        _check_duplicate_tool_names(tools)
+
+        llamacpp_tools: List[ChatCompletionTool] = []
         if tools:
-            tool_definitions = [{"type": "function", "function": {**t.tool_spec}} for t in tools]
-            llamacpp_tools = {"tools": tool_definitions}
+            for t in tools:
+                llamacpp_tools.append(
+                    {
+                        "type": "function",
+                        "function": {
+                            "name": t.tool_spec["name"],
+                            "description": t.tool_spec.get("description", ""),
+                            "parameters": t.tool_spec.get("parameters", {}),
+                        },
+                    }
+                )
 
         response = self._model.create_chat_completion(
-            messages=formatted_messages, **updated_generation_kwargs, **llamacpp_tools
+            messages=formatted_messages, tools=llamacpp_tools, **updated_generation_kwargs
         )
 
         replies = []
+        if not isinstance(response, dict):
+            msg = f"Expected a dictionary response, got a different object: {response}"
+            raise ValueError(msg)
+
         for choice in response["choices"]:
             chat_message = self._convert_chat_completion_choice_to_chat_message(choice, response)
             replies.append(chat_message)
@@ -239,10 +276,10 @@ def _convert_chat_completion_choice_to_chat_message(
                 except json.JSONDecodeError:
                     logger.warning(
                         "Llama.cpp returned a malformed JSON string for tool call arguments. This tool call "
-                        "will be skipped. Tool call ID: %s, Tool name: %s, Arguments: %s",
-                        llamacpp_tc["id"],
-                        llamacpp_tc["function"]["name"],
-                        arguments_str,
+                        "will be skipped. Tool call ID: {tc_id}, Tool name: {tc_name}, Arguments: {tc_args}",
+                        tc_id=llamacpp_tc["id"],
+                        tc_name=llamacpp_tc["function"]["name"],
+                        tc_args=arguments_str,
                     )
 
         meta = {
diff --git a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/generator.py b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/generator.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
 from haystack import component, logging
 
@@ -62,14 +62,16 @@ def __init__(
         self.n_batch = n_batch
         self.model_kwargs = model_kwargs
         self.generation_kwargs = generation_kwargs
-        self.model = None
+        self.model: Optional[Llama] = None
 
     def warm_up(self):
         if self.model is None:
             self.model = Llama(**self.model_kwargs)
 
     @component.output_types(replies=List[str], meta=List[Dict[str, Any]])
-    def run(self, prompt: str, generation_kwargs: Optional[Dict[str, Any]] = None):
+    def run(
+        self, prompt: str, generation_kwargs: Optional[Dict[str, Any]] = None
+    ) -> Dict[str, Union[List[str], List[Dict[str, Any]]]]:
         """
         Run the text generation model on the given prompt.
 
@@ -92,6 +94,10 @@ def run(self, prompt: str, generation_kwargs: Optional[Dict[str, Any]] = None):
         updated_generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
 
         output = self.model.create_completion(prompt=prompt, **updated_generation_kwargs)
+        if not isinstance(output, dict):
+            msg = f"Expected a dictionary response, got a different object: {output}"
+            raise ValueError(msg)
+
         replies = [output["choices"][0]["text"]]
 
-        return {"replies": replies, "meta": [output]}
+        return {"replies": replies, "meta": [dict(output.items())]}
diff --git a/integrations/llama_cpp/src/haystack_integrations/components/generators/py.typed b/integrations/llama_cpp/src/haystack_integrations/components/generators/py.typed
diff --git a/integrations/llama_cpp/tests/test_chat_generator.py b/integrations/llama_cpp/tests/test_chat_generator.py