Merge branch 'main' into feat/async_pipeline

davidsbatista · web-flow · commit ff57a5ae086c · 2025-02-07T14:04:55.000+01:00
diff --git a/docs/pydoc/config/preprocessors_api.yml b/docs/pydoc/config/preprocessors_api.yml
@@ -1,7 +1,7 @@
 loaders:
   - type: haystack_pydoc_tools.loaders.CustomPythonLoader
     search_path: [../../../haystack/components/preprocessors]
-    modules: ["document_cleaner", "document_splitter", "recursive_splitter", "text_cleaner"]
+    modules: ["csv_document_cleaner", "document_cleaner", "document_splitter", "recursive_splitter", "text_cleaner"]
     ignore_when_discovered: ["__init__"]
 processors:
   - type: filter
diff --git a/haystack/components/generators/chat/hugging_face_api.py b/haystack/components/generators/chat/hugging_face_api.py
@@ -15,6 +15,7 @@
 
 with LazyImport(message="Run 'pip install \"huggingface_hub[inference]>=0.27.0\"'") as huggingface_hub_import:
     from huggingface_hub import (
+        ChatCompletionInputFunctionDefinition,
         ChatCompletionInputTool,
         ChatCompletionOutput,
         ChatCompletionStreamOutput,
@@ -255,8 +256,15 @@ def run(
 
         hf_tools = None
         if tools:
-            hf_tools = [{"type": "function", "function": {**t.tool_spec}} for t in tools]
-
+            hf_tools = [
+                ChatCompletionInputTool(
+                    function=ChatCompletionInputFunctionDefinition(
+                        name=tool.name, description=tool.description, arguments=tool.parameters
+                    ),
+                    type="function",
+                )
+                for tool in tools
+            ]
         return self._run_non_streaming(formatted_messages, generation_kwargs, hf_tools)
 
     def _run_streaming(
@@ -278,13 +286,12 @@ def _run_streaming(
             # see https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion.n
             choice = chunk.choices[0]
 
-            text = choice.delta.content
-            if text:
-                generated_text += text
+            text = choice.delta.content or ""
+            generated_text += text
 
             finish_reason = choice.finish_reason
 
-            meta = {}
+            meta: Dict[str, Any] = {}
             if finish_reason:
                 meta["finish_reason"] = finish_reason
 
@@ -336,7 +343,11 @@ def _run_non_streaming(
                 )
                 tool_calls.append(tool_call)
 
-        meta = {"model": self._client.model, "finish_reason": choice.finish_reason, "index": choice.index}
+        meta: Dict[str, Any] = {
+            "model": self._client.model,
+            "finish_reason": choice.finish_reason,
+            "index": choice.index,
+        }
 
         usage = {"prompt_tokens": 0, "completion_tokens": 0}
         if api_chat_output.usage:
diff --git a/haystack/components/generators/chat/hugging_face_local.py b/haystack/components/generators/chat/hugging_face_local.py
@@ -145,7 +145,7 @@ def __init__(  # pylint: disable=too-many-positional-arguments
             elif isinstance(huggingface_pipeline_kwargs["model"], str):
                 task = model_info(
                     huggingface_pipeline_kwargs["model"], token=huggingface_pipeline_kwargs["token"]
-                ).pipeline_tag
+                ).pipeline_tag  # type: ignore[assignment]  # we'll check below if task is in supported tasks
 
         if task not in PIPELINE_SUPPORTED_TASKS:
             raise ValueError(
diff --git a/haystack/components/generators/chat/openai.py b/haystack/components/generators/chat/openai.py
@@ -431,8 +431,9 @@ def _convert_chat_completion_chunk_to_streaming_chunk(self, chunk: ChatCompletio
         Converts the streaming response chunk from the OpenAI API to a StreamingChunk.
 
         :param chunk: The chunk returned by the OpenAI API.
-        :param choice: The choice returned by the OpenAI API.
-        :return: The StreamingChunk.
+
+        :returns:
+            The StreamingChunk.
         """
         # we stream the content of the chunk if it's not a tool or function call
         choice: ChunkChoice = chunk.choices[0]
diff --git a/haystack/components/generators/hugging_face_api.py b/haystack/components/generators/hugging_face_api.py
@@ -4,7 +4,7 @@
 
 from dataclasses import asdict
 from datetime import datetime
-from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union, cast
 
 from haystack import component, default_from_dict, default_to_dict, logging
 from haystack.dataclasses import StreamingChunk
@@ -17,8 +17,8 @@
     from huggingface_hub import (
         InferenceClient,
         TextGenerationOutput,
-        TextGenerationOutputToken,
         TextGenerationStreamOutput,
+        TextGenerationStreamOutputToken,
     )
 
 
@@ -212,7 +212,8 @@ def run(
         if streaming_callback is not None:
             return self._stream_and_build_response(hf_output, streaming_callback)
 
-        return self._build_non_streaming_response(hf_output)
+        # mypy doesn't know that hf_output is a TextGenerationOutput, so we cast it
+        return self._build_non_streaming_response(cast(TextGenerationOutput, hf_output))
 
     def _stream_and_build_response(
         self, hf_output: Iterable["TextGenerationStreamOutput"], streaming_callback: Callable[[StreamingChunk], None]
@@ -221,7 +222,7 @@ def _stream_and_build_response(
         first_chunk_time = None
 
         for chunk in hf_output:
-            token: TextGenerationOutputToken = chunk.token
+            token: TextGenerationStreamOutputToken = chunk.token
             if token.special:
                 continue
 
diff --git a/haystack/components/preprocessors/__init__.py b/haystack/components/preprocessors/__init__.py
@@ -2,9 +2,10 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from .csv_document_cleaner import CSVDocumentCleaner
 from .document_cleaner import DocumentCleaner
 from .document_splitter import DocumentSplitter
 from .recursive_splitter import RecursiveDocumentSplitter
 from .text_cleaner import TextCleaner
 
-__all__ = ["DocumentSplitter", "DocumentCleaner", "RecursiveDocumentSplitter", "TextCleaner"]
+__all__ = ["CSVDocumentCleaner", "DocumentCleaner", "DocumentSplitter", "RecursiveDocumentSplitter", "TextCleaner"]
diff --git a/haystack/components/preprocessors/csv_document_cleaner.py b/haystack/components/preprocessors/csv_document_cleaner.py
@@ -0,0 +1,116 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from io import StringIO
+from typing import Dict, List
+
+from haystack import Document, component, logging
+from haystack.lazy_imports import LazyImport
+
+with LazyImport("Run 'pip install pandas'") as pandas_import:
+    import pandas as pd
+
+logger = logging.getLogger(__name__)
+
+
+@component
+class CSVDocumentCleaner:
+    """
+    A component for cleaning CSV documents by removing empty rows and columns.
+
+    This component processes CSV content stored in Documents, allowing
+    for the optional ignoring of a specified number of rows and columns before performing
+    the cleaning operation.
+    """
+
+    def __init__(self, ignore_rows: int = 0, ignore_columns: int = 0) -> None:
+        """
+        Initializes the CSVDocumentCleaner component.
+
+        :param ignore_rows: Number of rows to ignore from the top of the CSV table before processing.
+        :param ignore_columns: Number of columns to ignore from the left of the CSV table before processing.
+
+        Rows and columns ignored using these parameters are preserved in the final output, meaning
+        they are not considered when removing empty rows and columns.
+        """
+        self.ignore_rows = ignore_rows
+        self.ignore_columns = ignore_columns
+        pandas_import.check()
+
+    @component.output_types(documents=List[Document])
+    def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
+        """
+        Cleans CSV documents by removing empty rows and columns while preserving specified ignored rows and columns.
+
+        :param documents: List of Documents containing CSV-formatted content.
+
+        Processing steps:
+        1. Reads each document's content as a CSV table.
+        2. Retains the specified number of `ignore_rows` from the top and `ignore_columns` from the left.
+        3. Drops any rows and columns that are entirely empty (all NaN values).
+        4. Reattaches the ignored rows and columns to maintain their original positions.
+        5. Returns the cleaned CSV content as a new `Document` object.
+        """
+        ignore_rows = self.ignore_rows
+        ignore_columns = self.ignore_columns
+
+        cleaned_documents = []
+        for document in documents:
+            try:
+                df = pd.read_csv(StringIO(document.content), header=None, dtype=object)  # type: ignore
+            except Exception as e:
+                logger.error(
+                    "Error processing document {id}. Keeping it, but skipping cleaning. Error: {error}",
+                    id=document.id,
+                    error=e,
+                )
+                cleaned_documents.append(document)
+                continue
+
+            if ignore_rows > df.shape[0] or ignore_columns > df.shape[1]:
+                logger.warning(
+                    "Document {id} has fewer rows {df_rows} or columns {df_cols} "
+                    "than the number of rows {rows} or columns {cols} to ignore. "
+                    "Keeping the entire document.",
+                    id=document.id,
+                    df_rows=df.shape[0],
+                    df_cols=df.shape[1],
+                    rows=ignore_rows,
+                    cols=ignore_columns,
+                )
+                cleaned_documents.append(document)
+                continue
+
+            # Save ignored rows
+            ignored_rows = None
+            if ignore_rows > 0:
+                ignored_rows = df.iloc[:ignore_rows, :]
+
+            # Save ignored columns
+            ignored_columns = None
+            if ignore_columns > 0:
+                ignored_columns = df.iloc[:, :ignore_columns]
+
+            # Drop rows and columns that are entirely empty
+            remaining_df = df.iloc[ignore_rows:, ignore_columns:]
+            final_df = remaining_df.dropna(axis=0, how="all").dropna(axis=1, how="all")
+
+            # Reattach ignored rows
+            if ignore_rows > 0 and ignored_rows is not None:
+                # Keep only relevant columns
+                ignored_rows = ignored_rows.loc[:, final_df.columns]
+                final_df = pd.concat([ignored_rows, final_df], axis=0)
+
+            # Reattach ignored columns
+            if ignore_columns > 0 and ignored_columns is not None:
+                # Keep only relevant rows
+                ignored_columns = ignored_columns.loc[final_df.index, :]
+                final_df = pd.concat([ignored_columns, final_df], axis=1)
+
+            cleaned_documents.append(
+                Document(
+                    content=final_df.to_csv(index=False, header=False, lineterminator="\n"), meta=document.meta.copy()
+                )
+            )
+        return {"documents": cleaned_documents}
diff --git a/releasenotes/notes/csv-document-cleaner-8eca67e884684c56.yaml b/releasenotes/notes/csv-document-cleaner-8eca67e884684c56.yaml
@@ -0,0 +1,6 @@
+---
+features:
+  - |
+    Introduced `CSVDocumentCleaner` component for cleaning CSV documents.
+    - Removes empty rows and columns, while preserving specified ignored rows and columns.
+    - Customizable number of rows and columns to ignore during processing.
diff --git a/test/components/generators/test_hugging_face_api.py b/test/components/generators/test_hugging_face_api.py
@@ -7,6 +7,7 @@
 
 import pytest
 from huggingface_hub import (
+    TextGenerationOutput,
     TextGenerationOutputToken,
     TextGenerationStreamOutput,
     TextGenerationStreamOutputStreamDetails,
@@ -30,7 +31,7 @@ def mock_check_valid_model():
 @pytest.fixture
 def mock_text_generation():
     with patch("huggingface_hub.InferenceClient.text_generation", autospec=True) as mock_text_generation:
-        mock_response = Mock()
+        mock_response = Mock(spec=TextGenerationOutput)
         mock_response.generated_text = "I'm fine, thanks."
         details = Mock()
         details.finish_reason = MagicMock(field1="value")
diff --git a/test/components/preprocessors/test_csv_document_cleaner.py b/test/components/preprocessors/test_csv_document_cleaner.py