Skip to content

Commit 22dff10

Browse files
committed
Merge branch 'main' of github.com:deepset-ai/haystack into recursive-csv-splitter
2 parents 18d6e40 + 35788a2 commit 22dff10

File tree

3 files changed

+177
-35
lines changed

3 files changed

+177
-35
lines changed

Diff for: haystack/components/preprocessors/csv_document_cleaner.py

+97-35
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44

5+
from copy import deepcopy
56
from io import StringIO
6-
from typing import Dict, List
7+
from typing import Dict, List, Optional
78

89
from haystack import Document, component, logging
910
from haystack.lazy_imports import LazyImport
@@ -21,21 +22,36 @@ class CSVDocumentCleaner:
2122
2223
This component processes CSV content stored in Documents, allowing
2324
for the optional ignoring of a specified number of rows and columns before performing
24-
the cleaning operation.
25+
the cleaning operation. Additionally, it provides options to keep document IDs and
26+
control whether empty rows and columns should be removed.
2527
"""
2628

27-
def __init__(self, ignore_rows: int = 0, ignore_columns: int = 0) -> None:
29+
def __init__(
30+
self,
31+
*,
32+
ignore_rows: int = 0,
33+
ignore_columns: int = 0,
34+
remove_empty_rows: bool = True,
35+
remove_empty_columns: bool = True,
36+
keep_id: bool = False,
37+
) -> None:
2838
"""
2939
Initializes the CSVDocumentCleaner component.
3040
3141
:param ignore_rows: Number of rows to ignore from the top of the CSV table before processing.
3242
:param ignore_columns: Number of columns to ignore from the left of the CSV table before processing.
43+
:param remove_empty_rows: Whether to remove rows that are entirely empty.
44+
:param remove_empty_columns: Whether to remove columns that are entirely empty.
45+
:param keep_id: Whether to retain the original document ID in the output document.
3346
3447
Rows and columns ignored using these parameters are preserved in the final output, meaning
3548
they are not considered when removing empty rows and columns.
3649
"""
3750
self.ignore_rows = ignore_rows
3851
self.ignore_columns = ignore_columns
52+
self.remove_empty_rows = remove_empty_rows
53+
self.remove_empty_columns = remove_empty_columns
54+
self.keep_id = keep_id
3955
pandas_import.check()
4056

4157
@component.output_types(documents=List[Document])
@@ -44,14 +60,20 @@ def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
4460
Cleans CSV documents by removing empty rows and columns while preserving specified ignored rows and columns.
4561
4662
:param documents: List of Documents containing CSV-formatted content.
63+
:return: A dictionary with a list of cleaned Documents under the key "documents".
4764
4865
Processing steps:
4966
1. Reads each document's content as a CSV table.
5067
2. Retains the specified number of `ignore_rows` from the top and `ignore_columns` from the left.
51-
3. Drops any rows and columns that are entirely empty (all NaN values).
68+
3. Drops any rows and columns that are entirely empty (if enabled by `remove_empty_rows` and
69+
`remove_empty_columns`).
5270
4. Reattaches the ignored rows and columns to maintain their original positions.
53-
5. Returns the cleaned CSV content as a new `Document` object.
71+
5. Returns the cleaned CSV content as a new `Document` object, with an option to retain the original
72+
document ID.
5473
"""
74+
if len(documents) == 0:
75+
return {"documents": []}
76+
5577
ignore_rows = self.ignore_rows
5678
ignore_columns = self.ignore_columns
5779

@@ -82,35 +104,75 @@ def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
82104
cleaned_documents.append(document)
83105
continue
84106

85-
# Save ignored rows
86-
ignored_rows = None
87-
if ignore_rows > 0:
88-
ignored_rows = df.iloc[:ignore_rows, :]
89-
90-
# Save ignored columns
91-
ignored_columns = None
92-
if ignore_columns > 0:
93-
ignored_columns = df.iloc[:, :ignore_columns]
94-
95-
# Drop rows and columns that are entirely empty
96-
remaining_df = df.iloc[ignore_rows:, ignore_columns:]
97-
final_df = remaining_df.dropna(axis=0, how="all").dropna(axis=1, how="all")
98-
99-
# Reattach ignored rows
100-
if ignore_rows > 0 and ignored_rows is not None:
101-
# Keep only relevant columns
102-
ignored_rows = ignored_rows.loc[:, final_df.columns]
103-
final_df = pd.concat([ignored_rows, final_df], axis=0)
104-
105-
# Reattach ignored columns
106-
if ignore_columns > 0 and ignored_columns is not None:
107-
# Keep only relevant rows
108-
ignored_columns = ignored_columns.loc[final_df.index, :]
109-
final_df = pd.concat([ignored_columns, final_df], axis=1)
110-
111-
cleaned_documents.append(
112-
Document(
113-
content=final_df.to_csv(index=False, header=False, lineterminator="\n"), meta=document.meta.copy()
114-
)
107+
final_df = self._clean_df(df=df, ignore_rows=ignore_rows, ignore_columns=ignore_columns)
108+
109+
clean_doc = Document(
110+
id=document.id if self.keep_id else "",
111+
content=final_df.to_csv(index=False, header=False, lineterminator="\n"),
112+
blob=document.blob,
113+
meta=deepcopy(document.meta),
114+
score=document.score,
115+
embedding=document.embedding,
116+
sparse_embedding=document.sparse_embedding,
115117
)
118+
cleaned_documents.append(clean_doc)
116119
return {"documents": cleaned_documents}
120+
121+
def _clean_df(self, df: "pd.DataFrame", ignore_rows: int, ignore_columns: int) -> "pd.DataFrame":
122+
"""
123+
Cleans a DataFrame by removing empty rows and columns while preserving ignored sections.
124+
125+
:param df: The input DataFrame representing the CSV data.
126+
:param ignore_rows: Number of top rows to ignore.
127+
:param ignore_columns: Number of left columns to ignore.
128+
"""
129+
# Get ignored rows and columns
130+
ignored_rows = self._get_ignored_rows(df=df, ignore_rows=ignore_rows)
131+
ignored_columns = self._get_ignored_columns(df=df, ignore_columns=ignore_columns)
132+
final_df = df.iloc[ignore_rows:, ignore_columns:]
133+
134+
# Drop rows that are entirely empty
135+
if self.remove_empty_rows:
136+
final_df = final_df.dropna(axis=0, how="all")
137+
138+
# Drop columns that are entirely empty
139+
if self.remove_empty_columns:
140+
final_df = final_df.dropna(axis=1, how="all")
141+
142+
# Reattach ignored rows
143+
if ignore_rows > 0 and ignored_rows is not None:
144+
# Keep only relevant columns
145+
ignored_rows = ignored_rows.loc[:, final_df.columns]
146+
final_df = pd.concat([ignored_rows, final_df], axis=0)
147+
148+
# Reattach ignored columns
149+
if ignore_columns > 0 and ignored_columns is not None:
150+
# Keep only relevant rows
151+
ignored_columns = ignored_columns.loc[final_df.index, :]
152+
final_df = pd.concat([ignored_columns, final_df], axis=1)
153+
154+
return final_df
155+
156+
@staticmethod
157+
def _get_ignored_rows(df: "pd.DataFrame", ignore_rows: int) -> Optional["pd.DataFrame"]:
158+
"""
159+
Extracts the rows to be ignored from the DataFrame.
160+
161+
:param df: The input DataFrame.
162+
:param ignore_rows: Number of rows to extract from the top.
163+
"""
164+
if ignore_rows > 0:
165+
return df.iloc[:ignore_rows, :]
166+
return None
167+
168+
@staticmethod
169+
def _get_ignored_columns(df: "pd.DataFrame", ignore_columns: int) -> Optional["pd.DataFrame"]:
170+
"""
171+
Extracts the columns to be ignored from the DataFrame.
172+
173+
:param df: The input DataFrame.
174+
:param ignore_columns: Number of columns to extract from the left.
175+
"""
176+
if ignore_columns > 0:
177+
return df.iloc[:, :ignore_columns]
178+
return None
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
enhancements:
3+
- |
4+
For the CSVDocumentCleaner, added `remove_empty_rows` & `remove_empty_columns` to optionally remove rows and columns.
5+
Also added `keep_id` to optionally allow for keeping the original document ID.

Diff for: test/components/preprocessors/test_csv_document_cleaner.py

+75
Original file line numberDiff line numberDiff line change
@@ -144,3 +144,78 @@ def test_zero_ignore_rows_and_columns() -> None:
144144
result = csv_document_cleaner.run([csv_document])
145145
cleaned_document = result["documents"][0]
146146
assert cleaned_document.content == ",A,B,C\n1,item,s,\n2,item2,fd,\n"
147+
148+
149+
def test_empty_document() -> None:
150+
csv_document = Document(content="")
151+
csv_document_cleaner = CSVDocumentCleaner()
152+
result = csv_document_cleaner.run([csv_document])
153+
cleaned_document = result["documents"][0]
154+
assert cleaned_document.content == ""
155+
assert cleaned_document.meta == {}
156+
157+
158+
def test_empty_documents() -> None:
159+
csv_document_cleaner = CSVDocumentCleaner()
160+
result = csv_document_cleaner.run([])
161+
assert result["documents"] == []
162+
163+
164+
def test_keep_id() -> None:
165+
csv_content = """,A,B,C
166+
1,item,s,
167+
"""
168+
csv_document = Document(id="123", content=csv_content)
169+
csv_document_cleaner = CSVDocumentCleaner(keep_id=True)
170+
result = csv_document_cleaner.run([csv_document])
171+
cleaned_document = result["documents"][0]
172+
assert cleaned_document.id == "123"
173+
assert cleaned_document.content == ",A,B,C\n1,item,s,\n"
174+
175+
176+
def test_id_not_none() -> None:
177+
csv_content = """,A,B,C
178+
1,item,s,
179+
"""
180+
csv_document = Document(content=csv_content)
181+
csv_document_cleaner = CSVDocumentCleaner()
182+
result = csv_document_cleaner.run([csv_document])
183+
cleaned_document = result["documents"][0]
184+
assert cleaned_document.id != ""
185+
assert cleaned_document.content == ",A,B,C\n1,item,s,\n"
186+
187+
188+
def test_remove_empty_rows_false() -> None:
189+
csv_content = """,B,C
190+
,,
191+
,5,6
192+
"""
193+
csv_document = Document(content=csv_content)
194+
csv_document_cleaner = CSVDocumentCleaner(remove_empty_rows=False)
195+
result = csv_document_cleaner.run([csv_document])
196+
cleaned_document = result["documents"][0]
197+
assert cleaned_document.content == "B,C\n,\n5,6\n"
198+
199+
200+
def test_remove_empty_columns_false() -> None:
201+
csv_content = """,B,C
202+
,,
203+
,,4
204+
"""
205+
csv_document = Document(content=csv_content)
206+
csv_document_cleaner = CSVDocumentCleaner(remove_empty_columns=False)
207+
result = csv_document_cleaner.run([csv_document])
208+
cleaned_document = result["documents"][0]
209+
assert cleaned_document.content == ",B,C\n,,4\n"
210+
211+
212+
def test_remove_empty_rows_and_columns_false() -> None:
213+
csv_content = """,B,C
214+
,,4
215+
,,
216+
"""
217+
csv_document = Document(content=csv_content)
218+
csv_document_cleaner = CSVDocumentCleaner(remove_empty_rows=False, remove_empty_columns=False)
219+
result = csv_document_cleaner.run([csv_document])
220+
cleaned_document = result["documents"][0]
221+
assert cleaned_document.content == ",B,C\n,,4\n,,\n"

0 commit comments

Comments
 (0)