Skip to content

Commit 1785ea6

Browse files
sjrldavidsbatista
andauthored
feat: Add component CSVDocumentCleaner for removing empty rows and columns (#8816)
* Initial commit for csv cleaner * Add release notes * Update lineterminator * Update releasenotes/notes/csv-document-cleaner-8eca67e884684c56.yaml Co-authored-by: David S. Batista <[email protected]> * alphabetize * Use lazy import * Some refactoring * Some refactoring --------- Co-authored-by: David S. Batista <[email protected]>
1 parent 1f25794 commit 1785ea6

File tree

5 files changed

+271
-2
lines changed

5 files changed

+271
-2
lines changed

docs/pydoc/config/preprocessors_api.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
loaders:
22
- type: haystack_pydoc_tools.loaders.CustomPythonLoader
33
search_path: [../../../haystack/components/preprocessors]
4-
modules: ["document_cleaner", "document_splitter", "recursive_splitter", "text_cleaner"]
4+
modules: ["csv_document_cleaner", "document_cleaner", "document_splitter", "recursive_splitter", "text_cleaner"]
55
ignore_when_discovered: ["__init__"]
66
processors:
77
- type: filter

haystack/components/preprocessors/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44

5+
from .csv_document_cleaner import CSVDocumentCleaner
56
from .document_cleaner import DocumentCleaner
67
from .document_splitter import DocumentSplitter
78
from .recursive_splitter import RecursiveDocumentSplitter
89
from .text_cleaner import TextCleaner
910

10-
__all__ = ["DocumentSplitter", "DocumentCleaner", "RecursiveDocumentSplitter", "TextCleaner"]
11+
__all__ = ["CSVDocumentCleaner", "DocumentCleaner", "DocumentSplitter", "RecursiveDocumentSplitter", "TextCleaner"]
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from io import StringIO
6+
from typing import Dict, List
7+
8+
from haystack import Document, component, logging
9+
from haystack.lazy_imports import LazyImport
10+
11+
with LazyImport("Run 'pip install pandas'") as pandas_import:
12+
import pandas as pd
13+
14+
logger = logging.getLogger(__name__)
15+
16+
17+
@component
18+
class CSVDocumentCleaner:
19+
"""
20+
A component for cleaning CSV documents by removing empty rows and columns.
21+
22+
This component processes CSV content stored in Documents, allowing
23+
for the optional ignoring of a specified number of rows and columns before performing
24+
the cleaning operation.
25+
"""
26+
27+
def __init__(self, ignore_rows: int = 0, ignore_columns: int = 0) -> None:
28+
"""
29+
Initializes the CSVDocumentCleaner component.
30+
31+
:param ignore_rows: Number of rows to ignore from the top of the CSV table before processing.
32+
:param ignore_columns: Number of columns to ignore from the left of the CSV table before processing.
33+
34+
Rows and columns ignored using these parameters are preserved in the final output, meaning
35+
they are not considered when removing empty rows and columns.
36+
"""
37+
self.ignore_rows = ignore_rows
38+
self.ignore_columns = ignore_columns
39+
pandas_import.check()
40+
41+
@component.output_types(documents=List[Document])
42+
def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
43+
"""
44+
Cleans CSV documents by removing empty rows and columns while preserving specified ignored rows and columns.
45+
46+
:param documents: List of Documents containing CSV-formatted content.
47+
48+
Processing steps:
49+
1. Reads each document's content as a CSV table.
50+
2. Retains the specified number of `ignore_rows` from the top and `ignore_columns` from the left.
51+
3. Drops any rows and columns that are entirely empty (all NaN values).
52+
4. Reattaches the ignored rows and columns to maintain their original positions.
53+
5. Returns the cleaned CSV content as a new `Document` object.
54+
"""
55+
ignore_rows = self.ignore_rows
56+
ignore_columns = self.ignore_columns
57+
58+
cleaned_documents = []
59+
for document in documents:
60+
try:
61+
df = pd.read_csv(StringIO(document.content), header=None, dtype=object) # type: ignore
62+
except Exception as e:
63+
logger.error(
64+
"Error processing document {id}. Keeping it, but skipping cleaning. Error: {error}",
65+
id=document.id,
66+
error=e,
67+
)
68+
cleaned_documents.append(document)
69+
continue
70+
71+
if ignore_rows > df.shape[0] or ignore_columns > df.shape[1]:
72+
logger.warning(
73+
"Document {id} has fewer rows {df_rows} or columns {df_cols} "
74+
"than the number of rows {rows} or columns {cols} to ignore. "
75+
"Keeping the entire document.",
76+
id=document.id,
77+
df_rows=df.shape[0],
78+
df_cols=df.shape[1],
79+
rows=ignore_rows,
80+
cols=ignore_columns,
81+
)
82+
cleaned_documents.append(document)
83+
continue
84+
85+
# Save ignored rows
86+
ignored_rows = None
87+
if ignore_rows > 0:
88+
ignored_rows = df.iloc[:ignore_rows, :]
89+
90+
# Save ignored columns
91+
ignored_columns = None
92+
if ignore_columns > 0:
93+
ignored_columns = df.iloc[:, :ignore_columns]
94+
95+
# Drop rows and columns that are entirely empty
96+
remaining_df = df.iloc[ignore_rows:, ignore_columns:]
97+
final_df = remaining_df.dropna(axis=0, how="all").dropna(axis=1, how="all")
98+
99+
# Reattach ignored rows
100+
if ignore_rows > 0 and ignored_rows is not None:
101+
# Keep only relevant columns
102+
ignored_rows = ignored_rows.loc[:, final_df.columns]
103+
final_df = pd.concat([ignored_rows, final_df], axis=0)
104+
105+
# Reattach ignored columns
106+
if ignore_columns > 0 and ignored_columns is not None:
107+
# Keep only relevant rows
108+
ignored_columns = ignored_columns.loc[final_df.index, :]
109+
final_df = pd.concat([ignored_columns, final_df], axis=1)
110+
111+
cleaned_documents.append(
112+
Document(
113+
content=final_df.to_csv(index=False, header=False, lineterminator="\n"), meta=document.meta.copy()
114+
)
115+
)
116+
return {"documents": cleaned_documents}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
features:
3+
- |
4+
Introduced `CSVDocumentCleaner` component for cleaning CSV documents.
5+
- Removes empty rows and columns, while preserving specified ignored rows and columns.
6+
- Customizable number of rows and columns to ignore during processing.
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from haystack import Document
6+
7+
from haystack.components.preprocessors.csv_document_cleaner import CSVDocumentCleaner
8+
9+
10+
def test_empty_column() -> None:
11+
csv_content = """,A,B,C
12+
,1,2,3
13+
,4,5,6
14+
"""
15+
csv_document = Document(content=csv_content)
16+
csv_document_cleaner = CSVDocumentCleaner()
17+
result = csv_document_cleaner.run([csv_document])
18+
cleaned_document = result["documents"][0]
19+
assert cleaned_document.content == "A,B,C\n1,2,3\n4,5,6\n"
20+
21+
22+
def test_empty_row() -> None:
23+
csv_content = """A,B,C
24+
1,2,3
25+
,,
26+
4,5,6
27+
"""
28+
csv_document = Document(content=csv_content)
29+
csv_document_cleaner = CSVDocumentCleaner()
30+
result = csv_document_cleaner.run([csv_document])
31+
cleaned_document = result["documents"][0]
32+
assert cleaned_document.content == "A,B,C\n1,2,3\n4,5,6\n"
33+
34+
35+
def test_empty_column_and_row() -> None:
36+
csv_content = """,A,B,C
37+
,1,2,3
38+
,,,
39+
,4,5,6
40+
"""
41+
csv_document = Document(content=csv_content)
42+
csv_document_cleaner = CSVDocumentCleaner()
43+
result = csv_document_cleaner.run([csv_document])
44+
cleaned_document = result["documents"][0]
45+
assert cleaned_document.content == "A,B,C\n1,2,3\n4,5,6\n"
46+
47+
48+
def test_ignore_rows() -> None:
49+
csv_content = """,,
50+
A,B,C
51+
4,5,6
52+
7,8,9
53+
"""
54+
csv_document = Document(content=csv_content, meta={"name": "test.csv"})
55+
csv_document_cleaner = CSVDocumentCleaner(ignore_rows=1)
56+
result = csv_document_cleaner.run([csv_document])
57+
cleaned_document = result["documents"][0]
58+
assert cleaned_document.content == ",,\nA,B,C\n4,5,6\n7,8,9\n"
59+
assert cleaned_document.meta == {"name": "test.csv"}
60+
61+
62+
def test_ignore_rows_2() -> None:
63+
csv_content = """A,B,C
64+
,,
65+
4,5,6
66+
7,8,9
67+
"""
68+
csv_document = Document(content=csv_content, meta={"name": "test.csv"})
69+
csv_document_cleaner = CSVDocumentCleaner(ignore_rows=1)
70+
result = csv_document_cleaner.run([csv_document])
71+
cleaned_document = result["documents"][0]
72+
assert cleaned_document.content == "A,B,C\n4,5,6\n7,8,9\n"
73+
assert cleaned_document.meta == {"name": "test.csv"}
74+
75+
76+
def test_ignore_rows_3() -> None:
77+
csv_content = """A,B,C
78+
4,,6
79+
7,,9
80+
"""
81+
csv_document = Document(content=csv_content, meta={"name": "test.csv"})
82+
csv_document_cleaner = CSVDocumentCleaner(ignore_rows=1)
83+
result = csv_document_cleaner.run([csv_document])
84+
cleaned_document = result["documents"][0]
85+
assert cleaned_document.content == "A,C\n4,6\n7,9\n"
86+
assert cleaned_document.meta == {"name": "test.csv"}
87+
88+
89+
def test_ignore_columns() -> None:
90+
csv_content = """,,A,B
91+
,2,3,4
92+
,7,8,9
93+
"""
94+
csv_document = Document(content=csv_content)
95+
csv_document_cleaner = CSVDocumentCleaner(ignore_columns=1)
96+
result = csv_document_cleaner.run([csv_document])
97+
cleaned_document = result["documents"][0]
98+
assert cleaned_document.content == ",,A,B\n,2,3,4\n,7,8,9\n"
99+
100+
101+
def test_too_many_ignore_rows() -> None:
102+
csv_content = """,,
103+
A,B,C
104+
4,5,6
105+
"""
106+
csv_document = Document(content=csv_content)
107+
csv_document_cleaner = CSVDocumentCleaner(ignore_rows=4)
108+
result = csv_document_cleaner.run([csv_document])
109+
cleaned_document = result["documents"][0]
110+
assert cleaned_document.content == ",,\nA,B,C\n4,5,6\n"
111+
112+
113+
def test_too_many_ignore_columns() -> None:
114+
csv_content = """,,
115+
A,B,C
116+
4,5,6
117+
"""
118+
csv_document = Document(content=csv_content)
119+
csv_document_cleaner = CSVDocumentCleaner(ignore_columns=4)
120+
result = csv_document_cleaner.run([csv_document])
121+
cleaned_document = result["documents"][0]
122+
assert cleaned_document.content == ",,\nA,B,C\n4,5,6\n"
123+
124+
125+
def test_ignore_rows_and_columns() -> None:
126+
csv_content = """,A,B,C
127+
1,item,s,
128+
2,item2,fd,
129+
"""
130+
csv_document = Document(content=csv_content)
131+
csv_document_cleaner = CSVDocumentCleaner(ignore_columns=1, ignore_rows=1)
132+
result = csv_document_cleaner.run([csv_document])
133+
cleaned_document = result["documents"][0]
134+
assert cleaned_document.content == ",A,B\n1,item,s\n2,item2,fd\n"
135+
136+
137+
def test_zero_ignore_rows_and_columns() -> None:
138+
csv_content = """,A,B,C
139+
1,item,s,
140+
2,item2,fd,
141+
"""
142+
csv_document = Document(content=csv_content)
143+
csv_document_cleaner = CSVDocumentCleaner(ignore_columns=0, ignore_rows=0)
144+
result = csv_document_cleaner.run([csv_document])
145+
cleaned_document = result["documents"][0]
146+
assert cleaned_document.content == ",A,B,C\n1,item,s,\n2,item2,fd,\n"

0 commit comments

Comments
 (0)