2
2
#
3
3
# SPDX-License-Identifier: Apache-2.0
4
4
5
+ from copy import deepcopy
5
6
from io import StringIO
6
- from typing import Dict , List
7
+ from typing import Dict , List , Optional
7
8
8
9
from haystack import Document , component , logging
9
10
from haystack .lazy_imports import LazyImport
@@ -21,21 +22,36 @@ class CSVDocumentCleaner:
21
22
22
23
This component processes CSV content stored in Documents, allowing
23
24
for the optional ignoring of a specified number of rows and columns before performing
24
- the cleaning operation.
25
+ the cleaning operation. Additionally, it provides options to keep document IDs and
26
+ control whether empty rows and columns should be removed.
25
27
"""
26
28
27
- def __init__ (self , ignore_rows : int = 0 , ignore_columns : int = 0 ) -> None :
29
+ def __init__ (
30
+ self ,
31
+ * ,
32
+ ignore_rows : int = 0 ,
33
+ ignore_columns : int = 0 ,
34
+ remove_empty_rows : bool = True ,
35
+ remove_empty_columns : bool = True ,
36
+ keep_id : bool = False ,
37
+ ) -> None :
28
38
"""
29
39
Initializes the CSVDocumentCleaner component.
30
40
31
41
:param ignore_rows: Number of rows to ignore from the top of the CSV table before processing.
32
42
:param ignore_columns: Number of columns to ignore from the left of the CSV table before processing.
43
+ :param remove_empty_rows: Whether to remove rows that are entirely empty.
44
+ :param remove_empty_columns: Whether to remove columns that are entirely empty.
45
+ :param keep_id: Whether to retain the original document ID in the output document.
33
46
34
47
Rows and columns ignored using these parameters are preserved in the final output, meaning
35
48
they are not considered when removing empty rows and columns.
36
49
"""
37
50
self .ignore_rows = ignore_rows
38
51
self .ignore_columns = ignore_columns
52
+ self .remove_empty_rows = remove_empty_rows
53
+ self .remove_empty_columns = remove_empty_columns
54
+ self .keep_id = keep_id
39
55
pandas_import .check ()
40
56
41
57
@component .output_types (documents = List [Document ])
@@ -44,14 +60,20 @@ def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
44
60
Cleans CSV documents by removing empty rows and columns while preserving specified ignored rows and columns.
45
61
46
62
:param documents: List of Documents containing CSV-formatted content.
63
+ :return: A dictionary with a list of cleaned Documents under the key "documents".
47
64
48
65
Processing steps:
49
66
1. Reads each document's content as a CSV table.
50
67
2. Retains the specified number of `ignore_rows` from the top and `ignore_columns` from the left.
51
- 3. Drops any rows and columns that are entirely empty (all NaN values).
68
+ 3. Drops any rows and columns that are entirely empty (if enabled by `remove_empty_rows` and
69
+ `remove_empty_columns`).
52
70
4. Reattaches the ignored rows and columns to maintain their original positions.
53
- 5. Returns the cleaned CSV content as a new `Document` object.
71
+ 5. Returns the cleaned CSV content as a new `Document` object, with an option to retain the original
72
+ document ID.
54
73
"""
74
+ if len (documents ) == 0 :
75
+ return {"documents" : []}
76
+
55
77
ignore_rows = self .ignore_rows
56
78
ignore_columns = self .ignore_columns
57
79
@@ -82,35 +104,75 @@ def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
82
104
cleaned_documents .append (document )
83
105
continue
84
106
85
- # Save ignored rows
86
- ignored_rows = None
87
- if ignore_rows > 0 :
88
- ignored_rows = df .iloc [:ignore_rows , :]
89
-
90
- # Save ignored columns
91
- ignored_columns = None
92
- if ignore_columns > 0 :
93
- ignored_columns = df .iloc [:, :ignore_columns ]
94
-
95
- # Drop rows and columns that are entirely empty
96
- remaining_df = df .iloc [ignore_rows :, ignore_columns :]
97
- final_df = remaining_df .dropna (axis = 0 , how = "all" ).dropna (axis = 1 , how = "all" )
98
-
99
- # Reattach ignored rows
100
- if ignore_rows > 0 and ignored_rows is not None :
101
- # Keep only relevant columns
102
- ignored_rows = ignored_rows .loc [:, final_df .columns ]
103
- final_df = pd .concat ([ignored_rows , final_df ], axis = 0 )
104
-
105
- # Reattach ignored columns
106
- if ignore_columns > 0 and ignored_columns is not None :
107
- # Keep only relevant rows
108
- ignored_columns = ignored_columns .loc [final_df .index , :]
109
- final_df = pd .concat ([ignored_columns , final_df ], axis = 1 )
110
-
111
- cleaned_documents .append (
112
- Document (
113
- content = final_df .to_csv (index = False , header = False , lineterminator = "\n " ), meta = document .meta .copy ()
114
- )
107
+ final_df = self ._clean_df (df = df , ignore_rows = ignore_rows , ignore_columns = ignore_columns )
108
+
109
+ clean_doc = Document (
110
+ id = document .id if self .keep_id else "" ,
111
+ content = final_df .to_csv (index = False , header = False , lineterminator = "\n " ),
112
+ blob = document .blob ,
113
+ meta = deepcopy (document .meta ),
114
+ score = document .score ,
115
+ embedding = document .embedding ,
116
+ sparse_embedding = document .sparse_embedding ,
115
117
)
118
+ cleaned_documents .append (clean_doc )
116
119
return {"documents" : cleaned_documents }
120
+
121
+ def _clean_df (self , df : "pd.DataFrame" , ignore_rows : int , ignore_columns : int ) -> "pd.DataFrame" :
122
+ """
123
+ Cleans a DataFrame by removing empty rows and columns while preserving ignored sections.
124
+
125
+ :param df: The input DataFrame representing the CSV data.
126
+ :param ignore_rows: Number of top rows to ignore.
127
+ :param ignore_columns: Number of left columns to ignore.
128
+ """
129
+ # Get ignored rows and columns
130
+ ignored_rows = self ._get_ignored_rows (df = df , ignore_rows = ignore_rows )
131
+ ignored_columns = self ._get_ignored_columns (df = df , ignore_columns = ignore_columns )
132
+ final_df = df .iloc [ignore_rows :, ignore_columns :]
133
+
134
+ # Drop rows that are entirely empty
135
+ if self .remove_empty_rows :
136
+ final_df = final_df .dropna (axis = 0 , how = "all" )
137
+
138
+ # Drop columns that are entirely empty
139
+ if self .remove_empty_columns :
140
+ final_df = final_df .dropna (axis = 1 , how = "all" )
141
+
142
+ # Reattach ignored rows
143
+ if ignore_rows > 0 and ignored_rows is not None :
144
+ # Keep only relevant columns
145
+ ignored_rows = ignored_rows .loc [:, final_df .columns ]
146
+ final_df = pd .concat ([ignored_rows , final_df ], axis = 0 )
147
+
148
+ # Reattach ignored columns
149
+ if ignore_columns > 0 and ignored_columns is not None :
150
+ # Keep only relevant rows
151
+ ignored_columns = ignored_columns .loc [final_df .index , :]
152
+ final_df = pd .concat ([ignored_columns , final_df ], axis = 1 )
153
+
154
+ return final_df
155
+
156
+ @staticmethod
157
+ def _get_ignored_rows (df : "pd.DataFrame" , ignore_rows : int ) -> Optional ["pd.DataFrame" ]:
158
+ """
159
+ Extracts the rows to be ignored from the DataFrame.
160
+
161
+ :param df: The input DataFrame.
162
+ :param ignore_rows: Number of rows to extract from the top.
163
+ """
164
+ if ignore_rows > 0 :
165
+ return df .iloc [:ignore_rows , :]
166
+ return None
167
+
168
+ @staticmethod
169
+ def _get_ignored_columns (df : "pd.DataFrame" , ignore_columns : int ) -> Optional ["pd.DataFrame" ]:
170
+ """
171
+ Extracts the columns to be ignored from the DataFrame.
172
+
173
+ :param df: The input DataFrame.
174
+ :param ignore_columns: Number of columns to extract from the left.
175
+ """
176
+ if ignore_columns > 0 :
177
+ return df .iloc [:, :ignore_columns ]
178
+ return None
0 commit comments