@@ -107,26 +107,37 @@ def test_single_table_no_split(self, splitter: CSVDocumentSplitter) -> None:
107
107
1,2,3
108
108
4,5,6
109
109
"""
110
- doc = Document (content = csv_content )
110
+ doc = Document (content = csv_content , id = "test_id" )
111
111
result = splitter .run ([doc ])["documents" ]
112
112
assert len (result ) == 1
113
113
assert result [0 ].content == csv_content
114
+ assert result [0 ].meta == {"source_id" : "test_id" , "row_idx_start" : 0 , "col_idx_start" : 0 , "split_id" : 0 }
114
115
115
116
def test_row_split (self , splitter : CSVDocumentSplitter , two_tables_sep_by_two_empty_rows : str ) -> None :
116
- doc = Document (content = two_tables_sep_by_two_empty_rows )
117
+ doc = Document (content = two_tables_sep_by_two_empty_rows , id = "test_id" )
117
118
result = splitter .run ([doc ])["documents" ]
118
119
assert len (result ) == 2
119
120
expected_tables = ["A,B,C\n 1,2,3\n " , "X,Y,Z\n 7,8,9\n " ]
121
+ expected_meta = [
122
+ {"source_id" : "test_id" , "row_idx_start" : 0 , "col_idx_start" : 0 , "split_id" : 0 },
123
+ {"source_id" : "test_id" , "row_idx_start" : 4 , "col_idx_start" : 0 , "split_id" : 1 },
124
+ ]
120
125
for i , table in enumerate (result ):
121
126
assert table .content == expected_tables [i ]
127
+ assert table .meta == expected_meta [i ]
122
128
123
129
def test_column_split (self , splitter : CSVDocumentSplitter , two_tables_sep_by_two_empty_columns : str ) -> None :
124
- doc = Document (content = two_tables_sep_by_two_empty_columns )
130
+ doc = Document (content = two_tables_sep_by_two_empty_columns , id = "test_id" )
125
131
result = splitter .run ([doc ])["documents" ]
126
132
assert len (result ) == 2
127
133
expected_tables = ["A,B\n 1,2\n 3,4\n " , "X,Y\n 7,8\n 9,10\n " ]
134
+ expected_meta = [
135
+ {"source_id" : "test_id" , "row_idx_start" : 0 , "col_idx_start" : 0 , "split_id" : 0 },
136
+ {"source_id" : "test_id" , "row_idx_start" : 0 , "col_idx_start" : 4 , "split_id" : 1 },
137
+ ]
128
138
for i , table in enumerate (result ):
129
139
assert table .content == expected_tables [i ]
140
+ assert table .meta == expected_meta [i ]
130
141
131
142
def test_recursive_split_one_level (self , splitter : CSVDocumentSplitter ) -> None :
132
143
csv_content = """A,B,,,X,Y
@@ -136,12 +147,19 @@ def test_recursive_split_one_level(self, splitter: CSVDocumentSplitter) -> None:
136
147
P,Q,,,M,N
137
148
3,4,,,9,10
138
149
"""
139
- doc = Document (content = csv_content )
150
+ doc = Document (content = csv_content , id = "test_id" )
140
151
result = splitter .run ([doc ])["documents" ]
141
152
assert len (result ) == 4
142
153
expected_tables = ["A,B\n 1,2\n " , "X,Y\n 7,8\n " , "P,Q\n 3,4\n " , "M,N\n 9,10\n " ]
154
+ expected_meta = [
155
+ {"source_id" : "test_id" , "row_idx_start" : 0 , "col_idx_start" : 0 , "split_id" : 0 },
156
+ {"source_id" : "test_id" , "row_idx_start" : 0 , "col_idx_start" : 4 , "split_id" : 1 },
157
+ {"source_id" : "test_id" , "row_idx_start" : 4 , "col_idx_start" : 0 , "split_id" : 2 },
158
+ {"source_id" : "test_id" , "row_idx_start" : 4 , "col_idx_start" : 4 , "split_id" : 3 },
159
+ ]
143
160
for i , table in enumerate (result ):
144
161
assert table .content == expected_tables [i ]
162
+ assert table .meta == expected_meta [i ]
145
163
146
164
def test_recursive_split_two_levels (self , splitter : CSVDocumentSplitter ) -> None :
147
165
csv_content = """A,B,,,X,Y
@@ -151,12 +169,18 @@ def test_recursive_split_two_levels(self, splitter: CSVDocumentSplitter) -> None
151
169
P,Q,,,,
152
170
3,4,,,,
153
171
"""
154
- doc = Document (content = csv_content )
172
+ doc = Document (content = csv_content , id = "test_id" )
155
173
result = splitter .run ([doc ])["documents" ]
156
174
assert len (result ) == 3
157
- expected_tables = ["A,B\n 1,2\n " , "P,Q\n 3,4\n " , "X,Y\n 7,8\n M,N\n 9,10\n " ]
175
+ expected_tables = ["A,B\n 1,2\n " , "X,Y\n 7,8\n M,N\n 9,10\n " , "P,Q\n 3,4\n " ]
176
+ expected_meta = [
177
+ {"source_id" : "test_id" , "row_idx_start" : 0 , "col_idx_start" : 0 , "split_id" : 0 },
178
+ {"source_id" : "test_id" , "row_idx_start" : 0 , "col_idx_start" : 4 , "split_id" : 1 },
179
+ {"source_id" : "test_id" , "row_idx_start" : 4 , "col_idx_start" : 0 , "split_id" : 2 },
180
+ ]
158
181
for i , table in enumerate (result ):
159
182
assert table .content == expected_tables [i ]
183
+ assert table .meta == expected_meta [i ]
160
184
161
185
def test_threshold_no_effect (self , two_tables_sep_by_two_empty_rows : str ) -> None :
162
186
splitter = CSVDocumentSplitter (row_split_threshold = 3 )
0 commit comments