@@ -44,16 +44,19 @@ def test_non_text_document(self):
44
44
ValueError , match = "DocumentSplitter only works with text documents but content for document ID"
45
45
):
46
46
splitter = DocumentSplitter ()
47
+ splitter .warm_up ()
47
48
splitter .run (documents = [Document ()])
48
49
assert "DocumentSplitter only works with text documents but content for document ID" in caplog .text
49
50
50
51
def test_single_doc (self ):
51
52
with pytest .raises (TypeError , match = "DocumentSplitter expects a List of Documents as input." ):
52
53
splitter = DocumentSplitter ()
54
+ splitter .warm_up ()
53
55
splitter .run (documents = Document ())
54
56
55
57
def test_empty_list (self ):
56
58
splitter = DocumentSplitter ()
59
+ splitter .warm_up ()
57
60
res = splitter .run (documents = [])
58
61
assert res == {"documents" : []}
59
62
@@ -76,6 +79,7 @@ def test_unsupported_split_overlap(self):
76
79
def test_split_by_word (self ):
77
80
splitter = DocumentSplitter (split_by = "word" , split_length = 10 )
78
81
text = "This is a text with some words. There is a second sentence. And there is a third sentence."
82
+ splitter .warm_up ()
79
83
result = splitter .run (documents = [Document (content = text )])
80
84
docs = result ["documents" ]
81
85
assert len (docs ) == 2
@@ -88,6 +92,7 @@ def test_split_by_word(self):
88
92
89
93
def test_split_by_word_with_threshold (self ):
90
94
splitter = DocumentSplitter (split_by = "word" , split_length = 15 , split_threshold = 10 )
95
+ splitter .warm_up ()
91
96
result = splitter .run (
92
97
documents = [
93
98
Document (
@@ -105,6 +110,7 @@ def test_split_by_word_multiple_input_docs(self):
105
110
splitter = DocumentSplitter (split_by = "word" , split_length = 10 )
106
111
text1 = "This is a text with some words. There is a second sentence. And there is a third sentence."
107
112
text2 = "This is a different text with some words. There is a second sentence. And there is a third sentence. And there is a fourth sentence."
113
+ splitter .warm_up ()
108
114
result = splitter .run (documents = [Document (content = text1 ), Document (content = text2 )])
109
115
docs = result ["documents" ]
110
116
assert len (docs ) == 5
@@ -132,6 +138,7 @@ def test_split_by_word_multiple_input_docs(self):
132
138
def test_split_by_period (self ):
133
139
splitter = DocumentSplitter (split_by = "period" , split_length = 1 )
134
140
text = "This is a text with some words. There is a second sentence. And there is a third sentence."
141
+ splitter .warm_up ()
135
142
result = splitter .run (documents = [Document (content = text )])
136
143
docs = result ["documents" ]
137
144
assert len (docs ) == 3
@@ -148,6 +155,7 @@ def test_split_by_period(self):
148
155
def test_split_by_passage (self ):
149
156
splitter = DocumentSplitter (split_by = "passage" , split_length = 1 )
150
157
text = "This is a text with some words. There is a second sentence.\n \n And there is a third sentence.\n \n And another passage."
158
+ splitter .warm_up ()
151
159
result = splitter .run (documents = [Document (content = text )])
152
160
docs = result ["documents" ]
153
161
assert len (docs ) == 3
@@ -164,6 +172,7 @@ def test_split_by_passage(self):
164
172
def test_split_by_page (self ):
165
173
splitter = DocumentSplitter (split_by = "page" , split_length = 1 )
166
174
text = "This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
175
+ splitter .warm_up ()
167
176
result = splitter .run (documents = [Document (content = text )])
168
177
docs = result ["documents" ]
169
178
assert len (docs ) == 3
@@ -183,6 +192,7 @@ def test_split_by_page(self):
183
192
def test_split_by_function (self ):
184
193
splitting_function = lambda s : s .split ("." )
185
194
splitter = DocumentSplitter (split_by = "function" , splitting_function = splitting_function )
195
+ splitter .warm_up ()
186
196
text = "This.Is.A.Test"
187
197
result = splitter .run (documents = [Document (id = "1" , content = text , meta = {"key" : "value" })])
188
198
docs = result ["documents" ]
@@ -200,6 +210,7 @@ def test_split_by_function(self):
200
210
splitting_function = lambda s : re .split (r"[\s]{2,}" , s )
201
211
splitter = DocumentSplitter (split_by = "function" , splitting_function = splitting_function )
202
212
text = "This Is\n A Test"
213
+ splitter .warm_up ()
203
214
result = splitter .run (documents = [Document (id = "1" , content = text , meta = {"key" : "value" })])
204
215
docs = result ["documents" ]
205
216
assert len (docs ) == 4
@@ -215,6 +226,7 @@ def test_split_by_function(self):
215
226
def test_split_by_word_with_overlap (self ):
216
227
splitter = DocumentSplitter (split_by = "word" , split_length = 10 , split_overlap = 2 )
217
228
text = "This is a text with some words. There is a second sentence. And there is a third sentence."
229
+ splitter .warm_up ()
218
230
result = splitter .run (documents = [Document (content = text )])
219
231
docs = result ["documents" ]
220
232
assert len (docs ) == 2
@@ -234,6 +246,7 @@ def test_split_by_word_with_overlap(self):
234
246
def test_split_by_line (self ):
235
247
splitter = DocumentSplitter (split_by = "line" , split_length = 1 )
236
248
text = "This is a text with some words.\n There is a second sentence.\n And there is a third sentence."
249
+ splitter .warm_up ()
237
250
result = splitter .run (documents = [Document (content = text )])
238
251
docs = result ["documents" ]
239
252
@@ -252,6 +265,7 @@ def test_source_id_stored_in_metadata(self):
252
265
splitter = DocumentSplitter (split_by = "word" , split_length = 10 )
253
266
doc1 = Document (content = "This is a text with some words." )
254
267
doc2 = Document (content = "This is a different text with some words." )
268
+ splitter .warm_up ()
255
269
result = splitter .run (documents = [doc1 , doc2 ])
256
270
assert result ["documents" ][0 ].meta ["source_id" ] == doc1 .id
257
271
assert result ["documents" ][1 ].meta ["source_id" ] == doc2 .id
@@ -262,6 +276,7 @@ def test_copy_metadata(self):
262
276
Document (content = "Text." , meta = {"name" : "doc 0" }),
263
277
Document (content = "Text." , meta = {"name" : "doc 1" }),
264
278
]
279
+ splitter .warm_up ()
265
280
result = splitter .run (documents = documents )
266
281
assert len (result ["documents" ]) == 2
267
282
assert result ["documents" ][0 ].id != result ["documents" ][1 ].id
@@ -273,6 +288,7 @@ def test_add_page_number_to_metadata_with_no_overlap_word_split(self):
273
288
splitter = DocumentSplitter (split_by = "word" , split_length = 2 )
274
289
doc1 = Document (content = "This is some text.\f This text is on another page." )
275
290
doc2 = Document (content = "This content has two.\f \f page brakes." )
291
+ splitter .warm_up ()
276
292
result = splitter .run (documents = [doc1 , doc2 ])
277
293
278
294
expected_pages = [1 , 1 , 2 , 2 , 2 , 1 , 1 , 3 ]
@@ -283,6 +299,7 @@ def test_add_page_number_to_metadata_with_no_overlap_period_split(self):
283
299
splitter = DocumentSplitter (split_by = "period" , split_length = 1 )
284
300
doc1 = Document (content = "This is some text.\f This text is on another page." )
285
301
doc2 = Document (content = "This content has two.\f \f page brakes." )
302
+ splitter .warm_up ()
286
303
result = splitter .run (documents = [doc1 , doc2 ])
287
304
288
305
expected_pages = [1 , 1 , 1 , 1 ]
@@ -294,6 +311,7 @@ def test_add_page_number_to_metadata_with_no_overlap_passage_split(self):
294
311
doc1 = Document (
295
312
content = "This is a text with some words.\f There is a second sentence.\n \n And there is a third sentence.\n \n And more passages.\n \n \f And another passage."
296
313
)
314
+ splitter .warm_up ()
297
315
result = splitter .run (documents = [doc1 ])
298
316
299
317
expected_pages = [1 , 2 , 2 , 2 ]
@@ -305,6 +323,7 @@ def test_add_page_number_to_metadata_with_no_overlap_page_split(self):
305
323
doc1 = Document (
306
324
content = "This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
307
325
)
326
+ splitter .warm_up ()
308
327
result = splitter .run (documents = [doc1 ])
309
328
expected_pages = [1 , 2 , 3 ]
310
329
for doc , p in zip (result ["documents" ], expected_pages ):
@@ -314,6 +333,7 @@ def test_add_page_number_to_metadata_with_no_overlap_page_split(self):
314
333
doc1 = Document (
315
334
content = "This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
316
335
)
336
+ splitter .warm_up ()
317
337
result = splitter .run (documents = [doc1 ])
318
338
expected_pages = [1 , 3 ]
319
339
@@ -324,6 +344,7 @@ def test_add_page_number_to_metadata_with_overlap_word_split(self):
324
344
splitter = DocumentSplitter (split_by = "word" , split_length = 3 , split_overlap = 1 )
325
345
doc1 = Document (content = "This is some text. And\f this text is on another page." )
326
346
doc2 = Document (content = "This content has two.\f \f page brakes." )
347
+ splitter .warm_up ()
327
348
result = splitter .run (documents = [doc1 , doc2 ])
328
349
329
350
expected_pages = [1 , 1 , 1 , 2 , 2 , 1 , 1 , 3 ]
@@ -334,6 +355,7 @@ def test_add_page_number_to_metadata_with_overlap_period_split(self):
334
355
splitter = DocumentSplitter (split_by = "period" , split_length = 2 , split_overlap = 1 )
335
356
doc1 = Document (content = "This is some text. And this is more text.\f This text is on another page. End." )
336
357
doc2 = Document (content = "This content has two.\f \f page brakes. More text." )
358
+ splitter .warm_up ()
337
359
result = splitter .run (documents = [doc1 , doc2 ])
338
360
339
361
expected_pages = [1 , 1 , 1 , 2 , 1 , 1 ]
@@ -345,6 +367,7 @@ def test_add_page_number_to_metadata_with_overlap_passage_split(self):
345
367
doc1 = Document (
346
368
content = "This is a text with some words.\f There is a second sentence.\n \n And there is a third sentence.\n \n And more passages.\n \n \f And another passage."
347
369
)
370
+ splitter .warm_up ()
348
371
result = splitter .run (documents = [doc1 ])
349
372
350
373
expected_pages = [1 , 2 , 2 ]
@@ -356,6 +379,7 @@ def test_add_page_number_to_metadata_with_overlap_page_split(self):
356
379
doc1 = Document (
357
380
content = "This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
358
381
)
382
+ splitter .warm_up ()
359
383
result = splitter .run (documents = [doc1 ])
360
384
expected_pages = [1 , 2 , 3 ]
361
385
@@ -366,6 +390,7 @@ def test_add_split_overlap_information(self):
366
390
splitter = DocumentSplitter (split_length = 10 , split_overlap = 5 , split_by = "word" )
367
391
text = "This is a text with some words. There is a second sentence. And a third sentence."
368
392
doc = Document (content = "This is a text with some words. There is a second sentence. And a third sentence." )
393
+ splitter .warm_up ()
369
394
docs = splitter .run (documents = [doc ])["documents" ]
370
395
371
396
# check split_overlap is added to all the documents
@@ -487,6 +512,7 @@ def test_run_empty_document(self):
487
512
"""
488
513
splitter = DocumentSplitter ()
489
514
doc = Document (content = "" )
515
+ splitter .warm_up ()
490
516
results = splitter .run ([doc ])
491
517
assert results ["documents" ] == []
492
518
@@ -496,6 +522,7 @@ def test_run_document_only_whitespaces(self):
496
522
"""
497
523
splitter = DocumentSplitter ()
498
524
doc = Document (content = " " )
525
+ splitter .warm_up ()
499
526
results = splitter .run ([doc ])
500
527
assert results ["documents" ][0 ].content == " "
501
528
@@ -543,6 +570,7 @@ def test_run_split_by_sentence_1(self) -> None:
543
570
"Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night ... "
544
571
"The moon was full."
545
572
)
573
+ document_splitter .warm_up ()
546
574
documents = document_splitter .run (documents = [Document (content = text )])["documents" ]
547
575
548
576
assert len (documents ) == 2
@@ -568,6 +596,7 @@ def test_run_split_by_sentence_2(self) -> None:
568
596
"This is another test sentence. (This is a third test sentence.) "
569
597
"This is the last test sentence."
570
598
)
599
+ document_splitter .warm_up ()
571
600
documents = document_splitter .run (documents = [Document (content = text )])["documents" ]
572
601
573
602
assert len (documents ) == 4
@@ -601,6 +630,7 @@ def test_run_split_by_sentence_3(self) -> None:
601
630
use_split_rules = True ,
602
631
extend_abbreviations = True ,
603
632
)
633
+ document_splitter .warm_up ()
604
634
605
635
text = "Sentence on page 1.\f Sentence on page 2. \f Sentence on page 3. \f \f Sentence on page 5."
606
636
documents = document_splitter .run (documents = [Document (content = text )])["documents" ]
@@ -633,6 +663,7 @@ def test_run_split_by_sentence_4(self) -> None:
633
663
use_split_rules = True ,
634
664
extend_abbreviations = True ,
635
665
)
666
+ document_splitter .warm_up ()
636
667
637
668
text = "Sentence on page 1.\f Sentence on page 2. \f Sentence on page 3. \f \f Sentence on page 5."
638
669
documents = document_splitter .run (documents = [Document (content = text )])["documents" ]
@@ -660,6 +691,7 @@ def test_run_split_by_word_respect_sentence_boundary(self) -> None:
660
691
language = "en" ,
661
692
respect_sentence_boundary = True ,
662
693
)
694
+ document_splitter .warm_up ()
663
695
664
696
text = (
665
697
"Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night.\f "
@@ -692,6 +724,7 @@ def test_run_split_by_word_respect_sentence_boundary_no_repeats(self) -> None:
692
724
use_split_rules = False ,
693
725
extend_abbreviations = False ,
694
726
)
727
+ document_splitter .warm_up ()
695
728
text = (
696
729
"This is a test sentence with many many words that exceeds the split length and should not be repeated. "
697
730
"This is another test sentence. (This is a third test sentence.) "
@@ -717,6 +750,7 @@ def test_run_split_by_word_respect_sentence_boundary_with_split_overlap_and_page
717
750
extend_abbreviations = True ,
718
751
respect_sentence_boundary = True ,
719
752
)
753
+ document_splitter .warm_up ()
720
754
721
755
text = (
722
756
"Sentence on page 1. Another on page 1.\f Sentence on page 2. Another on page 2.\f "
0 commit comments