Skip to content

Commit 286061f

Browse files
sjrljulian-risch
andauthored
fix: Move potential nltk download to warm_up (#8646)
* Move potential nltk download to warm_up * Update tests * Add release notes * Fix tests * Uncomment * Make mypy happy * Add RuntimeError message * Update release notes --------- Co-authored-by: Julian Risch <[email protected]>
1 parent f4d9c2b commit 286061f

File tree

5 files changed

+90
-17
lines changed

5 files changed

+90
-17
lines changed

haystack/components/preprocessors/document_splitter.py

+19-8
Original file line numberDiff line numberDiff line change
@@ -107,15 +107,10 @@ def __init__( # pylint: disable=too-many-positional-arguments
107107
splitting_function=splitting_function,
108108
respect_sentence_boundary=respect_sentence_boundary,
109109
)
110-
111-
if split_by == "sentence" or (respect_sentence_boundary and split_by == "word"):
110+
self._use_sentence_splitter = split_by == "sentence" or (respect_sentence_boundary and split_by == "word")
111+
if self._use_sentence_splitter:
112112
nltk_imports.check()
113-
self.sentence_splitter = SentenceSplitter(
114-
language=language,
115-
use_split_rules=use_split_rules,
116-
extend_abbreviations=extend_abbreviations,
117-
keep_white_spaces=True,
118-
)
113+
self.sentence_splitter = None
119114

120115
if split_by == "sentence":
121116
# ToDo: remove this warning in the next major release
@@ -164,6 +159,18 @@ def _init_checks(
164159
)
165160
self.respect_sentence_boundary = False
166161

162+
def warm_up(self):
163+
"""
164+
Warm up the DocumentSplitter by loading the sentence tokenizer.
165+
"""
166+
if self._use_sentence_splitter and self.sentence_splitter is None:
167+
self.sentence_splitter = SentenceSplitter(
168+
language=self.language,
169+
use_split_rules=self.use_split_rules,
170+
extend_abbreviations=self.extend_abbreviations,
171+
keep_white_spaces=True,
172+
)
173+
167174
@component.output_types(documents=List[Document])
168175
def run(self, documents: List[Document]):
169176
"""
@@ -182,6 +189,10 @@ def run(self, documents: List[Document]):
182189
:raises TypeError: if the input is not a list of Documents.
183190
:raises ValueError: if the content of a document is None.
184191
"""
192+
if self._use_sentence_splitter and self.sentence_splitter is None:
193+
raise RuntimeError(
194+
"The component DocumentSplitter wasn't warmed up. Run 'warm_up()' before calling 'run()'."
195+
)
185196

186197
if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
187198
raise TypeError("DocumentSplitter expects a List of Documents as input.")

haystack/components/preprocessors/nltk_document_splitter.py

+23-9
Original file line numberDiff line numberDiff line change
@@ -77,16 +77,23 @@ def __init__( # pylint: disable=too-many-positional-arguments
7777
self.respect_sentence_boundary = respect_sentence_boundary
7878
self.use_split_rules = use_split_rules
7979
self.extend_abbreviations = extend_abbreviations
80-
self.sentence_splitter = SentenceSplitter(
81-
language=language,
82-
use_split_rules=use_split_rules,
83-
extend_abbreviations=extend_abbreviations,
84-
keep_white_spaces=True,
85-
)
80+
self.sentence_splitter = None
8681
self.language = language
8782

83+
def warm_up(self):
84+
"""
85+
Warm up the NLTKDocumentSplitter by loading the sentence tokenizer.
86+
"""
87+
if self.sentence_splitter is None:
88+
self.sentence_splitter = SentenceSplitter(
89+
language=self.language,
90+
use_split_rules=self.use_split_rules,
91+
extend_abbreviations=self.extend_abbreviations,
92+
keep_white_spaces=True,
93+
)
94+
8895
def _split_into_units(
89-
self, text: str, split_by: Literal["function", "page", "passage", "sentence", "word", "line"]
96+
self, text: str, split_by: Literal["function", "page", "passage", "period", "sentence", "word", "line"]
9097
) -> List[str]:
9198
"""
9299
Splits the text into units based on the specified split_by parameter.
@@ -106,6 +113,7 @@ def _split_into_units(
106113
# whitespace is preserved while splitting text into sentences when using keep_white_spaces=True
107114
# so split_at is set to an empty string
108115
self.split_at = ""
116+
assert self.sentence_splitter is not None
109117
result = self.sentence_splitter.split_sentences(text)
110118
units = [sentence["sentence"] for sentence in result]
111119
elif split_by == "word":
@@ -142,6 +150,11 @@ def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
142150
:raises TypeError: if the input is not a list of Documents.
143151
:raises ValueError: if the content of a document is None.
144152
"""
153+
if self.sentence_splitter is None:
154+
raise RuntimeError(
155+
"The component NLTKDocumentSplitter wasn't warmed up. Run 'warm_up()' before calling 'run()'."
156+
)
157+
145158
if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
146159
raise TypeError("DocumentSplitter expects a List of Documents as input.")
147160

@@ -221,8 +234,9 @@ def _number_of_sentences_to_keep(sentences: List[str], split_length: int, split_
221234
break
222235
return num_sentences_to_keep
223236

237+
@staticmethod
224238
def _concatenate_sentences_based_on_word_amount(
225-
self, sentences: List[str], split_length: int, split_overlap: int
239+
sentences: List[str], split_length: int, split_overlap: int
226240
) -> Tuple[List[str], List[int], List[int]]:
227241
"""
228242
Groups the sentences into chunks of `split_length` words while respecting sentence boundaries.
@@ -258,7 +272,7 @@ def _concatenate_sentences_based_on_word_amount(
258272
split_start_indices.append(chunk_start_idx)
259273

260274
# Get the number of sentences that overlap with the next chunk
261-
num_sentences_to_keep = self._number_of_sentences_to_keep(
275+
num_sentences_to_keep = NLTKDocumentSplitter._number_of_sentences_to_keep(
262276
sentences=current_chunk, split_length=split_length, split_overlap=split_overlap
263277
)
264278
# Set up information for the new chunk
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
fixes:
3+
- |
4+
Moved the NLTK download of DocumentSplitter and NLTKDocumentSplitter to warm_up(). This prevents calling to an external api during instantiation. If a DocumentSplitter or NLTKDocumentSplitter is used for sentence splitting outside of a pipeline, warm_up() now needs to be called before running the component.

test/components/preprocessors/test_document_splitter.py

+34
Original file line numberDiff line numberDiff line change
@@ -44,16 +44,19 @@ def test_non_text_document(self):
4444
ValueError, match="DocumentSplitter only works with text documents but content for document ID"
4545
):
4646
splitter = DocumentSplitter()
47+
splitter.warm_up()
4748
splitter.run(documents=[Document()])
4849
assert "DocumentSplitter only works with text documents but content for document ID" in caplog.text
4950

5051
def test_single_doc(self):
5152
with pytest.raises(TypeError, match="DocumentSplitter expects a List of Documents as input."):
5253
splitter = DocumentSplitter()
54+
splitter.warm_up()
5355
splitter.run(documents=Document())
5456

5557
def test_empty_list(self):
5658
splitter = DocumentSplitter()
59+
splitter.warm_up()
5760
res = splitter.run(documents=[])
5861
assert res == {"documents": []}
5962

@@ -76,6 +79,7 @@ def test_unsupported_split_overlap(self):
7679
def test_split_by_word(self):
7780
splitter = DocumentSplitter(split_by="word", split_length=10)
7881
text = "This is a text with some words. There is a second sentence. And there is a third sentence."
82+
splitter.warm_up()
7983
result = splitter.run(documents=[Document(content=text)])
8084
docs = result["documents"]
8185
assert len(docs) == 2
@@ -88,6 +92,7 @@ def test_split_by_word(self):
8892

8993
def test_split_by_word_with_threshold(self):
9094
splitter = DocumentSplitter(split_by="word", split_length=15, split_threshold=10)
95+
splitter.warm_up()
9196
result = splitter.run(
9297
documents=[
9398
Document(
@@ -105,6 +110,7 @@ def test_split_by_word_multiple_input_docs(self):
105110
splitter = DocumentSplitter(split_by="word", split_length=10)
106111
text1 = "This is a text with some words. There is a second sentence. And there is a third sentence."
107112
text2 = "This is a different text with some words. There is a second sentence. And there is a third sentence. And there is a fourth sentence."
113+
splitter.warm_up()
108114
result = splitter.run(documents=[Document(content=text1), Document(content=text2)])
109115
docs = result["documents"]
110116
assert len(docs) == 5
@@ -132,6 +138,7 @@ def test_split_by_word_multiple_input_docs(self):
132138
def test_split_by_period(self):
133139
splitter = DocumentSplitter(split_by="period", split_length=1)
134140
text = "This is a text with some words. There is a second sentence. And there is a third sentence."
141+
splitter.warm_up()
135142
result = splitter.run(documents=[Document(content=text)])
136143
docs = result["documents"]
137144
assert len(docs) == 3
@@ -148,6 +155,7 @@ def test_split_by_period(self):
148155
def test_split_by_passage(self):
149156
splitter = DocumentSplitter(split_by="passage", split_length=1)
150157
text = "This is a text with some words. There is a second sentence.\n\nAnd there is a third sentence.\n\n And another passage."
158+
splitter.warm_up()
151159
result = splitter.run(documents=[Document(content=text)])
152160
docs = result["documents"]
153161
assert len(docs) == 3
@@ -164,6 +172,7 @@ def test_split_by_passage(self):
164172
def test_split_by_page(self):
165173
splitter = DocumentSplitter(split_by="page", split_length=1)
166174
text = "This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
175+
splitter.warm_up()
167176
result = splitter.run(documents=[Document(content=text)])
168177
docs = result["documents"]
169178
assert len(docs) == 3
@@ -183,6 +192,7 @@ def test_split_by_page(self):
183192
def test_split_by_function(self):
184193
splitting_function = lambda s: s.split(".")
185194
splitter = DocumentSplitter(split_by="function", splitting_function=splitting_function)
195+
splitter.warm_up()
186196
text = "This.Is.A.Test"
187197
result = splitter.run(documents=[Document(id="1", content=text, meta={"key": "value"})])
188198
docs = result["documents"]
@@ -200,6 +210,7 @@ def test_split_by_function(self):
200210
splitting_function = lambda s: re.split(r"[\s]{2,}", s)
201211
splitter = DocumentSplitter(split_by="function", splitting_function=splitting_function)
202212
text = "This Is\n A Test"
213+
splitter.warm_up()
203214
result = splitter.run(documents=[Document(id="1", content=text, meta={"key": "value"})])
204215
docs = result["documents"]
205216
assert len(docs) == 4
@@ -215,6 +226,7 @@ def test_split_by_function(self):
215226
def test_split_by_word_with_overlap(self):
216227
splitter = DocumentSplitter(split_by="word", split_length=10, split_overlap=2)
217228
text = "This is a text with some words. There is a second sentence. And there is a third sentence."
229+
splitter.warm_up()
218230
result = splitter.run(documents=[Document(content=text)])
219231
docs = result["documents"]
220232
assert len(docs) == 2
@@ -234,6 +246,7 @@ def test_split_by_word_with_overlap(self):
234246
def test_split_by_line(self):
235247
splitter = DocumentSplitter(split_by="line", split_length=1)
236248
text = "This is a text with some words.\nThere is a second sentence.\nAnd there is a third sentence."
249+
splitter.warm_up()
237250
result = splitter.run(documents=[Document(content=text)])
238251
docs = result["documents"]
239252

@@ -252,6 +265,7 @@ def test_source_id_stored_in_metadata(self):
252265
splitter = DocumentSplitter(split_by="word", split_length=10)
253266
doc1 = Document(content="This is a text with some words.")
254267
doc2 = Document(content="This is a different text with some words.")
268+
splitter.warm_up()
255269
result = splitter.run(documents=[doc1, doc2])
256270
assert result["documents"][0].meta["source_id"] == doc1.id
257271
assert result["documents"][1].meta["source_id"] == doc2.id
@@ -262,6 +276,7 @@ def test_copy_metadata(self):
262276
Document(content="Text.", meta={"name": "doc 0"}),
263277
Document(content="Text.", meta={"name": "doc 1"}),
264278
]
279+
splitter.warm_up()
265280
result = splitter.run(documents=documents)
266281
assert len(result["documents"]) == 2
267282
assert result["documents"][0].id != result["documents"][1].id
@@ -273,6 +288,7 @@ def test_add_page_number_to_metadata_with_no_overlap_word_split(self):
273288
splitter = DocumentSplitter(split_by="word", split_length=2)
274289
doc1 = Document(content="This is some text.\f This text is on another page.")
275290
doc2 = Document(content="This content has two.\f\f page brakes.")
291+
splitter.warm_up()
276292
result = splitter.run(documents=[doc1, doc2])
277293

278294
expected_pages = [1, 1, 2, 2, 2, 1, 1, 3]
@@ -283,6 +299,7 @@ def test_add_page_number_to_metadata_with_no_overlap_period_split(self):
283299
splitter = DocumentSplitter(split_by="period", split_length=1)
284300
doc1 = Document(content="This is some text.\f This text is on another page.")
285301
doc2 = Document(content="This content has two.\f\f page brakes.")
302+
splitter.warm_up()
286303
result = splitter.run(documents=[doc1, doc2])
287304

288305
expected_pages = [1, 1, 1, 1]
@@ -294,6 +311,7 @@ def test_add_page_number_to_metadata_with_no_overlap_passage_split(self):
294311
doc1 = Document(
295312
content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
296313
)
314+
splitter.warm_up()
297315
result = splitter.run(documents=[doc1])
298316

299317
expected_pages = [1, 2, 2, 2]
@@ -305,6 +323,7 @@ def test_add_page_number_to_metadata_with_no_overlap_page_split(self):
305323
doc1 = Document(
306324
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
307325
)
326+
splitter.warm_up()
308327
result = splitter.run(documents=[doc1])
309328
expected_pages = [1, 2, 3]
310329
for doc, p in zip(result["documents"], expected_pages):
@@ -314,6 +333,7 @@ def test_add_page_number_to_metadata_with_no_overlap_page_split(self):
314333
doc1 = Document(
315334
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
316335
)
336+
splitter.warm_up()
317337
result = splitter.run(documents=[doc1])
318338
expected_pages = [1, 3]
319339

@@ -324,6 +344,7 @@ def test_add_page_number_to_metadata_with_overlap_word_split(self):
324344
splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=1)
325345
doc1 = Document(content="This is some text. And\f this text is on another page.")
326346
doc2 = Document(content="This content has two.\f\f page brakes.")
347+
splitter.warm_up()
327348
result = splitter.run(documents=[doc1, doc2])
328349

329350
expected_pages = [1, 1, 1, 2, 2, 1, 1, 3]
@@ -334,6 +355,7 @@ def test_add_page_number_to_metadata_with_overlap_period_split(self):
334355
splitter = DocumentSplitter(split_by="period", split_length=2, split_overlap=1)
335356
doc1 = Document(content="This is some text. And this is more text.\f This text is on another page. End.")
336357
doc2 = Document(content="This content has two.\f\f page brakes. More text.")
358+
splitter.warm_up()
337359
result = splitter.run(documents=[doc1, doc2])
338360

339361
expected_pages = [1, 1, 1, 2, 1, 1]
@@ -345,6 +367,7 @@ def test_add_page_number_to_metadata_with_overlap_passage_split(self):
345367
doc1 = Document(
346368
content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
347369
)
370+
splitter.warm_up()
348371
result = splitter.run(documents=[doc1])
349372

350373
expected_pages = [1, 2, 2]
@@ -356,6 +379,7 @@ def test_add_page_number_to_metadata_with_overlap_page_split(self):
356379
doc1 = Document(
357380
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
358381
)
382+
splitter.warm_up()
359383
result = splitter.run(documents=[doc1])
360384
expected_pages = [1, 2, 3]
361385

@@ -366,6 +390,7 @@ def test_add_split_overlap_information(self):
366390
splitter = DocumentSplitter(split_length=10, split_overlap=5, split_by="word")
367391
text = "This is a text with some words. There is a second sentence. And a third sentence."
368392
doc = Document(content="This is a text with some words. There is a second sentence. And a third sentence.")
393+
splitter.warm_up()
369394
docs = splitter.run(documents=[doc])["documents"]
370395

371396
# check split_overlap is added to all the documents
@@ -487,6 +512,7 @@ def test_run_empty_document(self):
487512
"""
488513
splitter = DocumentSplitter()
489514
doc = Document(content="")
515+
splitter.warm_up()
490516
results = splitter.run([doc])
491517
assert results["documents"] == []
492518

@@ -496,6 +522,7 @@ def test_run_document_only_whitespaces(self):
496522
"""
497523
splitter = DocumentSplitter()
498524
doc = Document(content=" ")
525+
splitter.warm_up()
499526
results = splitter.run([doc])
500527
assert results["documents"][0].content == " "
501528

@@ -543,6 +570,7 @@ def test_run_split_by_sentence_1(self) -> None:
543570
"Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night ... "
544571
"The moon was full."
545572
)
573+
document_splitter.warm_up()
546574
documents = document_splitter.run(documents=[Document(content=text)])["documents"]
547575

548576
assert len(documents) == 2
@@ -568,6 +596,7 @@ def test_run_split_by_sentence_2(self) -> None:
568596
"This is another test sentence. (This is a third test sentence.) "
569597
"This is the last test sentence."
570598
)
599+
document_splitter.warm_up()
571600
documents = document_splitter.run(documents=[Document(content=text)])["documents"]
572601

573602
assert len(documents) == 4
@@ -601,6 +630,7 @@ def test_run_split_by_sentence_3(self) -> None:
601630
use_split_rules=True,
602631
extend_abbreviations=True,
603632
)
633+
document_splitter.warm_up()
604634

605635
text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5."
606636
documents = document_splitter.run(documents=[Document(content=text)])["documents"]
@@ -633,6 +663,7 @@ def test_run_split_by_sentence_4(self) -> None:
633663
use_split_rules=True,
634664
extend_abbreviations=True,
635665
)
666+
document_splitter.warm_up()
636667

637668
text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5."
638669
documents = document_splitter.run(documents=[Document(content=text)])["documents"]
@@ -660,6 +691,7 @@ def test_run_split_by_word_respect_sentence_boundary(self) -> None:
660691
language="en",
661692
respect_sentence_boundary=True,
662693
)
694+
document_splitter.warm_up()
663695

664696
text = (
665697
"Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night.\f"
@@ -692,6 +724,7 @@ def test_run_split_by_word_respect_sentence_boundary_no_repeats(self) -> None:
692724
use_split_rules=False,
693725
extend_abbreviations=False,
694726
)
727+
document_splitter.warm_up()
695728
text = (
696729
"This is a test sentence with many many words that exceeds the split length and should not be repeated. "
697730
"This is another test sentence. (This is a third test sentence.) "
@@ -717,6 +750,7 @@ def test_run_split_by_word_respect_sentence_boundary_with_split_overlap_and_page
717750
extend_abbreviations=True,
718751
respect_sentence_boundary=True,
719752
)
753+
document_splitter.warm_up()
720754

721755
text = (
722756
"Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f"

0 commit comments

Comments
 (0)