Skip to content

Commit a045c0e

Browse files
srini047sjrl
andauthored
feat: added split by line to DocumentSplitter (#8525)
* feat: added split by line to DocumentSplitter * fix: pr review comments Co-authored-by: Sebastian Husch Lee <[email protected]> --------- Co-authored-by: Sebastian Husch Lee <[email protected]>
1 parent 0c11c7b commit a045c0e

File tree

3 files changed

+33
-7
lines changed

3 files changed

+33
-7
lines changed

haystack/components/preprocessors/document_splitter.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ class DocumentSplitter:
5050

5151
def __init__( # pylint: disable=too-many-positional-arguments
5252
self,
53-
split_by: Literal["function", "page", "passage", "sentence", "word"] = "word",
53+
split_by: Literal["function", "page", "passage", "sentence", "word", "line"] = "word",
5454
split_length: int = 200,
5555
split_overlap: int = 0,
5656
split_threshold: int = 0,
@@ -61,7 +61,7 @@ def __init__( # pylint: disable=too-many-positional-arguments
6161
6262
:param split_by: The unit for splitting your documents. Choose from `word` for splitting by spaces (" "),
6363
`sentence` for splitting by periods ("."), `page` for splitting by form feed ("\\f"),
64-
or `passage` for splitting by double line breaks ("\\n\\n").
64+
`passage` for splitting by double line breaks ("\\n\\n") or `line` for splitting each line ("\\n").
6565
:param split_length: The maximum number of units in each split.
6666
:param split_overlap: The number of overlapping units for each split.
6767
:param split_threshold: The minimum number of units per split. If a split has fewer units
@@ -72,8 +72,8 @@ def __init__( # pylint: disable=too-many-positional-arguments
7272
"""
7373

7474
self.split_by = split_by
75-
if split_by not in ["function", "page", "passage", "sentence", "word"]:
76-
raise ValueError("split_by must be one of 'word', 'sentence', 'page' or 'passage'.")
75+
if split_by not in ["function", "page", "passage", "sentence", "word", "line"]:
76+
raise ValueError("split_by must be one of 'word', 'sentence', 'page', 'passage' or 'line'.")
7777
if split_by == "function" and splitting_function is None:
7878
raise ValueError("When 'split_by' is set to 'function', a valid 'splitting_function' must be provided.")
7979
if split_length <= 0:
@@ -129,7 +129,7 @@ def run(self, documents: List[Document]):
129129
return {"documents": split_docs}
130130

131131
def _split_into_units(
132-
self, text: str, split_by: Literal["function", "page", "passage", "sentence", "word"]
132+
self, text: str, split_by: Literal["function", "page", "passage", "sentence", "word", "line"]
133133
) -> List[str]:
134134
if split_by == "page":
135135
self.split_at = "\f"
@@ -139,11 +139,14 @@ def _split_into_units(
139139
self.split_at = "."
140140
elif split_by == "word":
141141
self.split_at = " "
142+
elif split_by == "line":
143+
self.split_at = "\n"
142144
elif split_by == "function" and self.splitting_function is not None:
143145
return self.splitting_function(text)
144146
else:
145147
raise NotImplementedError(
146-
"DocumentSplitter only supports 'function', 'page', 'passage', 'sentence' or 'word' split_by options."
148+
"""DocumentSplitter only supports 'function', 'line', 'page',
149+
'passage', 'sentence' or 'word' split_by options."""
147150
)
148151
units = text.split(self.split_at)
149152
# Add the delimiter back to all units except the last one
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
enhancements:
3+
- |
4+
Added split by line to DocumentSplitter, which will split the document at \n

test/components/preprocessors/test_document_splitter.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,9 @@ def test_empty_list(self):
5656
assert res == {"documents": []}
5757

5858
def test_unsupported_split_by(self):
59-
with pytest.raises(ValueError, match="split_by must be one of 'word', 'sentence', 'page' or 'passage'."):
59+
with pytest.raises(
60+
ValueError, match="split_by must be one of 'word', 'sentence', 'page', 'passage' or 'line'."
61+
):
6062
DocumentSplitter(split_by="unsupported")
6163

6264
def test_unsupported_split_length(self):
@@ -214,6 +216,23 @@ def test_split_by_word_with_overlap(self):
214216
assert docs[1].meta["_split_overlap"][0]["range"] == (38, 43)
215217
assert docs[0].content[38:43] == "is a "
216218

219+
def test_split_by_line(self):
220+
splitter = DocumentSplitter(split_by="line", split_length=1)
221+
text = "This is a text with some words.\nThere is a second sentence.\nAnd there is a third sentence."
222+
result = splitter.run(documents=[Document(content=text)])
223+
docs = result["documents"]
224+
225+
assert len(docs) == 3
226+
assert docs[0].content == "This is a text with some words.\n"
227+
assert docs[0].meta["split_id"] == 0
228+
assert docs[0].meta["split_idx_start"] == text.index(docs[0].content)
229+
assert docs[1].content == "There is a second sentence.\n"
230+
assert docs[1].meta["split_id"] == 1
231+
assert docs[1].meta["split_idx_start"] == text.index(docs[1].content)
232+
assert docs[2].content == "And there is a third sentence."
233+
assert docs[2].meta["split_id"] == 2
234+
assert docs[2].meta["split_idx_start"] == text.index(docs[2].content)
235+
217236
def test_source_id_stored_in_metadata(self):
218237
splitter = DocumentSplitter(split_by="word", split_length=10)
219238
doc1 = Document(content="This is a text with some words.")

0 commit comments

Comments
 (0)