@@ -50,7 +50,7 @@ class DocumentSplitter:
50
50
51
51
def __init__ ( # pylint: disable=too-many-positional-arguments
52
52
self ,
53
- split_by : Literal ["function" , "page" , "passage" , "sentence" , "word" ] = "word" ,
53
+ split_by : Literal ["function" , "page" , "passage" , "sentence" , "word" , "line" ] = "word" ,
54
54
split_length : int = 200 ,
55
55
split_overlap : int = 0 ,
56
56
split_threshold : int = 0 ,
@@ -61,7 +61,7 @@ def __init__( # pylint: disable=too-many-positional-arguments
61
61
62
62
:param split_by: The unit for splitting your documents. Choose from `word` for splitting by spaces (" "),
63
63
`sentence` for splitting by periods ("."), `page` for splitting by form feed ("\\ f"),
64
- or `passage` for splitting by double line breaks ("\\ n\\ n").
64
+ `passage` for splitting by double line breaks ("\\ n\\ n") or `line` for splitting each line (" \\ n").
65
65
:param split_length: The maximum number of units in each split.
66
66
:param split_overlap: The number of overlapping units for each split.
67
67
:param split_threshold: The minimum number of units per split. If a split has fewer units
@@ -72,8 +72,8 @@ def __init__( # pylint: disable=too-many-positional-arguments
72
72
"""
73
73
74
74
self .split_by = split_by
75
- if split_by not in ["function" , "page" , "passage" , "sentence" , "word" ]:
76
- raise ValueError ("split_by must be one of 'word', 'sentence', 'page' or 'passage '." )
75
+ if split_by not in ["function" , "page" , "passage" , "sentence" , "word" , "line" ]:
76
+ raise ValueError ("split_by must be one of 'word', 'sentence', 'page', 'passage' or 'line '." )
77
77
if split_by == "function" and splitting_function is None :
78
78
raise ValueError ("When 'split_by' is set to 'function', a valid 'splitting_function' must be provided." )
79
79
if split_length <= 0 :
@@ -129,7 +129,7 @@ def run(self, documents: List[Document]):
129
129
return {"documents" : split_docs }
130
130
131
131
def _split_into_units (
132
- self , text : str , split_by : Literal ["function" , "page" , "passage" , "sentence" , "word" ]
132
+ self , text : str , split_by : Literal ["function" , "page" , "passage" , "sentence" , "word" , "line" ]
133
133
) -> List [str ]:
134
134
if split_by == "page" :
135
135
self .split_at = "\f "
@@ -139,11 +139,14 @@ def _split_into_units(
139
139
self .split_at = "."
140
140
elif split_by == "word" :
141
141
self .split_at = " "
142
+ elif split_by == "line" :
143
+ self .split_at = "\n "
142
144
elif split_by == "function" and self .splitting_function is not None :
143
145
return self .splitting_function (text )
144
146
else :
145
147
raise NotImplementedError (
146
- "DocumentSplitter only supports 'function', 'page', 'passage', 'sentence' or 'word' split_by options."
148
+ """DocumentSplitter only supports 'function', 'line', 'page',
149
+ 'passage', 'sentence' or 'word' split_by options."""
147
150
)
148
151
units = text .split (self .split_at )
149
152
# Add the delimiter back to all units except the last one
0 commit comments