@@ -50,7 +50,7 @@ class DocumentSplitter:
5050
5151 def __init__ ( # pylint: disable=too-many-positional-arguments
5252 self ,
53- split_by : Literal ["function" , "page" , "passage" , "sentence" , "word" ] = "word" ,
53+ split_by : Literal ["function" , "page" , "passage" , "sentence" , "word" , "line" ] = "word" ,
5454 split_length : int = 200 ,
5555 split_overlap : int = 0 ,
5656 split_threshold : int = 0 ,
@@ -61,7 +61,7 @@ def __init__( # pylint: disable=too-many-positional-arguments
6161
6262 :param split_by: The unit for splitting your documents. Choose from `word` for splitting by spaces (" "),
6363 `sentence` for splitting by periods ("."), `page` for splitting by form feed ("\\ f"),
64- or `passage` for splitting by double line breaks ("\\ n\\ n").
64+ `passage` for splitting by double line breaks ("\\ n\\ n") or `line` for splitting each line (" \\ n").
6565 :param split_length: The maximum number of units in each split.
6666 :param split_overlap: The number of overlapping units for each split.
6767 :param split_threshold: The minimum number of units per split. If a split has fewer units
@@ -72,8 +72,8 @@ def __init__( # pylint: disable=too-many-positional-arguments
7272 """
7373
7474 self .split_by = split_by
75- if split_by not in ["function" , "page" , "passage" , "sentence" , "word" ]:
76- raise ValueError ("split_by must be one of 'word', 'sentence', 'page' or 'passage '." )
75+ if split_by not in ["function" , "page" , "passage" , "sentence" , "word" , "line" ]:
76+ raise ValueError ("split_by must be one of 'word', 'sentence', 'page', 'passage' or 'line '." )
7777 if split_by == "function" and splitting_function is None :
7878 raise ValueError ("When 'split_by' is set to 'function', a valid 'splitting_function' must be provided." )
7979 if split_length <= 0 :
@@ -129,7 +129,7 @@ def run(self, documents: List[Document]):
129129 return {"documents" : split_docs }
130130
131131 def _split_into_units (
132- self , text : str , split_by : Literal ["function" , "page" , "passage" , "sentence" , "word" ]
132+ self , text : str , split_by : Literal ["function" , "page" , "passage" , "sentence" , "word" , "line" ]
133133 ) -> List [str ]:
134134 if split_by == "page" :
135135 self .split_at = "\f "
@@ -139,11 +139,14 @@ def _split_into_units(
139139 self .split_at = "."
140140 elif split_by == "word" :
141141 self .split_at = " "
142+ elif split_by == "line" :
143+ self .split_at = "\n "
142144 elif split_by == "function" and self .splitting_function is not None :
143145 return self .splitting_function (text )
144146 else :
145147 raise NotImplementedError (
146- "DocumentSplitter only supports 'function', 'page', 'passage', 'sentence' or 'word' split_by options."
148+ """DocumentSplitter only supports 'function', 'line', 'page',
149+ 'passage', 'sentence' or 'word' split_by options."""
147150 )
148151 units = text .split (self .split_at )
149152 # Add the delimiter back to all units except the last one
0 commit comments