Skip to content

Commit 542a7f7

Browse files
nickprockanakin87
andauthored
fix: update meta data before initializing new Document in DocumentSplitter (#8745)
* updated DocumentSplitter issue #8741 * release note * updated DocumentSplitter in _create_docs_from_splits function initialize a new variable copied_mete instead to overwrite meta * added test test_duplicate_pages_get_different_doc_id * fix fmt --------- Co-authored-by: Stefano Fiorucci <[email protected]>
1 parent 242138c commit 542a7f7

File tree

3 files changed

+17
-5
lines changed

3 files changed

+17
-5
lines changed

haystack/components/preprocessors/document_splitter.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -323,11 +323,11 @@ def _create_docs_from_splits(
323323
documents: List[Document] = []
324324

325325
for i, (txt, split_idx) in enumerate(zip(text_splits, splits_start_idxs)):
326-
meta = deepcopy(meta)
327-
doc = Document(content=txt, meta=meta)
328-
doc.meta["page_number"] = splits_pages[i]
329-
doc.meta["split_id"] = i
330-
doc.meta["split_idx_start"] = split_idx
326+
copied_meta = deepcopy(meta)
327+
copied_meta["page_number"] = splits_pages[i]
328+
copied_meta["split_id"] = i
329+
copied_meta["split_idx_start"] = split_idx
330+
doc = Document(content=txt, meta=copied_meta)
331331
documents.append(doc)
332332

333333
if self.split_overlap <= 0:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
enhancements:
3+
- |
4+
Updated Document's meta data after initializing the Document in DocumentSplitter as requested in issue #8741

test/components/preprocessors/test_document_splitter.py

+8
Original file line numberDiff line numberDiff line change
@@ -827,3 +827,11 @@ def test_respect_sentence_boundary_serialization(self):
827827
assert deserialized.respect_sentence_boundary == True
828828
assert hasattr(deserialized, "sentence_splitter")
829829
assert deserialized.language == "de"
830+
831+
def test_duplicate_pages_get_different_doc_id(self):
832+
splitter = DocumentSplitter(split_by="page", split_length=1)
833+
doc1 = Document(content="This is some text.\fThis is some text.\fThis is some text.\fThis is some text.")
834+
splitter.warm_up()
835+
result = splitter.run(documents=[doc1])
836+
837+
assert len({doc.id for doc in result["documents"]}) == 4

0 commit comments

Comments
 (0)