Skip to content

Commit

Permalink
fix: update meta data before initializing new Document in DocumentSpl…
Browse files Browse the repository at this point in the history
…itter (#8745)

* updated DocumentSplitter

issue #8741

* release note

* updated DocumentSplitter

in _create_docs_from_splits function initialize a new variable copied_mete instead to overwrite meta

* added test

test_duplicate_pages_get_different_doc_id

* fix fmt

---------

Co-authored-by: Stefano Fiorucci <[email protected]>
  • Loading branch information
nickprock and anakin87 authored Jan 20, 2025
1 parent 242138c commit 542a7f7
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 5 deletions.
10 changes: 5 additions & 5 deletions haystack/components/preprocessors/document_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,11 +323,11 @@ def _create_docs_from_splits(
documents: List[Document] = []

for i, (txt, split_idx) in enumerate(zip(text_splits, splits_start_idxs)):
meta = deepcopy(meta)
doc = Document(content=txt, meta=meta)
doc.meta["page_number"] = splits_pages[i]
doc.meta["split_id"] = i
doc.meta["split_idx_start"] = split_idx
copied_meta = deepcopy(meta)
copied_meta["page_number"] = splits_pages[i]
copied_meta["split_id"] = i
copied_meta["split_idx_start"] = split_idx
doc = Document(content=txt, meta=copied_meta)
documents.append(doc)

if self.split_overlap <= 0:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
enhancements:
- |
Updated Document's meta data after initializing the Document in DocumentSplitter as requested in issue #8741
8 changes: 8 additions & 0 deletions test/components/preprocessors/test_document_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -827,3 +827,11 @@ def test_respect_sentence_boundary_serialization(self):
assert deserialized.respect_sentence_boundary == True
assert hasattr(deserialized, "sentence_splitter")
assert deserialized.language == "de"

def test_duplicate_pages_get_different_doc_id(self):
splitter = DocumentSplitter(split_by="page", split_length=1)
doc1 = Document(content="This is some text.\fThis is some text.\fThis is some text.\fThis is some text.")
splitter.warm_up()
result = splitter.run(documents=[doc1])

assert len({doc.id for doc in result["documents"]}) == 4

0 comments on commit 542a7f7

Please sign in to comment.