From 542a7f7ef5638dcec9e53d7a8d66ee04d0e7359c Mon Sep 17 00:00:00 2001 From: Nicola Procopio Date: Mon, 20 Jan 2025 09:51:47 +0100 Subject: [PATCH] fix: update meta data before initializing new Document in DocumentSplitter (#8745) * updated DocumentSplitter issue #8741 * release note * updated DocumentSplitter in _create_docs_from_splits function initialize a new variable copied_mete instead to overwrite meta * added test test_duplicate_pages_get_different_doc_id * fix fmt --------- Co-authored-by: Stefano Fiorucci --- haystack/components/preprocessors/document_splitter.py | 10 +++++----- .../updated-documentsplitter-762c4409cbc296e6.yaml | 4 ++++ .../components/preprocessors/test_document_splitter.py | 8 ++++++++ 3 files changed, 17 insertions(+), 5 deletions(-) create mode 100644 releasenotes/notes/updated-documentsplitter-762c4409cbc296e6.yaml diff --git a/haystack/components/preprocessors/document_splitter.py b/haystack/components/preprocessors/document_splitter.py index d03897b4b6..949f756ae6 100644 --- a/haystack/components/preprocessors/document_splitter.py +++ b/haystack/components/preprocessors/document_splitter.py @@ -323,11 +323,11 @@ def _create_docs_from_splits( documents: List[Document] = [] for i, (txt, split_idx) in enumerate(zip(text_splits, splits_start_idxs)): - meta = deepcopy(meta) - doc = Document(content=txt, meta=meta) - doc.meta["page_number"] = splits_pages[i] - doc.meta["split_id"] = i - doc.meta["split_idx_start"] = split_idx + copied_meta = deepcopy(meta) + copied_meta["page_number"] = splits_pages[i] + copied_meta["split_id"] = i + copied_meta["split_idx_start"] = split_idx + doc = Document(content=txt, meta=copied_meta) documents.append(doc) if self.split_overlap <= 0: diff --git a/releasenotes/notes/updated-documentsplitter-762c4409cbc296e6.yaml b/releasenotes/notes/updated-documentsplitter-762c4409cbc296e6.yaml new file mode 100644 index 0000000000..f0d3a3d68e --- /dev/null +++ b/releasenotes/notes/updated-documentsplitter-762c4409cbc296e6.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + Updated Document's meta data after initializing the Document in DocumentSplitter as requested in issue #8741 diff --git a/test/components/preprocessors/test_document_splitter.py b/test/components/preprocessors/test_document_splitter.py index f9096239f2..81e0fa2ae4 100644 --- a/test/components/preprocessors/test_document_splitter.py +++ b/test/components/preprocessors/test_document_splitter.py @@ -827,3 +827,11 @@ def test_respect_sentence_boundary_serialization(self): assert deserialized.respect_sentence_boundary == True assert hasattr(deserialized, "sentence_splitter") assert deserialized.language == "de" + + def test_duplicate_pages_get_different_doc_id(self): + splitter = DocumentSplitter(split_by="page", split_length=1) + doc1 = Document(content="This is some text.\fThis is some text.\fThis is some text.\fThis is some text.") + splitter.warm_up() + result = splitter.run(documents=[doc1]) + + assert len({doc.id for doc in result["documents"]}) == 4