Skip to content

Commit 2828d9e

Browse files
authored
refactor!: DOCXToDocument converter - store DOCX metadata as a dict (#8804)
* DOCXToDocument - store DOCX metadata as a dict * do not export DOCXMetadata to converters package
1 parent 5ae9488 commit 2828d9e

File tree

4 files changed

+77
-72
lines changed

4 files changed

+77
-72
lines changed

Diff for: haystack/components/converters/__init__.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from haystack.components.converters.azure import AzureOCRDocumentConverter
66
from haystack.components.converters.csv import CSVToDocument
7-
from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument
7+
from haystack.components.converters.docx import DOCXToDocument
88
from haystack.components.converters.html import HTMLToDocument
99
from haystack.components.converters.json import JSONConverter
1010
from haystack.components.converters.markdown import MarkdownToDocument
@@ -28,7 +28,6 @@
2828
"OpenAPIServiceToFunctions",
2929
"OutputAdapter",
3030
"DOCXToDocument",
31-
"DOCXMetadata",
3231
"PPTXToDocument",
3332
"CSVToDocument",
3433
"JSONConverter",

Diff for: haystack/components/converters/docx.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import csv
66
import io
77
import os
8-
from dataclasses import dataclass
8+
from dataclasses import asdict, dataclass
99
from enum import Enum
1010
from io import StringIO
1111
from pathlib import Path
@@ -189,7 +189,7 @@ def run(
189189
)
190190
continue
191191

192-
docx_metadata = self._get_docx_metadata(document=docx_document)
192+
docx_metadata = asdict(self._get_docx_metadata(document=docx_document))
193193
merged_metadata = {**bytestream.meta, **metadata, "docx": docx_metadata}
194194

195195
if not self.store_full_path and "file_path" in bytestream.meta:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
upgrade:
3+
- |
4+
The `DOCXToDocument` converter now returns a `Document` object with DOCX metadata stored in the `meta` field as a
5+
dictionary under the key `docx`. Previously, the metadata was represented as a `DOCXMetadata` dataclass.
6+
This change does not impact reading from or writing to a Document Store.

Diff for: test/components/converters/test_docx_file_to_document.py

+68-68
Original file line numberDiff line numberDiff line change
@@ -121,23 +121,23 @@ def test_run(self, test_files_path, docx_converter):
121121
assert docs[0].meta.keys() == {"file_path", "docx"}
122122
assert docs[0].meta == {
123123
"file_path": os.path.basename(paths[0]),
124-
"docx": DOCXMetadata(
125-
author="Microsoft Office User",
126-
category="",
127-
comments="",
128-
content_status="",
129-
created="2024-06-09T21:17:00+00:00",
130-
identifier="",
131-
keywords="",
132-
language="",
133-
last_modified_by="Carlos Fernández Lorán",
134-
last_printed=None,
135-
modified="2024-06-09T21:27:00+00:00",
136-
revision=2,
137-
subject="",
138-
title="",
139-
version="",
140-
),
124+
"docx": {
125+
"author": "Microsoft Office User",
126+
"category": "",
127+
"comments": "",
128+
"content_status": "",
129+
"created": "2024-06-09T21:17:00+00:00",
130+
"identifier": "",
131+
"keywords": "",
132+
"language": "",
133+
"last_modified_by": "Carlos Fernández Lorán",
134+
"last_printed": None,
135+
"modified": "2024-06-09T21:27:00+00:00",
136+
"revision": 2,
137+
"subject": "",
138+
"title": "",
139+
"version": "",
140+
},
141141
}
142142

143143
def test_run_with_table(self, test_files_path):
@@ -153,23 +153,23 @@ def test_run_with_table(self, test_files_path):
153153
assert docs[0].meta.keys() == {"file_path", "docx"}
154154
assert docs[0].meta == {
155155
"file_path": os.path.basename(paths[0]),
156-
"docx": DOCXMetadata(
157-
author="Saha, Anirban",
158-
category="",
159-
comments="",
160-
content_status="",
161-
created="2020-07-14T08:14:00+00:00",
162-
identifier="",
163-
keywords="",
164-
language="",
165-
last_modified_by="Saha, Anirban",
166-
last_printed=None,
167-
modified="2020-07-14T08:16:00+00:00",
168-
revision=1,
169-
subject="",
170-
title="",
171-
version="",
172-
),
156+
"docx": {
157+
"author": "Saha, Anirban",
158+
"category": "",
159+
"comments": "",
160+
"content_status": "",
161+
"created": "2020-07-14T08:14:00+00:00",
162+
"identifier": "",
163+
"keywords": "",
164+
"language": "",
165+
"last_modified_by": "Saha, Anirban",
166+
"last_printed": None,
167+
"modified": "2020-07-14T08:16:00+00:00",
168+
"revision": 1,
169+
"subject": "",
170+
"title": "",
171+
"version": "",
172+
},
173173
}
174174
# let's now detect that the table markdown is correctly added and that order of elements is correct
175175
content_parts = docs[0].content.split("\n\n")
@@ -193,23 +193,23 @@ def test_run_with_store_full_path_false(self, test_files_path):
193193
assert docs[0].meta.keys() == {"file_path", "docx"}
194194
assert docs[0].meta == {
195195
"file_path": "sample_docx_1.docx",
196-
"docx": DOCXMetadata(
197-
author="Microsoft Office User",
198-
category="",
199-
comments="",
200-
content_status="",
201-
created="2024-06-09T21:17:00+00:00",
202-
identifier="",
203-
keywords="",
204-
language="",
205-
last_modified_by="Carlos Fernández Lorán",
206-
last_printed=None,
207-
modified="2024-06-09T21:27:00+00:00",
208-
revision=2,
209-
subject="",
210-
title="",
211-
version="",
212-
),
196+
"docx": {
197+
"author": "Microsoft Office User",
198+
"category": "",
199+
"comments": "",
200+
"content_status": "",
201+
"created": "2024-06-09T21:17:00+00:00",
202+
"identifier": "",
203+
"keywords": "",
204+
"language": "",
205+
"last_modified_by": "Carlos Fernández Lorán",
206+
"last_printed": None,
207+
"modified": "2024-06-09T21:27:00+00:00",
208+
"revision": 2,
209+
"subject": "",
210+
"title": "",
211+
"version": "",
212+
},
213213
}
214214

215215
@pytest.mark.parametrize("table_format", ["markdown", "csv"])
@@ -285,23 +285,23 @@ def test_run_with_additional_meta(self, test_files_path, docx_converter):
285285
doc = output["documents"][0]
286286
assert doc.meta == {
287287
"file_path": os.path.basename(paths[0]),
288-
"docx": DOCXMetadata(
289-
author="Microsoft Office User",
290-
category="",
291-
comments="",
292-
content_status="",
293-
created="2024-06-09T21:17:00+00:00",
294-
identifier="",
295-
keywords="",
296-
language="",
297-
last_modified_by="Carlos Fernández Lorán",
298-
last_printed=None,
299-
modified="2024-06-09T21:27:00+00:00",
300-
revision=2,
301-
subject="",
302-
title="",
303-
version="",
304-
),
288+
"docx": {
289+
"author": "Microsoft Office User",
290+
"category": "",
291+
"comments": "",
292+
"content_status": "",
293+
"created": "2024-06-09T21:17:00+00:00",
294+
"identifier": "",
295+
"keywords": "",
296+
"language": "",
297+
"last_modified_by": "Carlos Fernández Lorán",
298+
"last_printed": None,
299+
"modified": "2024-06-09T21:27:00+00:00",
300+
"revision": 2,
301+
"subject": "",
302+
"title": "",
303+
"version": "",
304+
},
305305
"language": "it",
306306
"author": "test_author",
307307
}

0 commit comments

Comments
 (0)