Skip to content

Commit 28ad78c

Browse files
authored
feat: Add XLSXToDocument converter (#8522)
* Add draft of the Excel To Document converter * Add license header * Add release note * Use Union instead of pipe * Add openpyxl as additional dep * Fix zip issue * few updates from Bijay * Update deps * Add markdown test * Adding more example excels and expanding tests * Added more tests * Fix windows test by setting lineterminator * Addressing PR comments * PR comments * Fix linting
1 parent bc30105 commit 28ad78c

File tree

9 files changed

+329
-1
lines changed

9 files changed

+329
-1
lines changed

Diff for: docs/pydoc/config/converters_api.yml

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ loaders:
1616
"pypdf",
1717
"tika",
1818
"txt",
19+
"xlsx",
1920
]
2021
ignore_when_discovered: ["__init__"]
2122
processors:

Diff for: haystack/components/converters/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from haystack.components.converters.pypdf import PyPDFToDocument
1616
from haystack.components.converters.tika import TikaDocumentConverter
1717
from haystack.components.converters.txt import TextFileToDocument
18+
from haystack.components.converters.xlsx import XLSXToDocument
1819

1920
__all__ = [
2021
"TextFileToDocument",
@@ -31,4 +32,5 @@
3132
"PPTXToDocument",
3233
"CSVToDocument",
3334
"JSONConverter",
35+
"XLSXToDocument",
3436
]

Diff for: haystack/components/converters/xlsx.py

+180
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import io
6+
from pathlib import Path
7+
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
8+
9+
import pandas as pd
10+
11+
from haystack import Document, component, logging
12+
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
13+
from haystack.dataclasses import ByteStream
14+
from haystack.lazy_imports import LazyImport
15+
16+
logger = logging.getLogger(__name__)
17+
18+
with LazyImport("Run 'pip install openpyxl'") as xlsx_import:
19+
import openpyxl # pylint: disable=unused-import # the library is used but not directly referenced
20+
21+
with LazyImport("Run 'pip install tabulate'") as tabulate_import:
22+
from tabulate import tabulate # pylint: disable=unused-import # the library is used but not directly referenced
23+
24+
25+
@component
26+
class XLSXToDocument:
27+
"""
28+
Converts XLSX (Excel) files into Documents.
29+
30+
Supports reading data from specific sheets or all sheets in the Excel file. If all sheets are read, a Document is
31+
created for each sheet. The content of the Document is the table which can be saved in CSV or Markdown format.
32+
33+
### Usage example
34+
35+
```python
36+
from haystack.components.converters.xlsx import XLSXToDocument
37+
38+
converter = XLSXToDocument()
39+
results = converter.run(sources=["sample.xlsx"], meta={"date_added": datetime.now().isoformat()})
40+
documents = results["documents"]
41+
print(documents[0].content)
42+
# ",A,B\n1,col_a,col_b\n2,1.5,test\n"
43+
```
44+
"""
45+
46+
def __init__(
47+
self,
48+
table_format: Literal["csv", "markdown"] = "csv",
49+
sheet_name: Union[str, int, List[Union[str, int]], None] = None,
50+
read_excel_kwargs: Optional[Dict[str, Any]] = None,
51+
table_format_kwargs: Optional[Dict[str, Any]] = None,
52+
):
53+
"""
54+
Creates a XLSXToDocument component.
55+
56+
:param table_format: The format to convert the Excel file to.
57+
:param sheet_name: The name of the sheet to read. If None, all sheets are read.
58+
:param read_excel_kwargs: Additional arguments to pass to `pandas.read_excel`.
59+
See https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html#pandas-read-excel
60+
:param table_format_kwargs: Additional keyword arguments to pass to the table format function.
61+
- If `table_format` is "csv", these arguments are passed to `pandas.DataFrame.to_csv`.
62+
See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html#pandas-dataframe-to-csv
63+
- If `table_format` is "markdown", these arguments are passed to `pandas.DataFrame.to_markdown`.
64+
See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_markdown.html#pandas-dataframe-to-markdown
65+
"""
66+
xlsx_import.check()
67+
self.table_format = table_format
68+
if table_format not in ["csv", "markdown"]:
69+
raise ValueError(f"Unsupported export format: {table_format}. Choose either 'csv' or 'markdown'.")
70+
if table_format == "markdown":
71+
tabulate_import.check()
72+
self.sheet_name = sheet_name
73+
self.read_excel_kwargs = read_excel_kwargs or {}
74+
self.table_format_kwargs = table_format_kwargs or {}
75+
76+
@component.output_types(documents=List[Document])
77+
def run(
78+
self,
79+
sources: List[Union[str, Path, ByteStream]],
80+
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
81+
) -> Dict[str, List[Document]]:
82+
"""
83+
Converts a XLSX file to a Document.
84+
85+
:param sources:
86+
List of file paths or ByteStream objects.
87+
:param meta:
88+
Optional metadata to attach to the documents.
89+
This value can be either a list of dictionaries or a single dictionary.
90+
If it's a single dictionary, its content is added to the metadata of all produced documents.
91+
If it's a list, the length of the list must match the number of sources, because the two lists will
92+
be zipped.
93+
If `sources` contains ByteStream objects, their `meta` will be added to the output documents.
94+
:returns:
95+
A dictionary with the following keys:
96+
- `documents`: Created documents
97+
"""
98+
documents = []
99+
100+
meta_list = normalize_metadata(meta, sources_count=len(sources))
101+
102+
for source, metadata in zip(sources, meta_list):
103+
try:
104+
bytestream = get_bytestream_from_source(source)
105+
except Exception as e:
106+
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
107+
continue
108+
109+
try:
110+
tables, tables_metadata = self._extract_tables(bytestream)
111+
except Exception as e:
112+
logger.warning(
113+
"Could not read {source} and convert it to a Document, skipping. Error: {error}",
114+
source=source,
115+
error=e,
116+
)
117+
continue
118+
119+
# Loop over tables and create a Document for each table
120+
for table, excel_metadata in zip(tables, tables_metadata):
121+
merged_metadata = {**bytestream.meta, **metadata, **excel_metadata}
122+
document = Document(content=table, meta=merged_metadata)
123+
documents.append(document)
124+
125+
return {"documents": documents}
126+
127+
@staticmethod
128+
def _generate_excel_column_names(n_cols: int) -> List[str]:
129+
result = []
130+
for i in range(n_cols):
131+
col_name = ""
132+
num = i
133+
while num >= 0:
134+
col_name = chr(num % 26 + 65) + col_name
135+
num = num // 26 - 1
136+
result.append(col_name)
137+
return result
138+
139+
def _extract_tables(self, bytestream: ByteStream) -> Tuple[List[str], List[Dict]]:
140+
"""
141+
Extract tables from a Excel file.
142+
"""
143+
resolved_read_excel_kwargs = {
144+
**self.read_excel_kwargs,
145+
"sheet_name": self.sheet_name,
146+
"header": None, # Don't assign any pandas column labels
147+
"engine": "openpyxl", # Use openpyxl as the engine to read the Excel file
148+
}
149+
sheet_to_dataframe = pd.read_excel(io=io.BytesIO(bytestream.data), **resolved_read_excel_kwargs)
150+
if isinstance(sheet_to_dataframe, pd.DataFrame):
151+
sheet_to_dataframe = {self.sheet_name: sheet_to_dataframe}
152+
153+
updated_sheet_to_dataframe = {}
154+
for key in sheet_to_dataframe:
155+
df = sheet_to_dataframe[key]
156+
# Row starts at 1 in Excel
157+
df.index = df.index + 1
158+
# Excel column names are Alphabet Characters
159+
header = self._generate_excel_column_names(df.shape[1])
160+
df.columns = header
161+
updated_sheet_to_dataframe[key] = df
162+
163+
tables = []
164+
metadata = []
165+
for key, value in updated_sheet_to_dataframe.items():
166+
if self.table_format == "csv":
167+
resolved_kwargs = {"index": True, "header": True, "lineterminator": "\n", **self.table_format_kwargs}
168+
tables.append(value.to_csv(**resolved_kwargs))
169+
else:
170+
resolved_kwargs = {
171+
"index": True,
172+
"headers": value.columns,
173+
"tablefmt": "pipe",
174+
**self.table_format_kwargs,
175+
}
176+
# to_markdown uses tabulate
177+
tables.append(value.to_markdown(**resolved_kwargs))
178+
# add sheet_name to metadata
179+
metadata.append({"xlsx": {"sheet_name": key}})
180+
return tables, metadata

Diff for: pyproject.toml

+3-1
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,9 @@ extra-dependencies = [
106106
"trafilatura", # HTMLToDocument
107107
"python-pptx", # PPTXToDocument
108108
"python-docx", # DocxToDocument
109-
"jq", #JSONConverter
109+
"jq", # JSONConverter
110+
"openpyxl", # XLSXToDocument
111+
"tabulate", # XLSXToDocument
110112

111113
"nltk", # NLTKDocumentSplitter
112114

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
features:
3+
- |
4+
Add XLSXToDocument converter that loads an Excel file using Pandas + openpyxl and by default converts each sheet into a separate Document in a CSV format.

Diff for: test/components/converters/test_xlsx_to_document.py

+139
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
import logging
2+
from typing import Union
3+
4+
import pytest
5+
6+
from haystack.components.converters.xlsx import XLSXToDocument
7+
8+
9+
class TestXLSXToDocument:
10+
def test_init(self) -> None:
11+
converter = XLSXToDocument()
12+
assert converter.sheet_name is None
13+
assert converter.read_excel_kwargs == {}
14+
assert converter.table_format == "csv"
15+
assert converter.table_format_kwargs == {}
16+
17+
def test_run_basic_tables(self, test_files_path) -> None:
18+
converter = XLSXToDocument()
19+
paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
20+
results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
21+
documents = results["documents"]
22+
assert len(documents) == 2
23+
assert documents[0].content == ",A,B\n1,col_a,col_b\n2,1.5,test\n"
24+
assert documents[0].meta == {
25+
"date_added": "2022-01-01T00:00:00",
26+
"file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"),
27+
"xlsx": {"sheet_name": "Basic Table"},
28+
}
29+
assert documents[1].content == ",A,B\n1,col_c,col_d\n2,True,\n"
30+
assert documents[1].meta == {
31+
"date_added": "2022-01-01T00:00:00",
32+
"file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"),
33+
"xlsx": {"sheet_name": "Table Missing Value"},
34+
}
35+
36+
def test_run_table_empty_rows_and_columns(self, test_files_path) -> None:
37+
converter = XLSXToDocument()
38+
paths = [test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx"]
39+
results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
40+
documents = results["documents"]
41+
assert len(documents) == 1
42+
assert documents[0].content == ",A,B,C\n1,,,\n2,,,\n3,,,\n4,,col_a,col_b\n5,,1.5,test\n"
43+
assert documents[0].meta == {
44+
"date_added": "2022-01-01T00:00:00",
45+
"file_path": str(test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx"),
46+
"xlsx": {"sheet_name": "Sheet1"},
47+
}
48+
49+
def test_run_multiple_tables_in_one_sheet(self, test_files_path) -> None:
50+
converter = XLSXToDocument()
51+
paths = [test_files_path / "xlsx" / "multiple_tables.xlsx"]
52+
results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
53+
documents = results["documents"]
54+
assert len(documents) == 1
55+
assert (
56+
documents[0].content
57+
== ",A,B,C,D,E,F\n1,,,,,,\n2,,,,,,\n3,,col_a,col_b,,,\n4,,1.5,test,,col_c,col_d\n5,,,,,3,True\n"
58+
)
59+
assert documents[0].meta == {
60+
"date_added": "2022-01-01T00:00:00",
61+
"file_path": str(test_files_path / "xlsx" / "multiple_tables.xlsx"),
62+
"xlsx": {"sheet_name": "Sheet1"},
63+
}
64+
65+
def test_run_markdown(self, test_files_path) -> None:
66+
converter = XLSXToDocument(table_format="markdown")
67+
paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
68+
results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
69+
documents = results["documents"]
70+
assert len(documents) == 2
71+
assert (
72+
documents[0].content
73+
== "| | A | B |\n|---:|:------|:------|\n| 1 | col_a | col_b |\n| 2 | 1.5 | test |"
74+
)
75+
assert documents[0].meta == {
76+
"date_added": "2022-01-01T00:00:00",
77+
"file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"),
78+
"xlsx": {"sheet_name": "Basic Table"},
79+
}
80+
assert (
81+
documents[1].content
82+
== "| | A | B |\n|---:|:------|:------|\n| 1 | col_c | col_d |\n| 2 | True | nan |"
83+
)
84+
assert documents[1].meta == {
85+
"date_added": "2022-01-01T00:00:00",
86+
"file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"),
87+
"xlsx": {"sheet_name": "Table Missing Value"},
88+
}
89+
90+
@pytest.mark.parametrize(
91+
"sheet_name, expected_sheet_name, expected_content",
92+
[
93+
("Basic Table", "Basic Table", ",A,B\n1,col_a,col_b\n2,1.5,test\n"),
94+
("Table Missing Value", "Table Missing Value", ",A,B\n1,col_c,col_d\n2,True,\n"),
95+
(0, 0, ",A,B\n1,col_a,col_b\n2,1.5,test\n"),
96+
(1, 1, ",A,B\n1,col_c,col_d\n2,True,\n"),
97+
],
98+
)
99+
def test_run_sheet_name(
100+
self, sheet_name: Union[int, str], expected_sheet_name: str, expected_content: str, test_files_path
101+
) -> None:
102+
converter = XLSXToDocument(sheet_name=sheet_name)
103+
paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
104+
results = converter.run(sources=paths)
105+
documents = results["documents"]
106+
assert len(documents) == 1
107+
assert documents[0].content == expected_content
108+
assert documents[0].meta == {
109+
"file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"),
110+
"xlsx": {"sheet_name": expected_sheet_name},
111+
}
112+
113+
def test_run_with_read_excel_kwargs(self, test_files_path) -> None:
114+
converter = XLSXToDocument(sheet_name="Basic Table", read_excel_kwargs={"skiprows": 1})
115+
paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
116+
results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
117+
documents = results["documents"]
118+
assert len(documents) == 1
119+
assert documents[0].content == ",A,B\n1,1.5,test\n"
120+
assert documents[0].meta == {
121+
"date_added": "2022-01-01T00:00:00",
122+
"file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"),
123+
"xlsx": {"sheet_name": "Basic Table"},
124+
}
125+
126+
def test_run_error_wrong_file_type(self, caplog: pytest.LogCaptureFixture, test_files_path) -> None:
127+
converter = XLSXToDocument()
128+
sources = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
129+
with caplog.at_level(logging.WARNING):
130+
results = converter.run(sources=sources)
131+
assert "sample_pdf_1.pdf and convert it" in caplog.text
132+
assert results["documents"] == []
133+
134+
def test_run_error_non_existent_file(self, caplog: pytest.LogCaptureFixture) -> None:
135+
converter = XLSXToDocument()
136+
paths = ["non_existing_file.docx"]
137+
with caplog.at_level(logging.WARNING):
138+
converter.run(sources=paths)
139+
assert "Could not read non_existing_file.docx" in caplog.text

Diff for: test/test_files/xlsx/basic_tables_two_sheets.xlsx

11.4 KB
Binary file not shown.

Diff for: test/test_files/xlsx/multiple_tables.xlsx

4.74 KB
Binary file not shown.
4.63 KB
Binary file not shown.

0 commit comments

Comments
 (0)