-
Notifications
You must be signed in to change notification settings - Fork 53
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add convert_file_to_pdf helper using libreoffice (#670)
* Add doc_to_pdf helper using libreoffice * Some cleanup * Lint * Move to utils package * Add ci_only tests * Revert "Add ci_only tests" This reverts commit 14917d9. * Use shutil to ensure libreoffice is available
- Loading branch information
Showing
4 changed files
with
73 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file added
BIN
+138 KB
lib/sycamore/sycamore/tests/resources/data/docx/aryn_website_sample.docx
Binary file not shown.
27 changes: 27 additions & 0 deletions
27
lib/sycamore/sycamore/tests/unit/utils/test_fileformat_tools.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
import logging | ||
import os | ||
import shutil | ||
from io import BytesIO | ||
|
||
from pypdf import PdfReader | ||
|
||
import sycamore | ||
from sycamore.tests.config import TEST_DIR | ||
from sycamore.utils.fileformat_tools import binary_representation_to_pdf | ||
|
||
|
||
def test_binary_representation_to_pdf(): | ||
# Run this test locally only if libreoffice is installed | ||
if shutil.which("libreoffice") is None: | ||
assert "GITHUB_ACTIONS" not in os.environ | ||
logging.warning("Skipping test ...; /usr/bin/libreoffice is not installed") | ||
return | ||
paths = str(TEST_DIR / "resources/data/docx/aryn_website_sample.docx") | ||
|
||
context = sycamore.init() | ||
doc = context.read.binary(paths, binary_format="docx").take(1)[0] | ||
result = binary_representation_to_pdf(doc) | ||
|
||
pdf_bytes = BytesIO(result.binary_representation) | ||
reader = PdfReader(pdf_bytes) | ||
assert len(reader.pages) == 2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
import subprocess | ||
from pathlib import Path | ||
from tempfile import NamedTemporaryFile | ||
|
||
from sycamore.data import Document | ||
|
||
|
||
def binary_representation_to_pdf(doc: Document) -> Document: | ||
""" | ||
Utility to convert binary_representations into different file formats. Uses LibreOffice as the conversion engine. | ||
Note: LibreOffice currently requires manual installation based on your platform. | ||
""" | ||
|
||
def run_libreoffice(source_path, output_path): | ||
subprocess.run( | ||
[ | ||
"libreoffice", | ||
"--headless", | ||
"--convert-to", | ||
"pdf", | ||
source_path, | ||
"--outdir", | ||
output_path, | ||
] | ||
) | ||
|
||
assert doc.binary_representation is not None | ||
|
||
extension = Path(doc.properties.get("path", "unknown")).suffix | ||
|
||
with NamedTemporaryFile(suffix=f"{extension}") as temp_file: | ||
temp_file.write(doc.binary_representation) | ||
temp_file.flush() | ||
|
||
temp_path = Path(temp_file.name) | ||
|
||
run_libreoffice(temp_path, temp_path.parent) | ||
|
||
with open(f"{temp_path.parent}/{temp_path.stem}.pdf", "rb") as processed_file: | ||
doc.binary_representation = processed_file.read() | ||
doc.properties["filetype"] = "application/pdf" | ||
|
||
return doc |