Skip to content

Commit

Permalink
Add convert_file_to_pdf helper using libreoffice (#670)
Browse files Browse the repository at this point in the history
* Add doc_to_pdf helper using libreoffice

* Some cleanup

* Lint

* Move to utils package

* Add ci_only tests

* Revert "Add ci_only tests"

This reverts commit 14917d9.

* Use shutil to ensure libreoffice is available
  • Loading branch information
baitsguy authored Aug 13, 2024
1 parent 1dc25f5 commit b28d1a2
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 2 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ jobs:
run: df
- name: Update Apt
run: sudo apt-get update
- name: Install poppler and tesseract
run: sudo apt-get install -y poppler-utils tesseract-ocr
- name: Install apt dependencies
run: sudo apt-get install -y poppler-utils tesseract-ocr libreoffice
- name: DF-5
run: df
- name: Run tests
Expand Down
Binary file not shown.
27 changes: 27 additions & 0 deletions lib/sycamore/sycamore/tests/unit/utils/test_fileformat_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import logging
import os
import shutil
from io import BytesIO

from pypdf import PdfReader

import sycamore
from sycamore.tests.config import TEST_DIR
from sycamore.utils.fileformat_tools import binary_representation_to_pdf


def test_binary_representation_to_pdf():
# Run this test locally only if libreoffice is installed
if shutil.which("libreoffice") is None:
assert "GITHUB_ACTIONS" not in os.environ
logging.warning("Skipping test ...; /usr/bin/libreoffice is not installed")
return
paths = str(TEST_DIR / "resources/data/docx/aryn_website_sample.docx")

context = sycamore.init()
doc = context.read.binary(paths, binary_format="docx").take(1)[0]
result = binary_representation_to_pdf(doc)

pdf_bytes = BytesIO(result.binary_representation)
reader = PdfReader(pdf_bytes)
assert len(reader.pages) == 2
44 changes: 44 additions & 0 deletions lib/sycamore/sycamore/utils/fileformat_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import subprocess
from pathlib import Path
from tempfile import NamedTemporaryFile

from sycamore.data import Document


def binary_representation_to_pdf(doc: Document) -> Document:
"""
Utility to convert binary_representations into different file formats. Uses LibreOffice as the conversion engine.
Note: LibreOffice currently requires manual installation based on your platform.
"""

def run_libreoffice(source_path, output_path):
subprocess.run(
[
"libreoffice",
"--headless",
"--convert-to",
"pdf",
source_path,
"--outdir",
output_path,
]
)

assert doc.binary_representation is not None

extension = Path(doc.properties.get("path", "unknown")).suffix

with NamedTemporaryFile(suffix=f"{extension}") as temp_file:
temp_file.write(doc.binary_representation)
temp_file.flush()

temp_path = Path(temp_file.name)

run_libreoffice(temp_path, temp_path.parent)

with open(f"{temp_path.parent}/{temp_path.stem}.pdf", "rb") as processed_file:
doc.binary_representation = processed_file.read()
doc.properties["filetype"] = "application/pdf"

return doc

0 comments on commit b28d1a2

Please sign in to comment.