Skip to content

Commit b28d1a2

Browse files
authored
Add convert_file_to_pdf helper using libreoffice (#670)
* Add doc_to_pdf helper using libreoffice * Some cleanup * Lint * Move to utils package * Add ci_only tests * Revert "Add ci_only tests" This reverts commit 14917d9. * Use shutil to ensure libreoffice is available
1 parent 1dc25f5 commit b28d1a2

File tree

4 files changed

+73
-2
lines changed

4 files changed

+73
-2
lines changed

.github/workflows/testing.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@ jobs:
6363
run: df
6464
- name: Update Apt
6565
run: sudo apt-get update
66-
- name: Install poppler and tesseract
67-
run: sudo apt-get install -y poppler-utils tesseract-ocr
66+
- name: Install apt dependencies
67+
run: sudo apt-get install -y poppler-utils tesseract-ocr libreoffice
6868
- name: DF-5
6969
run: df
7070
- name: Run tests
Binary file not shown.
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import logging
2+
import os
3+
import shutil
4+
from io import BytesIO
5+
6+
from pypdf import PdfReader
7+
8+
import sycamore
9+
from sycamore.tests.config import TEST_DIR
10+
from sycamore.utils.fileformat_tools import binary_representation_to_pdf
11+
12+
13+
def test_binary_representation_to_pdf():
14+
# Run this test locally only if libreoffice is installed
15+
if shutil.which("libreoffice") is None:
16+
assert "GITHUB_ACTIONS" not in os.environ
17+
logging.warning("Skipping test ...; /usr/bin/libreoffice is not installed")
18+
return
19+
paths = str(TEST_DIR / "resources/data/docx/aryn_website_sample.docx")
20+
21+
context = sycamore.init()
22+
doc = context.read.binary(paths, binary_format="docx").take(1)[0]
23+
result = binary_representation_to_pdf(doc)
24+
25+
pdf_bytes = BytesIO(result.binary_representation)
26+
reader = PdfReader(pdf_bytes)
27+
assert len(reader.pages) == 2
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import subprocess
2+
from pathlib import Path
3+
from tempfile import NamedTemporaryFile
4+
5+
from sycamore.data import Document
6+
7+
8+
def binary_representation_to_pdf(doc: Document) -> Document:
9+
"""
10+
Utility to convert binary_representations into different file formats. Uses LibreOffice as the conversion engine.
11+
12+
Note: LibreOffice currently requires manual installation based on your platform.
13+
"""
14+
15+
def run_libreoffice(source_path, output_path):
16+
subprocess.run(
17+
[
18+
"libreoffice",
19+
"--headless",
20+
"--convert-to",
21+
"pdf",
22+
source_path,
23+
"--outdir",
24+
output_path,
25+
]
26+
)
27+
28+
assert doc.binary_representation is not None
29+
30+
extension = Path(doc.properties.get("path", "unknown")).suffix
31+
32+
with NamedTemporaryFile(suffix=f"{extension}") as temp_file:
33+
temp_file.write(doc.binary_representation)
34+
temp_file.flush()
35+
36+
temp_path = Path(temp_file.name)
37+
38+
run_libreoffice(temp_path, temp_path.parent)
39+
40+
with open(f"{temp_path.parent}/{temp_path.stem}.pdf", "rb") as processed_file:
41+
doc.binary_representation = processed_file.read()
42+
doc.properties["filetype"] = "application/pdf"
43+
44+
return doc

0 commit comments

Comments
 (0)