Skip to content

Commit

Permalink
Update fileformattools (#1133)
Browse files Browse the repository at this point in the history
  • Loading branch information
baitsguy authored Jan 27, 2025
1 parent 6c83f5e commit 1904622
Showing 1 changed file with 14 additions and 2 deletions.
16 changes: 14 additions & 2 deletions lib/sycamore/sycamore/utils/fileformat_tools.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import logging
import os
import subprocess
from pathlib import Path
from urllib.parse import urlparse
from tempfile import NamedTemporaryFile, TemporaryDirectory

from sycamore.data import Document

logger = logging.getLogger(__name__)


def binary_representation_to_pdf(doc: Document) -> Document:
"""
Expand All @@ -29,19 +33,27 @@ def run_libreoffice(source_path, output_path):
)

assert doc.binary_representation is not None
extension = get_file_extension(doc.properties.get("path", "unknown"))
origpath = doc.properties.get("path", "unknown")
extension = get_file_extension(origpath)

with NamedTemporaryFile(suffix=f"{extension}") as temp_file:
temp_file.write(doc.binary_representation)
temp_file.flush()

temp_path = Path(temp_file.name)

pdffile = f"{temp_path.parent}/{temp_path.stem}.pdf"
logger.info(f"Processing {origpath} to {pdffile}")
with open(pdffile + "-path", "w") as pathfile:
pathfile.write(origpath)

run_libreoffice(temp_path, temp_path.parent)

with open(f"{temp_path.parent}/{temp_path.stem}.pdf", "rb") as processed_file:
with open(pdffile, "rb") as processed_file:
doc.binary_representation = processed_file.read()
doc.properties["filetype"] = "application/pdf"
os.unlink(pdffile)
os.unlink(pdffile + "-path")

return doc

Expand Down

0 comments on commit 1904622

Please sign in to comment.