File tree Expand file tree Collapse file tree 4 files changed +73
-2
lines changed Expand file tree Collapse file tree 4 files changed +73
-2
lines changed Original file line number Diff line number Diff line change 63
63
run : df
64
64
- name : Update Apt
65
65
run : sudo apt-get update
66
- - name : Install poppler and tesseract
67
- run : sudo apt-get install -y poppler-utils tesseract-ocr
66
+ - name : Install apt dependencies
67
+ run : sudo apt-get install -y poppler-utils tesseract-ocr libreoffice
68
68
- name : DF-5
69
69
run : df
70
70
- name : Run tests
Original file line number Diff line number Diff line change
1
+ import logging
2
+ import os
3
+ import shutil
4
+ from io import BytesIO
5
+
6
+ from pypdf import PdfReader
7
+
8
+ import sycamore
9
+ from sycamore .tests .config import TEST_DIR
10
+ from sycamore .utils .fileformat_tools import binary_representation_to_pdf
11
+
12
+
13
+ def test_binary_representation_to_pdf ():
14
+ # Run this test locally only if libreoffice is installed
15
+ if shutil .which ("libreoffice" ) is None :
16
+ assert "GITHUB_ACTIONS" not in os .environ
17
+ logging .warning ("Skipping test ...; /usr/bin/libreoffice is not installed" )
18
+ return
19
+ paths = str (TEST_DIR / "resources/data/docx/aryn_website_sample.docx" )
20
+
21
+ context = sycamore .init ()
22
+ doc = context .read .binary (paths , binary_format = "docx" ).take (1 )[0 ]
23
+ result = binary_representation_to_pdf (doc )
24
+
25
+ pdf_bytes = BytesIO (result .binary_representation )
26
+ reader = PdfReader (pdf_bytes )
27
+ assert len (reader .pages ) == 2
Original file line number Diff line number Diff line change
1
+ import subprocess
2
+ from pathlib import Path
3
+ from tempfile import NamedTemporaryFile
4
+
5
+ from sycamore .data import Document
6
+
7
+
8
+ def binary_representation_to_pdf (doc : Document ) -> Document :
9
+ """
10
+ Utility to convert binary_representations into different file formats. Uses LibreOffice as the conversion engine.
11
+
12
+ Note: LibreOffice currently requires manual installation based on your platform.
13
+ """
14
+
15
+ def run_libreoffice (source_path , output_path ):
16
+ subprocess .run (
17
+ [
18
+ "libreoffice" ,
19
+ "--headless" ,
20
+ "--convert-to" ,
21
+ "pdf" ,
22
+ source_path ,
23
+ "--outdir" ,
24
+ output_path ,
25
+ ]
26
+ )
27
+
28
+ assert doc .binary_representation is not None
29
+
30
+ extension = Path (doc .properties .get ("path" , "unknown" )).suffix
31
+
32
+ with NamedTemporaryFile (suffix = f"{ extension } " ) as temp_file :
33
+ temp_file .write (doc .binary_representation )
34
+ temp_file .flush ()
35
+
36
+ temp_path = Path (temp_file .name )
37
+
38
+ run_libreoffice (temp_path , temp_path .parent )
39
+
40
+ with open (f"{ temp_path .parent } /{ temp_path .stem } .pdf" , "rb" ) as processed_file :
41
+ doc .binary_representation = processed_file .read ()
42
+ doc .properties ["filetype" ] = "application/pdf"
43
+
44
+ return doc
You can’t perform that action at this time.
0 commit comments