1
+ import logging
2
+ import os
1
3
import subprocess
2
4
from pathlib import Path
3
5
from urllib .parse import urlparse
4
6
from tempfile import NamedTemporaryFile , TemporaryDirectory
5
7
6
8
from sycamore .data import Document
7
9
10
+ logger = logging .getLogger (__name__ )
11
+
8
12
9
13
def binary_representation_to_pdf (doc : Document ) -> Document :
10
14
"""
@@ -29,19 +33,27 @@ def run_libreoffice(source_path, output_path):
29
33
)
30
34
31
35
assert doc .binary_representation is not None
32
- extension = get_file_extension (doc .properties .get ("path" , "unknown" ))
36
+ origpath = doc .properties .get ("path" , "unknown" )
37
+ extension = get_file_extension (origpath )
33
38
34
39
with NamedTemporaryFile (suffix = f"{ extension } " ) as temp_file :
35
40
temp_file .write (doc .binary_representation )
36
41
temp_file .flush ()
37
42
38
43
temp_path = Path (temp_file .name )
39
44
45
+ pdffile = f"{ temp_path .parent } /{ temp_path .stem } .pdf"
46
+ logger .info (f"Processing { origpath } to { pdffile } " )
47
+ with open (pdffile + "-path" , "w" ) as pathfile :
48
+ pathfile .write (origpath )
49
+
40
50
run_libreoffice (temp_path , temp_path .parent )
41
51
42
- with open (f" { temp_path . parent } / { temp_path . stem } .pdf" , "rb" ) as processed_file :
52
+ with open (pdffile , "rb" ) as processed_file :
43
53
doc .binary_representation = processed_file .read ()
44
54
doc .properties ["filetype" ] = "application/pdf"
55
+ os .unlink (pdffile )
56
+ os .unlink (pdffile + "-path" )
45
57
46
58
return doc
47
59
0 commit comments