1
+ from contextlib import contextmanager
2
+ import io
1
3
import os
2
4
import sys
3
5
from pathlib import Path
11
13
from .parsers import Lattice
12
14
from .parsers import Stream
13
15
from .utils import TemporaryDirectory
14
- from .utils import download_url
16
+ from .utils import InvalidArguments
17
+ from .utils import get_url_bytes
15
18
from .utils import get_page_layout
16
19
from .utils import get_rotation
17
20
from .utils import get_text_objects
@@ -25,21 +28,36 @@ class PDFHandler:
25
28
26
29
Parameters
27
30
----------
28
- filepath : str
29
- Filepath or URL of the PDF file.
31
+ filepath : str | pathlib.Path, optional (default: None)
32
+ Filepath or URL of the PDF file. Required if file_bytes is not given
30
33
pages : str, optional (default: '1')
31
34
Comma-separated page numbers.
32
35
Example: '1,3,4' or '1,4-end' or 'all'.
33
36
password : str, optional (default: None)
34
37
Password for decryption.
38
+ file_bytes : io.IOBase, optional (default: None)
39
+ A file-like stream. Required if filepath is not given
35
40
36
41
"""
37
42
38
- def __init__ (self , filepath : Union [StrByteType , Path ], pages = "1" , password = None ):
43
+ def __init__ (self , filepath : Union [StrByteType , Path , None ], pages = "1" , password = None , file_bytes = None ):
39
44
if is_url (filepath ):
40
- filepath = download_url (filepath )
45
+ file_bytes = get_url_bytes (filepath )
46
+
47
+ if not filepath and not file_bytes :
48
+ raise InvalidArguments ('Either `filepath` or `file_bytes` is required' )
49
+ if not filepath :
50
+ # filepath must either be passed, or taken from the name attribute
51
+ filepath = getattr (file_bytes , 'name' )
52
+ if not filepath :
53
+ msg = ('Either pass a `filepath`, or give the '
54
+ '`file_bytes` argument a name attribute' )
55
+ raise InvalidArguments (msg )
56
+ self .file_bytes = file_bytes # ok to be None
57
+
58
+ # self.filepath = filepath
59
+ # or
41
60
self .filepath : Union [StrByteType , Path ] = filepath
42
-
43
61
if isinstance (filepath , str ) and not filepath .lower ().endswith (".pdf" ):
44
62
raise NotImplementedError ("File format not supported" )
45
63
@@ -51,6 +69,28 @@ def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None)
51
69
self .password = self .password .encode ("ascii" )
52
70
self .pages = self ._get_pages (pages )
53
71
72
+ @contextmanager
73
+ def managed_file_context (self ):
74
+ """Reads from either the `filepath` or `file_bytes`
75
+ attribute of this instance, to return a file-like object.
76
+ Closes any open file handles on exit or error.
77
+
78
+ Returns
79
+ -------
80
+ file_bytes : io.IOBase
81
+ A readable, seekable, file-like object
82
+ """
83
+ if self .file_bytes :
84
+ # if we can't seek, write to a BytesIO object that can,
85
+ # then seek to the beginning before yielding
86
+ if not hasattr (self .file_bytes , 'seek' ):
87
+ self .file_bytes = io .BytesIO (self .file_bytes .read ())
88
+ self .file_bytes .seek (0 )
89
+ yield self .file_bytes
90
+ else :
91
+ with open (self .filepath , "rb" ) as file_bytes :
92
+ yield file_bytes
93
+
54
94
def _get_pages (self , pages ):
55
95
"""Converts pages string to list of ints.
56
96
@@ -73,29 +113,30 @@ def _get_pages(self, pages):
73
113
if pages == "1" :
74
114
page_numbers .append ({"start" : 1 , "end" : 1 })
75
115
else :
76
- infile = PdfReader (self .filepath , strict = False )
77
-
78
- if infile .is_encrypted :
79
- infile .decrypt (self .password )
80
-
81
- if pages == "all" :
82
- page_numbers .append ({"start" : 1 , "end" : len (infile .pages )})
83
- else :
84
- for r in pages .split ("," ):
85
- if "-" in r :
86
- a , b = r .split ("-" )
87
- if b == "end" :
88
- b = len (infile .pages )
89
- page_numbers .append ({"start" : int (a ), "end" : int (b )})
90
- else :
91
- page_numbers .append ({"start" : int (r ), "end" : int (r )})
116
+ with self .managed_file_context () as f :
117
+ infile = PdfReader (f , strict = False )
118
+
119
+ if infile .is_encrypted :
120
+ infile .decrypt (self .password )
121
+
122
+ if pages == "all" :
123
+ page_numbers .append ({"start" : 1 , "end" : len (infile .pages )})
124
+ else :
125
+ for r in pages .split ("," ):
126
+ if "-" in r :
127
+ a , b = r .split ("-" )
128
+ if b == "end" :
129
+ b = len (infile .pages )
130
+ page_numbers .append ({"start" : int (a ), "end" : int (b )})
131
+ else :
132
+ page_numbers .append ({"start" : int (r ), "end" : int (r )})
92
133
93
134
result = []
94
135
for p in page_numbers :
95
136
result .extend (range (p ["start" ], p ["end" ] + 1 ))
96
137
return sorted (set (result ))
97
138
98
- def _save_page (self , filepath : Union [StrByteType , Path ], page , temp ):
139
+ def _save_page (self , filepath : Union [StrByteType , Path , None ], page , temp ):
99
140
"""Saves specified page from PDF into a temporary directory.
100
141
101
142
Parameters
@@ -108,39 +149,41 @@ def _save_page(self, filepath: Union[StrByteType, Path], page, temp):
108
149
Tmp directory.
109
150
110
151
"""
111
- infile = PdfReader (filepath , strict = False )
112
- if infile .is_encrypted :
113
- infile .decrypt (self .password )
114
- fpath = os .path .join (temp , f"page-{ page } .pdf" )
115
- froot , fext = os .path .splitext (fpath )
116
- p = infile .pages [page - 1 ]
117
- outfile = PdfWriter ()
118
- outfile .add_page (p )
119
- with open (fpath , "wb" ) as f :
120
- outfile .write (f )
121
- layout , dim = get_page_layout (fpath )
122
- # fix rotated PDF
123
- chars = get_text_objects (layout , ltype = "char" )
124
- horizontal_text = get_text_objects (layout , ltype = "horizontal_text" )
125
- vertical_text = get_text_objects (layout , ltype = "vertical_text" )
126
- rotation = get_rotation (chars , horizontal_text , vertical_text )
127
- if rotation != "" :
128
- fpath_new = "" .join ([froot .replace ("page" , "p" ), "_rotated" , fext ])
129
- os .rename (fpath , fpath_new )
130
- instream = open (fpath_new , "rb" )
131
- infile = PdfReader (instream , strict = False )
152
+
153
+ with self .managed_file_context () as fileobj :
154
+ infile = PdfReader (fileobj , strict = False )
132
155
if infile .is_encrypted :
133
156
infile .decrypt (self .password )
157
+ fpath = os .path .join (temp , f"page-{ page } .pdf" )
158
+ froot , fext = os .path .splitext (fpath )
159
+ p = infile .pages [page - 1 ]
134
160
outfile = PdfWriter ()
135
- p = infile .pages [0 ]
136
- if rotation == "anticlockwise" :
137
- p .rotate (90 )
138
- elif rotation == "clockwise" :
139
- p .rotate (- 90 )
140
161
outfile .add_page (p )
141
162
with open (fpath , "wb" ) as f :
142
163
outfile .write (f )
143
- instream .close ()
164
+ layout , dim = get_page_layout (fpath )
165
+ # fix rotated PDF
166
+ chars = get_text_objects (layout , ltype = "char" )
167
+ horizontal_text = get_text_objects (layout , ltype = "horizontal_text" )
168
+ vertical_text = get_text_objects (layout , ltype = "vertical_text" )
169
+ rotation = get_rotation (chars , horizontal_text , vertical_text )
170
+ if rotation != "" :
171
+ fpath_new = "" .join ([froot .replace ("page" , "p" ), "_rotated" , fext ])
172
+ os .rename (fpath , fpath_new )
173
+ instream = open (fpath_new , "rb" )
174
+ infile = PdfReader (instream , strict = False )
175
+ if infile .is_encrypted :
176
+ infile .decrypt (self .password )
177
+ outfile = PdfWriter ()
178
+ p = infile .pages [0 ]
179
+ if rotation == "anticlockwise" :
180
+ p .rotate (90 )
181
+ elif rotation == "clockwise" :
182
+ p .rotate (- 90 )
183
+ outfile .add_page (p )
184
+ with open (fpath , "wb" ) as f :
185
+ outfile .write (f )
186
+ instream .close ()
144
187
145
188
def parse (
146
189
self , flavor = "lattice" , suppress_stdout = False , layout_kwargs = None , ** kwargs
0 commit comments