22
22
from sycamore .data .element import create_element
23
23
from sycamore .transforms .table_structure .extract import DEFAULT_TABLE_STRUCTURE_EXTRACTOR
24
24
from sycamore .utils import choose_device
25
+ from sycamore .utils .bbox_sort import bbox_sort_page
25
26
from sycamore .utils .cache import Cache , DiskCache
26
27
from sycamore .utils .image_utils import crop_to_bbox , image_to_bytes
27
28
from sycamore .utils .import_utils import requires_modules
29
+ from sycamore .utils .markdown import elements_to_markdown
28
30
from sycamore .utils .memory_debugging import display_top , gc_tensor_dump
29
31
from sycamore .utils .pdf import convert_from_path_streamed_batched
30
32
from sycamore .utils .time_trace import LogTime , timetrace
@@ -71,6 +73,18 @@ def get_page_count(fp: BinaryIO):
71
73
return num_pages
72
74
73
75
76
+ def text_elem (text : str ) -> Element :
77
+ return Element (
78
+ {
79
+ "type" : "Text" ,
80
+ "properties" : {
81
+ "page_number" : 1 ,
82
+ },
83
+ "text_representation" : text ,
84
+ }
85
+ )
86
+
87
+
74
88
class ArynPDFPartitioner :
75
89
"""
76
90
This class contains the implementation of PDF partitioning using a Deformable DETR model.
@@ -146,6 +160,7 @@ def partition_pdf(
146
160
aryn_partitioner_address = DEFAULT_ARYN_PARTITIONER_ADDRESS ,
147
161
use_cache = False ,
148
162
pages_per_call : int = - 1 ,
163
+ output_format : Optional [str ] = None ,
149
164
) -> List [Element ]:
150
165
if use_partitioning_service :
151
166
assert aryn_api_key != ""
@@ -161,6 +176,7 @@ def partition_pdf(
161
176
extract_table_structure = extract_table_structure ,
162
177
extract_images = extract_images ,
163
178
pages_per_call = pages_per_call ,
179
+ output_format = output_format ,
164
180
)
165
181
else :
166
182
if isinstance (threshold , str ):
@@ -180,9 +196,15 @@ def partition_pdf(
180
196
)
181
197
elements = []
182
198
for i , r in enumerate (temp ):
199
+ page = []
183
200
for ele in r :
184
201
ele .properties ["page_number" ] = i + 1
185
- elements .append (ele )
202
+ page .append (ele )
203
+ bbox_sort_page (page )
204
+ elements .extend (page )
205
+ if output_format == "markdown" :
206
+ md = elements_to_markdown (elements )
207
+ return [text_elem (md )]
186
208
return elements
187
209
188
210
@staticmethod
@@ -202,6 +224,7 @@ def _call_remote_partitioner(
202
224
extract_table_structure : bool = False ,
203
225
extract_images : bool = False ,
204
226
selected_pages : list = [],
227
+ output_format : Optional [str ] = None ,
205
228
) -> List [Element ]:
206
229
file .seek (0 )
207
230
options = {
@@ -214,6 +237,8 @@ def _call_remote_partitioner(
214
237
"selected_pages" : selected_pages ,
215
238
"source" : "sycamore" ,
216
239
}
240
+ if output_format :
241
+ options ["output_format" ] = output_format
217
242
218
243
files : Mapping = {"pdf" : file , "options" : json .dumps (options ).encode ("utf-8" )}
219
244
header = {"Authorization" : f"Bearer { aryn_api_key } " }
@@ -288,6 +313,8 @@ def _call_remote_partitioner(
288
313
raise ArynPDFPartitionerException (
289
314
f"Error partway through processing: { response_json ['error' ]} \n Partial Status:\n { status } "
290
315
)
316
+ if (output_format == "markdown" ) and ((md := response_json .get ("markdown" )) is not None ):
317
+ return [text_elem (md )]
291
318
response_json = response_json .get ("elements" , [])
292
319
293
320
elements = []
@@ -311,6 +338,7 @@ def _partition_remote(
311
338
extract_table_structure : bool = False ,
312
339
extract_images : bool = False ,
313
340
pages_per_call : int = - 1 ,
341
+ output_format : Optional [str ] = None ,
314
342
) -> List [Element ]:
315
343
page_count = get_page_count (file )
316
344
@@ -332,6 +360,7 @@ def _partition_remote(
332
360
extract_table_structure = extract_table_structure ,
333
361
extract_images = extract_images ,
334
362
selected_pages = [[low , min (high , page_count )]],
363
+ output_format = output_format ,
335
364
)
336
365
)
337
366
low = high + 1
0 commit comments