Skip to content

Commit ca1d08b

Browse files
authored
Support output_format in ArynPartitioner. (#858)
1 parent 4b3cc08 commit ca1d08b

File tree

2 files changed

+34
-1
lines changed

2 files changed

+34
-1
lines changed

lib/sycamore/sycamore/transforms/detr_partitioner.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,11 @@
2222
from sycamore.data.element import create_element
2323
from sycamore.transforms.table_structure.extract import DEFAULT_TABLE_STRUCTURE_EXTRACTOR
2424
from sycamore.utils import choose_device
25+
from sycamore.utils.bbox_sort import bbox_sort_page
2526
from sycamore.utils.cache import Cache, DiskCache
2627
from sycamore.utils.image_utils import crop_to_bbox, image_to_bytes
2728
from sycamore.utils.import_utils import requires_modules
29+
from sycamore.utils.markdown import elements_to_markdown
2830
from sycamore.utils.memory_debugging import display_top, gc_tensor_dump
2931
from sycamore.utils.pdf import convert_from_path_streamed_batched
3032
from sycamore.utils.time_trace import LogTime, timetrace
@@ -71,6 +73,18 @@ def get_page_count(fp: BinaryIO):
7173
return num_pages
7274

7375

76+
def text_elem(text: str) -> Element:
77+
return Element(
78+
{
79+
"type": "Text",
80+
"properties": {
81+
"page_number": 1,
82+
},
83+
"text_representation": text,
84+
}
85+
)
86+
87+
7488
class ArynPDFPartitioner:
7589
"""
7690
This class contains the implementation of PDF partitioning using a Deformable DETR model.
@@ -146,6 +160,7 @@ def partition_pdf(
146160
aryn_partitioner_address=DEFAULT_ARYN_PARTITIONER_ADDRESS,
147161
use_cache=False,
148162
pages_per_call: int = -1,
163+
output_format: Optional[str] = None,
149164
) -> List[Element]:
150165
if use_partitioning_service:
151166
assert aryn_api_key != ""
@@ -161,6 +176,7 @@ def partition_pdf(
161176
extract_table_structure=extract_table_structure,
162177
extract_images=extract_images,
163178
pages_per_call=pages_per_call,
179+
output_format=output_format,
164180
)
165181
else:
166182
if isinstance(threshold, str):
@@ -180,9 +196,15 @@ def partition_pdf(
180196
)
181197
elements = []
182198
for i, r in enumerate(temp):
199+
page = []
183200
for ele in r:
184201
ele.properties["page_number"] = i + 1
185-
elements.append(ele)
202+
page.append(ele)
203+
bbox_sort_page(page)
204+
elements.extend(page)
205+
if output_format == "markdown":
206+
md = elements_to_markdown(elements)
207+
return [text_elem(md)]
186208
return elements
187209

188210
@staticmethod
@@ -202,6 +224,7 @@ def _call_remote_partitioner(
202224
extract_table_structure: bool = False,
203225
extract_images: bool = False,
204226
selected_pages: list = [],
227+
output_format: Optional[str] = None,
205228
) -> List[Element]:
206229
file.seek(0)
207230
options = {
@@ -214,6 +237,8 @@ def _call_remote_partitioner(
214237
"selected_pages": selected_pages,
215238
"source": "sycamore",
216239
}
240+
if output_format:
241+
options["output_format"] = output_format
217242

218243
files: Mapping = {"pdf": file, "options": json.dumps(options).encode("utf-8")}
219244
header = {"Authorization": f"Bearer {aryn_api_key}"}
@@ -288,6 +313,8 @@ def _call_remote_partitioner(
288313
raise ArynPDFPartitionerException(
289314
f"Error partway through processing: {response_json['error']}\nPartial Status:\n{status}"
290315
)
316+
if (output_format == "markdown") and ((md := response_json.get("markdown")) is not None):
317+
return [text_elem(md)]
291318
response_json = response_json.get("elements", [])
292319

293320
elements = []
@@ -311,6 +338,7 @@ def _partition_remote(
311338
extract_table_structure: bool = False,
312339
extract_images: bool = False,
313340
pages_per_call: int = -1,
341+
output_format: Optional[str] = None,
314342
) -> List[Element]:
315343
page_count = get_page_count(file)
316344

@@ -332,6 +360,7 @@ def _partition_remote(
332360
extract_table_structure=extract_table_structure,
333361
extract_images=extract_images,
334362
selected_pages=[[low, min(high, page_count)]],
363+
output_format=output_format,
335364
)
336365
)
337366
low = high + 1

lib/sycamore/sycamore/transforms/partition.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -386,6 +386,7 @@ class ArynPartitioner(Partitioner):
386386
use_cache: Cache results from the partitioner for faster inferences on the same documents in future runs.
387387
pages_per_call: Number of pages to send in a single call to the remote service. Default is -1,
388388
which means send all pages in one call.
389+
output_format: controls output representation: json (default) or markdown.
389390
390391
Example:
391392
The following shows an example of using the ArynPartitioner to partition a PDF and extract
@@ -417,6 +418,7 @@ def __init__(
417418
use_cache=False,
418419
pages_per_call: int = -1,
419420
cache: Optional[Cache] = None,
421+
output_format: Optional[str] = None,
420422
):
421423
if use_partitioning_service:
422424
device = "cpu"
@@ -446,6 +448,7 @@ def __init__(
446448
self._extract_table_structure = extract_table_structure
447449
self._table_structure_extractor = table_structure_extractor
448450
self._extract_images = extract_images
451+
self._output_format = output_format
449452
self._batch_size = batch_size
450453
self._use_partitioning_service = use_partitioning_service
451454
self._aryn_partitioner_address = aryn_partitioner_address
@@ -476,6 +479,7 @@ def partition(self, document: Document) -> Document:
476479
aryn_partitioner_address=self._aryn_partitioner_address,
477480
use_cache=self._use_cache,
478481
pages_per_call=self._pages_per_call,
482+
output_format=self._output_format,
479483
)
480484
except Exception as e:
481485
path = document.properties["path"]

0 commit comments

Comments
 (0)