Skip to content

Commit

Permalink
Support output_format in ArynPartitioner. (#858)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexaryn authored Oct 1, 2024
1 parent 4b3cc08 commit ca1d08b
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 1 deletion.
31 changes: 30 additions & 1 deletion lib/sycamore/sycamore/transforms/detr_partitioner.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,11 @@
from sycamore.data.element import create_element
from sycamore.transforms.table_structure.extract import DEFAULT_TABLE_STRUCTURE_EXTRACTOR
from sycamore.utils import choose_device
from sycamore.utils.bbox_sort import bbox_sort_page
from sycamore.utils.cache import Cache, DiskCache
from sycamore.utils.image_utils import crop_to_bbox, image_to_bytes
from sycamore.utils.import_utils import requires_modules
from sycamore.utils.markdown import elements_to_markdown
from sycamore.utils.memory_debugging import display_top, gc_tensor_dump
from sycamore.utils.pdf import convert_from_path_streamed_batched
from sycamore.utils.time_trace import LogTime, timetrace
Expand Down Expand Up @@ -71,6 +73,18 @@ def get_page_count(fp: BinaryIO):
return num_pages


def text_elem(text: str) -> Element:
return Element(
{
"type": "Text",
"properties": {
"page_number": 1,
},
"text_representation": text,
}
)


class ArynPDFPartitioner:
"""
This class contains the implementation of PDF partitioning using a Deformable DETR model.
Expand Down Expand Up @@ -146,6 +160,7 @@ def partition_pdf(
aryn_partitioner_address=DEFAULT_ARYN_PARTITIONER_ADDRESS,
use_cache=False,
pages_per_call: int = -1,
output_format: Optional[str] = None,
) -> List[Element]:
if use_partitioning_service:
assert aryn_api_key != ""
Expand All @@ -161,6 +176,7 @@ def partition_pdf(
extract_table_structure=extract_table_structure,
extract_images=extract_images,
pages_per_call=pages_per_call,
output_format=output_format,
)
else:
if isinstance(threshold, str):
Expand All @@ -180,9 +196,15 @@ def partition_pdf(
)
elements = []
for i, r in enumerate(temp):
page = []
for ele in r:
ele.properties["page_number"] = i + 1
elements.append(ele)
page.append(ele)
bbox_sort_page(page)
elements.extend(page)
if output_format == "markdown":
md = elements_to_markdown(elements)
return [text_elem(md)]
return elements

@staticmethod
Expand All @@ -202,6 +224,7 @@ def _call_remote_partitioner(
extract_table_structure: bool = False,
extract_images: bool = False,
selected_pages: list = [],
output_format: Optional[str] = None,
) -> List[Element]:
file.seek(0)
options = {
Expand All @@ -214,6 +237,8 @@ def _call_remote_partitioner(
"selected_pages": selected_pages,
"source": "sycamore",
}
if output_format:
options["output_format"] = output_format

files: Mapping = {"pdf": file, "options": json.dumps(options).encode("utf-8")}
header = {"Authorization": f"Bearer {aryn_api_key}"}
Expand Down Expand Up @@ -288,6 +313,8 @@ def _call_remote_partitioner(
raise ArynPDFPartitionerException(
f"Error partway through processing: {response_json['error']}\nPartial Status:\n{status}"
)
if (output_format == "markdown") and ((md := response_json.get("markdown")) is not None):
return [text_elem(md)]
response_json = response_json.get("elements", [])

elements = []
Expand All @@ -311,6 +338,7 @@ def _partition_remote(
extract_table_structure: bool = False,
extract_images: bool = False,
pages_per_call: int = -1,
output_format: Optional[str] = None,
) -> List[Element]:
page_count = get_page_count(file)

Expand All @@ -332,6 +360,7 @@ def _partition_remote(
extract_table_structure=extract_table_structure,
extract_images=extract_images,
selected_pages=[[low, min(high, page_count)]],
output_format=output_format,
)
)
low = high + 1
Expand Down
4 changes: 4 additions & 0 deletions lib/sycamore/sycamore/transforms/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,7 @@ class ArynPartitioner(Partitioner):
use_cache: Cache results from the partitioner for faster inferences on the same documents in future runs.
pages_per_call: Number of pages to send in a single call to the remote service. Default is -1,
which means send all pages in one call.
output_format: controls output representation: json (default) or markdown.
Example:
The following shows an example of using the ArynPartitioner to partition a PDF and extract
Expand Down Expand Up @@ -417,6 +418,7 @@ def __init__(
use_cache=False,
pages_per_call: int = -1,
cache: Optional[Cache] = None,
output_format: Optional[str] = None,
):
if use_partitioning_service:
device = "cpu"
Expand Down Expand Up @@ -446,6 +448,7 @@ def __init__(
self._extract_table_structure = extract_table_structure
self._table_structure_extractor = table_structure_extractor
self._extract_images = extract_images
self._output_format = output_format
self._batch_size = batch_size
self._use_partitioning_service = use_partitioning_service
self._aryn_partitioner_address = aryn_partitioner_address
Expand Down Expand Up @@ -476,6 +479,7 @@ def partition(self, document: Document) -> Document:
aryn_partitioner_address=self._aryn_partitioner_address,
use_cache=self._use_cache,
pages_per_call=self._pages_per_call,
output_format=self._output_format,
)
except Exception as e:
path = document.properties["path"]
Expand Down

0 comments on commit ca1d08b

Please sign in to comment.