diff --git a/lib/sycamore/sycamore/transforms/detr_partitioner.py b/lib/sycamore/sycamore/transforms/detr_partitioner.py index 09a00b887..453a49c50 100644 --- a/lib/sycamore/sycamore/transforms/detr_partitioner.py +++ b/lib/sycamore/sycamore/transforms/detr_partitioner.py @@ -22,9 +22,11 @@ from sycamore.data.element import create_element from sycamore.transforms.table_structure.extract import DEFAULT_TABLE_STRUCTURE_EXTRACTOR from sycamore.utils import choose_device +from sycamore.utils.bbox_sort import bbox_sort_page from sycamore.utils.cache import Cache, DiskCache from sycamore.utils.image_utils import crop_to_bbox, image_to_bytes from sycamore.utils.import_utils import requires_modules +from sycamore.utils.markdown import elements_to_markdown from sycamore.utils.memory_debugging import display_top, gc_tensor_dump from sycamore.utils.pdf import convert_from_path_streamed_batched from sycamore.utils.time_trace import LogTime, timetrace @@ -71,6 +73,18 @@ def get_page_count(fp: BinaryIO): return num_pages +def text_elem(text: str) -> Element: + return Element( + { + "type": "Text", + "properties": { + "page_number": 1, + }, + "text_representation": text, + } + ) + + class ArynPDFPartitioner: """ This class contains the implementation of PDF partitioning using a Deformable DETR model. @@ -146,6 +160,7 @@ def partition_pdf( aryn_partitioner_address=DEFAULT_ARYN_PARTITIONER_ADDRESS, use_cache=False, pages_per_call: int = -1, + output_format: Optional[str] = None, ) -> List[Element]: if use_partitioning_service: assert aryn_api_key != "" @@ -161,6 +176,7 @@ def partition_pdf( extract_table_structure=extract_table_structure, extract_images=extract_images, pages_per_call=pages_per_call, + output_format=output_format, ) else: if isinstance(threshold, str): @@ -180,9 +196,15 @@ def partition_pdf( ) elements = [] for i, r in enumerate(temp): + page = [] for ele in r: ele.properties["page_number"] = i + 1 - elements.append(ele) + page.append(ele) + bbox_sort_page(page) + elements.extend(page) + if output_format == "markdown": + md = elements_to_markdown(elements) + return [text_elem(md)] return elements @staticmethod @@ -202,6 +224,7 @@ def _call_remote_partitioner( extract_table_structure: bool = False, extract_images: bool = False, selected_pages: list = [], + output_format: Optional[str] = None, ) -> List[Element]: file.seek(0) options = { @@ -214,6 +237,8 @@ def _call_remote_partitioner( "selected_pages": selected_pages, "source": "sycamore", } + if output_format: + options["output_format"] = output_format files: Mapping = {"pdf": file, "options": json.dumps(options).encode("utf-8")} header = {"Authorization": f"Bearer {aryn_api_key}"} @@ -288,6 +313,8 @@ def _call_remote_partitioner( raise ArynPDFPartitionerException( f"Error partway through processing: {response_json['error']}\nPartial Status:\n{status}" ) + if (output_format == "markdown") and ((md := response_json.get("markdown")) is not None): + return [text_elem(md)] response_json = response_json.get("elements", []) elements = [] @@ -311,6 +338,7 @@ def _partition_remote( extract_table_structure: bool = False, extract_images: bool = False, pages_per_call: int = -1, + output_format: Optional[str] = None, ) -> List[Element]: page_count = get_page_count(file) @@ -332,6 +360,7 @@ def _partition_remote( extract_table_structure=extract_table_structure, extract_images=extract_images, selected_pages=[[low, min(high, page_count)]], + output_format=output_format, ) ) low = high + 1 diff --git a/lib/sycamore/sycamore/transforms/partition.py b/lib/sycamore/sycamore/transforms/partition.py index 8106776a5..50a318442 100644 --- a/lib/sycamore/sycamore/transforms/partition.py +++ b/lib/sycamore/sycamore/transforms/partition.py @@ -386,6 +386,7 @@ class ArynPartitioner(Partitioner): use_cache: Cache results from the partitioner for faster inferences on the same documents in future runs. pages_per_call: Number of pages to send in a single call to the remote service. Default is -1, which means send all pages in one call. + output_format: controls output representation: json (default) or markdown. Example: The following shows an example of using the ArynPartitioner to partition a PDF and extract @@ -417,6 +418,7 @@ def __init__( use_cache=False, pages_per_call: int = -1, cache: Optional[Cache] = None, + output_format: Optional[str] = None, ): if use_partitioning_service: device = "cpu" @@ -446,6 +448,7 @@ def __init__( self._extract_table_structure = extract_table_structure self._table_structure_extractor = table_structure_extractor self._extract_images = extract_images + self._output_format = output_format self._batch_size = batch_size self._use_partitioning_service = use_partitioning_service self._aryn_partitioner_address = aryn_partitioner_address @@ -476,6 +479,7 @@ def partition(self, document: Document) -> Document: aryn_partitioner_address=self._aryn_partitioner_address, use_cache=self._use_cache, pages_per_call=self._pages_per_call, + output_format=self._output_format, ) except Exception as e: path = document.properties["path"]