diff --git a/lib/aryn-sdk/README.md b/lib/aryn-sdk/README.md index 426884dd3..f3a18a7c3 100644 --- a/lib/aryn-sdk/README.md +++ b/lib/aryn-sdk/README.md @@ -96,3 +96,77 @@ pil_img = convert_image_element(image_elts[0]) jpg_bytes = convert_image_element(image_elts[1], format='JPEG') png_str = convert_image_element(image_elts[2], format="PNG", b64encode=True) ``` + +### Async Aryn DocParse + +#### Single Job Example +```python +import time +from aryn_sdk.partition import partition_file_async_submit, partition_file_async_result + +with open("my-favorite-pdf.pdf", "rb") as f: + response = partition_file_async_submit( + f, + use_ocr=True, + extract_table_structure=True, + ) + +job_id = response["job_id"] + +# Poll for the results +while True: + result = partition_file_async_result(job_id) + if result["status"] != "pending": + break + time.sleep(5) +``` + +Optionally, you can also set a webhook for Aryn to call when your job is completed: + +```python +partition_file_async_submit("path/to/my/file.docx", webhook_url="https://example.com/alert") +``` + +Aryn will POST a request containing a body like the below: +```json +{"done": [{"job_id": "aryn:j-47gpd3604e5tz79z1jro5fc"}]} +``` + +#### Multi-Job Example + +```python +import logging +import time +from aryn_sdk.partition import partition_file_async_submit, partition_file_async_result + +files = [open("file1.pdf", "rb"), open("file2.docx", "rb")] +job_ids = [None] * len(files) +for i, f in enumerate(files): + try: + job_ids[i] = partition_file_async_submit(f)["job_id"] + except Exception as e: + logging.warning(f"Failed to submit {f}: {e}") + +results = [None] * len(files) +for i, job_id in enumerate(job_ids): + while True: + result = partition_file_async_result(job_id) + if result["status"] != "pending": + break + time.sleep(5) + results[i] = result +``` + +#### Cancelling an async job + +```python +from aryn_sdk.partition import partition_file_async_submit, partition_file_async_cancel + job_id = partition_file_async_submit( + "path/to/file.pdf", + use_ocr=True, + extract_table_structure=True, + extract_images=True, + )["job_id"] + + partition_file_async_cancel(job_id) +``` diff --git a/lib/aryn-sdk/aryn_sdk/partition/__init__.py b/lib/aryn-sdk/aryn_sdk/partition/__init__.py index 2c2da48db..e91829019 100644 --- a/lib/aryn-sdk/aryn_sdk/partition/__init__.py +++ b/lib/aryn-sdk/aryn_sdk/partition/__init__.py @@ -1,4 +1,14 @@ -from .partition import partition_file, tables_to_pandas, table_elem_to_dataframe, convert_image_element, PartitionError +from .partition import ( + partition_file, + partition_file_async_submit, + partition_file_async_result, + partition_file_async_cancel, + partition_file_async_list, + tables_to_pandas, + table_elem_to_dataframe, + convert_image_element, + PartitionError, +) from .art import draw_with_boxes __all__ = [ @@ -8,4 +18,8 @@ "draw_with_boxes", "convert_image_element", "PartitionError", + "partition_file_async_submit", + "partition_file_async_result", + "partition_file_async_cancel", + "partition_file_async_list", ] diff --git a/lib/aryn-sdk/aryn_sdk/partition/partition.py b/lib/aryn-sdk/aryn_sdk/partition/partition.py index 619c9ee1d..5a525c4cb 100644 --- a/lib/aryn-sdk/aryn_sdk/partition/partition.py +++ b/lib/aryn-sdk/aryn_sdk/partition/partition.py @@ -1,5 +1,6 @@ from os import PathLike from typing import BinaryIO, Literal, Optional, Union, Any +from urllib.parse import urlparse, urlunparse from collections.abc import Mapping from aryn_sdk.config import ArynConfig import requests @@ -29,6 +30,7 @@ def __init__(self, message: str, status_code: int) -> None: def partition_file( file: Union[BinaryIO, str, PathLike], + *, aryn_api_key: Optional[str] = None, aryn_config: Optional[ArynConfig] = None, threshold: Optional[Union[float, Literal["auto"]]] = None, @@ -91,14 +93,21 @@ def partition_file( ssl_verify: verify ssl certificates. In databricks, set this to False to fix ssl imcompatibilities. output_format: controls output representation; can be set to "markdown" or "json" default: None (JSON elements) - output_label_options: A dictionary for configuring output label behavior. It supports two options: + output_label_options: A dictionary for configuring output label behavior. It supports three options: promote_title, a boolean specifying whether to pick the largest element by font size on the first page from among the elements on that page that have one of the types specified in title_candidate_elements and promote it to type "Title" if there is no element on the first page of type "Title" already. title_candidate_elements, a list of strings representing the label types allowed to be promoted to a title. + orientation_correction, a boolean specifying whether to pagewise rotate pages to the correct orientation + based off the orientation of text. Pages are rotated by increments of 90 degrees to correct their + orientation. Here is an example set of output label options: - {"promote_title": True, "title_candidate_elements": ["Section-header", "Caption"]} + { + "promote_title": True, + "title_candidate_elements": ["Section-header", "Caption"], + "orientation_correction": True + } default: None (no element is promoted to "Title") @@ -121,18 +130,54 @@ def partition_file( ) elements = data['elements'] """ + return _partition_file_inner( + file=file, + aryn_api_key=aryn_api_key, + aryn_config=aryn_config, + threshold=threshold, + use_ocr=use_ocr, + ocr_images=ocr_images, + extract_table_structure=extract_table_structure, + table_extraction_options=table_extraction_options, + extract_images=extract_images, + selected_pages=selected_pages, + chunking_options=chunking_options, + aps_url=aps_url, + docparse_url=docparse_url, + ssl_verify=ssl_verify, + output_format=output_format, + output_label_options=output_label_options, + ) + + +def _partition_file_inner( + file: Union[BinaryIO, str, PathLike], + *, + aryn_api_key: Optional[str] = None, + aryn_config: Optional[ArynConfig] = None, + threshold: Optional[Union[float, Literal["auto"]]] = None, + use_ocr: bool = False, + ocr_images: bool = False, + extract_table_structure: bool = False, + table_extraction_options: dict[str, Any] = {}, + extract_images: bool = False, + selected_pages: Optional[list[Union[list[int], int]]] = None, + chunking_options: Optional[dict[str, Any]] = None, + aps_url: Optional[str] = None, # deprecated in favor of docparse_url + docparse_url: Optional[str] = None, + ssl_verify: bool = True, + output_format: Optional[str] = None, + output_label_options: dict[str, Any] = {}, + webhook_url: Optional[str] = None, +): + """Do not call this function directly. Use partition_file or partition_file_async_submit instead.""" # If you hand me a path for the file, read it in instead of trying to send the path if isinstance(file, (str, PathLike)): with open(file, "rb") as f: file = io.BytesIO(f.read()) - if aryn_api_key is not None: - if aryn_config is not None: - _logger.warning("Both aryn_api_key and aryn_config were provided. Using aryn_api_key") - aryn_config = ArynConfig(aryn_api_key=aryn_api_key) - if aryn_config is None: - aryn_config = ArynConfig() + aryn_config = _process_config(aryn_api_key, aryn_config) if aps_url is not None: if docparse_url is not None: @@ -160,21 +205,13 @@ def partition_file( _logger.debug(f"{options_str}") - # Workaround for vcr. See https://github.com/aryn-ai/sycamore/issues/958 - stream = True - if "vcr" in sys.modules: - ul3 = sys.modules.get("urllib3") - if ul3: - # Look for tell-tale patched method... - mod = ul3.connectionpool.is_connection_dropped.__module__ - if "mock" in mod: - stream = False - files: Mapping = {"options": options_str.encode("utf-8"), "pdf": file} - http_header = {"Authorization": "Bearer {}".format(aryn_config.api_key())} - resp = requests.post(docparse_url, files=files, headers=http_header, stream=stream, verify=ssl_verify) + headers = {"Authorization": "Bearer {}".format(aryn_config.api_key())} + if webhook_url: + headers["X-Aryn-Webhook"] = webhook_url + resp = requests.post(docparse_url, files=files, headers=headers, stream=_set_stream(), verify=ssl_verify) - if resp.status_code != 200: + if resp.status_code < 200 or resp.status_code >= 300: raise requests.exceptions.HTTPError( f"Error: status_code: {resp.status_code}, reason: {resp.text}", response=resp ) @@ -223,6 +260,29 @@ def partition_file( return data +def _process_config(aryn_api_key: Optional[str] = None, aryn_config: Optional[ArynConfig] = None) -> ArynConfig: + if aryn_api_key is not None: + if aryn_config is not None: + _logger.warning("Both aryn_api_key and aryn_config were provided. Using aryn_api_key") + aryn_config = ArynConfig(aryn_api_key=aryn_api_key) + if aryn_config is None: + aryn_config = ArynConfig() + return aryn_config + + +def _set_stream() -> bool: + # Workaround for vcr. See https://github.com/aryn-ai/sycamore/issues/958 + stream = True + if "vcr" in sys.modules: + ul3 = sys.modules.get("urllib3") + if ul3: + # Look for tell-tale patched method... + mod = ul3.connectionpool.is_connection_dropped.__module__ + if "mock" in mod: + stream = False + return stream + + def _json_options( threshold: Optional[Union[float, Literal["auto"]]] = None, use_ocr: bool = False, @@ -263,6 +323,277 @@ def _json_options( return json.dumps(options) +def partition_file_async_submit( + file: Union[BinaryIO, str, PathLike], + *, + aryn_api_key: Optional[str] = None, + aryn_config: Optional[ArynConfig] = None, + threshold: Optional[Union[float, Literal["auto"]]] = None, + use_ocr: bool = False, + ocr_images: bool = False, + extract_table_structure: bool = False, + table_extraction_options: dict[str, Any] = {}, + extract_images: bool = False, + selected_pages: Optional[list[Union[list[int], int]]] = None, + chunking_options: Optional[dict[str, Any]] = None, + aps_url: Optional[str] = None, # deprecated in favor of docparse_url + docparse_url: Optional[str] = None, + ssl_verify: bool = True, + output_format: Optional[str] = None, + output_label_options: dict[str, Any] = {}, + webhook_url: Optional[str] = None, + async_submit_url: Optional[str] = None, +) -> dict[str, Any]: + """ + Submits a file to be partitioned asynchronously. Meant to be used in tandem with `partition_file_async_result`. + + `partition_file_async_submit` takes the same arguments as `partition_file`, and in addition it accepts a str + `webhook_url` argument which is a URL Aryn will send a POST request to when the job stops and an str + `async_submit_url` argument that can be used to override where the job is submitted to. + + Set the `docparse_url` argument to the url of the synchronous endpoint, and this function will automatically + change it to the async endpoint as long as `async_submit_url` is not set. + + + Args: + Includes All Arguments `partition_file` accepts plus those below: + ... + webhook_url: A URL to send a POST request to when the job is done. The resulting POST request will have a + body like: {"done": [{"job_id": "aryn:j-47gpd3604e5tz79z1jro5fc"}]} + async_submit_url: When set, this will override the endpoint the job is submitted to. + + Returns: + A dictionary containing the key "job_id" the value of which can be used with the `partition_file_async_result` + function to get the results and check the status of the async job. + + Single Job Example: + .. code-block:: python + + import time + from aryn_sdk.partition import partition_file_async_submit, partition_file_async_result + + with open("my-favorite-pdf.pdf", "rb") as f: + response = partition_file_async_submit( + f, + use_ocr=True, + extract_table_structure=True, + ) + + job_id = response["job_id"] + + # Poll for the results + while True: + result = partition_file_async_result(job_id) + if result["status"] != "pending": + break + time.sleep(5) + + Multi-Job Example: + .. code-block:: python + + import logging + import time + from aryn_sdk.partition import partition_file_async_submit, partition_file_async_result + + files = [open("file1.pdf", "rb"), open("file2.docx", "rb")] + job_ids = [None] * len(files) + for i, f in enumerate(files): + try: + job_ids[i] = partition_file_async_submit(f)["job_id"] + except Exception as e: + logging.warning(f"Failed to submit {f}: {e}") + + results = [None] * len(files) + for i, job_id in enumerate(job_ids): + while True: + result = partition_file_async_result(job_id) + if result["status"] != "pending": + break + time.sleep(5) + results[i] = result + + """ + + if async_submit_url: + docparse_url = async_submit_url + elif not aps_url and not docparse_url: + docparse_url = _convert_sync_to_async_submit_url(ARYN_DOCPARSE_URL) + else: + if aps_url: + aps_url = _convert_sync_to_async_submit_url(aps_url) + if docparse_url: + docparse_url = _convert_sync_to_async_submit_url(docparse_url) + + return _partition_file_inner( + file=file, + aryn_api_key=aryn_api_key, + aryn_config=aryn_config, + threshold=threshold, + use_ocr=use_ocr, + ocr_images=ocr_images, + extract_table_structure=extract_table_structure, + table_extraction_options=table_extraction_options, + extract_images=extract_images, + selected_pages=selected_pages, + chunking_options=chunking_options, + aps_url=aps_url, + docparse_url=docparse_url, + ssl_verify=ssl_verify, + output_format=output_format, + output_label_options=output_label_options, + webhook_url=webhook_url, + ) + + +def _convert_sync_to_async_submit_url(url: str) -> str: + parsed_url = urlparse(url) + assert parsed_url.path.startswith("/v1/") + if parsed_url.path.startswith("/v1/async/submit"): + return url + return urlunparse((*parsed_url[:2], f"/v1/async/submit{parsed_url.path[3:]}", *parsed_url[3:])) + + +def partition_file_async_result( + job_id: str, + *, + aryn_api_key: Optional[str] = None, + aryn_config: Optional[ArynConfig] = None, + ssl_verify: bool = True, + async_result_url: Optional[str] = None, +) -> dict[str, Any]: + """ + Get the results of an asynchronous partitioning job by job_id. Meant to be used with `partition_file_async_submit`. + + Returns: + A dict containing "status", "status_code", and also "result" which is "status" is "done". "status" can be + "done", "pending", "error", or "no_such_job". + + Unlike `partition_file`, this function does not raise an Exception if the partitioning failed. Note the + value corresponding to the "result" key of the returned dict contains what would have been the return value of + `partition_file` had the partitioning been done synchronously. + + Example: + See the examples in the docstring for `partition_file_async_submit` for a full example of how to use this + function. + """ + if not async_result_url: + async_result_url = _convert_sync_to_async_url(ARYN_DOCPARSE_URL) + + aryn_config = _process_config(aryn_api_key, aryn_config) + + specific_job_url = f"{async_result_url.rstrip('/')}/{job_id}" + http_header = {"Authorization": f"Bearer {aryn_config.api_key()}"} + response = requests.get(specific_job_url, headers=http_header, stream=_set_stream(), verify=ssl_verify) + + if response.status_code == 200: + return {"status": "done", "status_code": response.status_code, "result": response.json()} + elif response.status_code == 202: + return {"status": "pending", "status_code": response.status_code} + elif response.status_code == 404: + return {"status": "no_such_job", "status_code": response.status_code} + else: + return {"status": "error", "status_code": response.status_code} + + +def _convert_sync_to_async_url(url: str, prefix: str = "/result") -> str: + parsed_url = urlparse(url) + assert parsed_url.path.startswith("/v1/") + if parsed_url.path.startswith(f"/v1/async{prefix}"): + return url + return urlunparse((*parsed_url[:2], f"/v1/async{prefix}", *parsed_url[3:])) + + +def partition_file_async_cancel( + job_id: str, + *, + aryn_api_key: Optional[str] = None, + aryn_config: Optional[ArynConfig] = None, + ssl_verify: bool = True, + async_cancel_url: Optional[str] = None, +) -> bool: + """ + Cancel an asynchronous partitioning job by job_id. Meant to be used with `partition_file_async_submit`. + + Returns: + A bool indicating whether the job was successfully cancelled by this request. + + Example: + .. code-block:: python + + from aryn_sdk.partition import partition_file_async_submit, partition_file_async_cancel + job_id = partition_file_async_submit( + "path/to/file.pdf", + use_ocr=True, + extract_table_structure=True, + extract_images=True, + )["job_id"] + + partition_file_async_cancel(job_id) + """ + if not async_cancel_url: + async_cancel_url = _convert_sync_to_async_url(ARYN_DOCPARSE_URL, "/cancel") + + aryn_config = _process_config(aryn_api_key, aryn_config) + + specific_job_url = f"{async_cancel_url.rstrip('/')}/{job_id}" + http_header = {"Authorization": f"Bearer {aryn_config.api_key()}"} + response = requests.post(specific_job_url, headers=http_header, stream=_set_stream(), verify=ssl_verify) + if response.status_code == 200: + return True + elif response.status_code == 404: + return False + else: + raise Exception("Unexpected response code.") + + +def partition_file_async_list( + *, + aryn_api_key: Optional[str] = None, + aryn_config: Optional[ArynConfig] = None, + ssl_verify: bool = True, + async_list_url: Optional[str] = None, +) -> dict[str, Any]: + """ + List pending async jobs. + + Returns: + A dict containing "jobs" which is a dict containing jobs with their job_id as their key and their value is a + dict containing the keys "path" and "state". + + { + "jobs": { + "aryn:j-sc0v0lglkauo774pioflp4l": { + "path": "/v1/document/partition", + "state": "run" + }, + "aryn:j-0eorfmvhaf9skaxm0sagrrl": { + "path": "/v1/document/partition", + "state": "run" + }, + "aryn:j-b9xp7ny0eejvqvbazjhg8rn": { + "path": "/v1/document/partition", + "state": "run" + } + } + } + + Example: + .. code-block:: python + + from aryn_sdk.partition import partition_file_async_list + partition_file_async_list() + """ + if not async_list_url: + async_list_url = _convert_sync_to_async_url(ARYN_DOCPARSE_URL, "/list") + + aryn_config = _process_config(aryn_api_key, aryn_config) + + http_header = {"Authorization": f"Bearer {aryn_config.api_key()}"} + response = requests.get(async_list_url, headers=http_header, stream=_set_stream(), verify=ssl_verify) + + return response.json() + + # Heavily adapted from lib/sycamore/data/table.py::Table.to_csv() def table_elem_to_dataframe(elem: dict) -> Optional[pd.DataFrame]: """ @@ -353,7 +684,7 @@ def tables_to_pandas(data: dict) -> list[tuple[dict, Optional[pd.DataFrame]]]: with open("my-favorite-pdf.pdf", "rb") as f: data = partition_file( f, - aryn_api_key="MY-ARYN-TOKEN", + aryn_api_key="MY-ARYN-API-KEY", use_ocr=True, extract_table_structure=True, extract_images=True diff --git a/lib/aryn-sdk/aryn_sdk/test/resources/image/unsupported-format-test-document-image.heic b/lib/aryn-sdk/aryn_sdk/test/resources/image/unsupported-format-test-document-image.heic new file mode 100644 index 000000000..2d120dc62 Binary files /dev/null and b/lib/aryn-sdk/aryn_sdk/test/resources/image/unsupported-format-test-document-image.heic differ diff --git a/lib/aryn-sdk/aryn_sdk/test/resources/json/3m_output.json b/lib/aryn-sdk/aryn_sdk/test/resources/json/3m_output.json index 8d98f2116..b8e18af6f 100644 --- a/lib/aryn-sdk/aryn_sdk/test/resources/json/3m_output.json +++ b/lib/aryn-sdk/aryn_sdk/test/resources/json/3m_output.json @@ -4,39 +4,40 @@ "Until you get a line that matches ' ]\n', you can convert the partial", "output to a json document by appending '\"\"]}' to the partial output.", "", - "T+ 0.00: Server version 0.2024.06.28", - "T+ 0.00: Received request with aryn_call_id=c23b169c-62ed-452e-bd16-3f7483de2305", + "T+ 0.00: Server version aryn-partitioner-0.20250113.175427 Model version 1.4", + "T+ 0.00: Received request with aryn_call_id=72cb99f9-c127-4a83-88da-a0546cc1bd77", "T+ 0.00: Waiting for scheduling", "T+ 0.00: Preprocessing document", - "T+ 0.01: Done preprocessing document", - "T+ 0.93: completed page 1", + "T+ 0.00: Done preprocessing document", + "T+ 0.53: Completed work on page 1", "" ], + "status_code": 200, "elements": [ { "type": "Page-header", "bbox": [ - 0.09254883710075827, - 0.02588048761541193, - 0.18516457950367646, - 0.036462620821866125 + 0.09259089750402114, + 0.026172181909734554, + 0.18483936982996324, + 0.03624992023814808 ], "properties": { - "score": 0.7105238437652588, + "score": 0.7373364567756653, "page_number": 1 }, "text_representation": "Table of Contents \n" }, { - "type": "Section-header", + "type": "Caption", "bbox": [ - 0.0923248291015625, - 0.0672761327570135, - 0.3043683220358456, - 0.09994891079989347 + 0.09244359633501838, + 0.06721047834916548, + 0.30449961942784926, + 0.10004626187411221 ], "properties": { - "score": 0.4793054163455963, + "score": 0.3883528411388397, "page_number": 1 }, "text_representation": "3M Company and Subsidiaries\nConsolidated Statement of Cash Flow s\nYears ended December 31\n" @@ -44,31 +45,31 @@ { "type": "table", "bbox": [ - 0.0904532668169807, - 0.11156481656161221, - 0.8908106186810661, - 0.6249001242897727 + 0.0905675910500919, + 0.11067503495649858, + 0.8910016946231618, + 0.6249686501242898 ], "properties": { - "score": 0.9139547944068909, + "score": 0.9064249992370605, "title": null, "columns": null, "rows": null, "page_number": 1 }, - "table": null, - "_override_text": "(Millions)\nCash Flows from Operating Activities\nNet income including noncontrolling interest\nAdjustments to reconcile net income including noncontrolling interest to net cash\n 2018\n 2017\n 2016\n $\n 5,363 $\n 4,869 $\n 5,058 \n provided by operating activities\nDepreciation and amortization\nCompany pension and postretirement contributions\nCompany pension and postretirement expense\nStock-based compensation expense\nGain on sale of businesses\nDeferred income taxes\nChanges in assets and liabilities\n Accounts receivable\nInventories\nAccounts payable\nAccrued income taxes (current and long-term)\n Other — net\n Net cash provided by (used in) operating activities\n Cash Flows from Investing Activities\nPurchases of property, plant and equipment (PP&E)\nProceeds from sale of PP&E and other assets\nAcquisitions, net of cash acquired\nPurchases of marketable securities and investments\nProceeds from maturities and sale of marketable securities and investments\nProceeds from sale of businesses, net of cash sold\n Other — net\nNet cash provided by (used in) investing activities\n Cash Flows from Financing Activities\nChange in short-term debt — net\nRepayment of debt (maturities greater than 90 days)\nProceeds from debt (maturities greater than 90 days)\nPurchases of treasury stock\nProceeds from issuance of treasury stock pursuant to stock option and benefit plans\nDividends paid to shareholders\nOther — net\nNet cash provided by (used in) financing activities\n Effect of exchange rate changes on cash and cash equivalents\n Net increase (decrease) in cash and cash equivalents\nCash and cash equivalents at beginning of year\nCash and cash equivalents at end of period\n 1,488 \n(370) \n410 \n302 \n(545) \n(57) \n (305) \n(509) \n408 \n134 \n120 \n6,439 \n (1,577) \n262 \n13 \n(1,828) \n2,497 \n 846 \n 9 \n222 \n (284) \n(1,034) \n2,251 \n(4,870) \n485 \n(3,193) \n(56) \n(6,701) \n (160) \n 1,544 \n(967) \n334 \n324 \n(586) \n107 \n (245) \n(387) \n24 \n967 \n256 \n6,240 \n (1,373) \n49 \n(2,023) \n(2,152) \n1,354 \n 1,065 \n(6) \n(3,086) \n 578 \n(962) \n1,987 \n(2,068) \n734 \n(2,803) \n(121) \n(2,655) \n 156 \n (200) \n3,053 \n2,853 $\n 655 \n2,398 \n3,053 $\n $\n 1,474 \n(383) \n250 \n298 \n(111) \n 7 \n (313) \n57 \n148 \n101 \n76 \n6,662 \n (1,420) \n58 \n(16) \n(1,410) \n1,247 \n 142 \n(4) \n(1,403) \n (797) \n(992) \n2,832 \n(3,753) \n804 \n(2,678) \n(42) \n(4,626) \n (33) \n 600 \n1,798 \n2,398 \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n" + "text_representation": "(Millions)\nCash Flows from Operating Activities\nNet income including noncontrolling interest\nAdjustments to reconcile net income including noncontrolling interest to net cash\n 2018\n 2017\n 2016\n $\n 5,363 $\n 4,869 $\n 5,058 \n provided by operating activities\nDepreciation and amortization\nCompany pension and postretirement contributions\nCompany pension and postretirement expense\nStock-based compensation expense\nGain on sale of businesses\nDeferred income taxes\nChanges in assets and liabilities\n Accounts receivable\nInventories\nAccounts payable\nAccrued income taxes (current and long-term)\n Other \u2014 net\n Net cash provided by (used in) operating activities\n Cash Flows from Investing Activities\nPurchases of property, plant and equipment (PP&E)\nProceeds from sale of PP&E and other assets\nAcquisitions, net of cash acquired\nPurchases of marketable securities and investments\nProceeds from maturities and sale of marketable securities and investments\nProceeds from sale of businesses, net of cash sold\n Other \u2014 net\nNet cash provided by (used in) investing activities\n Cash Flows from Financing Activities\nChange in short-term debt \u2014 net\nRepayment of debt (maturities greater than 90 days)\nProceeds from debt (maturities greater than 90 days)\nPurchases of treasury stock\nProceeds from issuance of treasury stock pursuant to stock option and benefit plans\nDividends paid to shareholders\nOther \u2014 net\nNet cash provided by (used in) financing activities\n Effect of exchange rate changes on cash and cash equivalents\n Net increase (decrease) in cash and cash equivalents\nCash and cash equivalents at beginning of year\nCash and cash equivalents at end of period\n 1,488 \n(370) \n410 \n302 \n(545) \n(57) \n (305) \n(509) \n408 \n134 \n120 \n6,439 \n (1,577) \n262 \n13 \n(1,828) \n2,497 \n 846 \n 9 \n222 \n (284) \n(1,034) \n2,251 \n(4,870) \n485 \n(3,193) \n(56) \n(6,701) \n (160) \n 1,544 \n(967) \n334 \n324 \n(586) \n107 \n (245) \n(387) \n24 \n967 \n256 \n6,240 \n (1,373) \n49 \n(2,023) \n(2,152) \n1,354 \n 1,065 \n(6) \n(3,086) \n 578 \n(962) \n1,987 \n(2,068) \n734 \n(2,803) \n(121) \n(2,655) \n 156 \n (200) \n3,053 \n2,853 $\n 655 \n2,398 \n3,053 $\n $\n 1,474 \n(383) \n250 \n298 \n(111) \n 7 \n (313) \n57 \n148 \n101 \n76 \n6,662 \n (1,420) \n58 \n(16) \n(1,410) \n1,247 \n 142 \n(4) \n(1,403) \n (797) \n(992) \n2,832 \n(3,753) \n804 \n(2,678) \n(42) \n(4,626) \n (33) \n 600 \n1,798 \n2,398 \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n", + "table": null }, { "type": "Text", "bbox": [ - 0.09334101957433363, - 0.636026777787642, - 0.6066466567095589, - 0.6459264026988636 + 0.09298765294692096, + 0.6364184015447443, + 0.6064737477022059, + 0.6465563409978693 ], "properties": { - "score": 0.8346976041793823, + "score": 0.4158885180950165, "page_number": 1 }, "text_representation": "The accompanying Notes to Consolidated Financial Statements are an integral part of this statement.\n" @@ -76,13 +77,13 @@ { "type": "Page-footer", "bbox": [ - 0.47960643095128674, - 0.6814282781427556, - 0.4929185216567096, - 0.6909636896306818 + 0.47945628446691174, + 0.6812337979403409, + 0.49324290556066175, + 0.6917808393998579 ], "properties": { - "score": 0.8874315023422241, + "score": 0.908870279788971, "page_number": 1 }, "text_representation": "60\n" diff --git a/lib/aryn-sdk/aryn_sdk/test/resources/json/3m_output_ocr_table.json b/lib/aryn-sdk/aryn_sdk/test/resources/json/3m_output_ocr_table.json index 5aa5ef2d0..97d36a454 100644 --- a/lib/aryn-sdk/aryn_sdk/test/resources/json/3m_output_ocr_table.json +++ b/lib/aryn-sdk/aryn_sdk/test/resources/json/3m_output_ocr_table.json @@ -4,58 +4,60 @@ "Until you get a line that matches ' ]\n', you can convert the partial", "output to a json document by appending '\"\"]}' to the partial output.", "", - "T+ 0.00: Server version 0.2024.06.28", - "T+ 0.00: Received request with aryn_call_id=ae2c4bc0-57c9-4fb9-9210-55a4e43f73f0", + "T+ 0.00: Server version aryn-partitioner-0.20250113.175427 Model version 1.4", + "T+ 0.00: Received request with aryn_call_id=de1de753-1907-4404-a019-b6f8311ff1e6", "T+ 0.00: Waiting for scheduling", "T+ 0.00: Preprocessing document", - "T+ 0.01: Done preprocessing document", - "T+ 2.82: completed page 1", + "T+ 0.00: Done preprocessing document", + "T+ 0.82: Completed work on page 1", "" ], + "status_code": 200, "elements": [ { "type": "Page-header", "bbox": [ - 0.09254883710075827, - 0.02588048761541193, - 0.18516457950367646, - 0.036462620821866125 + 0.09259089750402114, + 0.026172181909734554, + 0.18483936982996324, + 0.03624992023814808 ], "properties": { - "score": 0.7105238437652588, + "score": 0.7373364567756653, "page_number": 1 }, - "text_representation": "" + "text_representation": "Table of Contents" }, { - "type": "Section-header", + "type": "Caption", "bbox": [ - 0.0923248291015625, - 0.0672761327570135, - 0.3043683220358456, - 0.09994891079989347 + 0.09244359633501838, + 0.06721047834916548, + 0.30449961942784926, + 0.10004626187411221 ], "properties": { - "score": 0.4793054163455963, + "score": 0.3883528411388397, "page_number": 1 }, - "text_representation": "3M Company and Subsidiaries\nConsolidated Statement of Cash Flows\nYears ended December 31\n" + "text_representation": "3M Company and Subsidiaries. Consolidated Statement of Cash Flow s Years ended December 31" }, { "type": "table", "bbox": [ - 0.0904532668169807, - 0.11156481656161221, - 0.8908106186810661, - 0.6249001242897727 + 0.0905675910500919, + 0.11067503495649858, + 0.8910016946231618, + 0.6249686501242898 ], "properties": { - "score": 0.9139547944068909, + "score": 0.9064249992370605, "title": null, - "columns": null, - "rows": null, + "columns": 4, + "rows": 39, "page_number": 1 }, + "text_representation": null, "table": { "cells": [ { @@ -68,10 +70,10 @@ ], "is_header": true, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.11292845292524858, - "x2": 0.5186885609346278, - "y2": 0.12201936201615766 + "x1": 0.09115582634420956, + "y1": 0.11203867132013494, + "x2": 0.5158617086971508, + "y2": 0.1234023076837713 }, "properties": {} }, @@ -85,10 +87,10 @@ ], "is_header": true, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.11292845292524858, - "x2": 0.6622179726993337, - "y2": 0.12201936201615766 + "x1": 0.5611558263442096, + "y1": 0.11203867132013494, + "x2": 0.6635087675206801, + "y2": 0.1234023076837713 }, "properties": {} }, @@ -102,10 +104,10 @@ ], "is_header": true, "bbox": { - "x1": 0.713394443287569, - "y1": 0.11292845292524858, - "x2": 0.7751591491699219, - "y2": 0.12201936201615766 + "x1": 0.6746852381089155, + "y1": 0.11203867132013494, + "x2": 0.7758617086971508, + "y2": 0.1234023076837713 }, "properties": {} }, @@ -119,15 +121,15 @@ ], "is_header": true, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.11292845292524858, - "x2": 0.8857473844640396, - "y2": 0.12201936201615766 + "x1": 0.7840970028147978, + "y1": 0.11203867132013494, + "x2": 0.8876264145795036, + "y2": 0.1234023076837713 }, "properties": {} }, { - "content": "Cash Flows Operating Activities from", + "content": "Cash Flows from Operating Activities", "rows": [ 1 ], @@ -139,30 +141,78 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.12201936201615766, - "x2": 0.8857473844640396, - "y2": 0.13611027110706675 + "x1": 0.09115582634420956, + "y1": 0.12294776222922585, + "x2": 0.8876264145795036, + "y2": 0.1343113985928622 }, "properties": {} }, { - "content": "Net income including noncontrolling interest 5,363 4.869 5,058", + "content": "Net income including noncontrolling interest.", + "rows": [ + 2 + ], + "cols": [ + 0 + ], + "is_header": false, + "bbox": { + "x1": 0.09115582634420956, + "y1": 0.13476594404740766, + "x2": 0.5158617086971508, + "y2": 0.14703867132013496 + }, + "properties": {} + }, + { + "content": "$ 5,363", + "rows": [ + 2 + ], + "cols": [ + 1 + ], + "is_header": false, + "bbox": { + "x1": 0.5611558263442096, + "y1": 0.13476594404740766, + "x2": 0.6635087675206801, + "y2": 0.14703867132013496 + }, + "properties": {} + }, + { + "content": "$ 4,869", + "rows": [ + 2 + ], + "cols": [ + 2 + ], + "is_header": false, + "bbox": { + "x1": 0.6746852381089155, + "y1": 0.13476594404740766, + "x2": 0.7758617086971508, + "y2": 0.14703867132013496 + }, + "properties": {} + }, + { + "content": "$ 5,058", "rows": [ 2 ], "cols": [ - 0, - 1, - 2, 3 ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.13338299837979403, - "x2": 0.8857473844640396, - "y2": 0.14701936201615767 + "x1": 0.7840970028147978, + "y1": 0.13476594404740766, + "x2": 0.8876264145795036, + "y2": 0.14703867132013496 }, "properties": {} }, @@ -172,17 +222,65 @@ 3 ], "cols": [ - 0, - 1, - 2, + 0 + ], + "is_header": false, + "bbox": { + "x1": 0.09115582634420956, + "y1": 0.14703867132013496, + "x2": 0.5158617086971508, + "y2": 0.16794776222922586 + }, + "properties": {} + }, + { + "content": "", + "rows": [ 3 ], + "cols": [ + 1 + ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.14429208928888496, - "x2": 0.8857473844640396, - "y2": 0.1697466347434304 + "x1": 0.5611558263442096, + "y1": 0.14703867132013496, + "x2": 0.6635087675206801, + "y2": 0.16794776222922586 + }, + "properties": {} + }, + { + "content": "", + "rows": [ + 3 + ], + "cols": [ + 2 + ], + "is_header": false, + "bbox": { + "x1": 0.6746852381089155, + "y1": 0.14703867132013496, + "x2": 0.7758617086971508, + "y2": 0.16794776222922586 + }, + "properties": {} + }, + { + "content": "", + "rows": [ + 3 + ], + "cols": [ + 3 + ], + "is_header": false, + "bbox": { + "x1": 0.7840970028147978, + "y1": 0.14703867132013496, + "x2": 0.8876264145795036, + "y2": 0.16794776222922586 }, "properties": {} }, @@ -196,10 +294,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.1688375438343395, - "x2": 0.5186885609346278, - "y2": 0.1806557256525213 + "x1": 0.09115582634420956, + "y1": 0.16840230768377132, + "x2": 0.5158617086971508, + "y2": 0.1815841258655895 }, "properties": {} }, @@ -213,10 +311,10 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.1688375438343395, - "x2": 0.6622179726993337, - "y2": 0.1806557256525213 + "x1": 0.5611558263442096, + "y1": 0.16840230768377132, + "x2": 0.6635087675206801, + "y2": 0.1815841258655895 }, "properties": {} }, @@ -230,10 +328,10 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.1688375438343395, - "x2": 0.7751591491699219, - "y2": 0.1806557256525213 + "x1": 0.6746852381089155, + "y1": 0.16840230768377132, + "x2": 0.7758617086971508, + "y2": 0.1815841258655895 }, "properties": {} }, @@ -247,10 +345,10 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.1688375438343395, - "x2": 0.8857473844640396, - "y2": 0.1806557256525213 + "x1": 0.7840970028147978, + "y1": 0.16840230768377132, + "x2": 0.8876264145795036, + "y2": 0.1815841258655895 }, "properties": {} }, @@ -264,10 +362,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.1797466347434304, - "x2": 0.5186885609346278, - "y2": 0.19247390747070312 + "x1": 0.09115582634420956, + "y1": 0.17976594404740767, + "x2": 0.5158617086971508, + "y2": 0.1934023076837713 }, "properties": {} }, @@ -281,15 +379,15 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.1797466347434304, - "x2": 0.6622179726993337, - "y2": 0.19247390747070312 + "x1": 0.5611558263442096, + "y1": 0.17976594404740767, + "x2": 0.6635087675206801, + "y2": 0.1934023076837713 }, "properties": {} }, { - "content": "967)", + "content": "(967)", "rows": [ 5 ], @@ -298,10 +396,10 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.1797466347434304, - "x2": 0.7751591491699219, - "y2": 0.19247390747070312 + "x1": 0.6746852381089155, + "y1": 0.17976594404740767, + "x2": 0.7758617086971508, + "y2": 0.1934023076837713 }, "properties": {} }, @@ -315,10 +413,10 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.1797466347434304, - "x2": 0.8857473844640396, - "y2": 0.19247390747070312 + "x1": 0.7840970028147978, + "y1": 0.17976594404740767, + "x2": 0.8876264145795036, + "y2": 0.1934023076837713 }, "properties": {} }, @@ -332,10 +430,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.1906557256525213, - "x2": 0.5186885609346278, - "y2": 0.20429208928888495 + "x1": 0.09115582634420956, + "y1": 0.19158412586558948, + "x2": 0.5158617086971508, + "y2": 0.20385685313831675 }, "properties": {} }, @@ -349,10 +447,10 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.1906557256525213, - "x2": 0.6622179726993337, - "y2": 0.20429208928888495 + "x1": 0.5611558263442096, + "y1": 0.19158412586558948, + "x2": 0.6635087675206801, + "y2": 0.20385685313831675 }, "properties": {} }, @@ -366,10 +464,10 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.1906557256525213, - "x2": 0.7751591491699219, - "y2": 0.20429208928888495 + "x1": 0.6746852381089155, + "y1": 0.19158412586558948, + "x2": 0.7758617086971508, + "y2": 0.20385685313831675 }, "properties": {} }, @@ -383,10 +481,10 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.1906557256525213, - "x2": 0.8857473844640396, - "y2": 0.20429208928888495 + "x1": 0.7840970028147978, + "y1": 0.19158412586558948, + "x2": 0.8876264145795036, + "y2": 0.20385685313831675 }, "properties": {} }, @@ -400,10 +498,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.20247390747070312, - "x2": 0.5186885609346278, - "y2": 0.21520118019797585 + "x1": 0.09115582634420956, + "y1": 0.20203867132013495, + "x2": 0.5158617086971508, + "y2": 0.2156750349564986 }, "properties": {} }, @@ -417,10 +515,10 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.20247390747070312, - "x2": 0.6622179726993337, - "y2": 0.21520118019797585 + "x1": 0.5611558263442096, + "y1": 0.20203867132013495, + "x2": 0.6635087675206801, + "y2": 0.2156750349564986 }, "properties": {} }, @@ -434,10 +532,10 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.20247390747070312, - "x2": 0.7751591491699219, - "y2": 0.21520118019797585 + "x1": 0.6746852381089155, + "y1": 0.20203867132013495, + "x2": 0.7758617086971508, + "y2": 0.2156750349564986 }, "properties": {} }, @@ -451,10 +549,10 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.20247390747070312, - "x2": 0.8857473844640396, - "y2": 0.21520118019797585 + "x1": 0.7840970028147978, + "y1": 0.20203867132013495, + "x2": 0.8876264145795036, + "y2": 0.2156750349564986 }, "properties": {} }, @@ -468,10 +566,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.21338299837979405, - "x2": 0.5186885609346278, - "y2": 0.22838299837979403 + "x1": 0.09115582634420956, + "y1": 0.21431139859286222, + "x2": 0.5158617086971508, + "y2": 0.2274932167746804 }, "properties": {} }, @@ -485,10 +583,10 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.21338299837979405, - "x2": 0.6622179726993337, - "y2": 0.22838299837979403 + "x1": 0.5611558263442096, + "y1": 0.21431139859286222, + "x2": 0.6635087675206801, + "y2": 0.2274932167746804 }, "properties": {} }, @@ -502,10 +600,10 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.21338299837979405, - "x2": 0.7751591491699219, - "y2": 0.22838299837979403 + "x1": 0.6746852381089155, + "y1": 0.21431139859286222, + "x2": 0.7758617086971508, + "y2": 0.2274932167746804 }, "properties": {} }, @@ -519,10 +617,10 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.21338299837979405, - "x2": 0.8857473844640396, - "y2": 0.22838299837979403 + "x1": 0.7840970028147978, + "y1": 0.21431139859286222, + "x2": 0.8876264145795036, + "y2": 0.2274932167746804 }, "properties": {} }, @@ -536,10 +634,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.22520118019797586, - "x2": 0.5186885609346278, - "y2": 0.23701936201615767 + "x1": 0.09115582634420956, + "y1": 0.22612958041104403, + "x2": 0.5158617086971508, + "y2": 0.2384023076837713 }, "properties": {} }, @@ -553,10 +651,10 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.22520118019797586, - "x2": 0.6622179726993337, - "y2": 0.23701936201615767 + "x1": 0.5611558263442096, + "y1": 0.22612958041104403, + "x2": 0.6635087675206801, + "y2": 0.2384023076837713 }, "properties": {} }, @@ -570,15 +668,15 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.22520118019797586, - "x2": 0.7751591491699219, - "y2": 0.23701936201615767 + "x1": 0.6746852381089155, + "y1": 0.22612958041104403, + "x2": 0.7758617086971508, + "y2": 0.2384023076837713 }, "properties": {} }, { - "content": "", + "content": "7", "rows": [ 9 ], @@ -587,15 +685,15 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.22520118019797586, - "x2": 0.8857473844640396, - "y2": 0.23701936201615767 + "x1": 0.7840970028147978, + "y1": 0.22612958041104403, + "x2": 0.8876264145795036, + "y2": 0.2384023076837713 }, "properties": {} }, { - "content": "Changes in assets and liabilities", + "content": "Changes in assets and liabilitie", "rows": [ 10 ], @@ -607,10 +705,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.23611027110706675, - "x2": 0.8857473844640396, - "y2": 0.2502011801979758 + "x1": 0.09115582634420956, + "y1": 0.23794776222922584, + "x2": 0.8876264145795036, + "y2": 0.24703867132013493 }, "properties": {} }, @@ -624,10 +722,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.24883754383433948, - "x2": 0.5186885609346278, - "y2": 0.2597466347434304 + "x1": 0.09115582634420956, + "y1": 0.2484023076837713, + "x2": 0.5158617086971508, + "y2": 0.2615841258655895 }, "properties": {} }, @@ -641,10 +739,10 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.24883754383433948, - "x2": 0.6622179726993337, - "y2": 0.2597466347434304 + "x1": 0.5611558263442096, + "y1": 0.2484023076837713, + "x2": 0.6635087675206801, + "y2": 0.2615841258655895 }, "properties": {} }, @@ -658,10 +756,10 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.24883754383433948, - "x2": 0.7751591491699219, - "y2": 0.2597466347434304 + "x1": 0.6746852381089155, + "y1": 0.2484023076837713, + "x2": 0.7758617086971508, + "y2": 0.2615841258655895 }, "properties": {} }, @@ -675,10 +773,10 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.24883754383433948, - "x2": 0.8857473844640396, - "y2": 0.2597466347434304 + "x1": 0.7840970028147978, + "y1": 0.2484023076837713, + "x2": 0.8876264145795036, + "y2": 0.2615841258655895 }, "properties": {} }, @@ -692,10 +790,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.2597466347434304, - "x2": 0.5186885609346278, - "y2": 0.27065572565252133 + "x1": 0.09115582634420956, + "y1": 0.25976594404740766, + "x2": 0.5158617086971508, + "y2": 0.27203867132013493 }, "properties": {} }, @@ -709,10 +807,10 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.2597466347434304, - "x2": 0.6622179726993337, - "y2": 0.27065572565252133 + "x1": 0.5611558263442096, + "y1": 0.25976594404740766, + "x2": 0.6635087675206801, + "y2": 0.27203867132013493 }, "properties": {} }, @@ -726,10 +824,10 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.2597466347434304, - "x2": 0.7751591491699219, - "y2": 0.27065572565252133 + "x1": 0.6746852381089155, + "y1": 0.25976594404740766, + "x2": 0.7758617086971508, + "y2": 0.27203867132013493 }, "properties": {} }, @@ -743,10 +841,10 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.2597466347434304, - "x2": 0.8857473844640396, - "y2": 0.27065572565252133 + "x1": 0.7840970028147978, + "y1": 0.25976594404740766, + "x2": 0.8876264145795036, + "y2": 0.27203867132013493 }, "properties": {} }, @@ -760,10 +858,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.27065572565252133, - "x2": 0.5186885609346278, - "y2": 0.2829284529252486 + "x1": 0.09115582634420956, + "y1": 0.271129580411044, + "x2": 0.5158617086971508, + "y2": 0.2834023076837713 }, "properties": {} }, @@ -777,10 +875,10 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.27065572565252133, - "x2": 0.6622179726993337, - "y2": 0.2829284529252486 + "x1": 0.5611558263442096, + "y1": 0.271129580411044, + "x2": 0.6635087675206801, + "y2": 0.2834023076837713 }, "properties": {} }, @@ -794,10 +892,10 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.27065572565252133, - "x2": 0.7751591491699219, - "y2": 0.2829284529252486 + "x1": 0.6746852381089155, + "y1": 0.271129580411044, + "x2": 0.7758617086971508, + "y2": 0.2834023076837713 }, "properties": {} }, @@ -811,15 +909,15 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.27065572565252133, - "x2": 0.8857473844640396, - "y2": 0.2829284529252486 + "x1": 0.7840970028147978, + "y1": 0.271129580411044, + "x2": 0.8876264145795036, + "y2": 0.2834023076837713 }, "properties": {} }, { - "content": "Accrued income taxes current and term) long-'", + "content": "Accrued income taxes (current and long-term)", "rows": [ 14 ], @@ -828,10 +926,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.28073021704121276, - "x2": 0.5186885609346278, - "y2": 0.29649032517292073 + "x1": 0.09115582634420956, + "y1": 0.2824932167746804, + "x2": 0.5158617086971508, + "y2": 0.2947659440474077 }, "properties": {} }, @@ -845,10 +943,10 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.28073021704121276, - "x2": 0.6622179726993337, - "y2": 0.29649032517292073 + "x1": 0.5611558263442096, + "y1": 0.2824932167746804, + "x2": 0.6635087675206801, + "y2": 0.2947659440474077 }, "properties": {} }, @@ -862,10 +960,10 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.28073021704121276, - "x2": 0.7751591491699219, - "y2": 0.29649032517292073 + "x1": 0.6746852381089155, + "y1": 0.2824932167746804, + "x2": 0.7758617086971508, + "y2": 0.2947659440474077 }, "properties": {} }, @@ -879,15 +977,15 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.28073021704121276, - "x2": 0.8857473844640396, - "y2": 0.29649032517292073 + "x1": 0.7840970028147978, + "y1": 0.2824932167746804, + "x2": 0.8876264145795036, + "y2": 0.2947659440474077 }, "properties": {} }, { - "content": "Other net", + "content": "Othernet", "rows": [ 15 ], @@ -896,10 +994,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.29338299837979404, - "x2": 0.5186885609346278, - "y2": 0.30429208928888496 + "x1": 0.09115582634420956, + "y1": 0.2934023076837713, + "x2": 0.5158617086971508, + "y2": 0.3065841258655895 }, "properties": {} }, @@ -913,10 +1011,10 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.29338299837979404, - "x2": 0.6622179726993337, - "y2": 0.30429208928888496 + "x1": 0.5611558263442096, + "y1": 0.2934023076837713, + "x2": 0.6635087675206801, + "y2": 0.3065841258655895 }, "properties": {} }, @@ -930,10 +1028,10 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.29338299837979404, - "x2": 0.7751591491699219, - "y2": 0.30429208928888496 + "x1": 0.6746852381089155, + "y1": 0.2934023076837713, + "x2": 0.7758617086971508, + "y2": 0.3065841258655895 }, "properties": {} }, @@ -947,15 +1045,15 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.29338299837979404, - "x2": 0.8857473844640396, - "y2": 0.30429208928888496 + "x1": 0.7840970028147978, + "y1": 0.2934023076837713, + "x2": 0.8876264145795036, + "y2": 0.3065841258655895 }, "properties": {} }, { - "content": "Net cash provided by (used in) operating activities", + "content": "Net cash provided by (used in) operating activities.", "rows": [ 16 ], @@ -964,10 +1062,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.3047466347434304, - "x2": 0.5186885609346278, - "y2": 0.3188375438343395 + "x1": 0.09115582634420956, + "y1": 0.3065841258655895, + "x2": 0.5158617086971508, + "y2": 0.3179477622292258 }, "properties": {} }, @@ -981,10 +1079,10 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.3047466347434304, - "x2": 0.6622179726993337, - "y2": 0.3188375438343395 + "x1": 0.5611558263442096, + "y1": 0.3065841258655895, + "x2": 0.6635087675206801, + "y2": 0.3179477622292258 }, "properties": {} }, @@ -998,15 +1096,15 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.3047466347434304, - "x2": 0.7751591491699219, - "y2": 0.3188375438343395 + "x1": 0.6746852381089155, + "y1": 0.3065841258655895, + "x2": 0.7758617086971508, + "y2": 0.3179477622292258 }, "properties": {} }, { - "content": "66_", + "content": "6,662", "rows": [ 16 ], @@ -1015,15 +1113,15 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.3047466347434304, - "x2": 0.8857473844640396, - "y2": 0.3188375438343395 + "x1": 0.7840970028147978, + "y1": 0.3065841258655895, + "x2": 0.8876264145795036, + "y2": 0.3179477622292258 }, "properties": {} }, { - "content": "Cash Flows from Investing Activities", + "content": "vities", "rows": [ 17 ], @@ -1035,10 +1133,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.3279284529252486, - "x2": 0.8857473844640396, - "y2": 0.3424739074707031 + "x1": 0.09115582634420956, + "y1": 0.32976594404740767, + "x2": 0.8876264145795036, + "y2": 0.3374932167746804 }, "properties": {} }, @@ -1052,10 +1150,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.3406557256525213, - "x2": 0.5186885609346278, - "y2": 0.35338299837979403 + "x1": 0.09115582634420956, + "y1": 0.3397659440474077, + "x2": 0.5158617086971508, + "y2": 0.3538568531383168 }, "properties": {} }, @@ -1069,10 +1167,10 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.3406557256525213, - "x2": 0.6622179726993337, - "y2": 0.35338299837979403 + "x1": 0.5611558263442096, + "y1": 0.3397659440474077, + "x2": 0.6635087675206801, + "y2": 0.3538568531383168 }, "properties": {} }, @@ -1086,15 +1184,15 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.3406557256525213, - "x2": 0.7751591491699219, - "y2": 0.35338299837979403 + "x1": 0.6746852381089155, + "y1": 0.3397659440474077, + "x2": 0.7758617086971508, + "y2": 0.3538568531383168 }, "properties": {} }, { - "content": "1,420)", + "content": "(1,420)", "rows": [ 18 ], @@ -1103,10 +1201,10 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.3406557256525213, - "x2": 0.8857473844640396, - "y2": 0.35338299837979403 + "x1": 0.7840970028147978, + "y1": 0.3397659440474077, + "x2": 0.8876264145795036, + "y2": 0.3538568531383168 }, "properties": {} }, @@ -1120,10 +1218,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.3524739074707031, - "x2": 0.5186885609346278, - "y2": 0.36338299837979404 + "x1": 0.09115582634420956, + "y1": 0.35203867132013494, + "x2": 0.5158617086971508, + "y2": 0.3652204895019531 }, "properties": {} }, @@ -1137,10 +1235,10 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.3524739074707031, - "x2": 0.6622179726993337, - "y2": 0.36338299837979404 + "x1": 0.5611558263442096, + "y1": 0.35203867132013494, + "x2": 0.6635087675206801, + "y2": 0.3652204895019531 }, "properties": {} }, @@ -1154,10 +1252,10 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.3524739074707031, - "x2": 0.7751591491699219, - "y2": 0.36338299837979404 + "x1": 0.6746852381089155, + "y1": 0.35203867132013494, + "x2": 0.7758617086971508, + "y2": 0.3652204895019531 }, "properties": {} }, @@ -1171,15 +1269,15 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.3524739074707031, - "x2": 0.8857473844640396, - "y2": 0.36338299837979404 + "x1": 0.7840970028147978, + "y1": 0.35203867132013494, + "x2": 0.8876264145795036, + "y2": 0.3652204895019531 }, "properties": {} }, { - "content": "Acquisitions net of cash acquired", + "content": "Acquisitions, net of cash acquired.", "rows": [ 20 ], @@ -1188,10 +1286,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.3624739074707031, - "x2": 0.5186885609346278, - "y2": 0.3756557256525213 + "x1": 0.09115582634420956, + "y1": 0.3638568531383168, + "x2": 0.5158617086971508, + "y2": 0.37612958041104405 }, "properties": {} }, @@ -1205,10 +1303,10 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.3624739074707031, - "x2": 0.6622179726993337, - "y2": 0.3756557256525213 + "x1": 0.5611558263442096, + "y1": 0.3638568531383168, + "x2": 0.6635087675206801, + "y2": 0.37612958041104405 }, "properties": {} }, @@ -1222,10 +1320,10 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.3624739074707031, - "x2": 0.7751591491699219, - "y2": 0.3756557256525213 + "x1": 0.6746852381089155, + "y1": 0.3638568531383168, + "x2": 0.7758617086971508, + "y2": 0.37612958041104405 }, "properties": {} }, @@ -1239,10 +1337,10 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.3624739074707031, - "x2": 0.8857473844640396, - "y2": 0.3756557256525213 + "x1": 0.7840970028147978, + "y1": 0.3638568531383168, + "x2": 0.8876264145795036, + "y2": 0.37612958041104405 }, "properties": {} }, @@ -1256,10 +1354,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.3747466347434304, - "x2": 0.5186885609346278, - "y2": 0.38701936201615766 + "x1": 0.09115582634420956, + "y1": 0.37476594404740765, + "x2": 0.5158617086971508, + "y2": 0.38794776222922583 }, "properties": {} }, @@ -1273,10 +1371,10 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.3747466347434304, - "x2": 0.6622179726993337, - "y2": 0.38701936201615766 + "x1": 0.5611558263442096, + "y1": 0.37476594404740765, + "x2": 0.6635087675206801, + "y2": 0.38794776222922583 }, "properties": {} }, @@ -1290,10 +1388,10 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.3747466347434304, - "x2": 0.7751591491699219, - "y2": 0.38701936201615766 + "x1": 0.6746852381089155, + "y1": 0.37476594404740765, + "x2": 0.7758617086971508, + "y2": 0.38794776222922583 }, "properties": {} }, @@ -1307,10 +1405,10 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.3747466347434304, - "x2": 0.8857473844640396, - "y2": 0.38701936201615766 + "x1": 0.7840970028147978, + "y1": 0.37476594404740765, + "x2": 0.8876264145795036, + "y2": 0.38794776222922583 }, "properties": {} }, @@ -1324,10 +1422,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.38611027110706675, - "x2": 0.5186885609346278, - "y2": 0.3979284529252486 + "x1": 0.09115582634420956, + "y1": 0.38612958041104406, + "x2": 0.5158617086971508, + "y2": 0.3997659440474077 }, "properties": {} }, @@ -1341,10 +1439,10 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.38611027110706675, - "x2": 0.6622179726993337, - "y2": 0.3979284529252486 + "x1": 0.5611558263442096, + "y1": 0.38612958041104406, + "x2": 0.6635087675206801, + "y2": 0.3997659440474077 }, "properties": {} }, @@ -1358,10 +1456,10 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.38611027110706675, - "x2": 0.7751591491699219, - "y2": 0.3979284529252486 + "x1": 0.6746852381089155, + "y1": 0.38612958041104406, + "x2": 0.7758617086971508, + "y2": 0.3997659440474077 }, "properties": {} }, @@ -1375,10 +1473,10 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.38611027110706675, - "x2": 0.8857473844640396, - "y2": 0.3979284529252486 + "x1": 0.7840970028147978, + "y1": 0.38612958041104406, + "x2": 0.8876264145795036, + "y2": 0.3997659440474077 }, "properties": {} }, @@ -1392,10 +1490,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.3974739074707031, - "x2": 0.5186885609346278, - "y2": 0.41429208928888495 + "x1": 0.09115582634420956, + "y1": 0.39840230768377133, + "x2": 0.5158617086971508, + "y2": 0.4147659440474077 }, "properties": {} }, @@ -1409,10 +1507,10 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.3974739074707031, - "x2": 0.6622179726993337, - "y2": 0.41429208928888495 + "x1": 0.5611558263442096, + "y1": 0.39840230768377133, + "x2": 0.6635087675206801, + "y2": 0.4147659440474077 }, "properties": {} }, @@ -1426,10 +1524,10 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.3974739074707031, - "x2": 0.7751591491699219, - "y2": 0.41429208928888495 + "x1": 0.6746852381089155, + "y1": 0.39840230768377133, + "x2": 0.7758617086971508, + "y2": 0.4147659440474077 }, "properties": {} }, @@ -1443,15 +1541,15 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.3974739074707031, - "x2": 0.8857473844640396, - "y2": 0.41429208928888495 + "x1": 0.7840970028147978, + "y1": 0.39840230768377133, + "x2": 0.8876264145795036, + "y2": 0.4147659440474077 }, "properties": {} }, { - "content": "Other net", + "content": "Other-net", "rows": [ 24 ], @@ -1460,15 +1558,15 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.41338299837979403, - "x2": 0.5186885609346278, - "y2": 0.42429208928888495 + "x1": 0.09115582634420956, + "y1": 0.41385685313831677, + "x2": 0.5158617086971508, + "y2": 0.42612958041104404 }, "properties": {} }, { - "content": "", + "content": "9", "rows": [ 24 ], @@ -1477,15 +1575,15 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.41338299837979403, - "x2": 0.6622179726993337, - "y2": 0.42429208928888495 + "x1": 0.5611558263442096, + "y1": 0.41385685313831677, + "x2": 0.6635087675206801, + "y2": 0.42612958041104404 }, "properties": {} }, { - "content": "", + "content": "(6)", "rows": [ 24 ], @@ -1494,15 +1592,15 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.41338299837979403, - "x2": 0.7751591491699219, - "y2": 0.42429208928888495 + "x1": 0.6746852381089155, + "y1": 0.41385685313831677, + "x2": 0.7758617086971508, + "y2": 0.42612958041104404 }, "properties": {} }, { - "content": "", + "content": "(4)", "rows": [ 24 ], @@ -1511,10 +1609,10 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.41338299837979403, - "x2": 0.8857473844640396, - "y2": 0.42429208928888495 + "x1": 0.7840970028147978, + "y1": 0.41385685313831677, + "x2": 0.8876264145795036, + "y2": 0.42612958041104404 }, "properties": {} }, @@ -1528,10 +1626,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.42429208928888495, - "x2": 0.5186885609346278, - "y2": 0.43792845292524857 + "x1": 0.09115582634420956, + "y1": 0.4252204895019531, + "x2": 0.5158617086971508, + "y2": 0.4384023076837713 }, "properties": {} }, @@ -1545,15 +1643,15 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.42429208928888495, - "x2": 0.6622179726993337, - "y2": 0.43792845292524857 + "x1": 0.5611558263442096, + "y1": 0.4252204895019531, + "x2": 0.6635087675206801, + "y2": 0.4384023076837713 }, "properties": {} }, { - "content": "(3,086", + "content": "(3,086)", "rows": [ 25 ], @@ -1562,15 +1660,15 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.42429208928888495, - "x2": 0.7751591491699219, - "y2": 0.43792845292524857 + "x1": 0.6746852381089155, + "y1": 0.4252204895019531, + "x2": 0.7758617086971508, + "y2": 0.4384023076837713 }, "properties": {} }, { - "content": "1.403)", + "content": "(1,403)", "rows": [ 25 ], @@ -1579,10 +1677,10 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.42429208928888495, - "x2": 0.8857473844640396, - "y2": 0.43792845292524857 + "x1": 0.7840970028147978, + "y1": 0.4252204895019531, + "x2": 0.8876264145795036, + "y2": 0.4384023076837713 }, "properties": {} }, @@ -1599,15 +1697,15 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.448382998379794, - "x2": 0.8857473844640396, - "y2": 0.4615648165616122 + "x1": 0.09115582634420956, + "y1": 0.44885685313831675, + "x2": 0.8876264145795036, + "y2": 0.4606750349564986 }, "properties": {} }, { - "content": "Change in short-term debt net", + "content": "Change in short-term debt - net", "rows": [ 27 ], @@ -1616,10 +1714,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.4597466347434304, - "x2": 0.5186885609346278, - "y2": 0.4738375438343395 + "x1": 0.09115582634420956, + "y1": 0.45931139859286224, + "x2": 0.5158617086971508, + "y2": 0.47385685313831677 }, "properties": {} }, @@ -1633,10 +1731,10 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.4597466347434304, - "x2": 0.6622179726993337, - "y2": 0.4738375438343395 + "x1": 0.5611558263442096, + "y1": 0.45931139859286224, + "x2": 0.6635087675206801, + "y2": 0.47385685313831677 }, "properties": {} }, @@ -1650,15 +1748,15 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.4597466347434304, - "x2": 0.7751591491699219, - "y2": 0.4738375438343395 + "x1": 0.6746852381089155, + "y1": 0.45931139859286224, + "x2": 0.7758617086971508, + "y2": 0.47385685313831677 }, "properties": {} }, { - "content": "797", + "content": "(797)", "rows": [ 27 ], @@ -1667,10 +1765,10 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.4597466347434304, - "x2": 0.8857473844640396, - "y2": 0.4738375438343395 + "x1": 0.7840970028147978, + "y1": 0.45931139859286224, + "x2": 0.8876264145795036, + "y2": 0.47385685313831677 }, "properties": {} }, @@ -1684,10 +1782,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.4706557256525213, - "x2": 0.5186885609346278, - "y2": 0.4847466347434304 + "x1": 0.09115582634420956, + "y1": 0.4715841258655895, + "x2": 0.5158617086971508, + "y2": 0.4847659440474077 }, "properties": {} }, @@ -1701,10 +1799,10 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.4706557256525213, - "x2": 0.6622179726993337, - "y2": 0.4847466347434304 + "x1": 0.5611558263442096, + "y1": 0.4715841258655895, + "x2": 0.6635087675206801, + "y2": 0.4847659440474077 }, "properties": {} }, @@ -1718,15 +1816,15 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.4706557256525213, - "x2": 0.7751591491699219, - "y2": 0.4847466347434304 + "x1": 0.6746852381089155, + "y1": 0.4715841258655895, + "x2": 0.7758617086971508, + "y2": 0.4847659440474077 }, "properties": {} }, { - "content": "992)", + "content": "(992)", "rows": [ 28 ], @@ -1735,15 +1833,15 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.4706557256525213, - "x2": 0.8857473844640396, - "y2": 0.4847466347434304 + "x1": 0.7840970028147978, + "y1": 0.4715841258655895, + "x2": 0.8876264145795036, + "y2": 0.4847659440474077 }, "properties": {} }, { - "content": "Proceeds debt (maturities greater than 90 from days)", + "content": "Proceeds from debt (maturities greater than 90 days)", "rows": [ 29 ], @@ -1752,10 +1850,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.4821102543713353, - "x2": 0.5186885609346278, - "y2": 0.4964739242064346 + "x1": 0.09115582634420956, + "y1": 0.48294776222922586, + "x2": 0.5158617086971508, + "y2": 0.4965841258655895 }, "properties": {} }, @@ -1769,10 +1867,10 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.4821102543713353, - "x2": 0.6622179726993337, - "y2": 0.4964739242064346 + "x1": 0.5611558263442096, + "y1": 0.48294776222922586, + "x2": 0.6635087675206801, + "y2": 0.4965841258655895 }, "properties": {} }, @@ -1786,10 +1884,10 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.4821102543713353, - "x2": 0.7751591491699219, - "y2": 0.4964739242064346 + "x1": 0.6746852381089155, + "y1": 0.48294776222922586, + "x2": 0.7758617086971508, + "y2": 0.4965841258655895 }, "properties": {} }, @@ -1803,10 +1901,10 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.4821102543713353, - "x2": 0.8857473844640396, - "y2": 0.4964739242064346 + "x1": 0.7840970028147978, + "y1": 0.48294776222922586, + "x2": 0.8876264145795036, + "y2": 0.4965841258655895 }, "properties": {} }, @@ -1820,10 +1918,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.49338299837979405, - "x2": 0.5186885609346278, - "y2": 0.5070193620161577 + "x1": 0.09115582634420956, + "y1": 0.49476594404740765, + "x2": 0.5158617086971508, + "y2": 0.5079477622292259 }, "properties": {} }, @@ -1837,15 +1935,15 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.49338299837979405, - "x2": 0.6622179726993337, - "y2": 0.5070193620161577 + "x1": 0.5611558263442096, + "y1": 0.49476594404740765, + "x2": 0.6635087675206801, + "y2": 0.5079477622292259 }, "properties": {} }, { - "content": "(2,068", + "content": "(2,068)", "rows": [ 30 ], @@ -1854,10 +1952,10 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.49338299837979405, - "x2": 0.7751591491699219, - "y2": 0.5070193620161577 + "x1": 0.6746852381089155, + "y1": 0.49476594404740765, + "x2": 0.7758617086971508, + "y2": 0.5079477622292259 }, "properties": {} }, @@ -1871,10 +1969,10 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.49338299837979405, - "x2": 0.8857473844640396, - "y2": 0.5070193620161577 + "x1": 0.7840970028147978, + "y1": 0.49476594404740765, + "x2": 0.8876264145795036, + "y2": 0.5079477622292259 }, "properties": {} }, @@ -1888,10 +1986,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.5056557256525213, - "x2": 0.5186885609346278, - "y2": 0.5179284529252486 + "x1": 0.09115582634420956, + "y1": 0.506129580411044, + "x2": 0.5158617086971508, + "y2": 0.5179477622292259 }, "properties": {} }, @@ -1905,10 +2003,10 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.5056557256525213, - "x2": 0.6622179726993337, - "y2": 0.5179284529252486 + "x1": 0.5611558263442096, + "y1": 0.506129580411044, + "x2": 0.6635087675206801, + "y2": 0.5179477622292259 }, "properties": {} }, @@ -1922,10 +2020,10 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.5056557256525213, - "x2": 0.7751591491699219, - "y2": 0.5179284529252486 + "x1": 0.6746852381089155, + "y1": 0.506129580411044, + "x2": 0.7758617086971508, + "y2": 0.5179477622292259 }, "properties": {} }, @@ -1939,10 +2037,10 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.5056557256525213, - "x2": 0.8857473844640396, - "y2": 0.5179284529252486 + "x1": 0.7840970028147978, + "y1": 0.506129580411044, + "x2": 0.8876264145795036, + "y2": 0.5179477622292259 }, "properties": {} }, @@ -1956,10 +2054,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.5161102711070668, - "x2": 0.5186885609346278, - "y2": 0.5292920892888849 + "x1": 0.09115582634420956, + "y1": 0.5174932167746804, + "x2": 0.5158617086971508, + "y2": 0.5302204895019531 }, "properties": {} }, @@ -1973,15 +2071,15 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.5161102711070668, - "x2": 0.6622179726993337, - "y2": 0.5292920892888849 + "x1": 0.5611558263442096, + "y1": 0.5174932167746804, + "x2": 0.6635087675206801, + "y2": 0.5302204895019531 }, "properties": {} }, { - "content": "(2,803_", + "content": "(2,803)", "rows": [ 32 ], @@ -1990,15 +2088,15 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.5161102711070668, - "x2": 0.7751591491699219, - "y2": 0.5292920892888849 + "x1": 0.6746852381089155, + "y1": 0.5174932167746804, + "x2": 0.7758617086971508, + "y2": 0.5302204895019531 }, "properties": {} }, { - "content": "(2,678", + "content": "(2,678)", "rows": [ 32 ], @@ -2007,15 +2105,15 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.5161102711070668, - "x2": 0.8857473844640396, - "y2": 0.5292920892888849 + "x1": 0.7840970028147978, + "y1": 0.5174932167746804, + "x2": 0.8876264145795036, + "y2": 0.5302204895019531 }, "properties": {} }, { - "content": "Other net", + "content": "Other -net", "rows": [ 33 ], @@ -2024,10 +2122,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.5288375438343395, - "x2": 0.5186885609346278, - "y2": 0.5397466347434304 + "x1": 0.09115582634420956, + "y1": 0.5293113985928622, + "x2": 0.5158617086971508, + "y2": 0.5411295804110441 }, "properties": {} }, @@ -2041,10 +2139,10 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.5288375438343395, - "x2": 0.6622179726993337, - "y2": 0.5397466347434304 + "x1": 0.5611558263442096, + "y1": 0.5293113985928622, + "x2": 0.6635087675206801, + "y2": 0.5411295804110441 }, "properties": {} }, @@ -2058,15 +2156,15 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.5288375438343395, - "x2": 0.7751591491699219, - "y2": 0.5397466347434304 + "x1": 0.6746852381089155, + "y1": 0.5293113985928622, + "x2": 0.7758617086971508, + "y2": 0.5411295804110441 }, "properties": {} }, { - "content": "42)", + "content": "(42)", "rows": [ 33 ], @@ -2075,10 +2173,10 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.5288375438343395, - "x2": 0.8857473844640396, - "y2": 0.5397466347434304 + "x1": 0.7840970028147978, + "y1": 0.5293113985928622, + "x2": 0.8876264145795036, + "y2": 0.5411295804110441 }, "properties": {} }, @@ -2092,15 +2190,15 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.5397466347434304, - "x2": 0.5186885609346278, - "y2": 0.553382998379794 + "x1": 0.09115582634420956, + "y1": 0.5411295804110441, + "x2": 0.5158617086971508, + "y2": 0.5543113985928623 }, "properties": {} }, { - "content": "6,701)", + "content": "(6,701)", "rows": [ 34 ], @@ -2109,15 +2207,15 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.5397466347434304, - "x2": 0.6622179726993337, - "y2": 0.553382998379794 + "x1": 0.5611558263442096, + "y1": 0.5411295804110441, + "x2": 0.6635087675206801, + "y2": 0.5543113985928623 }, "properties": {} }, { - "content": "(2655)", + "content": "(2,655)", "rows": [ 34 ], @@ -2126,15 +2224,15 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.5397466347434304, - "x2": 0.7751591491699219, - "y2": 0.553382998379794 + "x1": 0.6746852381089155, + "y1": 0.5411295804110441, + "x2": 0.7758617086971508, + "y2": 0.5543113985928623 }, "properties": {} }, { - "content": "4626)", + "content": "(4,626)", "rows": [ 34 ], @@ -2143,10 +2241,10 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.5397466347434304, - "x2": 0.8857473844640396, - "y2": 0.553382998379794 + "x1": 0.7840970028147978, + "y1": 0.5411295804110441, + "x2": 0.8876264145795036, + "y2": 0.5543113985928623 }, "properties": {} }, @@ -2160,10 +2258,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.5638375438343395, - "x2": 0.5186885609346278, - "y2": 0.5774739074707032 + "x1": 0.09115582634420956, + "y1": 0.5652204895019531, + "x2": 0.5158617086971508, + "y2": 0.5774932167746804 }, "properties": {} }, @@ -2177,10 +2275,10 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.5638375438343395, - "x2": 0.6622179726993337, - "y2": 0.5774739074707032 + "x1": 0.5611558263442096, + "y1": 0.5652204895019531, + "x2": 0.6635087675206801, + "y2": 0.5774932167746804 }, "properties": {} }, @@ -2194,15 +2292,15 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.5638375438343395, - "x2": 0.7751591491699219, - "y2": 0.5774739074707032 + "x1": 0.6746852381089155, + "y1": 0.5652204895019531, + "x2": 0.7758617086971508, + "y2": 0.5774932167746804 }, "properties": {} }, { - "content": "33", + "content": "(33)", "rows": [ 35 ], @@ -2211,15 +2309,15 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.5638375438343395, - "x2": 0.8857473844640396, - "y2": 0.5774739074707032 + "x1": 0.7840970028147978, + "y1": 0.5652204895019531, + "x2": 0.8876264145795036, + "y2": 0.5774932167746804 }, "properties": {} }, { - "content": "Net increase (decrease) in cash and cash equivalents", + "content": "Net increase (decrease) in cash and cash equivalents.", "rows": [ 36 ], @@ -2228,10 +2326,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.5870193620161577, - "x2": 0.5186885609346278, - "y2": 0.5997466347434304 + "x1": 0.09115582634420956, + "y1": 0.5884023076837713, + "x2": 0.5158617086971508, + "y2": 0.6002204895019532 }, "properties": {} }, @@ -2245,10 +2343,10 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.5870193620161577, - "x2": 0.6622179726993337, - "y2": 0.5997466347434304 + "x1": 0.5611558263442096, + "y1": 0.5884023076837713, + "x2": 0.6635087675206801, + "y2": 0.6002204895019532 }, "properties": {} }, @@ -2262,10 +2360,10 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.5870193620161577, - "x2": 0.7751591491699219, - "y2": 0.5997466347434304 + "x1": 0.6746852381089155, + "y1": 0.5884023076837713, + "x2": 0.7758617086971508, + "y2": 0.6002204895019532 }, "properties": {} }, @@ -2279,15 +2377,15 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.5870193620161577, - "x2": 0.8857473844640396, - "y2": 0.5997466347434304 + "x1": 0.7840970028147978, + "y1": 0.5884023076837713, + "x2": 0.8876264145795036, + "y2": 0.6002204895019532 }, "properties": {} }, { - "content": "Cash and cash equivalents at beginning of year", + "content": "Cash and cash equivalents at beginning of year.", "rows": [ 37 ], @@ -2296,10 +2394,10 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.5979284529252485, - "x2": 0.5186885609346278, - "y2": 0.6115648165616122 + "x1": 0.09115582634420956, + "y1": 0.5988568531383167, + "x2": 0.5158617086971508, + "y2": 0.611129580411044 }, "properties": {} }, @@ -2313,15 +2411,15 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.5979284529252485, - "x2": 0.6622179726993337, - "y2": 0.6115648165616122 + "x1": 0.5611558263442096, + "y1": 0.5988568531383167, + "x2": 0.6635087675206801, + "y2": 0.611129580411044 }, "properties": {} }, { - "content": "2.398", + "content": "2,398", "rows": [ 37 ], @@ -2330,10 +2428,10 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.5979284529252485, - "x2": 0.7751591491699219, - "y2": 0.6115648165616122 + "x1": 0.6746852381089155, + "y1": 0.5988568531383167, + "x2": 0.7758617086971508, + "y2": 0.611129580411044 }, "properties": {} }, @@ -2347,10 +2445,10 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.5979284529252485, - "x2": 0.8857473844640396, - "y2": 0.6115648165616122 + "x1": 0.7840970028147978, + "y1": 0.5988568531383167, + "x2": 0.8876264145795036, + "y2": 0.611129580411044 }, "properties": {} }, @@ -2364,15 +2462,15 @@ ], "is_header": false, "bbox": { - "x1": 0.0904532668169807, - "y1": 0.6106557256525214, - "x2": 0.5186885609346278, - "y2": 0.623382998379794 + "x1": 0.09115582634420956, + "y1": 0.6115841258655895, + "x2": 0.5158617086971508, + "y2": 0.6243113985928622 }, "properties": {} }, { - "content": "2,853", + "content": "S 2,853", "rows": [ 38 ], @@ -2381,15 +2479,15 @@ ], "is_header": false, "bbox": { - "x1": 0.5992767962287454, - "y1": 0.6106557256525214, - "x2": 0.6622179726993337, - "y2": 0.623382998379794 + "x1": 0.5611558263442096, + "y1": 0.6115841258655895, + "x2": 0.6635087675206801, + "y2": 0.6243113985928622 }, "properties": {} }, { - "content": "3,053", + "content": "S 3,053", "rows": [ 38 ], @@ -2398,10 +2496,10 @@ ], "is_header": false, "bbox": { - "x1": 0.713394443287569, - "y1": 0.6106557256525214, - "x2": 0.7751591491699219, - "y2": 0.623382998379794 + "x1": 0.6746852381089155, + "y1": 0.6115841258655895, + "x2": 0.7758617086971508, + "y2": 0.6243113985928622 }, "properties": {} }, @@ -2415,46 +2513,52 @@ ], "is_header": false, "bbox": { - "x1": 0.8239826785816866, - "y1": 0.6106557256525214, - "x2": 0.8857473844640396, - "y2": 0.623382998379794 + "x1": 0.7840970028147978, + "y1": 0.6115841258655895, + "x2": 0.8876264145795036, + "y2": 0.6243113985928622 }, "properties": {} } ], "caption": null, "num_rows": 39, - "num_cols": 4 + "num_cols": 4, + "column_headers": [ + "(Millions)", + "2018", + "2017", + "2016" + ] } }, { "type": "Text", "bbox": [ - 0.09334101957433363, - 0.636026777787642, - 0.6066466567095589, - 0.6459264026988636 + 0.09298765294692096, + 0.6364184015447443, + 0.6064737477022059, + 0.6465563409978693 ], "properties": { - "score": 0.8346976041793823, + "score": 0.4158885180950165, "page_number": 1 }, - "text_representation": "The accompanying Notes to Consolidated Financial Statements are an integral part of this statement.\n" + "text_representation": "The accompanying Notes to Consolidated Financial Statements are an integral part of this statement" }, { "type": "Page-footer", "bbox": [ - 0.47960643095128674, - 0.6814282781427556, - 0.4929185216567096, - 0.6909636896306818 + 0.47945628446691174, + 0.6812337979403409, + 0.49324290556066175, + 0.6917808393998579 ], "properties": { - "score": 0.8874315023422241, + "score": 0.908870279788971, "page_number": 1 }, - "text_representation": "" + "text_representation": "60" } ] } diff --git a/lib/aryn-sdk/aryn_sdk/test/resources/json/SPsort_output_images_page1.json b/lib/aryn-sdk/aryn_sdk/test/resources/json/SPsort_output_images_page1.json index 67f377f2f..cf30d0ca0 100644 --- a/lib/aryn-sdk/aryn_sdk/test/resources/json/SPsort_output_images_page1.json +++ b/lib/aryn-sdk/aryn_sdk/test/resources/json/SPsort_output_images_page1.json @@ -4,39 +4,54 @@ "Until you get a line that matches ' ]\n', you can convert the partial", "output to a json document by appending '\"\"]}' to the partial output.", "", - "T+ 0.00: Server version 0.2024.06.28", - "T+ 0.00: Received request with aryn_call_id=6a1d30a9-1f0d-4349-8c00-af329d7fbcc6", + "T+ 0.00: Server version aryn-partitioner-0.20250113.175427 Model version 1.4", + "T+ 0.00: Received request with aryn_call_id=a08a78ea-ccc0-4a01-aa4d-dfe44d7af015", "T+ 0.00: Waiting for scheduling", "T+ 0.00: Preprocessing document", "T+ 0.01: Done preprocessing document", - "T+ 2.61: completed page 1", + "T+ 0.49: Completed work on page 1", "" ], + "status_code": 200, "elements": [ { "type": "Section-header", "bbox": [ - 0.25801759607651653, - 0.14799718683416194, - 0.7434876206341912, - 0.16699282559481535 + 0.25692743637982535, + 0.14604660034179687, + 0.7428384937959559, + 0.16785587657581677 ], "properties": { - "score": 0.4984451234340668, + "score": 0.5503925085067749, "page_number": 1 }, "text_representation": "SPsort: How to Sort a Terabyte Quickly\n" }, + { + "type": "Text", + "bbox": [ + 0.32150031594669115, + 0.19034801136363635, + 0.6775626148897059, + 0.20603222933682527 + ], + "properties": { + "score": 0.4909828007221222, + "page_number": 1 + }, + "text_representation": "Jim Wyllie (wyllie@almaden.ibm.com)\n" + }, { "type": "Section-header", "bbox": [ - 0.43552116842830885, - 0.22252555153586648, - 0.5647090059168198, - 0.2349723399769176 + 0.43477129767922795, + 0.22159348921342328, + 0.5650739602481618, + 0.23526009299538353 ], "properties": { - "score": 0.6633284687995911, + "score": 0.6119033098220825, "page_number": 1 }, "text_representation": "February 4, 1999\n" @@ -44,13 +59,13 @@ { "type": "Section-header", "bbox": [ - 0.1457416399787454, - 0.26848521839488637, - 0.22136041977826287, - 0.28215559525923295 + 0.14695667042451746, + 0.2672817715731534, + 0.22006292006548714, + 0.2824556940252131 ], "properties": { - "score": 0.8485802412033081, + "score": 0.8749006986618042, "page_number": 1 }, "text_representation": "Abstract\n" @@ -58,13 +73,13 @@ { "type": "Text", "bbox": [ - 0.14681899126838235, - 0.30218597412109377, - 0.848895263671875, - 0.41659298983487214 + 0.14688757503733915, + 0.30218639026988636, + 0.84962890625, + 0.4172032304243608 ], "properties": { - "score": 0.9298610091209412, + "score": 0.9318665862083435, "page_number": 1 }, "text_representation": "In December 1998, a 488 node IBM RS/6000 SP* sorted a terabyte of data (10 billion 100 byte records) in\n17 minutes, 37 seconds. This is more than 2.5 times faster than the previous record for a problem of this\nmagnitude. The SPsort program itself was custom-designed for this benchmark, but the cluster, its\ninterconnection hardware, disk subsystem, operating system, file system, communication library, and job\nmanagement software are all IBM products. The system sustained an aggregate data rate of 2.8 GB/s from\nmore than 6 TB of disks managed by the GPFS global shared file system during the sort. Simultaneous\nwith these transfers, 1.9 GB/s of local disk I/O and 5.6 GB/s of interprocessor communication were also\nsustained.\n" @@ -72,13 +87,13 @@ { "type": "Section-header", "bbox": [ - 0.14619633394129136, - 0.45649486194957384, - 0.249862258013557, - 0.47087474476207386 + 0.14624366311465992, + 0.4559353915127841, + 0.25027528650620406, + 0.47113031560724433 ], "properties": { - "score": 0.8512004613876343, + "score": 0.8872892260551453, "page_number": 1 }, "text_representation": "Introduction\n" @@ -86,27 +101,27 @@ { "type": "Text", "bbox": [ - 0.14677959666532628, - 0.4913937655362216, - 0.8547881002987132, - 0.64927001953125 + 0.146956984576057, + 0.49155412153764205, + 0.8547768985523897, + 0.6496059903231534 ], "properties": { - "score": 0.9147736430168152, + "score": 0.9315835237503052, "page_number": 1 }, - "text_representation": "The speed of sorting has long been used as a measure of computer systems I/O and communication\nperformance. In 1985, an article in Datamation magazine proposed a sort of one million records of 100\nbytes each, with random 10 bytes keys, as a useful measure of computer systems I/O performance [1]. The\nground rules of that benchmark require that all input must start on disk, all output must end on disk, and\nthat the overhead to start the program and create the output files must be included in the benchmark time.\nInput and output must use operating system files, not raw disk partitions. The first published time for this\nbenchmark was an hour [12]. With constant improvements in computer hardware and sort algorithms, this\ntime diminished to just a few seconds [7]. At that point, variations on the basic theme evolved [6].\n“MinuteSort” [3, 8] measures how much can be sorted in one minute and “PennySort” [5] measures how\nmuch can be sorted for one cent, assuming a particular depreciation period. Recently, several groups\nreported sorting one terabyte of data [8, 9, 10]. SPsort improves substantially upon the best of these results.\n" + "text_representation": "The speed of sorting has long been used as a measure of computer systems I/O and communication\nperformance. In 1985, an article in Datamation magazine proposed a sort of one million records of 100\nbytes each, with random 10 bytes keys, as a useful measure of computer systems I/O performance [1]. The\nground rules of that benchmark require that all input must start on disk, all output must end on disk, and\nthat the overhead to start the program and create the output files must be included in the benchmark time.\nInput and output must use operating system files, not raw disk partitions. The first published time for this\nbenchmark was an hour [12]. With constant improvements in computer hardware and sort algorithms, this\ntime diminished to just a few seconds [7]. At that point, variations on the basic theme evolved [6].\n\u201cMinuteSort\u201d [3, 8] measures how much can be sorted in one minute and \u201cPennySort\u201d [5] measures how\nmuch can be sorted for one cent, assuming a particular depreciation period. Recently, several groups\nreported sorting one terabyte of data [8, 9, 10]. SPsort improves substantially upon the best of these results.\n" }, { "type": "Section-header", "bbox": [ - 0.14687787224264706, - 0.6754059392755681, - 0.23221774830537684, - 0.6892924915660511 + 0.14693428488338695, + 0.6747088068181818, + 0.23346440932329962, + 0.6892202481356534 ], "properties": { - "score": 0.8540988564491272, + "score": 0.8925763368606567, "page_number": 1 }, "text_representation": "Hardware\n" @@ -114,13 +129,13 @@ { "type": "Text", "bbox": [ - 0.14710727467256435, - 0.7094775945490057, - 0.8550252757352941, - 0.8823457475142046 + 0.14696044024299174, + 0.7098555686257102, + 0.8544048713235294, + 0.882755293412642 ], "properties": { - "score": 0.929555356502533, + "score": 0.9261640906333923, "page_number": 1 }, "text_representation": "The benchmark machine is a 488 node IBM RS/6000 SP, located in the IBM SP system test lab in\nPoughkeepsie, New York. Figure 1 shows the organization of this machine. Each node contains four\n332MHz PowerPC* 604e processors, 1.5 GB of RAM, at least one 32 bit 33 MHz PCI bus, and a 9 GB\nSCSI disk. The nodes communicate with one another through the high-speed SP switch with a bi-\ndirectional link bandwidth to each node of 150 megabytes/second. The switch adapter in each node is\nattached directly to the memory bus, so it does not have to share bandwidth with other devices on the PCI\nbus. Of the 488 nodes, 432 are compute nodes, while the remaining 56 are configured as storage nodes.\nGlobal storage consists of 1680 4.5 GB Serial Storage Architecture (SSA*) disk drives, organized into 336\ntwin-tailed 4+P RAID-5 arrays, for a total of just over 6 TB of user-accessible space attached to the storage\nnodes. Compute nodes are packaged 16 to a rack, while the storage nodes, which have 3 PCI busses and\nconsequently are larger, are packaged 8 to a rack. In total, the CPU and switch hardware occupies 34 racks,\nand the global disks require another 18 racks.\n" diff --git a/lib/aryn-sdk/aryn_sdk/test/resources/json/rotated.json b/lib/aryn-sdk/aryn_sdk/test/resources/json/rotated.json new file mode 100644 index 000000000..c901ad57a --- /dev/null +++ b/lib/aryn-sdk/aryn_sdk/test/resources/json/rotated.json @@ -0,0 +1,81 @@ +{ + "status": [ + "Incremental status will be shown here during execution.", + "Until you get a line that matches ' ]\n', you can convert the partial", + "output to a json document by appending '\"\"]}' to the partial output.", + "", + "T+ 0.00: Server version aryn-partitioner-0.20250122.214644 Model version 1.4", + "T+ 0.00: Received request with aryn_call_id=ef874bd3-052d-4889-93ec-5c0aabe52169", + "T+ 0.00: Waiting for scheduling", + "T+ 0.00: Preprocessing document", + "T+ 0.00: Done preprocessing document", + "T+ 0.24: Completed work on page 1", + "T+ 0.31: Completed work on page 2", + "" + ], + "status_code": 200, + "elements": [ + { + "type": "Image", + "bbox": [ + 0.509236894087358, + 0.4050663847081801, + 0.9102112926136363, + 0.5719264849494485 + ], + "properties": { + "score": 0.6749350428581238, + "image_size": null, + "image_mode": null, + "image_format": null, + "page_number": 1 + }, + "text_representation": "Rotation Test \n This is sideways! \n \n" + }, + { + "type": "Image", + "bbox": [ + 0.08446529612821692, + 0.6502498002485796, + 0.6053998161764705, + 0.7817662464488636 + ], + "properties": { + "score": 0.4510251581668854, + "image_size": null, + "image_mode": null, + "image_format": null, + "page_number": 2 + }, + "text_representation": "Rotation Test \n This is upside-Down! \n \n" + }, + { + "type": "Text", + "bbox": [ + 0.09905881096335019, + 0.6606139026988637, + 0.20984014174517462, + 0.6747579678622159 + ], + "properties": { + "score": 0.4541931748390198, + "page_number": 2 + }, + "text_representation": "Rotation Test \n" + }, + { + "type": "Text", + "bbox": [ + 0.09752255608053768, + 0.6977392578125, + 0.2741366397633272, + 0.7121780118075284 + ], + "properties": { + "score": 0.4705822765827179, + "page_number": 2 + }, + "text_representation": "This is upside-Down! \n" + } + ] +} diff --git a/lib/aryn-sdk/aryn_sdk/test/resources/pdfs/FR-2002-05-03-TRUNCATED-40.pdf b/lib/aryn-sdk/aryn_sdk/test/resources/pdfs/FR-2002-05-03-TRUNCATED-40.pdf new file mode 100644 index 000000000..e27a8df65 Binary files /dev/null and b/lib/aryn-sdk/aryn_sdk/test/resources/pdfs/FR-2002-05-03-TRUNCATED-40.pdf differ diff --git a/lib/aryn-sdk/aryn_sdk/test/resources/pdfs/rotated.pdf b/lib/aryn-sdk/aryn_sdk/test/resources/pdfs/rotated.pdf new file mode 100644 index 000000000..16a0cddd4 Binary files /dev/null and b/lib/aryn-sdk/aryn_sdk/test/resources/pdfs/rotated.pdf differ diff --git a/lib/aryn-sdk/aryn_sdk/test/test_partition.py b/lib/aryn-sdk/aryn_sdk/test/test_partition.py index 0671b7433..15fcc21f5 100644 --- a/lib/aryn-sdk/aryn_sdk/test/test_partition.py +++ b/lib/aryn-sdk/aryn_sdk/test/test_partition.py @@ -1,12 +1,24 @@ -from aryn_sdk.partition.partition import convert_image_element, tables_to_pandas +from os import PathLike +from typing import BinaryIO, Union +from aryn_sdk.partition.partition import convert_image_element, tables_to_pandas, ARYN_DOCPARSE_URL import pytest import json +import time from pathlib import Path +import logging -from aryn_sdk.partition import partition_file +from aryn_sdk.partition import ( + partition_file, + partition_file_async_submit, + partition_file_async_result, + partition_file_async_cancel, + partition_file_async_list, + PartitionError, +) from requests.exceptions import HTTPError RESOURCE_DIR = Path(__file__).parent / "resources" +ASYNC_TIMEOUT = 60 * 5 # 5 minutes in seconds # Unit tests @@ -71,7 +83,7 @@ def test_partition(pdf, kwargs, response, mocker): with open(pdf, "rb") as f: if kwargs.get("selected_pages") == [0]: - with pytest.raises(ValueError) as einfo: + with pytest.raises(PartitionError) as einfo: new_response = partition_file(f, **kwargs) assert "Invalid page number (0)" in str(einfo.value) else: @@ -113,13 +125,17 @@ def test_partition_it(pdf, kwargs, response): assert response_data["elements"] == new_response["elements"] -def test_partition_it_zero_page(): +def test_partition_with_unsupported_file_format(): + with pytest.raises(PartitionError): + with open(RESOURCE_DIR / "image" / "unsupported-format-test-document-image.heic", "rb") as f: + partition_file(f) + - with pytest.raises(ValueError) as einfo: +def test_partition_it_zero_page(): + with pytest.raises(PartitionError) as einfo: with open(RESOURCE_DIR / "pdfs" / "SPsort.pdf", "rb") as f: partition_file(f, selected_pages=[0]) - - assert "Invalid page number (0)" in str(einfo.value) + assert "selected_pages must not have empty or zero terms" in str(einfo.value) def test_partition_it_no_api_key(): @@ -130,6 +146,14 @@ def test_partition_it_no_api_key(): assert einfo.value.response.json().get("detail") == "Not authenticated" +def test_partition_file_auto_rotation(): + expected = json.loads(open(RESOURCE_DIR / "json" / "rotated.json", "r").read()) + actual = partition_file( + RESOURCE_DIR / "pdfs" / "rotated.pdf", output_label_options={"orientation_correction": True} + ) + assert actual["elements"] == expected["elements"] + + def test_data_to_pandas(): with open(RESOURCE_DIR / "json" / "3m_output_ocr_table.json", "r") as f: data = json.load(f) @@ -155,3 +179,162 @@ def test_convert_img(): with open(RESOURCE_DIR / "image" / "pngb64str.txt", "r") as f: real_str = f.read().strip() assert png_str == real_str + + +def test_invalid_job_id(): + response = partition_file_async_result("INVALID_JOB_ID") + assert response["status"] == "no_such_job" + + +def test_partition_file_async_submit(mocker): + data = b'{"job_id": "1234"}' + expected_response = json.loads(data.decode()) + + mocked_response = mocker.Mock() + mocked_response.status_code = 202 + mocked_response.iter_content.return_value = (data,) + + mocker.patch("requests.post").return_value = mocked_response + + with open(RESOURCE_DIR / "pdfs" / "3m_table.pdf", "rb") as f: + response = partition_file_async_submit(f) + + assert response == expected_response + + +def test_partiton_file_async_url_forwarding(mocker): + def call_partition_file(base_url: str): + partition_file_async_submit("", docparse_url=base_url) + partition_file_async_submit("", aps_url=base_url) + partition_file_async_submit("", aps_url="https://example.com/v1/document/partition", docparse_url=base_url) + partition_file_async_submit("", aps_url=base_url, docparse_url=base_url) + + standard_async_url = ARYN_DOCPARSE_URL.replace("/v1/", "/v1/async/submit/") + + def check_standard_url( + file: Union[BinaryIO, str, PathLike], + **kwargs, + ) -> None: + url = kwargs.get("docparse_url") or kwargs.get("aps_url") + assert url == standard_async_url + + mocker.patch("aryn_sdk.partition.partition._partition_file_inner", side_effect=check_standard_url) + partition_file_async_submit("") + call_partition_file(ARYN_DOCPARSE_URL) + call_partition_file(standard_async_url) + + nonstandard_url_example = "http://localhost:8000/v1/document/partition" + nonstandard_async_url_example = nonstandard_url_example.replace("/v1/", "/v1/async/submit/") + + def check_nonstandard_url( + file: Union[BinaryIO, str, PathLike], + **kwargs, + ) -> None: + url = kwargs.get("docparse_url") or kwargs.get("aps_url") + assert url == nonstandard_async_url_example + + mocker.patch("aryn_sdk.partition.partition._partition_file_inner", side_effect=check_nonstandard_url) + call_partition_file(nonstandard_url_example) + call_partition_file(nonstandard_async_url_example) + + +def test_partition_file_async(): + with open(RESOURCE_DIR / "pdfs" / "3m_table.pdf", "rb") as f: + job_id = partition_file_async_submit(f)["job_id"] + + start = time.time() + while True: + actual_result = partition_file_async_result(job_id) + if actual_result["status"] != "pending" or time.time() - start >= ASYNC_TIMEOUT: + break + time.sleep(1) + assert actual_result["status"] == "done" + + with open(RESOURCE_DIR / "json" / "3m_output.json", "rb") as f: + expected_result = json.load(f) + + assert expected_result["elements"] == actual_result["result"]["elements"] + + +def test_partition_file_async_with_unsupported_file_format(): + with open(RESOURCE_DIR / "image" / "unsupported-format-test-document-image.heic", "rb") as f: + job_id = partition_file_async_submit(f)["job_id"] + + start = time.time() + while True: + actual_result = partition_file_async_result(job_id) + if actual_result["status"] != "pending" or time.time() - start >= ASYNC_TIMEOUT: + break + time.sleep(1) + assert actual_result["status"] == "done" + assert actual_result["result"] is not None + assert actual_result["result"]["status_code"] == 500 + assert actual_result["result"]["error"] == "500: Failed to convert file to pdf" + + +def test_multiple_partition_file_async(): + num_jobs = 4 + job_ids = [] + + before = partition_file_async_list() + logging.info(f"List before:\n{json.dumps(before, indent=4)}") + assert len(before["jobs"]) == 0 + + for i in range(num_jobs): + logging.info(f"Submitting job {i + 1}/{num_jobs}") + job_id = partition_file_async_submit(RESOURCE_DIR / "pdfs" / "FR-2002-05-03-TRUNCATED-40.pdf")["job_id"] + logging.info(f"\tJob ID: {job_id}") + job_ids.append(job_id) + + after = partition_file_async_list() + logging.info(f"List after:\n{json.dumps(after, indent=4)}") + assert len(after["jobs"]) == num_jobs + + for i, job_id in enumerate(job_ids): + logging.info(f"Polling job ({job_id}) {i + 1}/{num_jobs}") + start = time.time() + while True: + actual_result = partition_file_async_result(job_id) + if actual_result["status"] != "pending" or time.time() - start >= ASYNC_TIMEOUT: + break + time.sleep(1) + logging.info(f"\tContinuing to Poll Job {job_id} ({i + 1}/{num_jobs})") + assert actual_result["status"] == "done" + assert len(actual_result["result"]["elements"]) > 1000 + + +def test_partition_file_async_cancel(): + with open(RESOURCE_DIR / "pdfs" / "FR-2002-05-03-TRUNCATED-40.pdf", "rb") as f: + job_id = partition_file_async_submit(f)["job_id"] + + before_cancel_result = partition_file_async_result(job_id) + assert before_cancel_result["status"] == "pending" + assert partition_file_async_cancel(job_id) + + # Cancellation is not reflected in the result immediately + for _ in range(10): + time.sleep(0.1) + after_cancel_result = partition_file_async_result(job_id) + if after_cancel_result["status"] != "pending": + break + assert after_cancel_result["status"] == "pending" + assert after_cancel_result["status"] == "no_such_job" + + +def test_smoke_webhook(mocker): + data = b'{"job_id": "1234"}' + + webhook_url = "TEST" + + mocked_response = mocker.Mock() + mocked_response.status_code = 202 + mocked_response.iter_content.return_value = data.split(sep=b"\n") + + def check_webhook(*args, headers, **kwargs): + assert "X-Aryn-Webhook" in headers + assert headers.get("X-Aryn-Webhook") == webhook_url + return mocked_response + + fake_post = mocker.patch("requests.post", side_effect=check_webhook) + partition_file_async_submit(RESOURCE_DIR / "pdfs" / "3m_table.pdf", webhook_url=webhook_url) + fake_post.assert_called()