Skip to content

Commit

Permalink
ASDK: Prevent excessive memory consumption reading file. (#1174)
Browse files Browse the repository at this point in the history
* ASDK: Prevent excessive memory consumption reading file.

* lint
  • Loading branch information
alexaryn authored and austintlee committed Feb 14, 2025
1 parent 829fe56 commit 502bade
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 11 deletions.
61 changes: 55 additions & 6 deletions lib/aryn-sdk/aryn_sdk/partition/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def partition_file(
)
elements = data['elements']
"""
return _partition_file_inner(
return _partition_file_wrapper(
file=file,
aryn_api_key=aryn_api_key,
aryn_config=aryn_config,
Expand All @@ -169,7 +169,7 @@ def partition_file(
)


def _partition_file_inner(
def _partition_file_wrapper(
file: Union[BinaryIO, str, PathLike],
*,
aryn_api_key: Optional[str] = None,
Expand All @@ -193,9 +193,58 @@ def _partition_file_inner(
"""Do not call this function directly. Use partition_file or partition_file_async_submit instead."""

# If you hand me a path for the file, read it in instead of trying to send the path
if isinstance(file, (str, PathLike)):
with open(file, "rb") as f:
file = io.BytesIO(f.read())
should_close = False
try:
if isinstance(file, (str, PathLike)):
file = open(file, "rb")
should_close = True
return _partition_file_inner(
file=file,
aryn_api_key=aryn_api_key,
aryn_config=aryn_config,
threshold=threshold,
use_ocr=use_ocr,
ocr_images=ocr_images,
ocr_language=ocr_language,
extract_table_structure=extract_table_structure,
table_extraction_options=table_extraction_options,
extract_images=extract_images,
selected_pages=selected_pages,
chunking_options=chunking_options,
aps_url=aps_url,
docparse_url=docparse_url,
ssl_verify=ssl_verify,
output_format=output_format,
output_label_options=output_label_options,
webhook_url=webhook_url,
)
finally:
if should_close and isinstance(file, BinaryIO):
file.close()


def _partition_file_inner(
file: BinaryIO,
*,
aryn_api_key: Optional[str] = None,
aryn_config: Optional[ArynConfig] = None,
threshold: Optional[Union[float, Literal["auto"]]] = None,
use_ocr: bool = False,
ocr_images: bool = False,
ocr_language: Optional[str] = None,
extract_table_structure: bool = False,
table_extraction_options: dict[str, Any] = {},
extract_images: bool = False,
selected_pages: Optional[list[Union[list[int], int]]] = None,
chunking_options: Optional[dict[str, Any]] = None,
aps_url: Optional[str] = None, # deprecated in favor of docparse_url
docparse_url: Optional[str] = None,
ssl_verify: bool = True,
output_format: Optional[str] = None,
output_label_options: dict[str, Any] = {},
webhook_url: Optional[str] = None,
):
"""Do not call this function directly. Use partition_file or partition_file_async_submit instead."""

aryn_config = _process_config(aryn_api_key, aryn_config)

Expand Down Expand Up @@ -412,7 +461,7 @@ def partition_file_async_submit(
if docparse_url:
docparse_url = _convert_sync_to_async_url(docparse_url, "/submit", truncate=False)

return _partition_file_inner(
return _partition_file_wrapper(
file=file,
aryn_api_key=aryn_api_key,
aryn_config=aryn_config,
Expand Down
12 changes: 7 additions & 5 deletions lib/aryn-sdk/aryn_sdk/test/test_partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,11 +206,13 @@ def test_partition_file_async_submit(mocker):


def test_partiton_file_async_url_forwarding(mocker):
dummy = open("/dev/null", "rb")

def call_partition_file(base_url: str):
partition_file_async_submit("", docparse_url=base_url)
partition_file_async_submit("", aps_url=base_url)
partition_file_async_submit("", aps_url="https://example.com/v1/document/partition", docparse_url=base_url)
partition_file_async_submit("", aps_url=base_url, docparse_url=base_url)
partition_file_async_submit(dummy, docparse_url=base_url)
partition_file_async_submit(dummy, aps_url=base_url)
partition_file_async_submit(dummy, aps_url="https://example.com/v1/document/partition", docparse_url=base_url)
partition_file_async_submit(dummy, aps_url=base_url, docparse_url=base_url)

standard_async_url = ARYN_DOCPARSE_URL.replace("/v1/", "/v1/async/submit/")

Expand All @@ -222,7 +224,7 @@ def check_standard_url(
assert url == standard_async_url

mocker.patch("aryn_sdk.partition.partition._partition_file_inner", side_effect=check_standard_url)
partition_file_async_submit("")
partition_file_async_submit(dummy)
call_partition_file(ARYN_DOCPARSE_URL)
call_partition_file(standard_async_url)

Expand Down

0 comments on commit 502bade

Please sign in to comment.