Skip to content

Commit

Permalink
merge main
Browse files Browse the repository at this point in the history
  • Loading branch information
karanataryn committed Feb 14, 2025
2 parents c4d1470 + 7f008bf commit f4b3af9
Show file tree
Hide file tree
Showing 18 changed files with 1,883 additions and 1,965 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ permissions:

jobs:
sycamore-unit-tests:
runs-on: blacksmith-4vcpu-ubuntu-2204
runs-on: blacksmith-8vcpu-ubuntu-2204
strategy:
matrix:
python-version: ["3.9", "3.10", "3.11", "3.12"]
Expand Down
557 changes: 516 additions & 41 deletions apps/integration/poetry.lock

Large diffs are not rendered by default.

884 changes: 373 additions & 511 deletions apps/jupyter/poetry.lock

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions apps/jupyter/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ repository = "https://github.com/aryn-ai/sycamore.git"

[tool.poetry.dependencies]
python = ">=3.9,<3.13"
sycamore-ai = {extras = ["opensearch"], version = "^0.1.13"}
sycamore-ai = {extras = ["opensearch"], version = "^0.1.30"}

jupyterlab = "^4.0.11"
jupyter-lsp = "^2.2.2"
Expand All @@ -17,7 +17,7 @@ notebook = "^7.1.2"


[tool.poetry.group.dev.dependencies]
sycamore-ai = { path = "../../lib/sycamore", develop = true }
sycamore-ai = { path = "../../lib/sycamore", extras = ["opensearch"], develop = true }

[tool.poetry.group.sycamore_poetry_lock.dependencies]
sycamore-poetry-lock = { path = "../../lib/poetry-lock", develop = true }
894 changes: 384 additions & 510 deletions apps/remote-processor-service/poetry.lock

Large diffs are not rendered by default.

79 changes: 70 additions & 9 deletions lib/aryn-sdk/aryn_sdk/partition/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
_logger.setLevel(logging.INFO)
_logger.addHandler(logging.StreamHandler(sys.stderr))

g_version = "0.1.13"
g_version = "0.1.14"
g_parameters = {"path_filter": "^/v1/document/partition$"}


Expand Down Expand Up @@ -62,6 +62,7 @@ def partition_file(
ssl_verify: bool = True,
output_format: Optional[str] = None,
output_label_options: dict[str, Any] = {},
trace_id: Optional[str] = None,
) -> dict:
"""
Sends file to Aryn DocParse and returns a dict of its document structure and text
Expand Down Expand Up @@ -127,6 +128,7 @@ def partition_file(
"orientation_correction": True
}
default: None (no element is promoted to "Title")
trace_id: for internal use
Returns:
Expand All @@ -148,7 +150,7 @@ def partition_file(
)
elements = data['elements']
"""
return _partition_file_inner(
return _partition_file_wrapper(
file=file,
aryn_api_key=aryn_api_key,
aryn_config=aryn_config,
Expand All @@ -166,10 +168,11 @@ def partition_file(
ssl_verify=ssl_verify,
output_format=output_format,
output_label_options=output_label_options,
trace_id=trace_id,
)


def _partition_file_inner(
def _partition_file_wrapper(
file: Union[BinaryIO, str, PathLike],
*,
aryn_api_key: Optional[str] = None,
Expand All @@ -189,13 +192,65 @@ def _partition_file_inner(
output_format: Optional[str] = None,
output_label_options: dict[str, Any] = {},
webhook_url: Optional[str] = None,
trace_id: Optional[str] = None,
):
"""Do not call this function directly. Use partition_file or partition_file_async_submit instead."""

# If you hand me a path for the file, read it in instead of trying to send the path
if isinstance(file, (str, PathLike)):
with open(file, "rb") as f:
file = io.BytesIO(f.read())
should_close = False
try:
if isinstance(file, (str, PathLike)):
file = open(file, "rb")
should_close = True
return _partition_file_inner(
file=file,
aryn_api_key=aryn_api_key,
aryn_config=aryn_config,
threshold=threshold,
use_ocr=use_ocr,
ocr_images=ocr_images,
ocr_language=ocr_language,
extract_table_structure=extract_table_structure,
table_extraction_options=table_extraction_options,
extract_images=extract_images,
selected_pages=selected_pages,
chunking_options=chunking_options,
aps_url=aps_url,
docparse_url=docparse_url,
ssl_verify=ssl_verify,
output_format=output_format,
output_label_options=output_label_options,
trace_id=trace_id,
webhook_url=webhook_url,
)
finally:
if should_close and isinstance(file, BinaryIO):
file.close()


def _partition_file_inner(
file: BinaryIO,
*,
aryn_api_key: Optional[str] = None,
aryn_config: Optional[ArynConfig] = None,
threshold: Optional[Union[float, Literal["auto"]]] = None,
use_ocr: bool = False,
ocr_images: bool = False,
ocr_language: Optional[str] = None,
extract_table_structure: bool = False,
table_extraction_options: dict[str, Any] = {},
extract_images: bool = False,
selected_pages: Optional[list[Union[list[int], int]]] = None,
chunking_options: Optional[dict[str, Any]] = None,
aps_url: Optional[str] = None, # deprecated in favor of docparse_url
docparse_url: Optional[str] = None,
ssl_verify: bool = True,
output_format: Optional[str] = None,
output_label_options: dict[str, Any] = {},
trace_id: Optional[str] = None,
webhook_url: Optional[str] = None,
):
"""Do not call this function directly. Use partition_file or partition_file_async_submit instead."""

aryn_config = _process_config(aryn_api_key, aryn_config)

Expand Down Expand Up @@ -227,7 +282,7 @@ def _partition_file_inner(
_logger.debug(f"{options_str}")

files: Mapping = {"options": options_str.encode("utf-8"), "pdf": file}
headers = _generate_headers(aryn_config.api_key(), webhook_url)
headers = _generate_headers(aryn_config.api_key(), webhook_url, trace_id)
resp = requests.post(docparse_url, files=files, headers=headers, stream=_should_stream(), verify=ssl_verify)

raise_error_on_non_2xx(resp)
Expand Down Expand Up @@ -293,10 +348,14 @@ def _process_config(aryn_api_key: Optional[str] = None, aryn_config: Optional[Ar
return aryn_config


def _generate_headers(aryn_api_key: str, webhook_url: Optional[str] = None) -> dict[str, str]:
def _generate_headers(
aryn_api_key: str, webhook_url: Optional[str] = None, trace_id: Optional[str] = None
) -> dict[str, str]:
headers = {"Authorization": f"Bearer {aryn_api_key}", "User-Agent": f"aryn-sdk/{g_version}"}
if webhook_url:
headers["X-Aryn-Webhook"] = webhook_url
if trace_id:
headers["X-Aryn-Trace-ID"] = trace_id
return headers


Expand Down Expand Up @@ -375,6 +434,7 @@ def partition_file_async_submit(
ssl_verify: bool = True,
output_format: Optional[str] = None,
output_label_options: dict[str, Any] = {},
trace_id: Optional[str] = None,
webhook_url: Optional[str] = None,
async_submit_url: Optional[str] = None,
) -> dict[str, Any]:
Expand Down Expand Up @@ -412,7 +472,7 @@ def partition_file_async_submit(
if docparse_url:
docparse_url = _convert_sync_to_async_url(docparse_url, "/submit", truncate=False)

return _partition_file_inner(
return _partition_file_wrapper(
file=file,
aryn_api_key=aryn_api_key,
aryn_config=aryn_config,
Expand All @@ -430,6 +490,7 @@ def partition_file_async_submit(
ssl_verify=ssl_verify,
output_format=output_format,
output_label_options=output_label_options,
trace_id=trace_id,
webhook_url=webhook_url,
)

Expand Down
14 changes: 9 additions & 5 deletions lib/aryn-sdk/aryn_sdk/test/test_partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,11 +206,13 @@ def test_partition_file_async_submit(mocker):


def test_partiton_file_async_url_forwarding(mocker):
dummy = open("/dev/null", "rb")

def call_partition_file(base_url: str):
partition_file_async_submit("", docparse_url=base_url)
partition_file_async_submit("", aps_url=base_url)
partition_file_async_submit("", aps_url="https://example.com/v1/document/partition", docparse_url=base_url)
partition_file_async_submit("", aps_url=base_url, docparse_url=base_url)
partition_file_async_submit(dummy, docparse_url=base_url)
partition_file_async_submit(dummy, aps_url=base_url)
partition_file_async_submit(dummy, aps_url="https://example.com/v1/document/partition", docparse_url=base_url)
partition_file_async_submit(dummy, aps_url=base_url, docparse_url=base_url)

standard_async_url = ARYN_DOCPARSE_URL.replace("/v1/", "/v1/async/submit/")

Expand All @@ -222,7 +224,7 @@ def check_standard_url(
assert url == standard_async_url

mocker.patch("aryn_sdk.partition.partition._partition_file_inner", side_effect=check_standard_url)
partition_file_async_submit("")
partition_file_async_submit(dummy)
call_partition_file(ARYN_DOCPARSE_URL)
call_partition_file(standard_async_url)

Expand All @@ -240,6 +242,8 @@ def check_nonstandard_url(
call_partition_file(nonstandard_url_example)
call_partition_file(nonstandard_async_url_example)

dummy.close()


def test_partition_file_async_with_unsupported_file_format():
with open(RESOURCE_DIR / "image" / "unsupported-format-test-document-image.heic", "rb") as f:
Expand Down
2 changes: 1 addition & 1 deletion lib/aryn-sdk/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "aryn-sdk"
version = "0.1.13"
version = "0.1.14"
description = "The client library for Aryn services"
authors = ["aryn.ai <[email protected]>"]
license = "Apache 2.0"
Expand Down
Loading

0 comments on commit f4b3af9

Please sign in to comment.