merge main

aryn-ai · Feb 25, 2025 · cf9db7c · cf9db7c
2 parents 4e7b101 + 27285f0
commit cf9db7c
Show file tree

Hide file tree

Showing 6 changed files with 80 additions and 47 deletions.
diff --git a/apps/git/git-credential-from-env.py b/apps/git/git-credential-from-env.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+#
+# Credential helper to enable people to store fine grained access tokens in ssh environment
+# variables for use on a shared instance.
+# git config --global credential.helper ..../git-credentials-from-env.py
+# git config --global credential.useHttpPath true
+#
+# To make a fine grained access token:
+# Github console upper right -> Settings -> Developer settings (lower left)
+#   -> Personal access tokens -> Fine-grained tokens -> Generate new token
+#
+# You will likely need to change the resource owner to your organization.
+# After selecting a repository, make sure to set the right repository permisisons
+# To push and pull you will need Contents = Read and write; and Metadata = Read-only
+#
+# Get the token to the remote machine, you can
+# 1. gpg encrypt it, mail it and decrypt it; or
+# 2. type it in manually. To verify correct typing, use a checksum.
+#    linux: sha256sum <file
+#    windows: certutil -hashfile file SHA256
+
+import sys
+import logging
+import os
+
+if len(sys.argv) != 2 or sys.argv[1] != "get":
+    exit(0)
+
+d = {}
+
+for line in sys.stdin:
+    if line == "\n":
+        break
+    p = line.rstrip().split("=")  # Fix the missing split
+    if len(p) != 2:
+        logging.error(f"{__file__}: unable to parse {line}")
+        continue
+    d[p[0]] = p[1]
+
+if d.get("host", "") == "github.com" and "aryn-ai" in d.get("path", ""):
+    assert "ARYN_GITHUB_USER" in os.environ
+    assert "ARYN_GITHUB_KEY" in os.environ
+    print("protocol=https")
+    print("host=github.com")
+    print(f"username={os.environ['ARYN_GITHUB_USER']}")
+    print(f"password={os.environ['ARYN_GITHUB_KEY']}")
+    logging.error(f"git-credentials-from-env helper: Aryn github.com was used for {d['path']}")
+    exit(0)
+
+if "CUSTOMER_USER" in os.environ and "CUSTOMER_KEY" in os.environ:
+    print("protocol=https")
+    print("host=github.com")
+    print(f"username={os.environ['CUSTOMER_USER']}")
+    print(f"password={os.environ['CUSTOMER_KEY']}")
+    logging.error(f"git-credentials-from-env helper: Customer user was used for {d['path']}")
+    exit(0)
+
+logging.error(f"WARNING from {__file__}: Unable to find CUSTOMER_USER and CUSTOMER_KEY in environ.")
+logging.error("WARNING since the helper was enabled, this is probably an error.")
+exit(0)
diff --git a/lib/aryn-sdk/aryn_sdk/partition/partition.py b/lib/aryn-sdk/aryn_sdk/partition/partition.py
@@ -50,7 +50,6 @@ def partition_file(
     aryn_config: Optional[ArynConfig] = None,
     threshold: Optional[Union[float, Literal["auto"]]] = None,
     use_ocr: bool = False,
-    ocr_images: bool = False,
     summarize_images: bool = False,
     ocr_language: Optional[str] = None,
     extract_table_structure: bool = False,
@@ -81,8 +80,6 @@ def partition_file(
             default: None (Aryn DocParse will choose)
         use_ocr: extract text using an OCR model instead of extracting embedded text in PDF.
             default: False
-        ocr_images: attempt to use OCR to generate a text representation of detected images.
-            default: False
         summarize_images: Generate a text summary of detected images using a VLM.
         ocr_language: specify the language to use for OCR. If not set, the language will be english.
             default: English
@@ -158,7 +155,6 @@ def partition_file(
         aryn_config=aryn_config,
         threshold=threshold,
         use_ocr=use_ocr,
-        ocr_images=ocr_images,
         summarize_images=summarize_images,
         ocr_language=ocr_language,
         extract_table_structure=extract_table_structure,
@@ -182,7 +178,6 @@ def _partition_file_wrapper(
     aryn_config: Optional[ArynConfig] = None,
     threshold: Optional[Union[float, Literal["auto"]]] = None,
     use_ocr: bool = False,
-    ocr_images: bool = False,
     summarize_images: bool = False,
     ocr_language: Optional[str] = None,
     extract_table_structure: bool = False,
@@ -212,7 +207,6 @@ def _partition_file_wrapper(
             aryn_config=aryn_config,
             threshold=threshold,
             use_ocr=use_ocr,
-            ocr_images=ocr_images,
             summarize_images=summarize_images,
             ocr_language=ocr_language,
             extract_table_structure=extract_table_structure,
@@ -240,7 +234,6 @@ def _partition_file_inner(
     aryn_config: Optional[ArynConfig] = None,
     threshold: Optional[Union[float, Literal["auto"]]] = None,
     use_ocr: bool = False,
-    ocr_images: bool = False,
     summarize_images: bool = False,
     ocr_language: Optional[str] = None,
     extract_table_structure: bool = False,
@@ -274,7 +267,6 @@ def _partition_file_inner(
     options_str = _json_options(
         threshold=threshold,
         use_ocr=use_ocr,
-        ocr_images=ocr_images,
         summarize_images=summarize_images,
         ocr_language=ocr_language,
         extract_table_structure=extract_table_structure,
@@ -382,7 +374,6 @@ def _should_stream() -> bool:
 def _json_options(
     threshold: Optional[Union[float, Literal["auto"]]] = None,
     use_ocr: bool = False,
-    ocr_images: bool = False,
     summarize_images: bool = False,
     ocr_language: Optional[str] = None,
     extract_table_structure: bool = False,
@@ -399,8 +390,6 @@ def _json_options(
         options["threshold"] = threshold
     if use_ocr:
         options["use_ocr"] = use_ocr
-    if ocr_images:
-        options["ocr_images"] = ocr_images
     if summarize_images:
         options["summarize_images"] = summarize_images
     if ocr_language:
@@ -432,7 +421,6 @@ def partition_file_async_submit(
     aryn_config: Optional[ArynConfig] = None,
     threshold: Optional[Union[float, Literal["auto"]]] = None,
     use_ocr: bool = False,
-    ocr_images: bool = False,
     summarize_images: bool = False,
     ocr_language: Optional[str] = None,
     extract_table_structure: bool = False,
@@ -489,7 +477,6 @@ def partition_file_async_submit(
         aryn_config=aryn_config,
         threshold=threshold,
         use_ocr=use_ocr,
-        ocr_images=ocr_images,
         summarize_images=summarize_images,
         ocr_language=ocr_language,
         extract_table_structure=extract_table_structure,

diff --git a/lib/sycamore/sycamore/query/schema.py b/lib/sycamore/sycamore/query/schema.py
@@ -57,7 +57,11 @@ def get_schema(self) -> OpenSearchSchema:
         logger.debug(f"Getting schema for index {self._index}")
         # Fetch example values.
         query["index"] = self._index
-        query["query"] = {"query": {"match_all": {}}, "size": self.NUM_EXAMPLES}
+        query["query"] = {
+            "query": {"match_all": {}},
+            "size": self.NUM_EXAMPLES,
+            "sort": [{"_script": {"type": "number", "script": {"source": "Math.random()"}}}],
+        }
         random_sample = self._query_executor.query(query)["result"]["hits"]["hits"]
 
         result = OpenSearchSchema(

diff --git a/lib/sycamore/sycamore/tests/integration/test_image_utils.py b/lib/sycamore/sycamore/tests/integration/test_image_utils.py
@@ -2,6 +2,7 @@
 
 import pdf2image
 from PIL import Image
+from copy import deepcopy
 
 
 import sycamore
@@ -27,18 +28,23 @@ def image_boxes() -> list[BoundingBox]:
 
 
 @pytest.fixture(scope="module")
-def source_image() -> Image.Image:
+def source_image_module_scope() -> Image.Image:
     images = pdf2image.convert_from_path(path)
     return images[0].convert(mode="RGBA")
 
 
+@pytest.fixture(scope="function")
+def source_image(source_image_module_scope) -> Image.Image:
+    return deepcopy(source_image_module_scope)
+
+
 # Checks that the image contains blue pixels. This is of course an imperfect check, but
 # it at least will tell us if we drew some bounding boxes. on the image. Won't work
 # if the image contains blue pixels to begin with. Image must have mode RGBA.
 def check_image(image: Image.Image, expected_color=(0, 0, 255, 255)) -> None:
     raw_colors = image.getcolors(64_000)
     assert raw_colors is not None, "Image has too many colors to count"
-    assert expected_color in set((color_tup[1] for color_tup in raw_colors))
+    assert expected_color in set((color_tup[1] for color_tup in raw_colors)), "Did not draw boxes"
 
 
 def test_draw_boxes_bbox(source_image, image_boxes) -> None:
@@ -86,11 +92,13 @@ def test_draw_boxes_dict(source_image, image_boxes) -> None:
 
 def test_invalid_list(source_image, image_boxes):
     boxes = [[b.coordinates] for b in image_boxes]
-    with pytest.raises(ValueError):
-        try_draw_boxes(source_image, boxes)
+    output: Image.Image = try_draw_boxes(source_image, boxes)
+    with pytest.raises(AssertionError, match=r".*Did not draw boxes.*"):
+        check_image(output)
 
 
 def test_invalid_dict(source_image, image_boxes):
     boxes = [{"bboxes": b.coordinates} for b in image_boxes]
-    with pytest.raises(ValueError):
-        try_draw_boxes(source_image, boxes)
+    output: Image.Image = try_draw_boxes(source_image, boxes)
+    with pytest.raises(AssertionError, match=r".*Did not draw boxes.*"):
+        check_image(output)
diff --git a/lib/sycamore/sycamore/transforms/detr_partitioner.py b/lib/sycamore/sycamore/transforms/detr_partitioner.py
@@ -140,7 +140,6 @@ def partition_pdf(
         file: BinaryIO,
         threshold: Union[float, Literal["auto"]] = DEFAULT_LOCAL_THRESHOLD,
         use_ocr=False,
-        ocr_images=False,
         ocr_model: Union[str, OcrModel] = "easyocr",
         per_element_ocr=True,
         extract_table_structure=False,
@@ -167,7 +166,6 @@ def partition_pdf(
                 aryn_partitioner_address=aryn_partitioner_address,
                 threshold=threshold,
                 use_ocr=use_ocr,
-                ocr_images=ocr_images,
                 extract_table_structure=extract_table_structure,
                 extract_images=extract_images,
                 pages_per_call=pages_per_call,
@@ -183,7 +181,6 @@ def partition_pdf(
                 file=file,
                 threshold=threshold,
                 use_ocr=use_ocr,
-                ocr_images=ocr_images,
                 ocr_model=ocr_model,
                 per_element_ocr=per_element_ocr,
                 extract_table_structure=extract_table_structure,
@@ -226,7 +223,6 @@ def _call_remote_partitioner(
         aryn_partitioner_address=DEFAULT_ARYN_PARTITIONER_ADDRESS,
         threshold: Union[float, Literal["auto"]] = "auto",
         use_ocr: bool = False,
-        ocr_images: bool = False,
         extract_table_structure: bool = False,
         extract_images: bool = False,
         selected_pages: list = [],
@@ -238,7 +234,6 @@ def _call_remote_partitioner(
         options = {
             "threshold": threshold,
             "use_ocr": use_ocr,
-            "ocr_images": ocr_images,
             "extract_table_structure": extract_table_structure,
             "extract_images": extract_images,
             "selected_pages": selected_pages,
@@ -341,7 +336,6 @@ def _partition_remote(
         aryn_partitioner_address=DEFAULT_ARYN_PARTITIONER_ADDRESS,
         threshold: Union[float, Literal["auto"]] = "auto",
         use_ocr: bool = False,
-        ocr_images: bool = False,
         extract_table_structure: bool = False,
         extract_images: bool = False,
         pages_per_call: int = -1,
@@ -364,7 +358,6 @@ def _partition_remote(
                     aryn_partitioner_address=aryn_partitioner_address,
                     threshold=threshold,
                     use_ocr=use_ocr,
-                    ocr_images=ocr_images,
                     extract_table_structure=extract_table_structure,
                     extract_images=extract_images,
                     selected_pages=[[low, min(high, page_count)]],
@@ -383,7 +376,6 @@ def _partition_pdf_batched(
         file: BinaryIO,
         threshold: float = DEFAULT_LOCAL_THRESHOLD,
         use_ocr: bool = False,
-        ocr_images: bool = False,
         ocr_model: Union[str, OcrModel] = "easyocr",
         per_element_ocr: bool = True,
         extract_table_structure: bool = False,
@@ -410,7 +402,6 @@ def _partition_pdf_batched(
                 file_hash.hexdigest(),
                 threshold,
                 use_ocr,
-                ocr_images,
                 ocr_model,
                 per_element_ocr,
                 extract_table_structure,
@@ -430,7 +421,6 @@ def _partition_pdf_batched_named(
         hash_key: str,
         threshold: float = DEFAULT_LOCAL_THRESHOLD,
         use_ocr: bool = False,
-        ocr_images: bool = False,
         ocr_model: Union[str, OcrModel] = "easyocr",
         per_element_ocr: bool = True,
         extract_table_structure=False,
@@ -472,7 +462,6 @@ def _partition_pdf_batched_named(
                 use_ocr=use_ocr,
                 text_extractor=text_extractor,
                 extractor_inputs=extractor_inputs,
-                ocr_images=ocr_images,
                 ocr_model=ocr_model,
                 per_element_ocr=per_element_ocr,
                 extract_table_structure=extract_table_structure,
@@ -504,7 +493,6 @@ def process_batch(
         text_extractor: TextExtractor,
         extractor_inputs: Any,
         use_ocr: bool,
-        ocr_images: bool,
         ocr_model: Union[str, OcrModel],
         per_element_ocr: bool,
         extract_table_structure: bool,
@@ -526,7 +514,6 @@ def process_batch(
                 extract_ocr(
                     batch,
                     deformable_layout,
-                    ocr_images=ocr_images,
                     ocr_model=ocr_model,
                 )
         else:
@@ -571,12 +558,11 @@ def _run_text_extractor_document(
         hash_key: str,
         use_cache: bool,
         use_ocr: bool,
-        ocr_images: bool,
         text_extractor_model: Union[str, OcrModel],
         text_extraction_options: dict[str, Any],
         images: Optional[list[Image.Image]] = None,
     ):
-        kwargs = {"ocr_images": ocr_images, "images": images}
+        kwargs = {"images": images}
         if isinstance(text_extractor_model, OcrModel):
             model: TextExtractor = text_extractor_model
         else:
@@ -592,7 +578,6 @@ def process_batch_inference(
         use_cache: bool,
         use_ocr: bool,
         ocr_model: Union[str, OcrModel],
-        ocr_images: bool,
         per_element_ocr: bool,
     ) -> Any:
         self._init_model()
@@ -606,7 +591,6 @@ def process_batch_inference(
             extract_ocr(
                 batch,
                 deformable_layout,
-                ocr_images=ocr_images,
                 ocr_model=ocr_model,
             )
         return deformable_layout
@@ -777,7 +761,6 @@ def _get_hash_key(self, image: Image.Image, threshold: float) -> str:
 def extract_ocr(
     images: list[Image.Image],
     elements: list[list[Element]],
-    ocr_images: bool = False,
     ocr_model: Union[str, OcrModel] = "easyocr",
     text_extraction_options: dict[str, Any] = {},
 ) -> list[list[Element]]:
@@ -796,8 +779,6 @@ def extract_ocr(
         for elem in page_elements:
             if elem.bbox is None:
                 continue
-            if elem.type == "Picture" and not ocr_images:
-                continue
             cropped_image = crop_to_bbox(image, elem.bbox, padding=0)
             if 0 in cropped_image.size:
                 elem.text_representation = ""