From faafd115fa07ec240e7b8d12ebe27f15a3d41cf7 Mon Sep 17 00:00:00 2001
From: Eric Anderson <eric@aryn.ai>
Date: Thu, 20 Feb 2025 13:11:24 -0800
Subject: [PATCH 1/4] Helper script for getting git credentials from the
 environment (#1190)

Useful for passing in fine-grained personal access tokens via environment variables when SSHing
into a node so that they don't have to be stored in a shared environment.
---
 apps/git/git-credential-from-env.py | 60 +++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100755 apps/git/git-credential-from-env.py

diff --git a/apps/git/git-credential-from-env.py b/apps/git/git-credential-from-env.py
new file mode 100755
index 000000000..748f2473f
--- /dev/null
+++ b/apps/git/git-credential-from-env.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+#
+# Credential helper to enable people to store fine grained access tokens in ssh environment
+# variables for use on a shared instance.
+# git config --global credential.helper ..../git-credentials-from-env.py
+# git config --global credential.useHttpPath true
+#
+# To make a fine grained access token:
+# Github console upper right -> Settings -> Developer settings (lower left)
+#   -> Personal access tokens -> Fine-grained tokens -> Generate new token
+#
+# You will likely need to change the resource owner to your organization.
+# After selecting a repository, make sure to set the right repository permisisons
+# To push and pull you will need Contents = Read and write; and Metadata = Read-only
+#
+# Get the token to the remote machine, you can
+# 1. gpg encrypt it, mail it and decrypt it; or
+# 2. type it in manually. To verify correct typing, use a checksum.
+#    linux: sha256sum <file
+#    windows: certutil -hashfile file SHA256
+
+import sys
+import logging
+import os
+
+if len(sys.argv) != 2 or sys.argv[1] != "get":
+    exit(0)
+
+d = {}
+
+for line in sys.stdin:
+    if line == "\n":
+        break
+    p = line.rstrip().split("=")  # Fix the missing split
+    if len(p) != 2:
+        logging.error(f"{__file__}: unable to parse {line}")
+        continue
+    d[p[0]] = p[1]
+
+if d.get("host", "") == "github.com" and "aryn-ai" in d.get("path", ""):
+    assert "ARYN_GITHUB_USER" in os.environ
+    assert "ARYN_GITHUB_KEY" in os.environ
+    print("protocol=https")
+    print("host=github.com")
+    print(f"username={os.environ['ARYN_GITHUB_USER']}")
+    print(f"password={os.environ['ARYN_GITHUB_KEY']}")
+    logging.error(f"git-credentials-from-env helper: Aryn github.com was used for {d['path']}")
+    exit(0)
+
+if "CUSTOMER_USER" in os.environ and "CUSTOMER_KEY" in os.environ:
+    print("protocol=https")
+    print("host=github.com")
+    print(f"username={os.environ['CUSTOMER_USER']}")
+    print(f"password={os.environ['CUSTOMER_KEY']}")
+    logging.error(f"git-credentials-from-env helper: Customer user was used for {d['path']}")
+    exit(0)
+
+logging.error(f"WARNING from {__file__}: Unable to find CUSTOMER_USER and CUSTOMER_KEY in environ.")
+logging.error("WARNING since the helper was enabled, this is probably an error.")
+exit(0)

From 2d0f0aa089e2273f905e86c7f47017770b92c0c5 Mon Sep 17 00:00:00 2001
From: Henry Lindeman <hmlindeman@yahoo.com>
Date: Mon, 24 Feb 2025 12:40:04 -0800
Subject: [PATCH 2/4] fix image tests - we don't explode on bad bboxes these
 days (#1193)

Signed-off-by: Henry Lindeman <hmlindeman@yahoo.com>
---
 .../tests/integration/test_image_utils.py     | 20 +++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/lib/sycamore/sycamore/tests/integration/test_image_utils.py b/lib/sycamore/sycamore/tests/integration/test_image_utils.py
index 05c486027..9eb7ea716 100644
--- a/lib/sycamore/sycamore/tests/integration/test_image_utils.py
+++ b/lib/sycamore/sycamore/tests/integration/test_image_utils.py
@@ -2,6 +2,7 @@
 
 import pdf2image
 from PIL import Image
+from copy import deepcopy
 
 
 import sycamore
@@ -27,18 +28,23 @@ def image_boxes() -> list[BoundingBox]:
 
 
 @pytest.fixture(scope="module")
-def source_image() -> Image.Image:
+def source_image_module_scope() -> Image.Image:
     images = pdf2image.convert_from_path(path)
     return images[0].convert(mode="RGBA")
 
 
+@pytest.fixture(scope="function")
+def source_image(source_image_module_scope) -> Image.Image:
+    return deepcopy(source_image_module_scope)
+
+
 # Checks that the image contains blue pixels. This is of course an imperfect check, but
 # it at least will tell us if we drew some bounding boxes. on the image. Won't work
 # if the image contains blue pixels to begin with. Image must have mode RGBA.
 def check_image(image: Image.Image, expected_color=(0, 0, 255, 255)) -> None:
     raw_colors = image.getcolors(64_000)
     assert raw_colors is not None, "Image has too many colors to count"
-    assert expected_color in set((color_tup[1] for color_tup in raw_colors))
+    assert expected_color in set((color_tup[1] for color_tup in raw_colors)), "Did not draw boxes"
 
 
 def test_draw_boxes_bbox(source_image, image_boxes) -> None:
@@ -86,11 +92,13 @@ def test_draw_boxes_dict(source_image, image_boxes) -> None:
 
 def test_invalid_list(source_image, image_boxes):
     boxes = [[b.coordinates] for b in image_boxes]
-    with pytest.raises(ValueError):
-        try_draw_boxes(source_image, boxes)
+    output: Image.Image = try_draw_boxes(source_image, boxes)
+    with pytest.raises(AssertionError, match=r".*Did not draw boxes.*"):
+        check_image(output)
 
 
 def test_invalid_dict(source_image, image_boxes):
     boxes = [{"bboxes": b.coordinates} for b in image_boxes]
-    with pytest.raises(ValueError):
-        try_draw_boxes(source_image, boxes)
+    output: Image.Image = try_draw_boxes(source_image, boxes)
+    with pytest.raises(AssertionError, match=r".*Did not draw boxes.*"):
+        check_image(output)

From 2711b1001dd362fd7c6e7fceaa66f31c2f902c9d Mon Sep 17 00:00:00 2001
From: Dhruv Kaliraman <112497058+dhruvkaliraman7@users.noreply.github.com>
Date: Mon, 24 Feb 2025 18:21:33 -0500
Subject: [PATCH 3/4] Get random hits when filtering properties (#1195)

---
 lib/sycamore/sycamore/query/schema.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lib/sycamore/sycamore/query/schema.py b/lib/sycamore/sycamore/query/schema.py
index 97aafd7ce..dd1405c58 100644
--- a/lib/sycamore/sycamore/query/schema.py
+++ b/lib/sycamore/sycamore/query/schema.py
@@ -57,7 +57,11 @@ def get_schema(self) -> OpenSearchSchema:
         logger.debug(f"Getting schema for index {self._index}")
         # Fetch example values.
         query["index"] = self._index
-        query["query"] = {"query": {"match_all": {}}, "size": self.NUM_EXAMPLES}
+        query["query"] = {
+            "query": {"match_all": {}},
+            "size": self.NUM_EXAMPLES,
+            "sort": [{"_script": {"type": "number", "script": {"source": "Math.random()"}}}],
+        }
         random_sample = self._query_executor.query(query)["result"]["hits"]["hits"]
 
         result = OpenSearchSchema(

From 27285f0522b410d0ecac977dd3b3ebe07ea9f6db Mon Sep 17 00:00:00 2001
From: Karan Sampath <176953591+karanataryn@users.noreply.github.com>
Date: Tue, 25 Feb 2025 10:01:06 -0800
Subject: [PATCH 4/4] remove ocr images (#1194)

---
 lib/aryn-sdk/aryn_sdk/partition/partition.py  | 13 ------------
 .../sycamore/transforms/detr_partitioner.py   | 21 +------------------
 lib/sycamore/sycamore/transforms/partition.py |  7 -------
 3 files changed, 1 insertion(+), 40 deletions(-)

diff --git a/lib/aryn-sdk/aryn_sdk/partition/partition.py b/lib/aryn-sdk/aryn_sdk/partition/partition.py
index bdd30ae45..73979f91e 100644
--- a/lib/aryn-sdk/aryn_sdk/partition/partition.py
+++ b/lib/aryn-sdk/aryn_sdk/partition/partition.py
@@ -50,7 +50,6 @@ def partition_file(
     aryn_config: Optional[ArynConfig] = None,
     threshold: Optional[Union[float, Literal["auto"]]] = None,
     use_ocr: bool = False,
-    ocr_images: bool = False,
     ocr_language: Optional[str] = None,
     extract_table_structure: bool = False,
     table_extraction_options: dict[str, Any] = {},
@@ -80,8 +79,6 @@ def partition_file(
             default: None (Aryn DocParse will choose)
         use_ocr: extract text using an OCR model instead of extracting embedded text in PDF.
             default: False
-        ocr_images: attempt to use OCR to generate a text representation of detected images.
-            default: False
         ocr_language: specify the language to use for OCR. If not set, the language will be english.
             default: English
         extract_table_structure: extract tables and their structural content.
@@ -156,7 +153,6 @@ def partition_file(
         aryn_config=aryn_config,
         threshold=threshold,
         use_ocr=use_ocr,
-        ocr_images=ocr_images,
         ocr_language=ocr_language,
         extract_table_structure=extract_table_structure,
         table_extraction_options=table_extraction_options,
@@ -179,7 +175,6 @@ def _partition_file_wrapper(
     aryn_config: Optional[ArynConfig] = None,
     threshold: Optional[Union[float, Literal["auto"]]] = None,
     use_ocr: bool = False,
-    ocr_images: bool = False,
     ocr_language: Optional[str] = None,
     extract_table_structure: bool = False,
     table_extraction_options: dict[str, Any] = {},
@@ -208,7 +203,6 @@ def _partition_file_wrapper(
             aryn_config=aryn_config,
             threshold=threshold,
             use_ocr=use_ocr,
-            ocr_images=ocr_images,
             ocr_language=ocr_language,
             extract_table_structure=extract_table_structure,
             table_extraction_options=table_extraction_options,
@@ -235,7 +229,6 @@ def _partition_file_inner(
     aryn_config: Optional[ArynConfig] = None,
     threshold: Optional[Union[float, Literal["auto"]]] = None,
     use_ocr: bool = False,
-    ocr_images: bool = False,
     ocr_language: Optional[str] = None,
     extract_table_structure: bool = False,
     table_extraction_options: dict[str, Any] = {},
@@ -268,7 +261,6 @@ def _partition_file_inner(
     options_str = _json_options(
         threshold=threshold,
         use_ocr=use_ocr,
-        ocr_images=ocr_images,
         ocr_language=ocr_language,
         extract_table_structure=extract_table_structure,
         table_extraction_options=table_extraction_options,
@@ -375,7 +367,6 @@ def _should_stream() -> bool:
 def _json_options(
     threshold: Optional[Union[float, Literal["auto"]]] = None,
     use_ocr: bool = False,
-    ocr_images: bool = False,
     ocr_language: Optional[str] = None,
     extract_table_structure: bool = False,
     table_extraction_options: dict[str, Any] = {},
@@ -391,8 +382,6 @@ def _json_options(
         options["threshold"] = threshold
     if use_ocr:
         options["use_ocr"] = use_ocr
-    if ocr_images:
-        options["ocr_images"] = ocr_images
     if ocr_language:
         options["ocr_language"] = ocr_language
     if extract_images:
@@ -422,7 +411,6 @@ def partition_file_async_submit(
     aryn_config: Optional[ArynConfig] = None,
     threshold: Optional[Union[float, Literal["auto"]]] = None,
     use_ocr: bool = False,
-    ocr_images: bool = False,
     ocr_language: Optional[str] = None,
     extract_table_structure: bool = False,
     table_extraction_options: dict[str, Any] = {},
@@ -478,7 +466,6 @@ def partition_file_async_submit(
         aryn_config=aryn_config,
         threshold=threshold,
         use_ocr=use_ocr,
-        ocr_images=ocr_images,
         ocr_language=ocr_language,
         extract_table_structure=extract_table_structure,
         table_extraction_options=table_extraction_options,
diff --git a/lib/sycamore/sycamore/transforms/detr_partitioner.py b/lib/sycamore/sycamore/transforms/detr_partitioner.py
index cd00c01c1..4e9489ee6 100644
--- a/lib/sycamore/sycamore/transforms/detr_partitioner.py
+++ b/lib/sycamore/sycamore/transforms/detr_partitioner.py
@@ -140,7 +140,6 @@ def partition_pdf(
         file: BinaryIO,
         threshold: Union[float, Literal["auto"]] = DEFAULT_LOCAL_THRESHOLD,
         use_ocr=False,
-        ocr_images=False,
         ocr_model: Union[str, OcrModel] = "easyocr",
         per_element_ocr=True,
         extract_table_structure=False,
@@ -167,7 +166,6 @@ def partition_pdf(
                 aryn_partitioner_address=aryn_partitioner_address,
                 threshold=threshold,
                 use_ocr=use_ocr,
-                ocr_images=ocr_images,
                 extract_table_structure=extract_table_structure,
                 extract_images=extract_images,
                 pages_per_call=pages_per_call,
@@ -183,7 +181,6 @@ def partition_pdf(
                 file=file,
                 threshold=threshold,
                 use_ocr=use_ocr,
-                ocr_images=ocr_images,
                 ocr_model=ocr_model,
                 per_element_ocr=per_element_ocr,
                 extract_table_structure=extract_table_structure,
@@ -226,7 +223,6 @@ def _call_remote_partitioner(
         aryn_partitioner_address=DEFAULT_ARYN_PARTITIONER_ADDRESS,
         threshold: Union[float, Literal["auto"]] = "auto",
         use_ocr: bool = False,
-        ocr_images: bool = False,
         extract_table_structure: bool = False,
         extract_images: bool = False,
         selected_pages: list = [],
@@ -238,7 +234,6 @@ def _call_remote_partitioner(
         options = {
             "threshold": threshold,
             "use_ocr": use_ocr,
-            "ocr_images": ocr_images,
             "extract_table_structure": extract_table_structure,
             "extract_images": extract_images,
             "selected_pages": selected_pages,
@@ -341,7 +336,6 @@ def _partition_remote(
         aryn_partitioner_address=DEFAULT_ARYN_PARTITIONER_ADDRESS,
         threshold: Union[float, Literal["auto"]] = "auto",
         use_ocr: bool = False,
-        ocr_images: bool = False,
         extract_table_structure: bool = False,
         extract_images: bool = False,
         pages_per_call: int = -1,
@@ -364,7 +358,6 @@ def _partition_remote(
                     aryn_partitioner_address=aryn_partitioner_address,
                     threshold=threshold,
                     use_ocr=use_ocr,
-                    ocr_images=ocr_images,
                     extract_table_structure=extract_table_structure,
                     extract_images=extract_images,
                     selected_pages=[[low, min(high, page_count)]],
@@ -383,7 +376,6 @@ def _partition_pdf_batched(
         file: BinaryIO,
         threshold: float = DEFAULT_LOCAL_THRESHOLD,
         use_ocr: bool = False,
-        ocr_images: bool = False,
         ocr_model: Union[str, OcrModel] = "easyocr",
         per_element_ocr: bool = True,
         extract_table_structure: bool = False,
@@ -410,7 +402,6 @@ def _partition_pdf_batched(
                 file_hash.hexdigest(),
                 threshold,
                 use_ocr,
-                ocr_images,
                 ocr_model,
                 per_element_ocr,
                 extract_table_structure,
@@ -430,7 +421,6 @@ def _partition_pdf_batched_named(
         hash_key: str,
         threshold: float = DEFAULT_LOCAL_THRESHOLD,
         use_ocr: bool = False,
-        ocr_images: bool = False,
         ocr_model: Union[str, OcrModel] = "easyocr",
         per_element_ocr: bool = True,
         extract_table_structure=False,
@@ -472,7 +462,6 @@ def _partition_pdf_batched_named(
                 use_ocr=use_ocr,
                 text_extractor=text_extractor,
                 extractor_inputs=extractor_inputs,
-                ocr_images=ocr_images,
                 ocr_model=ocr_model,
                 per_element_ocr=per_element_ocr,
                 extract_table_structure=extract_table_structure,
@@ -504,7 +493,6 @@ def process_batch(
         text_extractor: TextExtractor,
         extractor_inputs: Any,
         use_ocr: bool,
-        ocr_images: bool,
         ocr_model: Union[str, OcrModel],
         per_element_ocr: bool,
         extract_table_structure: bool,
@@ -526,7 +514,6 @@ def process_batch(
                 extract_ocr(
                     batch,
                     deformable_layout,
-                    ocr_images=ocr_images,
                     ocr_model=ocr_model,
                 )
         else:
@@ -571,12 +558,11 @@ def _run_text_extractor_document(
         hash_key: str,
         use_cache: bool,
         use_ocr: bool,
-        ocr_images: bool,
         text_extractor_model: Union[str, OcrModel],
         text_extraction_options: dict[str, Any],
         images: Optional[list[Image.Image]] = None,
     ):
-        kwargs = {"ocr_images": ocr_images, "images": images}
+        kwargs = {"images": images}
         if isinstance(text_extractor_model, OcrModel):
             model: TextExtractor = text_extractor_model
         else:
@@ -592,7 +578,6 @@ def process_batch_inference(
         use_cache: bool,
         use_ocr: bool,
         ocr_model: Union[str, OcrModel],
-        ocr_images: bool,
         per_element_ocr: bool,
     ) -> Any:
         self._init_model()
@@ -606,7 +591,6 @@ def process_batch_inference(
             extract_ocr(
                 batch,
                 deformable_layout,
-                ocr_images=ocr_images,
                 ocr_model=ocr_model,
             )
         return deformable_layout
@@ -777,7 +761,6 @@ def _get_hash_key(self, image: Image.Image, threshold: float) -> str:
 def extract_ocr(
     images: list[Image.Image],
     elements: list[list[Element]],
-    ocr_images: bool = False,
     ocr_model: Union[str, OcrModel] = "easyocr",
     text_extraction_options: dict[str, Any] = {},
 ) -> list[list[Element]]:
@@ -796,8 +779,6 @@ def extract_ocr(
         for elem in page_elements:
             if elem.bbox is None:
                 continue
-            if elem.type == "Picture" and not ocr_images:
-                continue
             cropped_image = crop_to_bbox(image, elem.bbox, padding=0)
             if 0 in cropped_image.size:
                 elem.text_representation = ""
diff --git a/lib/sycamore/sycamore/transforms/partition.py b/lib/sycamore/sycamore/transforms/partition.py
index c39a5c1b2..6b3dbb65c 100644
--- a/lib/sycamore/sycamore/transforms/partition.py
+++ b/lib/sycamore/sycamore/transforms/partition.py
@@ -379,8 +379,6 @@ class ArynPartitioner(Partitioner):
         use_ocr: Whether to use OCR to extract text from the PDF. If false, we will attempt to extract
              the text from the underlying PDF.
             default: False
-        ocr_images: If set with use_ocr, will attempt to OCR regions of the document identified as images.
-            default: False
         ocr_model: model to use for OCR. Choices are "easyocr", "paddle", "tesseract" and "legacy", which
             correspond to EasyOCR, PaddleOCR, and Tesseract respectively, with "legacy" being a combination of
             Tesseract for text and EasyOCR for tables. If you choose paddle make sure to install
@@ -446,7 +444,6 @@ def __init__(
         model_name_or_path=ARYN_DETR_MODEL,
         threshold: Optional[Union[float, Literal["auto"]]] = None,
         use_ocr: bool = False,
-        ocr_images: bool = False,
         ocr_model: str = "easyocr",
         per_element_ocr: bool = True,
         extract_table_structure: bool = False,
@@ -489,7 +486,6 @@ def __init__(
             self._threshold = threshold
 
         self._use_ocr = use_ocr
-        self._ocr_images = ocr_images
         self._ocr_model = ocr_model
         self._per_element_ocr = per_element_ocr
         self._extract_table_structure = extract_table_structure
@@ -519,7 +515,6 @@ def partition(self, document: Document) -> Document:
                 binary,
                 self._threshold,
                 use_ocr=self._use_ocr,
-                ocr_images=self._ocr_images,
                 per_element_ocr=self._per_element_ocr,
                 ocr_model=self._ocr_model,
                 extract_table_structure=self._extract_table_structure,
@@ -561,7 +556,6 @@ def __init__(
         model_name_or_path=ARYN_DETR_MODEL,
         threshold: float = 0.4,
         use_ocr=False,
-        ocr_images=False,
         ocr_tables=False,
         extract_table_structure=False,
         table_structure_extractor=None,
@@ -574,7 +568,6 @@ def __init__(
             model_name_or_path=model_name_or_path,
             threshold=threshold,
             use_ocr=use_ocr,
-            ocr_images=ocr_images,
             extract_table_structure=extract_table_structure,
             extract_images=extract_images,
             device=device,