Skip to content

Commit

Permalink
merge main
Browse files Browse the repository at this point in the history
  • Loading branch information
karanataryn committed Feb 25, 2025
2 parents 4e7b101 + 27285f0 commit cf9db7c
Show file tree
Hide file tree
Showing 6 changed files with 80 additions and 47 deletions.
60 changes: 60 additions & 0 deletions apps/git/git-credential-from-env.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/usr/bin/env python3
#
# Credential helper to enable people to store fine grained access tokens in ssh environment
# variables for use on a shared instance.
# git config --global credential.helper ..../git-credentials-from-env.py
# git config --global credential.useHttpPath true
#
# To make a fine grained access token:
# Github console upper right -> Settings -> Developer settings (lower left)
# -> Personal access tokens -> Fine-grained tokens -> Generate new token
#
# You will likely need to change the resource owner to your organization.
# After selecting a repository, make sure to set the right repository permisisons
# To push and pull you will need Contents = Read and write; and Metadata = Read-only
#
# Get the token to the remote machine, you can
# 1. gpg encrypt it, mail it and decrypt it; or
# 2. type it in manually. To verify correct typing, use a checksum.
# linux: sha256sum <file
# windows: certutil -hashfile file SHA256

import sys
import logging
import os

if len(sys.argv) != 2 or sys.argv[1] != "get":
exit(0)

d = {}

for line in sys.stdin:
if line == "\n":
break
p = line.rstrip().split("=") # Fix the missing split
if len(p) != 2:
logging.error(f"{__file__}: unable to parse {line}")
continue
d[p[0]] = p[1]

if d.get("host", "") == "github.com" and "aryn-ai" in d.get("path", ""):
assert "ARYN_GITHUB_USER" in os.environ
assert "ARYN_GITHUB_KEY" in os.environ
print("protocol=https")
print("host=github.com")
print(f"username={os.environ['ARYN_GITHUB_USER']}")
print(f"password={os.environ['ARYN_GITHUB_KEY']}")
logging.error(f"git-credentials-from-env helper: Aryn github.com was used for {d['path']}")
exit(0)

if "CUSTOMER_USER" in os.environ and "CUSTOMER_KEY" in os.environ:
print("protocol=https")
print("host=github.com")
print(f"username={os.environ['CUSTOMER_USER']}")
print(f"password={os.environ['CUSTOMER_KEY']}")
logging.error(f"git-credentials-from-env helper: Customer user was used for {d['path']}")
exit(0)

logging.error(f"WARNING from {__file__}: Unable to find CUSTOMER_USER and CUSTOMER_KEY in environ.")
logging.error("WARNING since the helper was enabled, this is probably an error.")
exit(0)
13 changes: 0 additions & 13 deletions lib/aryn-sdk/aryn_sdk/partition/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ def partition_file(
aryn_config: Optional[ArynConfig] = None,
threshold: Optional[Union[float, Literal["auto"]]] = None,
use_ocr: bool = False,
ocr_images: bool = False,
summarize_images: bool = False,
ocr_language: Optional[str] = None,
extract_table_structure: bool = False,
Expand Down Expand Up @@ -81,8 +80,6 @@ def partition_file(
default: None (Aryn DocParse will choose)
use_ocr: extract text using an OCR model instead of extracting embedded text in PDF.
default: False
ocr_images: attempt to use OCR to generate a text representation of detected images.
default: False
summarize_images: Generate a text summary of detected images using a VLM.
ocr_language: specify the language to use for OCR. If not set, the language will be english.
default: English
Expand Down Expand Up @@ -158,7 +155,6 @@ def partition_file(
aryn_config=aryn_config,
threshold=threshold,
use_ocr=use_ocr,
ocr_images=ocr_images,
summarize_images=summarize_images,
ocr_language=ocr_language,
extract_table_structure=extract_table_structure,
Expand All @@ -182,7 +178,6 @@ def _partition_file_wrapper(
aryn_config: Optional[ArynConfig] = None,
threshold: Optional[Union[float, Literal["auto"]]] = None,
use_ocr: bool = False,
ocr_images: bool = False,
summarize_images: bool = False,
ocr_language: Optional[str] = None,
extract_table_structure: bool = False,
Expand Down Expand Up @@ -212,7 +207,6 @@ def _partition_file_wrapper(
aryn_config=aryn_config,
threshold=threshold,
use_ocr=use_ocr,
ocr_images=ocr_images,
summarize_images=summarize_images,
ocr_language=ocr_language,
extract_table_structure=extract_table_structure,
Expand Down Expand Up @@ -240,7 +234,6 @@ def _partition_file_inner(
aryn_config: Optional[ArynConfig] = None,
threshold: Optional[Union[float, Literal["auto"]]] = None,
use_ocr: bool = False,
ocr_images: bool = False,
summarize_images: bool = False,
ocr_language: Optional[str] = None,
extract_table_structure: bool = False,
Expand Down Expand Up @@ -274,7 +267,6 @@ def _partition_file_inner(
options_str = _json_options(
threshold=threshold,
use_ocr=use_ocr,
ocr_images=ocr_images,
summarize_images=summarize_images,
ocr_language=ocr_language,
extract_table_structure=extract_table_structure,
Expand Down Expand Up @@ -382,7 +374,6 @@ def _should_stream() -> bool:
def _json_options(
threshold: Optional[Union[float, Literal["auto"]]] = None,
use_ocr: bool = False,
ocr_images: bool = False,
summarize_images: bool = False,
ocr_language: Optional[str] = None,
extract_table_structure: bool = False,
Expand All @@ -399,8 +390,6 @@ def _json_options(
options["threshold"] = threshold
if use_ocr:
options["use_ocr"] = use_ocr
if ocr_images:
options["ocr_images"] = ocr_images
if summarize_images:
options["summarize_images"] = summarize_images
if ocr_language:
Expand Down Expand Up @@ -432,7 +421,6 @@ def partition_file_async_submit(
aryn_config: Optional[ArynConfig] = None,
threshold: Optional[Union[float, Literal["auto"]]] = None,
use_ocr: bool = False,
ocr_images: bool = False,
summarize_images: bool = False,
ocr_language: Optional[str] = None,
extract_table_structure: bool = False,
Expand Down Expand Up @@ -489,7 +477,6 @@ def partition_file_async_submit(
aryn_config=aryn_config,
threshold=threshold,
use_ocr=use_ocr,
ocr_images=ocr_images,
summarize_images=summarize_images,
ocr_language=ocr_language,
extract_table_structure=extract_table_structure,
Expand Down
6 changes: 5 additions & 1 deletion lib/sycamore/sycamore/query/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,11 @@ def get_schema(self) -> OpenSearchSchema:
logger.debug(f"Getting schema for index {self._index}")
# Fetch example values.
query["index"] = self._index
query["query"] = {"query": {"match_all": {}}, "size": self.NUM_EXAMPLES}
query["query"] = {
"query": {"match_all": {}},
"size": self.NUM_EXAMPLES,
"sort": [{"_script": {"type": "number", "script": {"source": "Math.random()"}}}],
}
random_sample = self._query_executor.query(query)["result"]["hits"]["hits"]

result = OpenSearchSchema(
Expand Down
20 changes: 14 additions & 6 deletions lib/sycamore/sycamore/tests/integration/test_image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pdf2image
from PIL import Image
from copy import deepcopy


import sycamore
Expand All @@ -27,18 +28,23 @@ def image_boxes() -> list[BoundingBox]:


@pytest.fixture(scope="module")
def source_image() -> Image.Image:
def source_image_module_scope() -> Image.Image:
images = pdf2image.convert_from_path(path)
return images[0].convert(mode="RGBA")


@pytest.fixture(scope="function")
def source_image(source_image_module_scope) -> Image.Image:
return deepcopy(source_image_module_scope)


# Checks that the image contains blue pixels. This is of course an imperfect check, but
# it at least will tell us if we drew some bounding boxes. on the image. Won't work
# if the image contains blue pixels to begin with. Image must have mode RGBA.
def check_image(image: Image.Image, expected_color=(0, 0, 255, 255)) -> None:
raw_colors = image.getcolors(64_000)
assert raw_colors is not None, "Image has too many colors to count"
assert expected_color in set((color_tup[1] for color_tup in raw_colors))
assert expected_color in set((color_tup[1] for color_tup in raw_colors)), "Did not draw boxes"


def test_draw_boxes_bbox(source_image, image_boxes) -> None:
Expand Down Expand Up @@ -86,11 +92,13 @@ def test_draw_boxes_dict(source_image, image_boxes) -> None:

def test_invalid_list(source_image, image_boxes):
boxes = [[b.coordinates] for b in image_boxes]
with pytest.raises(ValueError):
try_draw_boxes(source_image, boxes)
output: Image.Image = try_draw_boxes(source_image, boxes)
with pytest.raises(AssertionError, match=r".*Did not draw boxes.*"):
check_image(output)


def test_invalid_dict(source_image, image_boxes):
boxes = [{"bboxes": b.coordinates} for b in image_boxes]
with pytest.raises(ValueError):
try_draw_boxes(source_image, boxes)
output: Image.Image = try_draw_boxes(source_image, boxes)
with pytest.raises(AssertionError, match=r".*Did not draw boxes.*"):
check_image(output)
21 changes: 1 addition & 20 deletions lib/sycamore/sycamore/transforms/detr_partitioner.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,6 @@ def partition_pdf(
file: BinaryIO,
threshold: Union[float, Literal["auto"]] = DEFAULT_LOCAL_THRESHOLD,
use_ocr=False,
ocr_images=False,
ocr_model: Union[str, OcrModel] = "easyocr",
per_element_ocr=True,
extract_table_structure=False,
Expand All @@ -167,7 +166,6 @@ def partition_pdf(
aryn_partitioner_address=aryn_partitioner_address,
threshold=threshold,
use_ocr=use_ocr,
ocr_images=ocr_images,
extract_table_structure=extract_table_structure,
extract_images=extract_images,
pages_per_call=pages_per_call,
Expand All @@ -183,7 +181,6 @@ def partition_pdf(
file=file,
threshold=threshold,
use_ocr=use_ocr,
ocr_images=ocr_images,
ocr_model=ocr_model,
per_element_ocr=per_element_ocr,
extract_table_structure=extract_table_structure,
Expand Down Expand Up @@ -226,7 +223,6 @@ def _call_remote_partitioner(
aryn_partitioner_address=DEFAULT_ARYN_PARTITIONER_ADDRESS,
threshold: Union[float, Literal["auto"]] = "auto",
use_ocr: bool = False,
ocr_images: bool = False,
extract_table_structure: bool = False,
extract_images: bool = False,
selected_pages: list = [],
Expand All @@ -238,7 +234,6 @@ def _call_remote_partitioner(
options = {
"threshold": threshold,
"use_ocr": use_ocr,
"ocr_images": ocr_images,
"extract_table_structure": extract_table_structure,
"extract_images": extract_images,
"selected_pages": selected_pages,
Expand Down Expand Up @@ -341,7 +336,6 @@ def _partition_remote(
aryn_partitioner_address=DEFAULT_ARYN_PARTITIONER_ADDRESS,
threshold: Union[float, Literal["auto"]] = "auto",
use_ocr: bool = False,
ocr_images: bool = False,
extract_table_structure: bool = False,
extract_images: bool = False,
pages_per_call: int = -1,
Expand All @@ -364,7 +358,6 @@ def _partition_remote(
aryn_partitioner_address=aryn_partitioner_address,
threshold=threshold,
use_ocr=use_ocr,
ocr_images=ocr_images,
extract_table_structure=extract_table_structure,
extract_images=extract_images,
selected_pages=[[low, min(high, page_count)]],
Expand All @@ -383,7 +376,6 @@ def _partition_pdf_batched(
file: BinaryIO,
threshold: float = DEFAULT_LOCAL_THRESHOLD,
use_ocr: bool = False,
ocr_images: bool = False,
ocr_model: Union[str, OcrModel] = "easyocr",
per_element_ocr: bool = True,
extract_table_structure: bool = False,
Expand All @@ -410,7 +402,6 @@ def _partition_pdf_batched(
file_hash.hexdigest(),
threshold,
use_ocr,
ocr_images,
ocr_model,
per_element_ocr,
extract_table_structure,
Expand All @@ -430,7 +421,6 @@ def _partition_pdf_batched_named(
hash_key: str,
threshold: float = DEFAULT_LOCAL_THRESHOLD,
use_ocr: bool = False,
ocr_images: bool = False,
ocr_model: Union[str, OcrModel] = "easyocr",
per_element_ocr: bool = True,
extract_table_structure=False,
Expand Down Expand Up @@ -472,7 +462,6 @@ def _partition_pdf_batched_named(
use_ocr=use_ocr,
text_extractor=text_extractor,
extractor_inputs=extractor_inputs,
ocr_images=ocr_images,
ocr_model=ocr_model,
per_element_ocr=per_element_ocr,
extract_table_structure=extract_table_structure,
Expand Down Expand Up @@ -504,7 +493,6 @@ def process_batch(
text_extractor: TextExtractor,
extractor_inputs: Any,
use_ocr: bool,
ocr_images: bool,
ocr_model: Union[str, OcrModel],
per_element_ocr: bool,
extract_table_structure: bool,
Expand All @@ -526,7 +514,6 @@ def process_batch(
extract_ocr(
batch,
deformable_layout,
ocr_images=ocr_images,
ocr_model=ocr_model,
)
else:
Expand Down Expand Up @@ -571,12 +558,11 @@ def _run_text_extractor_document(
hash_key: str,
use_cache: bool,
use_ocr: bool,
ocr_images: bool,
text_extractor_model: Union[str, OcrModel],
text_extraction_options: dict[str, Any],
images: Optional[list[Image.Image]] = None,
):
kwargs = {"ocr_images": ocr_images, "images": images}
kwargs = {"images": images}
if isinstance(text_extractor_model, OcrModel):
model: TextExtractor = text_extractor_model
else:
Expand All @@ -592,7 +578,6 @@ def process_batch_inference(
use_cache: bool,
use_ocr: bool,
ocr_model: Union[str, OcrModel],
ocr_images: bool,
per_element_ocr: bool,
) -> Any:
self._init_model()
Expand All @@ -606,7 +591,6 @@ def process_batch_inference(
extract_ocr(
batch,
deformable_layout,
ocr_images=ocr_images,
ocr_model=ocr_model,
)
return deformable_layout
Expand Down Expand Up @@ -777,7 +761,6 @@ def _get_hash_key(self, image: Image.Image, threshold: float) -> str:
def extract_ocr(
images: list[Image.Image],
elements: list[list[Element]],
ocr_images: bool = False,
ocr_model: Union[str, OcrModel] = "easyocr",
text_extraction_options: dict[str, Any] = {},
) -> list[list[Element]]:
Expand All @@ -796,8 +779,6 @@ def extract_ocr(
for elem in page_elements:
if elem.bbox is None:
continue
if elem.type == "Picture" and not ocr_images:
continue
cropped_image = crop_to_bbox(image, elem.bbox, padding=0)
if 0 in cropped_image.size:
elem.text_representation = ""
Expand Down
Loading

0 comments on commit cf9db7c

Please sign in to comment.