From faafd115fa07ec240e7b8d12ebe27f15a3d41cf7 Mon Sep 17 00:00:00 2001 From: Eric Anderson Date: Thu, 20 Feb 2025 13:11:24 -0800 Subject: [PATCH 1/4] Helper script for getting git credentials from the environment (#1190) Useful for passing in fine-grained personal access tokens via environment variables when SSHing into a node so that they don't have to be stored in a shared environment. --- apps/git/git-credential-from-env.py | 60 +++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100755 apps/git/git-credential-from-env.py diff --git a/apps/git/git-credential-from-env.py b/apps/git/git-credential-from-env.py new file mode 100755 index 000000000..748f2473f --- /dev/null +++ b/apps/git/git-credential-from-env.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +# +# Credential helper to enable people to store fine grained access tokens in ssh environment +# variables for use on a shared instance. +# git config --global credential.helper ..../git-credentials-from-env.py +# git config --global credential.useHttpPath true +# +# To make a fine grained access token: +# Github console upper right -> Settings -> Developer settings (lower left) +# -> Personal access tokens -> Fine-grained tokens -> Generate new token +# +# You will likely need to change the resource owner to your organization. +# After selecting a repository, make sure to set the right repository permisisons +# To push and pull you will need Contents = Read and write; and Metadata = Read-only +# +# Get the token to the remote machine, you can +# 1. gpg encrypt it, mail it and decrypt it; or +# 2. type it in manually. To verify correct typing, use a checksum. +# linux: sha256sum Date: Mon, 24 Feb 2025 12:40:04 -0800 Subject: [PATCH 2/4] fix image tests - we don't explode on bad bboxes these days (#1193) Signed-off-by: Henry Lindeman --- .../tests/integration/test_image_utils.py | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/lib/sycamore/sycamore/tests/integration/test_image_utils.py b/lib/sycamore/sycamore/tests/integration/test_image_utils.py index 05c486027..9eb7ea716 100644 --- a/lib/sycamore/sycamore/tests/integration/test_image_utils.py +++ b/lib/sycamore/sycamore/tests/integration/test_image_utils.py @@ -2,6 +2,7 @@ import pdf2image from PIL import Image +from copy import deepcopy import sycamore @@ -27,18 +28,23 @@ def image_boxes() -> list[BoundingBox]: @pytest.fixture(scope="module") -def source_image() -> Image.Image: +def source_image_module_scope() -> Image.Image: images = pdf2image.convert_from_path(path) return images[0].convert(mode="RGBA") +@pytest.fixture(scope="function") +def source_image(source_image_module_scope) -> Image.Image: + return deepcopy(source_image_module_scope) + + # Checks that the image contains blue pixels. This is of course an imperfect check, but # it at least will tell us if we drew some bounding boxes. on the image. Won't work # if the image contains blue pixels to begin with. Image must have mode RGBA. def check_image(image: Image.Image, expected_color=(0, 0, 255, 255)) -> None: raw_colors = image.getcolors(64_000) assert raw_colors is not None, "Image has too many colors to count" - assert expected_color in set((color_tup[1] for color_tup in raw_colors)) + assert expected_color in set((color_tup[1] for color_tup in raw_colors)), "Did not draw boxes" def test_draw_boxes_bbox(source_image, image_boxes) -> None: @@ -86,11 +92,13 @@ def test_draw_boxes_dict(source_image, image_boxes) -> None: def test_invalid_list(source_image, image_boxes): boxes = [[b.coordinates] for b in image_boxes] - with pytest.raises(ValueError): - try_draw_boxes(source_image, boxes) + output: Image.Image = try_draw_boxes(source_image, boxes) + with pytest.raises(AssertionError, match=r".*Did not draw boxes.*"): + check_image(output) def test_invalid_dict(source_image, image_boxes): boxes = [{"bboxes": b.coordinates} for b in image_boxes] - with pytest.raises(ValueError): - try_draw_boxes(source_image, boxes) + output: Image.Image = try_draw_boxes(source_image, boxes) + with pytest.raises(AssertionError, match=r".*Did not draw boxes.*"): + check_image(output) From 2711b1001dd362fd7c6e7fceaa66f31c2f902c9d Mon Sep 17 00:00:00 2001 From: Dhruv Kaliraman <112497058+dhruvkaliraman7@users.noreply.github.com> Date: Mon, 24 Feb 2025 18:21:33 -0500 Subject: [PATCH 3/4] Get random hits when filtering properties (#1195) --- lib/sycamore/sycamore/query/schema.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/sycamore/sycamore/query/schema.py b/lib/sycamore/sycamore/query/schema.py index 97aafd7ce..dd1405c58 100644 --- a/lib/sycamore/sycamore/query/schema.py +++ b/lib/sycamore/sycamore/query/schema.py @@ -57,7 +57,11 @@ def get_schema(self) -> OpenSearchSchema: logger.debug(f"Getting schema for index {self._index}") # Fetch example values. query["index"] = self._index - query["query"] = {"query": {"match_all": {}}, "size": self.NUM_EXAMPLES} + query["query"] = { + "query": {"match_all": {}}, + "size": self.NUM_EXAMPLES, + "sort": [{"_script": {"type": "number", "script": {"source": "Math.random()"}}}], + } random_sample = self._query_executor.query(query)["result"]["hits"]["hits"] result = OpenSearchSchema( From 27285f0522b410d0ecac977dd3b3ebe07ea9f6db Mon Sep 17 00:00:00 2001 From: Karan Sampath <176953591+karanataryn@users.noreply.github.com> Date: Tue, 25 Feb 2025 10:01:06 -0800 Subject: [PATCH 4/4] remove ocr images (#1194) --- lib/aryn-sdk/aryn_sdk/partition/partition.py | 13 ------------ .../sycamore/transforms/detr_partitioner.py | 21 +------------------ lib/sycamore/sycamore/transforms/partition.py | 7 ------- 3 files changed, 1 insertion(+), 40 deletions(-) diff --git a/lib/aryn-sdk/aryn_sdk/partition/partition.py b/lib/aryn-sdk/aryn_sdk/partition/partition.py index bdd30ae45..73979f91e 100644 --- a/lib/aryn-sdk/aryn_sdk/partition/partition.py +++ b/lib/aryn-sdk/aryn_sdk/partition/partition.py @@ -50,7 +50,6 @@ def partition_file( aryn_config: Optional[ArynConfig] = None, threshold: Optional[Union[float, Literal["auto"]]] = None, use_ocr: bool = False, - ocr_images: bool = False, ocr_language: Optional[str] = None, extract_table_structure: bool = False, table_extraction_options: dict[str, Any] = {}, @@ -80,8 +79,6 @@ def partition_file( default: None (Aryn DocParse will choose) use_ocr: extract text using an OCR model instead of extracting embedded text in PDF. default: False - ocr_images: attempt to use OCR to generate a text representation of detected images. - default: False ocr_language: specify the language to use for OCR. If not set, the language will be english. default: English extract_table_structure: extract tables and their structural content. @@ -156,7 +153,6 @@ def partition_file( aryn_config=aryn_config, threshold=threshold, use_ocr=use_ocr, - ocr_images=ocr_images, ocr_language=ocr_language, extract_table_structure=extract_table_structure, table_extraction_options=table_extraction_options, @@ -179,7 +175,6 @@ def _partition_file_wrapper( aryn_config: Optional[ArynConfig] = None, threshold: Optional[Union[float, Literal["auto"]]] = None, use_ocr: bool = False, - ocr_images: bool = False, ocr_language: Optional[str] = None, extract_table_structure: bool = False, table_extraction_options: dict[str, Any] = {}, @@ -208,7 +203,6 @@ def _partition_file_wrapper( aryn_config=aryn_config, threshold=threshold, use_ocr=use_ocr, - ocr_images=ocr_images, ocr_language=ocr_language, extract_table_structure=extract_table_structure, table_extraction_options=table_extraction_options, @@ -235,7 +229,6 @@ def _partition_file_inner( aryn_config: Optional[ArynConfig] = None, threshold: Optional[Union[float, Literal["auto"]]] = None, use_ocr: bool = False, - ocr_images: bool = False, ocr_language: Optional[str] = None, extract_table_structure: bool = False, table_extraction_options: dict[str, Any] = {}, @@ -268,7 +261,6 @@ def _partition_file_inner( options_str = _json_options( threshold=threshold, use_ocr=use_ocr, - ocr_images=ocr_images, ocr_language=ocr_language, extract_table_structure=extract_table_structure, table_extraction_options=table_extraction_options, @@ -375,7 +367,6 @@ def _should_stream() -> bool: def _json_options( threshold: Optional[Union[float, Literal["auto"]]] = None, use_ocr: bool = False, - ocr_images: bool = False, ocr_language: Optional[str] = None, extract_table_structure: bool = False, table_extraction_options: dict[str, Any] = {}, @@ -391,8 +382,6 @@ def _json_options( options["threshold"] = threshold if use_ocr: options["use_ocr"] = use_ocr - if ocr_images: - options["ocr_images"] = ocr_images if ocr_language: options["ocr_language"] = ocr_language if extract_images: @@ -422,7 +411,6 @@ def partition_file_async_submit( aryn_config: Optional[ArynConfig] = None, threshold: Optional[Union[float, Literal["auto"]]] = None, use_ocr: bool = False, - ocr_images: bool = False, ocr_language: Optional[str] = None, extract_table_structure: bool = False, table_extraction_options: dict[str, Any] = {}, @@ -478,7 +466,6 @@ def partition_file_async_submit( aryn_config=aryn_config, threshold=threshold, use_ocr=use_ocr, - ocr_images=ocr_images, ocr_language=ocr_language, extract_table_structure=extract_table_structure, table_extraction_options=table_extraction_options, diff --git a/lib/sycamore/sycamore/transforms/detr_partitioner.py b/lib/sycamore/sycamore/transforms/detr_partitioner.py index cd00c01c1..4e9489ee6 100644 --- a/lib/sycamore/sycamore/transforms/detr_partitioner.py +++ b/lib/sycamore/sycamore/transforms/detr_partitioner.py @@ -140,7 +140,6 @@ def partition_pdf( file: BinaryIO, threshold: Union[float, Literal["auto"]] = DEFAULT_LOCAL_THRESHOLD, use_ocr=False, - ocr_images=False, ocr_model: Union[str, OcrModel] = "easyocr", per_element_ocr=True, extract_table_structure=False, @@ -167,7 +166,6 @@ def partition_pdf( aryn_partitioner_address=aryn_partitioner_address, threshold=threshold, use_ocr=use_ocr, - ocr_images=ocr_images, extract_table_structure=extract_table_structure, extract_images=extract_images, pages_per_call=pages_per_call, @@ -183,7 +181,6 @@ def partition_pdf( file=file, threshold=threshold, use_ocr=use_ocr, - ocr_images=ocr_images, ocr_model=ocr_model, per_element_ocr=per_element_ocr, extract_table_structure=extract_table_structure, @@ -226,7 +223,6 @@ def _call_remote_partitioner( aryn_partitioner_address=DEFAULT_ARYN_PARTITIONER_ADDRESS, threshold: Union[float, Literal["auto"]] = "auto", use_ocr: bool = False, - ocr_images: bool = False, extract_table_structure: bool = False, extract_images: bool = False, selected_pages: list = [], @@ -238,7 +234,6 @@ def _call_remote_partitioner( options = { "threshold": threshold, "use_ocr": use_ocr, - "ocr_images": ocr_images, "extract_table_structure": extract_table_structure, "extract_images": extract_images, "selected_pages": selected_pages, @@ -341,7 +336,6 @@ def _partition_remote( aryn_partitioner_address=DEFAULT_ARYN_PARTITIONER_ADDRESS, threshold: Union[float, Literal["auto"]] = "auto", use_ocr: bool = False, - ocr_images: bool = False, extract_table_structure: bool = False, extract_images: bool = False, pages_per_call: int = -1, @@ -364,7 +358,6 @@ def _partition_remote( aryn_partitioner_address=aryn_partitioner_address, threshold=threshold, use_ocr=use_ocr, - ocr_images=ocr_images, extract_table_structure=extract_table_structure, extract_images=extract_images, selected_pages=[[low, min(high, page_count)]], @@ -383,7 +376,6 @@ def _partition_pdf_batched( file: BinaryIO, threshold: float = DEFAULT_LOCAL_THRESHOLD, use_ocr: bool = False, - ocr_images: bool = False, ocr_model: Union[str, OcrModel] = "easyocr", per_element_ocr: bool = True, extract_table_structure: bool = False, @@ -410,7 +402,6 @@ def _partition_pdf_batched( file_hash.hexdigest(), threshold, use_ocr, - ocr_images, ocr_model, per_element_ocr, extract_table_structure, @@ -430,7 +421,6 @@ def _partition_pdf_batched_named( hash_key: str, threshold: float = DEFAULT_LOCAL_THRESHOLD, use_ocr: bool = False, - ocr_images: bool = False, ocr_model: Union[str, OcrModel] = "easyocr", per_element_ocr: bool = True, extract_table_structure=False, @@ -472,7 +462,6 @@ def _partition_pdf_batched_named( use_ocr=use_ocr, text_extractor=text_extractor, extractor_inputs=extractor_inputs, - ocr_images=ocr_images, ocr_model=ocr_model, per_element_ocr=per_element_ocr, extract_table_structure=extract_table_structure, @@ -504,7 +493,6 @@ def process_batch( text_extractor: TextExtractor, extractor_inputs: Any, use_ocr: bool, - ocr_images: bool, ocr_model: Union[str, OcrModel], per_element_ocr: bool, extract_table_structure: bool, @@ -526,7 +514,6 @@ def process_batch( extract_ocr( batch, deformable_layout, - ocr_images=ocr_images, ocr_model=ocr_model, ) else: @@ -571,12 +558,11 @@ def _run_text_extractor_document( hash_key: str, use_cache: bool, use_ocr: bool, - ocr_images: bool, text_extractor_model: Union[str, OcrModel], text_extraction_options: dict[str, Any], images: Optional[list[Image.Image]] = None, ): - kwargs = {"ocr_images": ocr_images, "images": images} + kwargs = {"images": images} if isinstance(text_extractor_model, OcrModel): model: TextExtractor = text_extractor_model else: @@ -592,7 +578,6 @@ def process_batch_inference( use_cache: bool, use_ocr: bool, ocr_model: Union[str, OcrModel], - ocr_images: bool, per_element_ocr: bool, ) -> Any: self._init_model() @@ -606,7 +591,6 @@ def process_batch_inference( extract_ocr( batch, deformable_layout, - ocr_images=ocr_images, ocr_model=ocr_model, ) return deformable_layout @@ -777,7 +761,6 @@ def _get_hash_key(self, image: Image.Image, threshold: float) -> str: def extract_ocr( images: list[Image.Image], elements: list[list[Element]], - ocr_images: bool = False, ocr_model: Union[str, OcrModel] = "easyocr", text_extraction_options: dict[str, Any] = {}, ) -> list[list[Element]]: @@ -796,8 +779,6 @@ def extract_ocr( for elem in page_elements: if elem.bbox is None: continue - if elem.type == "Picture" and not ocr_images: - continue cropped_image = crop_to_bbox(image, elem.bbox, padding=0) if 0 in cropped_image.size: elem.text_representation = "" diff --git a/lib/sycamore/sycamore/transforms/partition.py b/lib/sycamore/sycamore/transforms/partition.py index c39a5c1b2..6b3dbb65c 100644 --- a/lib/sycamore/sycamore/transforms/partition.py +++ b/lib/sycamore/sycamore/transforms/partition.py @@ -379,8 +379,6 @@ class ArynPartitioner(Partitioner): use_ocr: Whether to use OCR to extract text from the PDF. If false, we will attempt to extract the text from the underlying PDF. default: False - ocr_images: If set with use_ocr, will attempt to OCR regions of the document identified as images. - default: False ocr_model: model to use for OCR. Choices are "easyocr", "paddle", "tesseract" and "legacy", which correspond to EasyOCR, PaddleOCR, and Tesseract respectively, with "legacy" being a combination of Tesseract for text and EasyOCR for tables. If you choose paddle make sure to install @@ -446,7 +444,6 @@ def __init__( model_name_or_path=ARYN_DETR_MODEL, threshold: Optional[Union[float, Literal["auto"]]] = None, use_ocr: bool = False, - ocr_images: bool = False, ocr_model: str = "easyocr", per_element_ocr: bool = True, extract_table_structure: bool = False, @@ -489,7 +486,6 @@ def __init__( self._threshold = threshold self._use_ocr = use_ocr - self._ocr_images = ocr_images self._ocr_model = ocr_model self._per_element_ocr = per_element_ocr self._extract_table_structure = extract_table_structure @@ -519,7 +515,6 @@ def partition(self, document: Document) -> Document: binary, self._threshold, use_ocr=self._use_ocr, - ocr_images=self._ocr_images, per_element_ocr=self._per_element_ocr, ocr_model=self._ocr_model, extract_table_structure=self._extract_table_structure, @@ -561,7 +556,6 @@ def __init__( model_name_or_path=ARYN_DETR_MODEL, threshold: float = 0.4, use_ocr=False, - ocr_images=False, ocr_tables=False, extract_table_structure=False, table_structure_extractor=None, @@ -574,7 +568,6 @@ def __init__( model_name_or_path=model_name_or_path, threshold=threshold, use_ocr=use_ocr, - ocr_images=ocr_images, extract_table_structure=extract_table_structure, extract_images=extract_images, device=device,