diff --git a/apps/git/git-credential-from-env.py b/apps/git/git-credential-from-env.py new file mode 100755 index 000000000..748f2473f --- /dev/null +++ b/apps/git/git-credential-from-env.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +# +# Credential helper to enable people to store fine grained access tokens in ssh environment +# variables for use on a shared instance. +# git config --global credential.helper ..../git-credentials-from-env.py +# git config --global credential.useHttpPath true +# +# To make a fine grained access token: +# Github console upper right -> Settings -> Developer settings (lower left) +# -> Personal access tokens -> Fine-grained tokens -> Generate new token +# +# You will likely need to change the resource owner to your organization. +# After selecting a repository, make sure to set the right repository permisisons +# To push and pull you will need Contents = Read and write; and Metadata = Read-only +# +# Get the token to the remote machine, you can +# 1. gpg encrypt it, mail it and decrypt it; or +# 2. type it in manually. To verify correct typing, use a checksum. +# linux: sha256sum bool: def _json_options( threshold: Optional[Union[float, Literal["auto"]]] = None, use_ocr: bool = False, - ocr_images: bool = False, summarize_images: bool = False, ocr_language: Optional[str] = None, extract_table_structure: bool = False, @@ -399,8 +390,6 @@ def _json_options( options["threshold"] = threshold if use_ocr: options["use_ocr"] = use_ocr - if ocr_images: - options["ocr_images"] = ocr_images if summarize_images: options["summarize_images"] = summarize_images if ocr_language: @@ -432,7 +421,6 @@ def partition_file_async_submit( aryn_config: Optional[ArynConfig] = None, threshold: Optional[Union[float, Literal["auto"]]] = None, use_ocr: bool = False, - ocr_images: bool = False, summarize_images: bool = False, ocr_language: Optional[str] = None, extract_table_structure: bool = False, @@ -489,7 +477,6 @@ def partition_file_async_submit( aryn_config=aryn_config, threshold=threshold, use_ocr=use_ocr, - ocr_images=ocr_images, summarize_images=summarize_images, ocr_language=ocr_language, extract_table_structure=extract_table_structure, diff --git a/lib/sycamore/sycamore/query/schema.py b/lib/sycamore/sycamore/query/schema.py index 97aafd7ce..dd1405c58 100644 --- a/lib/sycamore/sycamore/query/schema.py +++ b/lib/sycamore/sycamore/query/schema.py @@ -57,7 +57,11 @@ def get_schema(self) -> OpenSearchSchema: logger.debug(f"Getting schema for index {self._index}") # Fetch example values. query["index"] = self._index - query["query"] = {"query": {"match_all": {}}, "size": self.NUM_EXAMPLES} + query["query"] = { + "query": {"match_all": {}}, + "size": self.NUM_EXAMPLES, + "sort": [{"_script": {"type": "number", "script": {"source": "Math.random()"}}}], + } random_sample = self._query_executor.query(query)["result"]["hits"]["hits"] result = OpenSearchSchema( diff --git a/lib/sycamore/sycamore/tests/integration/test_image_utils.py b/lib/sycamore/sycamore/tests/integration/test_image_utils.py index 05c486027..9eb7ea716 100644 --- a/lib/sycamore/sycamore/tests/integration/test_image_utils.py +++ b/lib/sycamore/sycamore/tests/integration/test_image_utils.py @@ -2,6 +2,7 @@ import pdf2image from PIL import Image +from copy import deepcopy import sycamore @@ -27,18 +28,23 @@ def image_boxes() -> list[BoundingBox]: @pytest.fixture(scope="module") -def source_image() -> Image.Image: +def source_image_module_scope() -> Image.Image: images = pdf2image.convert_from_path(path) return images[0].convert(mode="RGBA") +@pytest.fixture(scope="function") +def source_image(source_image_module_scope) -> Image.Image: + return deepcopy(source_image_module_scope) + + # Checks that the image contains blue pixels. This is of course an imperfect check, but # it at least will tell us if we drew some bounding boxes. on the image. Won't work # if the image contains blue pixels to begin with. Image must have mode RGBA. def check_image(image: Image.Image, expected_color=(0, 0, 255, 255)) -> None: raw_colors = image.getcolors(64_000) assert raw_colors is not None, "Image has too many colors to count" - assert expected_color in set((color_tup[1] for color_tup in raw_colors)) + assert expected_color in set((color_tup[1] for color_tup in raw_colors)), "Did not draw boxes" def test_draw_boxes_bbox(source_image, image_boxes) -> None: @@ -86,11 +92,13 @@ def test_draw_boxes_dict(source_image, image_boxes) -> None: def test_invalid_list(source_image, image_boxes): boxes = [[b.coordinates] for b in image_boxes] - with pytest.raises(ValueError): - try_draw_boxes(source_image, boxes) + output: Image.Image = try_draw_boxes(source_image, boxes) + with pytest.raises(AssertionError, match=r".*Did not draw boxes.*"): + check_image(output) def test_invalid_dict(source_image, image_boxes): boxes = [{"bboxes": b.coordinates} for b in image_boxes] - with pytest.raises(ValueError): - try_draw_boxes(source_image, boxes) + output: Image.Image = try_draw_boxes(source_image, boxes) + with pytest.raises(AssertionError, match=r".*Did not draw boxes.*"): + check_image(output) diff --git a/lib/sycamore/sycamore/transforms/detr_partitioner.py b/lib/sycamore/sycamore/transforms/detr_partitioner.py index cd00c01c1..4e9489ee6 100644 --- a/lib/sycamore/sycamore/transforms/detr_partitioner.py +++ b/lib/sycamore/sycamore/transforms/detr_partitioner.py @@ -140,7 +140,6 @@ def partition_pdf( file: BinaryIO, threshold: Union[float, Literal["auto"]] = DEFAULT_LOCAL_THRESHOLD, use_ocr=False, - ocr_images=False, ocr_model: Union[str, OcrModel] = "easyocr", per_element_ocr=True, extract_table_structure=False, @@ -167,7 +166,6 @@ def partition_pdf( aryn_partitioner_address=aryn_partitioner_address, threshold=threshold, use_ocr=use_ocr, - ocr_images=ocr_images, extract_table_structure=extract_table_structure, extract_images=extract_images, pages_per_call=pages_per_call, @@ -183,7 +181,6 @@ def partition_pdf( file=file, threshold=threshold, use_ocr=use_ocr, - ocr_images=ocr_images, ocr_model=ocr_model, per_element_ocr=per_element_ocr, extract_table_structure=extract_table_structure, @@ -226,7 +223,6 @@ def _call_remote_partitioner( aryn_partitioner_address=DEFAULT_ARYN_PARTITIONER_ADDRESS, threshold: Union[float, Literal["auto"]] = "auto", use_ocr: bool = False, - ocr_images: bool = False, extract_table_structure: bool = False, extract_images: bool = False, selected_pages: list = [], @@ -238,7 +234,6 @@ def _call_remote_partitioner( options = { "threshold": threshold, "use_ocr": use_ocr, - "ocr_images": ocr_images, "extract_table_structure": extract_table_structure, "extract_images": extract_images, "selected_pages": selected_pages, @@ -341,7 +336,6 @@ def _partition_remote( aryn_partitioner_address=DEFAULT_ARYN_PARTITIONER_ADDRESS, threshold: Union[float, Literal["auto"]] = "auto", use_ocr: bool = False, - ocr_images: bool = False, extract_table_structure: bool = False, extract_images: bool = False, pages_per_call: int = -1, @@ -364,7 +358,6 @@ def _partition_remote( aryn_partitioner_address=aryn_partitioner_address, threshold=threshold, use_ocr=use_ocr, - ocr_images=ocr_images, extract_table_structure=extract_table_structure, extract_images=extract_images, selected_pages=[[low, min(high, page_count)]], @@ -383,7 +376,6 @@ def _partition_pdf_batched( file: BinaryIO, threshold: float = DEFAULT_LOCAL_THRESHOLD, use_ocr: bool = False, - ocr_images: bool = False, ocr_model: Union[str, OcrModel] = "easyocr", per_element_ocr: bool = True, extract_table_structure: bool = False, @@ -410,7 +402,6 @@ def _partition_pdf_batched( file_hash.hexdigest(), threshold, use_ocr, - ocr_images, ocr_model, per_element_ocr, extract_table_structure, @@ -430,7 +421,6 @@ def _partition_pdf_batched_named( hash_key: str, threshold: float = DEFAULT_LOCAL_THRESHOLD, use_ocr: bool = False, - ocr_images: bool = False, ocr_model: Union[str, OcrModel] = "easyocr", per_element_ocr: bool = True, extract_table_structure=False, @@ -472,7 +462,6 @@ def _partition_pdf_batched_named( use_ocr=use_ocr, text_extractor=text_extractor, extractor_inputs=extractor_inputs, - ocr_images=ocr_images, ocr_model=ocr_model, per_element_ocr=per_element_ocr, extract_table_structure=extract_table_structure, @@ -504,7 +493,6 @@ def process_batch( text_extractor: TextExtractor, extractor_inputs: Any, use_ocr: bool, - ocr_images: bool, ocr_model: Union[str, OcrModel], per_element_ocr: bool, extract_table_structure: bool, @@ -526,7 +514,6 @@ def process_batch( extract_ocr( batch, deformable_layout, - ocr_images=ocr_images, ocr_model=ocr_model, ) else: @@ -571,12 +558,11 @@ def _run_text_extractor_document( hash_key: str, use_cache: bool, use_ocr: bool, - ocr_images: bool, text_extractor_model: Union[str, OcrModel], text_extraction_options: dict[str, Any], images: Optional[list[Image.Image]] = None, ): - kwargs = {"ocr_images": ocr_images, "images": images} + kwargs = {"images": images} if isinstance(text_extractor_model, OcrModel): model: TextExtractor = text_extractor_model else: @@ -592,7 +578,6 @@ def process_batch_inference( use_cache: bool, use_ocr: bool, ocr_model: Union[str, OcrModel], - ocr_images: bool, per_element_ocr: bool, ) -> Any: self._init_model() @@ -606,7 +591,6 @@ def process_batch_inference( extract_ocr( batch, deformable_layout, - ocr_images=ocr_images, ocr_model=ocr_model, ) return deformable_layout @@ -777,7 +761,6 @@ def _get_hash_key(self, image: Image.Image, threshold: float) -> str: def extract_ocr( images: list[Image.Image], elements: list[list[Element]], - ocr_images: bool = False, ocr_model: Union[str, OcrModel] = "easyocr", text_extraction_options: dict[str, Any] = {}, ) -> list[list[Element]]: @@ -796,8 +779,6 @@ def extract_ocr( for elem in page_elements: if elem.bbox is None: continue - if elem.type == "Picture" and not ocr_images: - continue cropped_image = crop_to_bbox(image, elem.bbox, padding=0) if 0 in cropped_image.size: elem.text_representation = "" diff --git a/lib/sycamore/sycamore/transforms/partition.py b/lib/sycamore/sycamore/transforms/partition.py index c39a5c1b2..6b3dbb65c 100644 --- a/lib/sycamore/sycamore/transforms/partition.py +++ b/lib/sycamore/sycamore/transforms/partition.py @@ -379,8 +379,6 @@ class ArynPartitioner(Partitioner): use_ocr: Whether to use OCR to extract text from the PDF. If false, we will attempt to extract the text from the underlying PDF. default: False - ocr_images: If set with use_ocr, will attempt to OCR regions of the document identified as images. - default: False ocr_model: model to use for OCR. Choices are "easyocr", "paddle", "tesseract" and "legacy", which correspond to EasyOCR, PaddleOCR, and Tesseract respectively, with "legacy" being a combination of Tesseract for text and EasyOCR for tables. If you choose paddle make sure to install @@ -446,7 +444,6 @@ def __init__( model_name_or_path=ARYN_DETR_MODEL, threshold: Optional[Union[float, Literal["auto"]]] = None, use_ocr: bool = False, - ocr_images: bool = False, ocr_model: str = "easyocr", per_element_ocr: bool = True, extract_table_structure: bool = False, @@ -489,7 +486,6 @@ def __init__( self._threshold = threshold self._use_ocr = use_ocr - self._ocr_images = ocr_images self._ocr_model = ocr_model self._per_element_ocr = per_element_ocr self._extract_table_structure = extract_table_structure @@ -519,7 +515,6 @@ def partition(self, document: Document) -> Document: binary, self._threshold, use_ocr=self._use_ocr, - ocr_images=self._ocr_images, per_element_ocr=self._per_element_ocr, ocr_model=self._ocr_model, extract_table_structure=self._extract_table_structure, @@ -561,7 +556,6 @@ def __init__( model_name_or_path=ARYN_DETR_MODEL, threshold: float = 0.4, use_ocr=False, - ocr_images=False, ocr_tables=False, extract_table_structure=False, table_structure_extractor=None, @@ -574,7 +568,6 @@ def __init__( model_name_or_path=model_name_or_path, threshold=threshold, use_ocr=use_ocr, - ocr_images=ocr_images, extract_table_structure=extract_table_structure, extract_images=extract_images, device=device,