From 0700b116b5500c3b2f4ecb150d3b860e0717bbb9 Mon Sep 17 00:00:00 2001 From: Henry Lindeman Date: Wed, 4 Dec 2024 14:40:30 -0800 Subject: [PATCH 1/6] factor around deformable detr loading/lockfile management for use with deformable table extractor Signed-off-by: Henry Lindeman --- .../sycamore/transforms/detr_partitioner.py | 16 +++------ .../transforms/table_structure/extract.py | 4 +-- lib/sycamore/sycamore/utils/model_load.py | 33 +++++++++++++++++++ 3 files changed, 39 insertions(+), 14 deletions(-) create mode 100644 lib/sycamore/sycamore/utils/model_load.py diff --git a/lib/sycamore/sycamore/transforms/detr_partitioner.py b/lib/sycamore/sycamore/transforms/detr_partitioner.py index 1e7e65ca4..c2db46b5c 100644 --- a/lib/sycamore/sycamore/transforms/detr_partitioner.py +++ b/lib/sycamore/sycamore/transforms/detr_partitioner.py @@ -14,7 +14,6 @@ from tenacity import retry, retry_if_exception, wait_exponential, stop_after_delay import base64 from PIL import Image -import fasteners from pypdf import PdfReader from sycamore.data import Element, BoundingBox, ImageElement, TableElement @@ -683,18 +682,11 @@ def __init__(self, model_name_or_path, device=None, cache: Optional[Cache] = Non self._model_name_or_path = model_name_or_path self.cache = cache - from sycamore.utils.pytorch_dir import get_pytorch_build_directory + from transformers import AutoImageProcessor + from sycamore.utils.model_load import load_deformable_detr - with fasteners.InterProcessLock(_DETR_LOCK_FILE): - lockfile = Path(get_pytorch_build_directory("MultiScaleDeformableAttention", False)) / "lock" - lockfile.unlink(missing_ok=True) - - from transformers import AutoImageProcessor, DeformableDetrForObjectDetection - - LogTime("loading_model", point=True) - with LogTime("load_model", log_start=True): - self.processor = AutoImageProcessor.from_pretrained(model_name_or_path) - self.model = DeformableDetrForObjectDetection.from_pretrained(model_name_or_path).to(self._get_device()) + self.processor = AutoImageProcessor.from_pretrained(model_name_or_path) + self.model = load_deformable_detr(model_name_or_path).to(self._get_device()) # Note: We wrap this in a function so that we can execute on both the leader and the workers # to account for heterogeneous systems. Currently, if you pass in an explicit device parameter diff --git a/lib/sycamore/sycamore/transforms/table_structure/extract.py b/lib/sycamore/sycamore/transforms/table_structure/extract.py index bde614ce7..b17f93280 100644 --- a/lib/sycamore/sycamore/transforms/table_structure/extract.py +++ b/lib/sycamore/sycamore/transforms/table_structure/extract.py @@ -205,9 +205,9 @@ def __init__(self, model: str, device=None): super().__init__(model, device) def _init_structure_model(self): - from transformers import DeformableDetrForObjectDetection + from sycamore.utils.model_load import load_deformable_detr - self.structure_model = DeformableDetrForObjectDetection.from_pretrained(self.model).to(self._get_device()) + self.structure_model = load_deformable_detr(self.model).to(self._get_device()) def extract( self, element: TableElement, doc_image: Image.Image, union_tokens=False, apply_thresholds=True diff --git a/lib/sycamore/sycamore/utils/model_load.py b/lib/sycamore/sycamore/utils/model_load.py new file mode 100644 index 000000000..b96295b52 --- /dev/null +++ b/lib/sycamore/sycamore/utils/model_load.py @@ -0,0 +1,33 @@ +from sycamore.utils.import_utils import requires_modules +from sycamore.utils.time_trace import LogTime +import fasteners +from pathlib import Path + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from transformers import DeformableDetrForObjectDetection + +_DETR_LOCK_FILE = f"{Path.home()}/.cache/Aryn-Detr.lock" + + +@requires_modules("transformers", "local_inference") +def load_deformable_detr(model_name_or_path) -> "DeformableDetrForObjectDetection": + """Load deformable detr without getting concurrency issues in + jit-ing the deformable attention kernel. + + Refactored out of: + https://github.com/aryn-ai/sycamore/blob/7e6b62639ce9b8f63d56cb35a32837d1c97e711e/lib/sycamore/sycamore/transforms/detr_partitioner.py#L686 + """ + from sycamore.utils.pytorch_dir import get_pytorch_build_directory + + with fasteners.InterProcessLock(_DETR_LOCK_FILE): + lockfile = Path(get_pytorch_build_directory("MultiScaleDeformableAttention", False)) / "lock" + lockfile.unlink(missing_ok=True) + + from transformers import DeformableDetrForObjectDetection + + LogTime("loading_model", point=True) + with LogTime("loading_model", log_start=True): + model = DeformableDetrForObjectDetection.from_pretrained(model_name_or_path) + return model From 9b5a9e9efbe6068cd0c8fe40d1d9e6ba95b6d1ca Mon Sep 17 00:00:00 2001 From: Henry Lindeman Date: Wed, 4 Dec 2024 14:56:32 -0800 Subject: [PATCH 2/6] remove unused global variable Signed-off-by: Henry Lindeman --- lib/sycamore/sycamore/transforms/detr_partitioner.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/lib/sycamore/sycamore/transforms/detr_partitioner.py b/lib/sycamore/sycamore/transforms/detr_partitioner.py index c2db46b5c..6f20ec270 100644 --- a/lib/sycamore/sycamore/transforms/detr_partitioner.py +++ b/lib/sycamore/sycamore/transforms/detr_partitioner.py @@ -6,7 +6,6 @@ from abc import ABC, abstractmethod from collections.abc import Mapping from typing import Any, BinaryIO, Literal, Union, Optional -from pathlib import Path from itertools import repeat import requests @@ -33,7 +32,6 @@ from sycamore.transforms.text_extraction.pdf_miner import PdfMinerExtractor logger = logging.getLogger(__name__) -_DETR_LOCK_FILE = f"{Path.home()}/.cache/Aryn-Detr.lock" _VERSION = "0.2024.07.24" From ea27bcfb0cde3a6f6c0c37f9f0d36517fd52d335 Mon Sep 17 00:00:00 2001 From: Henry Lindeman Date: Wed, 4 Dec 2024 15:02:12 -0800 Subject: [PATCH 3/6] move .to(device) iniside the lock Signed-off-by: Henry Lindeman --- lib/sycamore/sycamore/transforms/detr_partitioner.py | 2 +- lib/sycamore/sycamore/transforms/table_structure/extract.py | 2 +- lib/sycamore/sycamore/utils/model_load.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/sycamore/sycamore/transforms/detr_partitioner.py b/lib/sycamore/sycamore/transforms/detr_partitioner.py index 6f20ec270..cc0cdc7c4 100644 --- a/lib/sycamore/sycamore/transforms/detr_partitioner.py +++ b/lib/sycamore/sycamore/transforms/detr_partitioner.py @@ -684,7 +684,7 @@ def __init__(self, model_name_or_path, device=None, cache: Optional[Cache] = Non from sycamore.utils.model_load import load_deformable_detr self.processor = AutoImageProcessor.from_pretrained(model_name_or_path) - self.model = load_deformable_detr(model_name_or_path).to(self._get_device()) + self.model = load_deformable_detr(model_name_or_path, self._get_device()) # Note: We wrap this in a function so that we can execute on both the leader and the workers # to account for heterogeneous systems. Currently, if you pass in an explicit device parameter diff --git a/lib/sycamore/sycamore/transforms/table_structure/extract.py b/lib/sycamore/sycamore/transforms/table_structure/extract.py index b17f93280..c7510748e 100644 --- a/lib/sycamore/sycamore/transforms/table_structure/extract.py +++ b/lib/sycamore/sycamore/transforms/table_structure/extract.py @@ -207,7 +207,7 @@ def __init__(self, model: str, device=None): def _init_structure_model(self): from sycamore.utils.model_load import load_deformable_detr - self.structure_model = load_deformable_detr(self.model).to(self._get_device()) + self.structure_model = load_deformable_detr(self.model, self._get_device()) def extract( self, element: TableElement, doc_image: Image.Image, union_tokens=False, apply_thresholds=True diff --git a/lib/sycamore/sycamore/utils/model_load.py b/lib/sycamore/sycamore/utils/model_load.py index b96295b52..e1d1ace1a 100644 --- a/lib/sycamore/sycamore/utils/model_load.py +++ b/lib/sycamore/sycamore/utils/model_load.py @@ -12,7 +12,7 @@ @requires_modules("transformers", "local_inference") -def load_deformable_detr(model_name_or_path) -> "DeformableDetrForObjectDetection": +def load_deformable_detr(model_name_or_path, device) -> "DeformableDetrForObjectDetection": """Load deformable detr without getting concurrency issues in jit-ing the deformable attention kernel. @@ -29,5 +29,5 @@ def load_deformable_detr(model_name_or_path) -> "DeformableDetrForObjectDetectio LogTime("loading_model", point=True) with LogTime("loading_model", log_start=True): - model = DeformableDetrForObjectDetection.from_pretrained(model_name_or_path) + model = DeformableDetrForObjectDetection.from_pretrained(model_name_or_path).to(device) return model From 0a8a2b1646f357845a03c97b72bbd7b4ffabefb0 Mon Sep 17 00:00:00 2001 From: Henry Lindeman Date: Wed, 4 Dec 2024 15:44:22 -0800 Subject: [PATCH 4/6] jitpick Signed-off-by: Henry Lindeman --- lib/sycamore/sycamore/utils/model_load.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/sycamore/sycamore/utils/model_load.py b/lib/sycamore/sycamore/utils/model_load.py index e1d1ace1a..e8d05861e 100644 --- a/lib/sycamore/sycamore/utils/model_load.py +++ b/lib/sycamore/sycamore/utils/model_load.py @@ -14,7 +14,7 @@ @requires_modules("transformers", "local_inference") def load_deformable_detr(model_name_or_path, device) -> "DeformableDetrForObjectDetection": """Load deformable detr without getting concurrency issues in - jit-ing the deformable attention kernel. + jitc-ing the deformable attention kernel. Refactored out of: https://github.com/aryn-ai/sycamore/blob/7e6b62639ce9b8f63d56cb35a32837d1c97e711e/lib/sycamore/sycamore/transforms/detr_partitioner.py#L686 From 4f49ede403660239fa9f4f16e863fd344badcaa9 Mon Sep 17 00:00:00 2001 From: Henry Lindeman Date: Wed, 4 Dec 2024 16:17:27 -0800 Subject: [PATCH 5/6] set deformable table extractor choose_device detr=True Signed-off-by: Henry Lindeman --- lib/sycamore/sycamore/transforms/table_structure/extract.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/sycamore/sycamore/transforms/table_structure/extract.py b/lib/sycamore/sycamore/transforms/table_structure/extract.py index c7510748e..926695b1e 100644 --- a/lib/sycamore/sycamore/transforms/table_structure/extract.py +++ b/lib/sycamore/sycamore/transforms/table_structure/extract.py @@ -209,6 +209,9 @@ def _init_structure_model(self): self.structure_model = load_deformable_detr(self.model, self._get_device()) + def _get_device(self) -> str: + return choose_device(self.device, detr=True) + def extract( self, element: TableElement, doc_image: Image.Image, union_tokens=False, apply_thresholds=True ) -> TableElement: From 7c3ad398411b8eb2a3e4bc6b4bb039c37091d211 Mon Sep 17 00:00:00 2001 From: Henry Lindeman Date: Tue, 17 Dec 2024 10:47:09 -0800 Subject: [PATCH 6/6] Misc table transformers post-processing (#1077) * misc postprocessing tweaks Signed-off-by: Henry Lindeman * typo Signed-off-by: Henry Lindeman --------- Signed-off-by: Henry Lindeman --- .../table_structure/table_transformers.py | 45 +++++++++++++++---- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/lib/sycamore/sycamore/transforms/table_structure/table_transformers.py b/lib/sycamore/sycamore/transforms/table_structure/table_transformers.py index 71cd8da75..c271978bf 100644 --- a/lib/sycamore/sycamore/transforms/table_structure/table_transformers.py +++ b/lib/sycamore/sycamore/transforms/table_structure/table_transformers.py @@ -53,6 +53,23 @@ def apply_class_thresholds(bboxes, labels, scores, class_names, class_thresholds return bboxes, scores, labels +def apply_class_thresholds_or_take_best(bboxes, labels, scores, class_names, class_thresholds, epsilon=0.05): + """ + Filter out bounding boxes whose confidence is below the confidence threshold for its + associated class threshold, defining the threshold as whichever is lower between what + is written in the class_thresholds dict and the highest score for the class minus epsilon + """ + new_class_thresholds = {k: v for k, v in class_thresholds.items()} + max_row_score = max(sc for (sc, lbl) in zip(scores, labels) if class_names[lbl] == "table row") + max_col_score = max(sc for (sc, lbl) in zip(scores, labels) if class_names[lbl] == "table column") + if max_row_score - epsilon < class_thresholds["table row"]: + new_class_thresholds["table row"] = max_row_score - epsilon + if max_col_score - epsilon < class_thresholds["table column"]: + new_class_thresholds["table column"] = max_col_score - epsilon + new_class_thresholds["table"] = 0.0 + return apply_class_thresholds(bboxes, labels, scores, class_names, new_class_thresholds) + + def iob(coords1, coords2) -> float: return BoundingBox(*coords1).iob(BoundingBox(*coords2)) @@ -83,7 +100,7 @@ def outputs_to_objects(outputs, img_size, id2label, apply_thresholds: bool = Fal pred_bboxes = [elem.tolist() for elem in rescale_bboxes(pred_bboxes, img_size)] if apply_thresholds: - pred_bboxes, pred_scores, pred_labels = apply_class_thresholds( + pred_bboxes, pred_scores, pred_labels = apply_class_thresholds_or_take_best( pred_bboxes, pred_labels, pred_scores, id2label, DEFAULT_STRUCTURE_CLASS_THRESHOLDS ) @@ -277,20 +294,32 @@ def slot_into_containers( # If the container starts after the package ends, break if not _early_exit_vertical and container["bbox"][0] > package["bbox"][2]: if len(match_scores) == 0: - match_scores.append({"container": container, "container_num": container_num, "score": 0}) + match_scores.append( + {"container": container, "container_num": container_num, "score": 0, "score_2": 0} + ) break elif _early_exit_vertical and container["bbox"][1] > package["bbox"][3]: if len(match_scores) == 0: - match_scores.append({"container": container, "container_num": container_num, "score": 0}) + match_scores.append( + {"container": container, "container_num": container_num, "score": 0, "score_2": 0} + ) break container_rect = BoundingBox(*container["bbox"]) intersect_area = container_rect.intersect(package_rect).area overlap_fraction = intersect_area / package_area - match_scores.append({"container": container, "container_num": container_num, "score": overlap_fraction}) + opposite_overlap_fraction = intersect_area / (container_rect.area or 1) + match_scores.append( + { + "container": container, + "container_num": container_num, + "score": overlap_fraction, + "score_2": opposite_overlap_fraction, + } + ) # Don't sort if you don't have to if unique_assignment: - sorted_match_scores = [max(match_scores, key=lambda x: x["score"])] + sorted_match_scores = [max(match_scores, key=lambda x: (x["score"], x["score_2"]))] else: sorted_match_scores = sort_objects_by_score(match_scores) @@ -320,7 +349,7 @@ def sort_objects_by_score(objects, reverse=True): sign = -1 else: sign = 1 - return sorted(objects, key=lambda k: sign * k["score"]) + return sorted(objects, key=lambda k: (sign * k["score"], sign * k.get("score_2", 0))) def remove_objects_without_content(page_spans, objects): @@ -911,10 +940,10 @@ def objects_to_structures(objects, tokens, class_thresholds): if len(tables) == 0: return {} if len(tables) > 1: - tables.sort(key=lambda x: x["score"], reverse=True) + tables.sort(key=lambda x: BoundingBox(*x["bbox"]).area, reverse=True) import logging - logging.warning("Got multiple tables in document. Using only the highest-scoring one") + logging.warning("Got multiple tables in document. Using only the biggest one") table = tables[0] structure = {}