Skip to content

Add LazyObject to lazy load pdf to image conversion #321

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 66 additions & 1 deletion textractor/textractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import boto3
import logging
import uuid
from functools import partial
from PIL import Image
from copy import deepcopy
from typing import List, Union
Expand All @@ -36,7 +37,7 @@
from textractcaller.t_call import Textract_Call_Mode, Textract_API, get_full_json

try:
from pdf2image import convert_from_bytes, convert_from_path
from pdf2image import convert_from_bytes, convert_from_path, pdfinfo_from_bytes, pdfinfo_from_path

IS_PDF2IMAGE_INSTALLED = True
except ImportError:
Expand All @@ -51,6 +52,7 @@
from textractor.entities.lazy_document import LazyDocument
from textractor.parsers import response_parser
from textractor.utils.s3_utils import upload_to_s3, s3_path_to_bucket_and_prefix
from textractor.utils.lazy_object import LazyObject
from textractor.exceptions import (
InputError,
RegionMismatchError,
Expand Down Expand Up @@ -150,6 +152,69 @@ def _get_document_images_from_path(self, filepath: str) -> List[Image.Image]:

return images

def _get_document_lazy_images_from_path(self, filepath: str) -> List[Union[LazyObject, Image.Image]]:
"""
Same as :code:`_get_document_images_from_path` but returns LazyObjects instead to defer the evaluation.

:param filepath: filepath to the document stored locally or on an S3 bucket.
:type filepath: str, required
:return: Returns a list of LazyObject or possibly images if the filepath is an image
:rtype: List[Union[LazyObject, Image.Image]]
"""
images = []
if "s3://" in filepath:
edit_filepath = filepath.replace("s3://", "")
bucket = edit_filepath.split("/")[0]
key = edit_filepath[edit_filepath.index("/") + 1 :]

s3_client = (
boto3.session.Session(profile_name=self.profile_name).client("s3")
if self.profile_name is not None
else boto3.session.Session(region_name=self.region_name).client("s3")
)
file_obj = s3_client.get_object(Bucket=bucket, Key=key).get("Body").read()
if filepath.lower().endswith(".pdf"):
if IS_PDF2IMAGE_INSTALLED:
pdf_info = pdfinfo_from_bytes(bytearray(file_obj))
if "Pages" in pdf_info:
images = [
LazyObject(partial(convert_from_bytes, bytearray(file_obj), first_page=i+1, last_page=i+1))
for i in range(pdf_info["Pages"])
]
# pdfinfo failed, let's try to get the pages directly
else:
images = convert_from_bytes(bytearray(file_obj))
else:
raise MissingDependencyException(
"pdf2image is not installed. If you do not plan on using visualizations you can skip image generation using save_image=False in your function call."
)
else:
images = [Image.open(io.BytesIO(bytearray(file_obj)))]

else:
if filepath.lower().endswith(".pdf"):
if IS_PDF2IMAGE_INSTALLED:
pdf_info = pdfinfo_from_path(filepath)
if "Pages" in pdf_info:
images = [
LazyObject(partial(convert_from_path, filepath, first_page=i+1, last_page=i+1))
for i in range(pdf_info["Pages"])
]
# pdfinfo failed, let's try to get the pages directly
else:
images = convert_from_path(filepath)
else:
raise MissingDependencyException(
"pdf2image is not installed. If you do not plan on using visualizations you can skip image generation using save_image=False in your function call."
)
else:
images = [Image.open(filepath)]

if not images:
raise UnhandledCaseException(f"Could not get any images from {filepath}")

return images

def detect_document_text(
self, file_source, s3_output_path: str = "", save_image: bool = True
) -> Document:
Expand Down
25 changes: 25 additions & 0 deletions textractor/utils/lazy_object.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from typing import Callable

class LazyObject:
def __init__(self, get_obj_func: Callable):
"""
LazyObject defers the creation of an object to when it will be used, this is useful to handle cases where
the consumer will only touch a handful of objects (such as images) and we want to preserve the appearance of
swift processing.
""

:param get_obj_func: function to call to get the object, should be pre-parametrized
:type get_obj_func: Callable
"""
self.get_obj_func = get_obj_func
self.obj = None

def __getattr__(self, *args, **kwargs):
if not self.obj:
self.obj = self.get_obj_func()
return self.obj.__getattr__(*args, **kwargs)

def __setattr__(self, *args, **kwargs):
if not self.obj:
self.obj = self.get_obj_func()
return self.obj.__setattr__(*args, **kwargs)