textractor/textractor.py

"""
:class:`Textractor` is the main class associated with this package. It needs to be instantiated before using any of the functionalities
the package provides. The main use of this class is to make calls to the Textract API and create Python objects for all the
document entities that are returned in the JSON output of the API. The response received is implicitly parsed and a :class:`Document` type 
object is returned containing all the document entities, their associated relationships and metadata.

The Textract API and Textractor method mapping is as below. Use these wrappers to make calls and parse the responses
in one step.

* (SYNC) DetectDocumentText : detect_document_text
* (SYNC) AnalyzeDocument : analyze_document
* (SYNC) AnalyzeID : analyze_id
* (SYNC) AnalyzeExpense : analyze_expense
* (ASYNC) StartDocumentTextDetection : start_document_text_detection
* (ASYNC) StartDocumentAnalysis : start_document_analysis
* (ASYNC) StartExpenseAnalysis : start_expense_analysis

"""

import io
import os
import boto3
import logging
import uuid
from functools import partial
from PIL import Image
from copy import deepcopy
from typing import List, Union
from textractcaller import (
    call_textract,
    call_textract_analyzeid,
    call_textract_expense,
    OutputConfig,
    Query,
    QueriesConfig,
)
from textractcaller.t_call import Textract_Call_Mode, Textract_API, get_full_json

try:
    from pdf2image import convert_from_bytes, convert_from_path, pdfinfo_from_bytes, pdfinfo_from_path

    IS_PDF2IMAGE_INSTALLED = True
except ImportError:
    IS_PDF2IMAGE_INSTALLED = False
    logging.info("pdf2image is not installed, client-side PDF rasterizing is disabled")

from textractor.data.constants import (
    TextractAPI,
    TextractFeatures,
)
from textractor.entities.document import Document
from textractor.entities.lazy_document import LazyDocument
from textractor.parsers import response_parser
from textractor.utils.s3_utils import upload_to_s3, s3_path_to_bucket_and_prefix
from textractor.utils.lazy_object import LazyObject
from textractor.exceptions import (
    InputError,
    RegionMismatchError,
    IncorrectMethodException,
    MissingDependencyException,
    UnhandledCaseException,
)


class Textractor:
    """
    Initializes the customer credentials needed to make calls to Textract using boto3 package internally.

    :param profile_name: Customer's profile name as set in the ~/.aws/config file. This profile typically contains this format.
                                :code:`[default]
                                region = us-west-2
                                output=json`
    :type profile_name: str
    :param region_name: If AWSCLI isn't setup, the user can pass region to let boto3 pick up credentials from the system.
    :param region_name: str
    :type profile_name: str, optional
    :param kms_key_id: Customer's AWS KMS key (cryptographic key)
    :type kms_key_id: str, optional
    """

    def __init__(
        self,
        profile_name: str = None,
        region_name: str = None,
        kms_key_id: str = "",
    ):
        self.profile_name = profile_name
        self.region_name = region_name
        self.kms_key_id = kms_key_id

        if self.profile_name is not None:
            self.session = boto3.session.Session(profile_name=self.profile_name)
        elif self.region_name is not None:
            self.session = boto3.session.Session(region_name=self.region_name)
        else:
            raise InputError(
                "Unable to initiate Textractor. Either profile_name or region requires an input parameter."
            )
        if self.region_name is not None:
            self.textract_client = self.session.client(
                "textract", region_name=self.region_name
            )
        else:
            self.textract_client = self.session.client("textract")
        self.s3_client = self.session.client("s3")

    def _get_document_images_from_path(self, filepath: str) -> List[Image.Image]:
        """
        Converts the every page in the document to an image. It supports pdfs and image formats that can be opened by
        PIL package. Documents can be stored in the local computer or on an S3 Bucket.

        :param filepath: filepath to the document stored locally or on an S3 bucket.
        :type filepath: str, required
        :return: Returns a list of PIL Images, one for each page of the document
        :rtype: List[Image]
        """
        images = []
        if "s3://" in filepath:
            edit_filepath = filepath.replace("s3://", "")
            bucket = edit_filepath.split("/")[0]
            key = edit_filepath[edit_filepath.index("/") + 1 :]

            s3_client = (
                boto3.session.Session(profile_name=self.profile_name).client("s3")
                if self.profile_name is not None
                else boto3.session.Session(region_name=self.region_name).client("s3")
            )
            file_obj = s3_client.get_object(Bucket=bucket, Key=key).get("Body").read()
            if filepath.lower().endswith(".pdf"):
                if IS_PDF2IMAGE_INSTALLED:
                    images = convert_from_bytes(bytearray(file_obj))
                else:
                    raise MissingDependencyException(
                        "pdf2image is not installed. If you do not plan on using visualizations you can skip image generation using save_image=False in your function call."
                    )
            else:
                images = [Image.open(io.BytesIO(bytearray(file_obj)))]

        else:
            if filepath.lower().endswith(".pdf"):
                if IS_PDF2IMAGE_INSTALLED:
                    images = convert_from_path(filepath)
                else:
                    raise MissingDependencyException(
                        "pdf2image is not installed. If you do not plan on using visualizations you can skip image generation using save_image=False in your function call."
                    )
            else:
                images = [Image.open(filepath)]

        if not images:
            raise UnhandledCaseException(f"Could not get any images from {filepath}")

        return images

    def _get_document_lazy_images_from_path(self, filepath: str) -> List[Union[LazyObject, Image.Image]]:
        """
        Same as :code:`_get_document_images_from_path` but returns LazyObjects instead to defer the evaluation.

        :param filepath: filepath to the document stored locally or on an S3 bucket.
        :type filepath: str, required
        :return: Returns a list of LazyObject or possibly images if the filepath is an image
        :rtype: List[Union[LazyObject, Image.Image]]
        """
        images = []
        if "s3://" in filepath:
            edit_filepath = filepath.replace("s3://", "")
            bucket = edit_filepath.split("/")[0]
            key = edit_filepath[edit_filepath.index("/") + 1 :]

            s3_client = (
                boto3.session.Session(profile_name=self.profile_name).client("s3")
                if self.profile_name is not None
                else boto3.session.Session(region_name=self.region_name).client("s3")
            )
            file_obj = s3_client.get_object(Bucket=bucket, Key=key).get("Body").read()
            if filepath.lower().endswith(".pdf"):
                if IS_PDF2IMAGE_INSTALLED:
                    pdf_info = pdfinfo_from_bytes(bytearray(file_obj))
                    if "Pages" in pdf_info:
                        images = [
                            LazyObject(partial(convert_from_bytes, bytearray(file_obj), first_page=i+1, last_page=i+1))
                            for i in range(pdf_info["Pages"])
                        ]
                    # pdfinfo failed, let's try to get the pages directly
                    else:
                        images = convert_from_bytes(bytearray(file_obj))
                else:
                    raise MissingDependencyException(
                        "pdf2image is not installed. If you do not plan on using visualizations you can skip image generation using save_image=False in your function call."
                    )
            else:
                images = [Image.open(io.BytesIO(bytearray(file_obj)))]

        else:
            if filepath.lower().endswith(".pdf"):
                if IS_PDF2IMAGE_INSTALLED:
                    pdf_info = pdfinfo_from_path(filepath)
                    if "Pages" in pdf_info:
                        images = [
                            LazyObject(partial(convert_from_path, filepath, first_page=i+1, last_page=i+1))
                            for i in range(pdf_info["Pages"])
                        ]
                    # pdfinfo failed, let's try to get the pages directly
                    else:
                        images = convert_from_path(filepath)
                else:
                    raise MissingDependencyException(
                        "pdf2image is not installed. If you do not plan on using visualizations you can skip image generation using save_image=False in your function call."
                    )
            else:
                images = [Image.open(filepath)]

        if not images:
            raise UnhandledCaseException(f"Could not get any images from {filepath}")

        return images 

    def detect_document_text(
        self, file_source, s3_output_path: str = "", save_image: bool = True
    ) -> Document:
        """
        Make a call to the SYNC DetectDocumentText API, implicitly parses the response and produces a :class:`Document` object.
        This function is ideal for single page PDFs or single images.

        :param file_source: Path to a file stored locally, on an S3 bucket or PIL Image
        :type file_source: str or PIL.Image, required
        :param s3_output_path: S3 path to store the output.
        :type s3_output_path: str, optional
        :param save_image: Flag to indicate if document images are to be stored within the Document object. This is optional
                            and necessary only if the customer wants to visualize bounding boxes for their document entities.
        :type save_image: bool

        :return: Returns a Document object containing all the entities, relationships and metadata extracted by the Textract
                 DetectDocumentText API stored within it.
        :rtype: Document
        """

        if isinstance(file_source, list) and len(file_source) > 1:
            raise IncorrectMethodException(
                "List contains more than 1 image. Call start_document_text_detection instead."
            )

        elif isinstance(file_source, str):
            logging.debug("Filepath given.")
            images = self._get_document_images_from_path(file_source)
            if len(images) > 1:
                raise IncorrectMethodException(
                    "Input contains more than 1 page. Call start_document_text_detection instead."
                )
            file_source = _image_to_byte_array(images[0])

        elif isinstance(file_source, Image.Image):
            logging.debug("PIL Image given.")
            images = [file_source]
            file_source = _image_to_byte_array(file_source)

        elif isinstance(file_source, list) and isinstance(file_source[0], Image.Image):
            logging.debug("List of PIL Image given.")
            images = deepcopy(file_source)
            file_source = _image_to_byte_array(images[0])

        else:
            images = []
            raise InputError("Input file_source format not supported.")

        if not s3_output_path:
            output_config = None
        else:
            bucket, prefix = s3_path_to_bucket_and_prefix(s3_output_path)
            output_config = OutputConfig(s3_bucket=bucket, s3_prefix=prefix)

        try:
            response = call_textract(
                input_document=file_source,
                features=[],
                queries_config=None,  # not supported yet
                output_config=output_config,
                kms_key_id=self.kms_key_id,
                job_tag="",
                notification_channel=None,  # not supported yet
                client_request_token="",
                return_job_id=False,
                force_async_api=False,
                call_mode=Textract_Call_Mode.FORCE_SYNC,
                boto3_textract_client=self.textract_client,
                job_done_polling_interval=0,
            )
        except Exception as exception:
            if exception.__class__.__name__ == "InvalidS3ObjectException":
                raise RegionMismatchError(
                    "Region passed in the profile_name and S3 bucket do not match. Ensure the regions are the same."
                )
            raise exception

        document = response_parser.parse(response)
        document.response = response
        if save_image:
            for page in document.pages:
                page.image = images[document.pages.index(page)]
        return document

    def start_document_text_detection(
        self,
        file_source: Union[str, bytes, Image.Image],
        s3_output_path: str = "",
        s3_upload_path: str = "",
        client_request_token: str = "",
        job_tag: str = "",
        save_image: bool = True,
    ):
        """
        Make a call to the ASYNC StartDocumentTextDetection API.

        :param file_source: File bytes, path to a file stored locally or in an S3 bucket
        :type file_source: Union[str, bytes, Image.Image], required
        :param s3_output_path: Prefix to store the output on the S3 bucket (passed as param to Textractor).
        :type s3_output_path: str
        :param s3_upload_path: If given, will automatically upload the document to the given S3 prefix before calling Textract. Files are uploaded
                                    under a uuid. If not given the data is expected to be already in s3
        :type s3_upload_path: str, optional
        :param client_request_token: The idempotent token that's used to identify the start request. If you use the same. token
                                    with multiple StartDocumentTextDetection requests, the same. JobId is returned. Use ClientRequestToken
                                    to prevent the same. job from being accidentally started more than once.
        :type client_request_token: str, optional
        :param job_tag: An identifier that you specify that's included in the completion notification published to the Amazon SNS topic.
        :type job_tag: str, optional

        :return: Returns a job id which can be used to fetch the results
        :rtype: str
        """

        original_file_source = file_source

        if not isinstance(file_source, (str, bytes, Image.Image)):
            raise InputError(
                f"file_source needs to be of type str, bytes or PIL Image, not {type(file_source)}"
            )

        # If the file is not already in S3
        if not isinstance(file_source, str) or not file_source.startswith("s3://"):
            # Check if the user has given us a bucket to upload to
            if not s3_upload_path:
                raise InputError(
                    "For files not in S3, an S3 upload path must be provided"
                )

            s3_file_path = os.path.join(s3_upload_path, str(uuid.uuid4()))
            upload_to_s3(self.s3_client, s3_file_path, file_source)
            file_source = s3_file_path

        output_config = None
        if s3_output_path:
            s3_bucket, s3_prefix = s3_path_to_bucket_and_prefix(s3_output_path)
            output_config = OutputConfig(s3_bucket=s3_bucket, s3_prefix=s3_prefix)

        try:
            response = call_textract(
                input_document=file_source,
                features=[],
                queries_config=None,  # not supported yet
                output_config=output_config,
                kms_key_id=self.kms_key_id,
                job_tag=job_tag,
                notification_channel=None,  # not supported yet
                client_request_token=client_request_token,
                return_job_id=True,
                force_async_api=True,
                call_mode=Textract_Call_Mode.FORCE_ASYNC,
                boto3_textract_client=self.textract_client,
                job_done_polling_interval=1,
            )
        except Exception as exception:
            if exception.__class__.__name__ == "InvalidS3ObjectException":
                raise RegionMismatchError(
                    "Region passed in the profile_name and S3 bucket do not match. Ensure the regions are the same."
                )
            raise exception

        images = None
        if save_image:
            if isinstance(original_file_source, Image.Image):
                images = [original_file_source]
            elif (
                isinstance(original_file_source, list)
                and len(original_file_source)
                and isinstance(original_file_source[0], Image.Image)
            ):
                images = original_file_source
            else:
                images = self._get_document_images_from_path(original_file_source)

        return LazyDocument(
            response["JobId"],
            TextractAPI.DETECT_TEXT,
            textract_client=self.textract_client,
            images=images,
        )

    def analyze_document(
        self,
        file_source,
        features,
        queries: Union[QueriesConfig, List[Query], List[str]] = None,
        s3_output_path: str = "",
        save_image: bool = True,
    ) -> Document:
        """
        Make a call to the SYNC AnalyzeDocument API, implicitly parses the response and produces a :class:`Document` object.
        This function is ideal for single page PDFs or single images.

        :param file_source: Path to a file stored locally, on an S3 bucket or PIL Image
        :type file_source: str or PIL.Image, required
        :param features: List of TextractFeatures to be extracted from the Document by the TextractAPI
        :type features: list, required
        :param queries: Queries to run on the document
        :type features: Union[QueriesConfig, List[Query], List[str]]
        :param s3_output_path: Prefix to store the output on the S3 bucket (passed as param to Textractor).
        :type s3_output_path: str, optional
        :param save_image: Flag to indicate if document images are to be stored within the Document object. This is optional
                            and necessary only if the customer wants to visualize bounding boxes for their document entities.
        :type save_image: bool

        :return: Returns a Document object containing all the entities, relationships and metadata extracted by the Textract
                 AnalyzeDocument API stored within it.
        :rtype: Document
        """
        if isinstance(file_source, list) and len(file_source) > 1:
            raise IncorrectMethodException(
                "List contains more than 1 image. Call start_document_analysis() instead."
            )

        elif isinstance(file_source, str):
            logging.debug("Filepath given.")
            images = self._get_document_images_from_path(file_source)
            if len(images) > 1:
                raise IncorrectMethodException(
                    "Input contains more than 1 page. Call start_document_analysis() instead."
                )
            file_source = _image_to_byte_array(images[0])

        elif isinstance(file_source, Image.Image):
            logging.debug("PIL Image given.")
            images = [file_source]
            file_source = _image_to_byte_array(file_source)

        elif isinstance(file_source, list) and isinstance(file_source[0], Image.Image):
            logging.debug("List of PIL Image given.")
            images = deepcopy(file_source)
            file_source = _image_to_byte_array(images[0])

        else:
            images = []
            raise InputError("Input file_source format not supported.")

        if not s3_output_path:
            output_config = None
        else:
            bucket, prefix = s3_path_to_bucket_and_prefix(s3_output_path)
            output_config = OutputConfig(s3_bucket=bucket, s3_prefix=prefix)

        if not isinstance(features, list):
            features = [features]

        if queries and TextractFeatures.QUERIES not in features:
            raise InputError(
                "Queries were given as a parameter but QUERIES is not enabled in the feature set"
            )

        if queries and not isinstance(queries, QueriesConfig):
            if not isinstance(queries, List):
                raise InputError(
                    f"Queries must be of type QueriesConfig, List[Query] or List[str], not {type(queries)}"
                )
            if isinstance(queries[0], Query):
                queries_config = QueriesConfig(queries)
                queries = queries_config
            elif isinstance(queries[0], str):
                queries_config = QueriesConfig([Query(query) for query in queries])
                queries = queries_config
            else:
                raise InputError(
                    f"Queries must be of type QueriesConfig, List[Query] or List[str], not {type(queries)}"
                )

        try:
            response = call_textract(
                input_document=file_source,
                features=features,
                queries_config=queries,  # not supported yet
                output_config=output_config,
                kms_key_id=self.kms_key_id,
                job_tag="",
                notification_channel=None,  # not supported yet
                client_request_token="",
                return_job_id=False,
                force_async_api=False,
                call_mode=Textract_Call_Mode.FORCE_SYNC,
                boto3_textract_client=self.textract_client,
                job_done_polling_interval=0,
            )
        except Exception as exception:
            if exception.__class__.__name__ == "InvalidS3ObjectException":
                raise RegionMismatchError(
                    "Region passed in the profile_name and S3 bucket do not match. Ensure the regions are the same."
                )
            raise exception

        document = response_parser.parse(response)
        document.response = response
        if save_image:
            for page in document.pages:
                page.image = images[document.pages.index(page)]
        return document

    def start_document_analysis(
        self,
        file_source: Union[str, bytes, Image.Image],
        features,
        s3_output_path: str = "",
        s3_upload_path: str = "",
        queries: Union[QueriesConfig, List[Query], List[str]] = None,
        client_request_token: str = "",
        job_tag: str = "",
        save_image: bool = True,
    ) -> LazyDocument:
        """
        Make a call to the ASYNC StartDocumentAnalysis API, implicitly parses the response and produces a :class:`Document` object.
        This function is ideal for multipage PDFs or an image.

        :param file_source: Path to a file stored locally, on an S3 bucket or a PIL Image
        :type file_source: Union[str, bytes, Image.Image], required
        :param features: List of TextractFeatures to be extracted from the Document by the TextractAPI
        :type features: list, required
        :param s3_output_path: Path to store the output on the S3 bucket (passed as param to Textractor).
        :type s3_output_path: str
        :param s3_upload_path: If given, will automatically upload the document to the given S3 prefix before calling Textract. Files are uploaded
                               under a uuid. If not given the data is expected to be already in s3
        :type s3_upload_path: str, optional
        :param client_request_token: The idempotent token that's used to identify the start request. If you use the same. token
                                    with multiple StartDocumentTextDetection requests, the same. JobId is returned. Use ClientRequestToken
                                    to prevent the same. job from being accidentally started more than once.
        :type client_request_token: str, optional
        :param job_tag: An identifier that you specify that's included in the completion notification published to the Amazon SNS topic.
        :type job_tag: str, optional
        :param save_image: Flag to indicate if document images are to be stored within the Document object. This is optional
                            and necessary only if the customer wants to visualize bounding boxes for their document entities.
        :type save_image: bool

        :return: Returns a Document object containing all the entities, relationships and metadata extracted by the Textract
                 StartDocumentAnalysis API stored within it.
        :rtype: Document
        """

        original_file_source = file_source

        if not isinstance(file_source, (str, bytes, Image.Image)):
            raise InputError(
                f"file_source needs to be of type str, bytes or PIL Image, not {type(file_source)}"
            )

        # If the file is not already in S3
        if not isinstance(file_source, str) or not file_source.startswith("s3://"):
            # Check if the user has given us a bucket to upload to
            if not s3_upload_path:
                raise InputError(
                    f"For files not in S3, an S3 upload path must be provided"
                )

            s3_file_path = os.path.join(s3_upload_path, str(uuid.uuid4()))
            upload_to_s3(self.s3_client, s3_file_path, file_source)
            file_source = s3_file_path

        output_config = None
        if s3_output_path:
            s3_bucket, s3_prefix = s3_path_to_bucket_and_prefix(s3_output_path)
            output_config = OutputConfig(s3_bucket=s3_bucket, s3_prefix=s3_prefix)

        if not isinstance(features, list):
            features = [features]

        if queries and TextractFeatures.QUERIES not in features:
            raise InputError(
                "Queries were given as a parameter but QUERIES is not enabled in the feature set"
            )

        if queries and not isinstance(queries, QueriesConfig):
            if not isinstance(queries, List):
                raise InputError(
                    f"Queries must be of type QueriesConfig, List[Query] or List[str], not {type(queries)}"
                )
            if isinstance(queries[0], Query):
                queries_config = QueriesConfig(queries)
                queries = queries_config
            elif isinstance(queries[0], str):
                queries_config = QueriesConfig([Query(query) for query in queries])
                queries = queries_config
            else:
                raise InputError(
                    f"Queries must be of type QueriesConfig, List[Query] or List[str], not {type(queries)}"
                )

        try:
            response = call_textract(
                input_document=file_source,
                features=features,
                queries_config=queries,  # not supported yet
                output_config=output_config,
                kms_key_id=self.kms_key_id,
                job_tag=job_tag,
                notification_channel=None,  # not supported yet
                client_request_token=client_request_token,
                return_job_id=True,
                force_async_api=True,
                call_mode=Textract_Call_Mode.FORCE_ASYNC,
                boto3_textract_client=self.textract_client,
                job_done_polling_interval=1,
            )
        except Exception as exception:
            if exception.__class__.__name__ == "InvalidS3ObjectException":
                raise RegionMismatchError(
                    "Region passed in the profile_name and S3 bucket do not match. Ensure the regions are the same."
                )
            raise exception

        images = None
        if save_image:
            if isinstance(original_file_source, Image.Image):
                images = [original_file_source]
            elif (
                isinstance(original_file_source, list)
                and len(original_file_source)
                and isinstance(original_file_source[0], Image.Image)
            ):
                images = original_file_source
            else:
                images = self._get_document_images_from_path(original_file_source)

        return LazyDocument(
            response["JobId"],
            TextractAPI.ANALYZE,
            textract_client=self.textract_client,
            images=images,
            output_config=output_config,
        )

    def analyze_id(
        self,
        file_source: Union[str, List[Image.Image], List[str]],
        save_image: bool = True,
    ) -> Document:
        """AnalyzeID parses identity documents such as passports and driver's license and
        returns the result as a dictionary of standardized fields. See https://docs.aws.amazon.com/textract/latest/dg/identitydocumentfields.html
        for a complete list.

        :param file_source: Path to a file stored locally, on an S3 bucket or list of PIL Images
        :type file_source: Union[str, List[Image.Image], List[str]]
        :param save_image: Saves the images in the returned Document object for visualizing the results, defaults to False
        :type save_image: bool, optional
        :raises InputError: Raised when the file_source could not be parsed
        :raises RegionMismatchError: Raised when the S3 object passed as file source is in a region that does not match the one used to create the Textractor object.
        :raises exception: Raised when the Textract call fails
        :return: Document
        :rtype: Document
        """
        if isinstance(file_source, str):
            logging.debug("Filepath given.")
            images = self._get_document_images_from_path(file_source)
        elif isinstance(file_source, Image.Image):
            logging.debug("PIL Image given.")
            images = [file_source]
        elif isinstance(file_source, list) and isinstance(file_source[0], Image.Image):
            logging.debug("List of PIL Image given.")
            # FIXME: Is this needed?
            images = deepcopy(file_source)
        else:
            images = []
            raise InputError("Input file_source format not supported.")

        images_bytes = [_image_to_byte_array(image) for image in images]

        try:
            response = call_textract_analyzeid(
                document_pages=images_bytes,
                boto3_textract_client=self.textract_client,
            )
        except Exception as exception:
            if exception.__class__.__name__ == "InvalidS3ObjectException":
                raise RegionMismatchError(
                    "Region passed in the profile_name and S3 bucket do not match. Ensure the regions are the same."
                )
            raise exception

        document = response_parser.parse(response)
        document.response = response
        if save_image:
            for page in document.pages:
                page.image = images[document.pages.index(page)]
        return document

    def analyze_expense(
        self,
        file_source: Union[str, List[Image.Image], List[str]],
        save_image: bool = True,
    ):
        """Make a call to the SYNC AnalyzeExpense API, implicitly parses the response and produces a :class:`Document` object.
        This function is ideal for multipage PDFs or list of images.

        :param file_source: Path to a file stored locally, on an S3 bucket or PIL Image
        :type file_source: Union[str, List[Image.Image], List[str]]
        :param save_image: Whether to keep the file source as PIL Images inside the returned Document object, defaults to False
        :type save_image: bool, optional
        :raises IncorrectMethodException: Raised when the file source type is incompatible with the Textract API being called
        :raises InputError: Raised when the file source type is invalid
        :raises RegionMismatchError: Raised when the file source region is different the API region.
        :raises exception: Raised if the Textract API call fails
        :return: Document
        :rtype: Document
        """
        if isinstance(file_source, list) and len(file_source) > 1:
            raise IncorrectMethodException(
                "List contains more than 1 image. Call start_expense_analysis instead."
            )

        elif isinstance(file_source, str):
            logging.debug("Filepath given.")
            images = self._get_document_images_from_path(file_source)
            if len(images) > 1:
                raise IncorrectMethodException(
                    "Input contains more than 1 page. Call start_expense_analysis instead."
                )
            file_source = _image_to_byte_array(images[0])

        elif isinstance(file_source, Image.Image):
            logging.debug("PIL Image given.")
            images = [file_source.copy()]
            file_source = _image_to_byte_array(file_source)

        elif isinstance(file_source, list) and isinstance(file_source[0], Image.Image):
            logging.debug("List of PIL Image given.")
            images = deepcopy(file_source)
            file_source = _image_to_byte_array(images[0])

        else:
            images = []
            raise InputError("Input file_source format not supported.")

        output_config = None

        try:
            response = call_textract_expense(
                input_document=file_source,
                output_config=output_config,
                kms_key_id=self.kms_key_id,
                job_tag="",
                notification_channel=None,  # not supported yet
                client_request_token="",
                return_job_id=False,
                force_async_api=False,
                boto3_textract_client=self.textract_client,
                job_done_polling_interval=0,
            )
        except Exception as exception:
            if exception.__class__.__name__ == "InvalidS3ObjectException":
                raise RegionMismatchError(
                    "Region passed in the profile_name and S3 bucket do not match. Ensure the regions are the same."
                )
            raise exception

        document = response_parser.parse(response)
        document.response = response
        if save_image:
            for page in document.pages:
                page.image = images[document.pages.index(page)]
        return document

    def start_expense_analysis(
        self,
        file_source: Union[str, bytes, Image.Image],
        s3_output_path: str = "",
        s3_upload_path: str = "",
        client_request_token: str = "",
        job_tag: str = "",
        save_image: bool = True,
    ) -> LazyDocument:
        """Make a call to the ASYNC StartExpenseAnalysis API, implicitly parses the response and produces a :class:`Document` object.
        This function is ideal for multipage PDFs or an image.

        :param file_source: Path to a file stored locally, on an S3 bucket or a PIL Image
        :type file_source: Union[str, bytes, Image.Image]
        :param s3_output_path: Path to store the output on the S3 bucket (passed as param to Textractor).
        :type s3_output_path: str
        :param s3_upload_path: If given, will automatically upload the document to the given S3 prefix before calling Textract. Files are uploaded
                               under a uuid. If not given the data is expected to be already in s3
        :type s3_upload_path: str, optional
        :param client_request_token: The idempotent token that's used to identify the start request. If you use the same. token
                                    with multiple StartDocumentTextDetection requests, the same. JobId is returned. Use ClientRequestToken
                                    to prevent the same. job from being accidentally started more than once.
        :type client_request_token: str, optional
        :param job_tag: An identifier that you specify that's included in the completion notification published to the Amazon SNS topic.
        :type job_tag: str, optional
        :param save_image: Flag to indicate if document images are to be stored within the Document object. This is optional
                            and necessary only if the customer wants to visualize bounding boxes for their document entities.
        :type save_image: bool
        :raises InputError: Raised when the file source type is invalid
        :raises RegionMismatchError: Raised when the file source region is different the API region.
        :raises exception: Raised if the Textract API call fails
        :return: Lazy-loaded Document object
        :rtype: LazyDocument
        """

        original_file_source = file_source

        if not isinstance(file_source, (str, bytes, Image.Image)):
            raise InputError(
                f"file_source needs to be of type str, bytes or PIL Image, not {type(file_source)}"
            )

        # If the file is not already in S3
        if not isinstance(file_source, str) or not file_source.startswith("s3://"):
            # Check if the user has given us a bucket to upload to
            if not s3_upload_path:
                raise InputError(
                    f"For files not in S3, an S3 upload path must be provided"
                )

            s3_file_path = os.path.join(s3_upload_path, str(uuid.uuid4()))
            upload_to_s3(self.s3_client, s3_file_path, file_source)
            file_source = s3_file_path

        output_config = None
        if s3_output_path:
            s3_bucket, s3_prefix = s3_path_to_bucket_and_prefix(s3_output_path)
            output_config = OutputConfig(s3_bucket=s3_bucket, s3_prefix=s3_prefix)

        try:
            response = call_textract_expense(
                input_document=file_source,
                output_config=output_config,
                kms_key_id=self.kms_key_id,
                job_tag=job_tag,
                notification_channel=None,  # not supported yet
                client_request_token=client_request_token,
                return_job_id=True,
                force_async_api=True,
                boto3_textract_client=self.textract_client,
                job_done_polling_interval=1,
            )
        except Exception as exception:
            if exception.__class__.__name__ == "InvalidS3ObjectException":
                raise RegionMismatchError(
                    "Region passed in the profile_name and S3 bucket do not match. Ensure the regions are the same."
                )
            raise exception

        images = None
        if save_image:
            if isinstance(original_file_source, Image.Image):
                images = [original_file_source]
            elif (
                isinstance(original_file_source, list)
                and len(original_file_source)
                and isinstance(original_file_source[0], Image.Image)
            ):
                images = original_file_source
            else:
                images = self._get_document_images_from_path(original_file_source)

        return LazyDocument(
            response["JobId"],
            TextractAPI.EXPENSE,
            textract_client=self.textract_client,
            images=images,
        )

    def get_result(
        self, job_id: str, api: Union[TextractAPI, Textract_API]
    ) -> Document:
        """
        Retrieves Textract API output for a given job id.
        :param job_id: Textract API JobID
        :type job_id: str, required
        :return: Returns a Document object
        :rtype: Document
        """

        response = get_full_json(
            job_id,
            TextractAPI.TextractAPI_to_Textract_API(api)
            if isinstance(api, TextractAPI)
            else api,
            boto3_textract_client=self.textract_client,
            job_done_polling_interval=1,
        )

        document = response_parser.parse(response)
        document.response = response

        return document


def _image_to_byte_array(image: Image) -> bytes:
    """
    Function to convert PIL.Image to bytearray for processing Document using Textract service.
    :param image: Image to be converted to bytearray
    :type image: PIL.Image, required
    :return: Returns a bytearray of the input image
    :rtype: bytes
    """
    img_byte_arr = io.BytesIO()
    image.convert("RGB").save(img_byte_arr, format="JPEG")
    img_byte_arr = img_byte_arr.getvalue()
    return img_byte_arr