From 78f378b34d5a65e4799bf284cdac13f7764b5c8e Mon Sep 17 00:00:00 2001 From: jlonge4 <91354480+jlonge4@users.noreply.github.com> Date: Wed, 16 Oct 2024 06:38:49 -0400 Subject: [PATCH] implement additional mime types (#8446) * implement additional mime types * correct typo * reduce complexity * add optional * add missing release note * yamllint * yamllint * Update file-router-additional-mime-types-47fe57e6816b83da.yaml minor reno change for consistency --------- Co-authored-by: Vladimir Blagojevic --- .../components/routers/file_type_router.py | 10 ++++++++- ...dditional-mime-types-47fe57e6816b83da.yaml | 22 +++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 releasenotes/notes/file-router-additional-mime-types-47fe57e6816b83da.yaml diff --git a/haystack/components/routers/file_type_router.py b/haystack/components/routers/file_type_router.py index 43f61840c0..df3935cf0c 100644 --- a/haystack/components/routers/file_type_router.py +++ b/haystack/components/routers/file_type_router.py @@ -54,16 +54,24 @@ class FileTypeRouter: :param mime_types: A list of MIME types or regex patterns to classify the input files or byte streams. """ - def __init__(self, mime_types: List[str]): + def __init__(self, mime_types: List[str], additional_mimetypes: Optional[Dict[str, str]] = None): """ Initialize the FileTypeRouter component. :param mime_types: A list of MIME types or regex patterns to classify the input files or byte streams. (for example: `["text/plain", "audio/x-wav", "image/jpeg"]`). + + :param additional_mimetypes: A dictionary containing the MIME type to add to the mimetypes package to prevent + unsupported or non native packages from being unclassified. + (for example: `{"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx"}`). """ if not mime_types: raise ValueError("The list of mime types cannot be empty.") + if additional_mimetypes: + for mime, ext in additional_mimetypes.items(): + mimetypes.add_type(mime, ext) + self.mime_type_patterns = [] for mime_type in mime_types: if not self._is_valid_mime_type_format(mime_type): diff --git a/releasenotes/notes/file-router-additional-mime-types-47fe57e6816b83da.yaml b/releasenotes/notes/file-router-additional-mime-types-47fe57e6816b83da.yaml new file mode 100644 index 0000000000..5d5b8a9d99 --- /dev/null +++ b/releasenotes/notes/file-router-additional-mime-types-47fe57e6816b83da.yaml @@ -0,0 +1,22 @@ +--- +features: + - | + Added a new parameter `additional_mimetypes` to the FileTypeRouter + component. + + This allows users to specify additional MIME type mappings, ensuring + correct + + file classification across different runtime environments and Python + versions. +enhancements: + - | + Improved file type detection in FileTypeRouter, particularly for Microsoft + Office file formats like .docx and .pptx. This enhancement ensures more + consistent behavior across different environments, including AWS Lambda + functions and systems without pre-installed office suites. +fixes: + - | + Addressed an issue where certain file types (e.g., .docx, .pptx) were + incorrectly classified as 'unclassified' in environments with limited + MIME type definitions, such as AWS Lambda functions.