Skip to content

Commit

Permalink
Add back MSG parsing (#1993)
Browse files Browse the repository at this point in the history
* Add back MSG parsing

* Fix init

* Fix msg parsing
  • Loading branch information
NolanTrem authored Feb 20, 2025
1 parent cd3f750 commit ad001fa
Show file tree
Hide file tree
Showing 8 changed files with 74 additions and 62 deletions.
2 changes: 1 addition & 1 deletion py/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@
"EMLParser",
"EPUBParser",
"JSONParser",
# "MSGParser",
"MSGParser",
"ORGParser",
"P7SParser",
"RSTParser",
Expand Down
2 changes: 1 addition & 1 deletion py/core/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"EMLParser",
"EPUBParser",
"JSONParser",
# "MSGParser",
"MSGParser",
"ORGParser",
"P7SParser",
"RSTParser",
Expand Down
5 changes: 2 additions & 3 deletions py/core/parsers/structured/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
from .eml_parser import EMLParser
from .epub_parser import EPUBParser
from .json_parser import JSONParser

# from .msg_parser import MSGParser
from .msg_parser import MSGParser
from .org_parser import ORGParser
from .p7s_parser import P7SParser
from .rst_parser import RSTParser
Expand All @@ -19,7 +18,7 @@
"EMLParser",
"EPUBParser",
"JSONParser",
# "MSGParser",
"MSGParser",
"ORGParser",
"P7SParser",
"RSTParser",
Expand Down
108 changes: 53 additions & 55 deletions py/core/parsers/structured/msg_parser.py
Original file line number Diff line number Diff line change
@@ -1,67 +1,65 @@
# # type: ignore
# from typing import AsyncGenerator
# type: ignore
import os
import tempfile
from typing import AsyncGenerator

# import extract_msg
from msg_parser import MsOxMessage

# from core.base.parsers.base_parser import AsyncParser
# from core.base.providers import (
# CompletionProvider,
# DatabaseProvider,
# IngestionConfig,
# )
from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
CompletionProvider,
DatabaseProvider,
IngestionConfig,
)

# class MSGParser(AsyncParser[str | bytes]):
# """Parser for MSG (Outlook Message) files."""

# def __init__(
# self,
# config: IngestionConfig,
# database_provider: DatabaseProvider,
# llm_provider: CompletionProvider,
# ):
# self.database_provider = database_provider
# self.llm_provider = llm_provider
# self.config = config
# self.extract_msg = extract_msg
class MSGParser(AsyncParser[str | bytes]):
"""Parser for MSG (Outlook Message) files using msg_parser."""

# async def ingest(
# self, data: str | bytes, **kwargs
# ) -> AsyncGenerator[str, None]:
# """Ingest MSG data and yield email content."""
# if isinstance(data, str):
# raise ValueError("MSG data must be in bytes format.")
def __init__(
self,
config: IngestionConfig,
database_provider: DatabaseProvider,
llm_provider: CompletionProvider,
):
self.database_provider = database_provider
self.llm_provider = llm_provider
self.config = config

# from io import BytesIO
async def ingest(
self, data: str | bytes, **kwargs
) -> AsyncGenerator[str, None]:
"""Ingest MSG data and yield email content."""
if isinstance(data, str):
raise ValueError("MSG data must be in bytes format.")

# file_obj = BytesIO(data)
tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".msg")
try:
tmp_file.write(data)
tmp_file.close()

# try:
# msg = self.extract_msg.Message(file_obj)
msg = MsOxMessage(tmp_file.name)

# # Extract metadata
# metadata = []
# if msg.subject:
# metadata.append(f"Subject: {msg.subject}")
# if msg.sender:
# metadata.append(f"From: {msg.sender}")
# if msg.to:
# metadata.append(f"To: {msg.to}")
# if msg.date:
# metadata.append(f"Date: {msg.date}")
metadata = []

# if metadata:
# yield "\n".join(metadata)
if msg.subject:
metadata.append(f"Subject: {msg.subject}")
if msg.sender:
metadata.append(f"From: {msg.sender}")
if msg.to:
metadata.append(f"To: {', '.join(msg.to)}")
if msg.sent_date:
metadata.append(f"Date: {msg.sent_date}")
if metadata:
yield "\n".join(metadata)
if msg.body:
yield msg.body.strip()

# # Extract body
# if msg.body:
# yield msg.body.strip()
for attachment in msg.attachments:
if attachment.Filename:
yield f"\nAttachment: {attachment.Filename}"

# # Extract attachments (optional)
# for attachment in msg.attachments:
# if hasattr(attachment, "name"):
# yield f"\nAttachment: {attachment.name}"

# except Exception as e:
# raise ValueError(f"Error processing MSG file: {str(e)}")
# finally:
# file_obj.close()
except Exception as e:
raise ValueError(f"Error processing MSG file: {str(e)}") from e
finally:
os.remove(tmp_file.name)
2 changes: 1 addition & 1 deletion py/core/providers/ingestion/r2r/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class R2RIngestionProvider(IngestionProvider):
DocumentType.HTM: parsers.HTMLParser,
DocumentType.ODT: parsers.ODTParser,
DocumentType.JSON: parsers.JSONParser,
# DocumentType.MSG: parsers.MSGParser,
DocumentType.MSG: parsers.MSGParser,
DocumentType.ORG: parsers.ORGParser,
DocumentType.MD: parsers.MDParser,
DocumentType.PDF: parsers.BasicPDFParser,
Expand Down
1 change: 1 addition & 0 deletions py/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ core = [
"hatchet-sdk ==0.47.0",
"litellm >=1.58.2,<2.0.0",
"markdown >=3.6,<4.0",
"msg-parser>=1.2.0",
"networkx >=3.3,<4.0",
"numpy >=1.22.4,<1.29.0",
"olefile >=0.47,<0.48",
Expand Down
2 changes: 1 addition & 1 deletion py/tests/integration/test_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def client(config):
("jpeg", "core/examples/supported_file_types/jpeg.jpeg"),
("jpg", "core/examples/supported_file_types/jpg.jpg"),
("md", "core/examples/supported_file_types/md.md"),
# ("msg", "core/examples/supported_file_types/msg.msg"),
("msg", "core/examples/supported_file_types/msg.msg"),
("odt", "core/examples/supported_file_types/odt.odt"),
("org", "core/examples/supported_file_types/org.org"),
("p7s", "core/examples/supported_file_types/p7s.p7s"),
Expand Down
14 changes: 14 additions & 0 deletions py/uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit ad001fa

Please sign in to comment.