From 3772ef5f08ff8222b6d59223ff7540726210afb2 Mon Sep 17 00:00:00 2001 From: Nolan Tremelling <34580718+NolanTrem@users.noreply.github.com> Date: Thu, 20 Feb 2025 17:53:02 -0800 Subject: [PATCH] Fix bug on unstructured fallback parsing (#1995) * Fix bug on unstructured fallback parsing * Bump release --- .../providers/ingestion/unstructured/base.py | 50 ++++++++++++------- py/pyproject.toml | 2 +- 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/py/core/providers/ingestion/unstructured/base.py b/py/core/providers/ingestion/unstructured/base.py index 20fb129d1..6dc59a337 100644 --- a/py/core/providers/ingestion/unstructured/base.py +++ b/py/core/providers/ingestion/unstructured/base.py @@ -203,35 +203,47 @@ async def parse_fallback( ingestion_config: dict, parser_name: str, ) -> AsyncGenerator[FallbackElement, None]: - context = "" - async for text in self.parsers[parser_name].ingest( + contents = [] + async for chunk in self.parsers[parser_name].ingest( file_content, **ingestion_config ): # type: ignore - if text is not None: - context += text + "\n\n" - logging.info(f"Fallback ingestion with config = {ingestion_config}") + if isinstance(chunk, dict) and chunk.get("content"): + contents.append(chunk) + elif chunk: # Handle string output for backward compatibility + contents.append({"content": chunk}) - if not context.strip(): + if not contents: logging.warning( "No valid text content was extracted during parsing" ) return - loop = asyncio.get_event_loop() - splitter = RecursiveCharacterTextSplitter( - chunk_size=ingestion_config["new_after_n_chars"], - chunk_overlap=ingestion_config["overlap"], - ) - chunks = await loop.run_in_executor( - None, splitter.create_documents, [context] - ) + logging.info(f"Fallback ingestion with config = {ingestion_config}") + + iteration = 0 + for content_item in contents: + text = content_item["content"] - for chunk_id, text_chunk in enumerate(chunks): - yield FallbackElement( - text=text_chunk.page_content, - metadata={"chunk_id": chunk_id}, + loop = asyncio.get_event_loop() + splitter = RecursiveCharacterTextSplitter( + chunk_size=ingestion_config["new_after_n_chars"], + chunk_overlap=ingestion_config["overlap"], ) - await asyncio.sleep(0) + chunks = await loop.run_in_executor( + None, splitter.create_documents, [text] + ) + + for text_chunk in chunks: + metadata = {"chunk_id": iteration} + if "page_number" in content_item: + metadata["page_number"] = content_item["page_number"] + + yield FallbackElement( + text=text_chunk.page_content, + metadata=metadata, + ) + iteration += 1 + await asyncio.sleep(0) async def parse( self, diff --git a/py/pyproject.toml b/py/pyproject.toml index 5e662861c..bc087faab 100644 --- a/py/pyproject.toml +++ b/py/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "r2r" -version = "3.4.1" +version = "3.4.2" description = "SciPhi R2R" readme = "README.md" license = {text = "MIT"}