AS-176 open-parse markitdown chunking options

Michael Mooring · Michael Mooring · commit cb49eecfa6a0 · 2025-03-13T15:09:41.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -22,6 +22,8 @@ venv
 docs.zip
 archive.zip
 *.egg-info
+.ropeproject
+.qodo
 
 # macOS
 .DS_Store
diff --git a/src/openparse/doc_parser.py b/src/openparse/doc_parser.py
@@ -80,6 +80,9 @@ def __init__(
         table_args=None,
         use_markitdown: bool = False,
         llm_client: Optional[object] = None,
+        chunk_size: int = 1000,
+        chunk_overlap: int = 200,
+        use_tokens: bool = True,
         verbose: bool = False,
          **kwargs
     ):
@@ -97,12 +100,30 @@ def __init__(
         # Set pipeline verbosity
         self.processing_pipeline.verbose = self._verbose
         
-        # Initialize parsers and args
-        self.table_args = table_args
+        """
+        Initialize the document parser.
+        
+        Args:
+            use_markitdown: Whether to use MarkItDown for document parsing
+            use_ocr: Whether to use OCR for document parsing
+            table_args: Arguments for table extraction
+            processing_pipeline: Pipeline for processing extracted nodes
+            llm_client: Optional LLM client for enhanced parsing
+            chunk_size: Maximum size of text chunks (in characters or tokens)
+            chunk_overlap: Number of characters or tokens to overlap between chunks
+            use_tokens: If True, measures length in tokens; if False, uses characters
+            verbose: Whether to enable verbose logging
+        """
         self.use_markitdown = use_markitdown
+        self.table_args = table_args
+        
         if use_markitdown:
-            self.markitdown_parser = MarkItDownParser(llm_client=llm_client)
-
+            self.markitdown_parser = MarkItDownParser(
+                llm_client=llm_client,
+                chunk_size=chunk_size,
+                chunk_overlap=chunk_overlap,
+                use_tokens=use_tokens
+            )
     def _process_directory(
         self,
         files: List[Path],
@@ -233,8 +254,6 @@ def _get_s3_client(self, endpoint_url: Optional[str] = None) -> boto3.client:
         # Log the configuration being used
         logger.info("🟠☁️ Creating S3 client with configuration:")
         logger.info(f"Endpoint URL: {endpoint_url}")
-        logger.info(f"Access Key ID present: {bool(os.getenv('R2_ACCESS_KEY_ID'))}")
-        logger.info(f"Secret Access Key present: {bool(os.getenv('R2_SECRET_ACCESS_KEY'))}")
 
         client_kwargs = {
             'service_name': 's3',
diff --git a/src/openparse/processing/markitdown_doc_parser.py b/src/openparse/processing/markitdown_doc_parser.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
-from typing import List, Union, Tuple
+from typing import List, Union, Tuple, Dict, Optional, Literal
 from pathlib import Path
 from datetime import date
-from typing import Literal
 import logging
 
 from markitdown import MarkItDown
@@ -13,10 +12,109 @@ class DocumentParser:
     
     SUPPORTED_FORMATS = {'.pdf', '.docx', '.pptx', '.xlsx', '.html', '.txt', '.json', '.xml', '.zip'}
     
-    def __init__(self, use_ocr: bool = False, llm_client: Optional[object] = None):
+    def __init__(self, 
+                 use_ocr: bool = False, 
+                 llm_client: Optional[object] = None,
+                 chunk_size: int = 1000,
+                 chunk_overlap: int = 200,
+                 use_tokens: bool = True):
+        """
+        Initialize the MarkItDown document parser.
+        
+        Args:
+            use_ocr: Whether to use OCR for document parsing
+            llm_client: Optional LLM client for enhanced parsing
+            chunk_size: Maximum size of text chunks (in characters or tokens)
+            chunk_overlap: Number of characters or tokens to overlap between chunks
+            use_tokens: If True, measures length in tokens; if False, uses characters
+        """
         self.parser = MarkItDown(llm_client=llm_client) if llm_client else MarkItDown()
         self.use_ocr = use_ocr
         self.logger = logging.getLogger(__name__)
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.use_tokens = use_tokens
+
+    def split_text_with_overlap(self, text: str) -> List[str]:
+        """
+        Splits text into chunks based on paragraphs, respecting max length and overlap.
+        
+        Args:
+            text: The input text to split
+            
+        Returns:
+            List of text chunks
+        """
+        # Normalize newlines for consistent splitting
+        normalized_text = text.replace('\r\n', '\n').replace('\n+', '\n\n').strip()
+        
+        # Split into paragraphs based on double newlines
+        paragraphs = normalized_text.split('\n\n')
+        
+        chunks = []
+        current_chunk = ''
+        
+        # Helper function to measure length (characters or tokens)
+        def get_length(s: str) -> int:
+            if self.use_tokens:
+                # Rough token count: split by whitespace and filter out empty strings
+                return len([word for word in s.split() if word])
+            return len(s)  # Character count
+
+        # Helper function to get the last N characters or tokens for overlap
+        def get_overlap_segment(s: str, size: int) -> str:
+            if self.use_tokens:
+                words = [word for word in s.split() if word]
+                overlap_words = words[-min(size, len(words)):]
+                return ' '.join(overlap_words)
+            return s[-min(size, len(s)):]
+
+        for paragraph in paragraphs:
+            paragraph_length = get_length(paragraph)
+
+            # If paragraph fits in current chunk
+            if get_length(current_chunk) + paragraph_length <= self.chunk_size:
+                current_chunk += ('\n\n' if current_chunk else '') + paragraph
+            else:
+                # If current chunk isn't empty, append it and start a new one with overlap
+                if current_chunk:
+                    chunks.append(current_chunk)
+                    overlap_text = get_overlap_segment(current_chunk, self.chunk_overlap)
+                    current_chunk = overlap_text + '\n\n' + paragraph
+                else:
+                    # If paragraph alone exceeds max_length, split it further
+                    remaining = paragraph
+                    while get_length(remaining) > self.chunk_size:
+                        if self.use_tokens:
+                            # Find approximate token boundary
+                            words = [word for word in remaining.split() if word]
+                            token_count = 0
+                            char_count = 0
+                            for i, word in enumerate(words):
+                                token_count += 1
+                                char_count += len(word) + (1 if i > 0 else 0)  # Add space
+                                if token_count >= self.chunk_size:
+                                    split_point = char_count
+                                    break
+                            else:
+                                split_point = len(remaining)
+                        else:
+                            # Find last space before max_length for clean character split
+                            split_point = remaining.rfind(' ', 0, self.chunk_size)
+                            if split_point == -1:
+                                split_point = self.chunk_size
+
+                        chunk = remaining[:split_point].strip()
+                        chunks.append(chunk)
+                        overlap_text = get_overlap_segment(chunk, self.chunk_overlap)
+                        remaining = overlap_text + ' ' + remaining[split_point:].strip()
+                    current_chunk = remaining
+
+        # Append the final chunk if it exists
+        if current_chunk:
+            chunks.append(current_chunk)
+
+        return chunks
 
     def parse_batch(self, files: List[Path], batch_size: int = 1) -> List[Tuple[List[Node], FileMetadata]]:
         """Process multiple files in batches."""
@@ -46,10 +144,13 @@ def _get_metadata(self, result, file_path: Path) -> Dict:
         }
 
     def _text_to_nodes(self, text: str, start_page: int = 1) -> List[Node]:
-        """Convert text content to nodes."""
+        """Convert text content to nodes with RAG-based chunking."""
         nodes = []
         if text and len(text.strip()) > 0:
-            chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
+            # Apply RAG-based chunking
+            chunks = self.split_text_with_overlap(text)
+            
+            self.logger.debug(f"Split text into {len(chunks)} chunks using RAG-based chunking")
             
             for i, chunk in enumerate(chunks, start_page):
                 if chunk.strip():
diff --git a/src/tests/test_docparser_0_7_2.py b/src/tests/test_docparser_0_7_2.py
@@ -98,6 +98,53 @@ def test_metadata_extraction(self, tmp_path):
         assert "file_type" in metadata
         assert metadata["file_type"] == ".txt"
 
+    def test_split_text_with_overlap(self):
+        """Test text chunking with overlap."""
+        parser = DocumentParser(
+            use_markitdown=True, 
+            chunk_size=50,
+            chunk_overlap=10,
+            use_tokens=False
+        )
+        
+        # Test with a long text that needs chunking
+        long_text = "This is a test " * 20  # Creates text longer than chunk_size
+        chunks = parser.markitdown_parser.split_text_with_overlap(long_text)
+        
+        # Verify we got multiple chunks
+        assert len(chunks) > 1
+        
+        # Verify chunk size constraints
+        for chunk in chunks:
+            assert len(chunk) <= 50 + 10  # Allow for some overlap
+            
+        # Verify overlap exists between chunks
+        if len(chunks) >= 2:
+            # The end of first chunk should appear at the start of second chunk
+            overlap = chunks[0][-10:]
+            assert overlap in chunks[1]
+
+    def test_token_based_chunking(self):
+        """Test token-based text chunking."""
+        parser = DocumentParser(
+            use_markitdown=True, 
+            chunk_size=10,  # 10 tokens
+            chunk_overlap=2,
+            use_tokens=True
+        )
+        
+        # Create text with exactly 25 tokens
+        text = "one two three four five six seven eight nine ten " * 2 + "one two three four five"
+        chunks = parser.markitdown_parser.split_text_with_overlap(text)
+        
+        # Should create 3 chunks with our settings
+        assert len(chunks) >= 2
+        
+        # Check token counts in each chunk
+        for chunk in chunks:
+            token_count = len([word for word in chunk.split() if word])
+            assert token_count <= 12  # chunk_size + overlap
+
     def test_batch_processing(self, tmp_path):
         """Test batch processing of multiple files."""
         # Create test files
@@ -167,6 +214,27 @@ def test_different_file_types(self, tmp_path, file_type):
         assert isinstance(metadata, dict)
         assert metadata["file_type"] == file_type
 
+    def test_llm_client_initialization(self):
+        """Test initialization with LLM client."""
+        mock_llm = MagicMock()
+        parser = DocumentParser(use_markitdown=True, llm_client=mock_llm)
+        
+        # Verify the LLM client was passed to MarkItDown
+        assert parser.markitdown_parser.parser is not None
+        
+    def test_custom_chunking_parameters(self):
+        """Test initialization with custom chunking parameters."""
+        parser = DocumentParser(
+            use_markitdown=True,
+            chunk_size=2000,
+            chunk_overlap=300,
+            use_tokens=False
+        )
+        
+        assert parser.markitdown_parser.chunk_size == 2000
+        assert parser.markitdown_parser.chunk_overlap == 300
+        assert parser.markitdown_parser.use_tokens is False
+
 class TestDocumentParser:
     def test_init_default(self):
         """Test default initialization."""