Skip to content

Commit cb49eec

Browse files
author
Michael Mooring
committed
AS-176 open-parse markitdown chunking options
1 parent 2af8163 commit cb49eec

File tree

4 files changed

+201
-11
lines changed

4 files changed

+201
-11
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ venv
2222
docs.zip
2323
archive.zip
2424
*.egg-info
25+
.ropeproject
26+
.qodo
2527

2628
# macOS
2729
.DS_Store

src/openparse/doc_parser.py

+25-6
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,9 @@ def __init__(
8080
table_args=None,
8181
use_markitdown: bool = False,
8282
llm_client: Optional[object] = None,
83+
chunk_size: int = 1000,
84+
chunk_overlap: int = 200,
85+
use_tokens: bool = True,
8386
verbose: bool = False,
8487
**kwargs
8588
):
@@ -97,12 +100,30 @@ def __init__(
97100
# Set pipeline verbosity
98101
self.processing_pipeline.verbose = self._verbose
99102

100-
# Initialize parsers and args
101-
self.table_args = table_args
103+
"""
104+
Initialize the document parser.
105+
106+
Args:
107+
use_markitdown: Whether to use MarkItDown for document parsing
108+
use_ocr: Whether to use OCR for document parsing
109+
table_args: Arguments for table extraction
110+
processing_pipeline: Pipeline for processing extracted nodes
111+
llm_client: Optional LLM client for enhanced parsing
112+
chunk_size: Maximum size of text chunks (in characters or tokens)
113+
chunk_overlap: Number of characters or tokens to overlap between chunks
114+
use_tokens: If True, measures length in tokens; if False, uses characters
115+
verbose: Whether to enable verbose logging
116+
"""
102117
self.use_markitdown = use_markitdown
118+
self.table_args = table_args
119+
103120
if use_markitdown:
104-
self.markitdown_parser = MarkItDownParser(llm_client=llm_client)
105-
121+
self.markitdown_parser = MarkItDownParser(
122+
llm_client=llm_client,
123+
chunk_size=chunk_size,
124+
chunk_overlap=chunk_overlap,
125+
use_tokens=use_tokens
126+
)
106127
def _process_directory(
107128
self,
108129
files: List[Path],
@@ -233,8 +254,6 @@ def _get_s3_client(self, endpoint_url: Optional[str] = None) -> boto3.client:
233254
# Log the configuration being used
234255
logger.info("🟠☁️ Creating S3 client with configuration:")
235256
logger.info(f"Endpoint URL: {endpoint_url}")
236-
logger.info(f"Access Key ID present: {bool(os.getenv('R2_ACCESS_KEY_ID'))}")
237-
logger.info(f"Secret Access Key present: {bool(os.getenv('R2_SECRET_ACCESS_KEY'))}")
238257

239258
client_kwargs = {
240259
'service_name': 's3',

src/openparse/processing/markitdown_doc_parser.py

+106-5
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
from __future__ import annotations
2-
from typing import List, Union, Tuple
2+
from typing import List, Union, Tuple, Dict, Optional, Literal
33
from pathlib import Path
44
from datetime import date
5-
from typing import Literal
65
import logging
76

87
from markitdown import MarkItDown
@@ -13,10 +12,109 @@ class DocumentParser:
1312

1413
SUPPORTED_FORMATS = {'.pdf', '.docx', '.pptx', '.xlsx', '.html', '.txt', '.json', '.xml', '.zip'}
1514

16-
def __init__(self, use_ocr: bool = False, llm_client: Optional[object] = None):
15+
def __init__(self,
16+
use_ocr: bool = False,
17+
llm_client: Optional[object] = None,
18+
chunk_size: int = 1000,
19+
chunk_overlap: int = 200,
20+
use_tokens: bool = True):
21+
"""
22+
Initialize the MarkItDown document parser.
23+
24+
Args:
25+
use_ocr: Whether to use OCR for document parsing
26+
llm_client: Optional LLM client for enhanced parsing
27+
chunk_size: Maximum size of text chunks (in characters or tokens)
28+
chunk_overlap: Number of characters or tokens to overlap between chunks
29+
use_tokens: If True, measures length in tokens; if False, uses characters
30+
"""
1731
self.parser = MarkItDown(llm_client=llm_client) if llm_client else MarkItDown()
1832
self.use_ocr = use_ocr
1933
self.logger = logging.getLogger(__name__)
34+
self.chunk_size = chunk_size
35+
self.chunk_overlap = chunk_overlap
36+
self.use_tokens = use_tokens
37+
38+
def split_text_with_overlap(self, text: str) -> List[str]:
39+
"""
40+
Splits text into chunks based on paragraphs, respecting max length and overlap.
41+
42+
Args:
43+
text: The input text to split
44+
45+
Returns:
46+
List of text chunks
47+
"""
48+
# Normalize newlines for consistent splitting
49+
normalized_text = text.replace('\r\n', '\n').replace('\n+', '\n\n').strip()
50+
51+
# Split into paragraphs based on double newlines
52+
paragraphs = normalized_text.split('\n\n')
53+
54+
chunks = []
55+
current_chunk = ''
56+
57+
# Helper function to measure length (characters or tokens)
58+
def get_length(s: str) -> int:
59+
if self.use_tokens:
60+
# Rough token count: split by whitespace and filter out empty strings
61+
return len([word for word in s.split() if word])
62+
return len(s) # Character count
63+
64+
# Helper function to get the last N characters or tokens for overlap
65+
def get_overlap_segment(s: str, size: int) -> str:
66+
if self.use_tokens:
67+
words = [word for word in s.split() if word]
68+
overlap_words = words[-min(size, len(words)):]
69+
return ' '.join(overlap_words)
70+
return s[-min(size, len(s)):]
71+
72+
for paragraph in paragraphs:
73+
paragraph_length = get_length(paragraph)
74+
75+
# If paragraph fits in current chunk
76+
if get_length(current_chunk) + paragraph_length <= self.chunk_size:
77+
current_chunk += ('\n\n' if current_chunk else '') + paragraph
78+
else:
79+
# If current chunk isn't empty, append it and start a new one with overlap
80+
if current_chunk:
81+
chunks.append(current_chunk)
82+
overlap_text = get_overlap_segment(current_chunk, self.chunk_overlap)
83+
current_chunk = overlap_text + '\n\n' + paragraph
84+
else:
85+
# If paragraph alone exceeds max_length, split it further
86+
remaining = paragraph
87+
while get_length(remaining) > self.chunk_size:
88+
if self.use_tokens:
89+
# Find approximate token boundary
90+
words = [word for word in remaining.split() if word]
91+
token_count = 0
92+
char_count = 0
93+
for i, word in enumerate(words):
94+
token_count += 1
95+
char_count += len(word) + (1 if i > 0 else 0) # Add space
96+
if token_count >= self.chunk_size:
97+
split_point = char_count
98+
break
99+
else:
100+
split_point = len(remaining)
101+
else:
102+
# Find last space before max_length for clean character split
103+
split_point = remaining.rfind(' ', 0, self.chunk_size)
104+
if split_point == -1:
105+
split_point = self.chunk_size
106+
107+
chunk = remaining[:split_point].strip()
108+
chunks.append(chunk)
109+
overlap_text = get_overlap_segment(chunk, self.chunk_overlap)
110+
remaining = overlap_text + ' ' + remaining[split_point:].strip()
111+
current_chunk = remaining
112+
113+
# Append the final chunk if it exists
114+
if current_chunk:
115+
chunks.append(current_chunk)
116+
117+
return chunks
20118

21119
def parse_batch(self, files: List[Path], batch_size: int = 1) -> List[Tuple[List[Node], FileMetadata]]:
22120
"""Process multiple files in batches."""
@@ -46,10 +144,13 @@ def _get_metadata(self, result, file_path: Path) -> Dict:
46144
}
47145

48146
def _text_to_nodes(self, text: str, start_page: int = 1) -> List[Node]:
49-
"""Convert text content to nodes."""
147+
"""Convert text content to nodes with RAG-based chunking."""
50148
nodes = []
51149
if text and len(text.strip()) > 0:
52-
chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
150+
# Apply RAG-based chunking
151+
chunks = self.split_text_with_overlap(text)
152+
153+
self.logger.debug(f"Split text into {len(chunks)} chunks using RAG-based chunking")
53154

54155
for i, chunk in enumerate(chunks, start_page):
55156
if chunk.strip():

src/tests/test_docparser_0_7_2.py

+68
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,53 @@ def test_metadata_extraction(self, tmp_path):
9898
assert "file_type" in metadata
9999
assert metadata["file_type"] == ".txt"
100100

101+
def test_split_text_with_overlap(self):
102+
"""Test text chunking with overlap."""
103+
parser = DocumentParser(
104+
use_markitdown=True,
105+
chunk_size=50,
106+
chunk_overlap=10,
107+
use_tokens=False
108+
)
109+
110+
# Test with a long text that needs chunking
111+
long_text = "This is a test " * 20 # Creates text longer than chunk_size
112+
chunks = parser.markitdown_parser.split_text_with_overlap(long_text)
113+
114+
# Verify we got multiple chunks
115+
assert len(chunks) > 1
116+
117+
# Verify chunk size constraints
118+
for chunk in chunks:
119+
assert len(chunk) <= 50 + 10 # Allow for some overlap
120+
121+
# Verify overlap exists between chunks
122+
if len(chunks) >= 2:
123+
# The end of first chunk should appear at the start of second chunk
124+
overlap = chunks[0][-10:]
125+
assert overlap in chunks[1]
126+
127+
def test_token_based_chunking(self):
128+
"""Test token-based text chunking."""
129+
parser = DocumentParser(
130+
use_markitdown=True,
131+
chunk_size=10, # 10 tokens
132+
chunk_overlap=2,
133+
use_tokens=True
134+
)
135+
136+
# Create text with exactly 25 tokens
137+
text = "one two three four five six seven eight nine ten " * 2 + "one two three four five"
138+
chunks = parser.markitdown_parser.split_text_with_overlap(text)
139+
140+
# Should create 3 chunks with our settings
141+
assert len(chunks) >= 2
142+
143+
# Check token counts in each chunk
144+
for chunk in chunks:
145+
token_count = len([word for word in chunk.split() if word])
146+
assert token_count <= 12 # chunk_size + overlap
147+
101148
def test_batch_processing(self, tmp_path):
102149
"""Test batch processing of multiple files."""
103150
# Create test files
@@ -167,6 +214,27 @@ def test_different_file_types(self, tmp_path, file_type):
167214
assert isinstance(metadata, dict)
168215
assert metadata["file_type"] == file_type
169216

217+
def test_llm_client_initialization(self):
218+
"""Test initialization with LLM client."""
219+
mock_llm = MagicMock()
220+
parser = DocumentParser(use_markitdown=True, llm_client=mock_llm)
221+
222+
# Verify the LLM client was passed to MarkItDown
223+
assert parser.markitdown_parser.parser is not None
224+
225+
def test_custom_chunking_parameters(self):
226+
"""Test initialization with custom chunking parameters."""
227+
parser = DocumentParser(
228+
use_markitdown=True,
229+
chunk_size=2000,
230+
chunk_overlap=300,
231+
use_tokens=False
232+
)
233+
234+
assert parser.markitdown_parser.chunk_size == 2000
235+
assert parser.markitdown_parser.chunk_overlap == 300
236+
assert parser.markitdown_parser.use_tokens is False
237+
170238
class TestDocumentParser:
171239
def test_init_default(self):
172240
"""Test default initialization."""

0 commit comments

Comments
 (0)