Skip to content

Commit 643b8ec

Browse files
committed
Performance improvements
1 parent 4a5b8aa commit 643b8ec

File tree

3 files changed

+33
-15
lines changed

3 files changed

+33
-15
lines changed

pdftext/pdf/chars.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,13 @@ def word_break():
8585
word_break()
8686
continue
8787

88-
# we break on any change in font info
89-
if any(char['font'][k] != word['font'][k] for k in ['name', 'flags', 'size', 'weight']):
88+
# we break on any change in font info - optimized comparison
89+
char_font = char['font']
90+
word_font = word['font']
91+
if (char_font['name'] != word_font['name'] or
92+
char_font['flags'] != word_font['flags'] or
93+
char_font['size'] != word_font['size'] or
94+
char_font['weight'] != word_font['weight']):
9095
word_break()
9196
continue
9297

@@ -99,17 +104,19 @@ def word_break():
99104
word['bbox'] = word['bbox'].merge(char['bbox'])
100105
word['chars'].append(char)
101106

102-
# deduplicate words
103-
seen = {}
107+
# deduplicate words - use tuple keys instead of strings
108+
seen = set()
104109
deduped = []
105110
for word in words:
106111
# Round the bbox coordinates
107112
bbox = word['bbox'].bbox
108-
bbox = [round(x, 0) for x in bbox]
113+
bbox_rounded = tuple(round(x, 0) for x in bbox)
109114

110-
key = f"{bbox}-{word['text']}-{word['rotation']}-{word['font']['name']}-{word['font']['flags']}-{word['font']['size']}-{word['font']['weight']}"
115+
key = (bbox_rounded, word['text'], word['rotation'],
116+
word['font']['name'], word['font']['flags'],
117+
word['font']['size'], word['font']['weight'])
111118
if key not in seen:
112-
seen[key] = True
119+
seen.add(key)
113120
deduped.append(word)
114121

115122
return [char for word in deduped for char in word['chars']]

pdftext/pdf/pages.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,8 +132,11 @@ def span_break():
132132
def get_lines(spans: Spans) -> Lines:
133133
lines: Lines = []
134134
line: Line = None
135+
line_text: str = ""
135136

136137
def line_break():
138+
global line_text
139+
line_text = ""
137140
lines.append({"spans": [span], "bbox": span["bbox"], "rotation": span["rotation"]})
138141

139142
for span in spans:
@@ -144,8 +147,13 @@ def line_break():
144147
line_break()
145148
continue
146149

147-
# we break if the previous span ends with a linebreak or hyphenation
148-
if any(line["spans"][-1]["text"].endswith(suffix) for suffix in ["\n", "\x02"]):
150+
# we break if the previous span ends with a linebreak, and the line has text
151+
if line["spans"][-1]["text"].endswith("\n") and line_text.strip():
152+
line_break()
153+
continue
154+
155+
# we break if the current line ends with a hyphen
156+
if line["spans"][-1]["text"].endswith("\x02"):
149157
line_break()
150158
continue
151159

@@ -159,6 +167,7 @@ def line_break():
159167
continue
160168

161169
line["spans"].append(span)
170+
line_text += span["text"]
162171
line["bbox"] = line["bbox"].merge(span["bbox"])
163172

164173
return lines

pdftext/schema.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -56,12 +56,14 @@ def y_end(self):
5656
return self.bbox[3]
5757

5858
def merge(self, other: Bbox) -> Bbox:
59-
x_start = self.x_start if self.x_start < other.x_start else other.x_start
60-
y_start = self.y_start if self.y_start < other.y_start else other.y_start
61-
x_end = self.x_end if self.x_end > other.x_end else other.x_end
62-
y_end = self.y_end if self.y_end > other.y_end else other.y_end
63-
64-
return Bbox([x_start, y_start, x_end, y_end])
59+
self_bbox = self.bbox
60+
other_bbox = other.bbox
61+
return Bbox([
62+
min(self_bbox[0], other_bbox[0]),
63+
min(self_bbox[1], other_bbox[1]),
64+
max(self_bbox[2], other_bbox[2]),
65+
max(self_bbox[3], other_bbox[3])
66+
])
6567

6668
def overlap_x(self, other: Bbox):
6769
return max(0, min(self.bbox[2], other.bbox[2]) - max(self.bbox[0], other.bbox[0]))

0 commit comments

Comments
 (0)