Skip to content

Commit 6e1fba9

Browse files
authored
Merge pull request #1664 from myhloli/dev
feat(pdf_parse): improve OCR processing and contrast filtering
2 parents 9bb2d58 + 5561ac9 commit 6e1fba9

File tree

6 files changed

+65
-20
lines changed

6 files changed

+65
-20
lines changed

magic_pdf/filter/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def classify(pdf_bytes: bytes) -> SupportedPdfParseMethod:
2323
pdf_meta['image_info_per_page'],
2424
pdf_meta['text_len_per_page'],
2525
pdf_meta['imgs_per_page'],
26-
pdf_meta['text_layout_per_page'],
26+
# pdf_meta['text_layout_per_page'],
2727
pdf_meta['invalid_chars'],
2828
)
2929
if is_text_pdf:

magic_pdf/filter/pdf_classify_by_type.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,8 @@ def is_narrow_strip(img):
305305

306306

307307
def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
308-
text_layout_list: list, invalid_chars: bool):
308+
# text_layout_list: list,
309+
invalid_chars: bool):
309310
"""
310311
这里的图片和页面长度单位是pts
311312
:param total_page:
@@ -321,7 +322,7 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
321322
'by_text_len': classify_by_text_len(text_len_list, total_page),
322323
'by_avg_words': classify_by_avg_words(text_len_list),
323324
'by_img_num': classify_by_img_num(img_sz_list, img_num_list),
324-
'by_text_layout': classify_by_text_layout(text_layout_list),
325+
# 'by_text_layout': classify_by_text_layout(text_layout_list),
325326
'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list),
326327
'by_invalid_chars': invalid_chars,
327328
}
@@ -332,9 +333,10 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
332333
return False, results
333334
else:
334335
logger.warning(
335-
f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']},"
336+
f"OCR needed based on classification result, by_image_area: {results['by_image_area']},"
336337
f" by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']},"
337-
f" by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']},"
338+
# f" by_text_layout: {results['by_text_layout']},"
339+
f" by_img_narrow_strips: {results['by_img_narrow_strips']},"
338340
f" by_invalid_chars: {results['by_invalid_chars']}",
339341
file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
340342
return False, results

magic_pdf/filter/pdf_meta_scan.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -356,9 +356,9 @@ def pdf_meta_scan(pdf_bytes: bytes):
356356
# logger.info(f"image_info_per_page: {image_info_per_page}, junk_img_bojids: {junk_img_bojids}")
357357
text_len_per_page = get_pdf_textlen_per_page(doc)
358358
# logger.info(f"text_len_per_page: {text_len_per_page}")
359-
text_layout_per_page = get_pdf_text_layout_per_page(doc)
359+
# text_layout_per_page = get_pdf_text_layout_per_page(doc)
360360
# logger.info(f"text_layout_per_page: {text_layout_per_page}")
361-
text_language = get_language(doc)
361+
# text_language = get_language(doc)
362362
# logger.info(f"text_language: {text_language}")
363363
invalid_chars = check_invalid_chars(pdf_bytes)
364364
# logger.info(f"invalid_chars: {invalid_chars}")
@@ -372,8 +372,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
372372
'page_height_pts': int(page_height_pts),
373373
'image_info_per_page': image_info_per_page,
374374
'text_len_per_page': text_len_per_page,
375-
'text_layout_per_page': text_layout_per_page,
376-
'text_language': text_language,
375+
# 'text_layout_per_page': text_layout_per_page,
376+
# 'text_language': text_language,
377377
# "svgs_per_page": svgs_per_page,
378378
'imgs_per_page': imgs_per_page, # 增加每页img数量list
379379
'junk_img_bojids': junk_img_bojids, # 增加垃圾图片的bojid list

magic_pdf/libs/pdf_check.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import re
55
from io import BytesIO
66
from pdfminer.high_level import extract_text
7+
from pdfminer.layout import LAParams
78

89

910
def calculate_sample_count(total_page: int):
@@ -41,7 +42,16 @@ def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
4142
sample_docs = extract_pages(src_pdf_bytes)
4243
sample_pdf_bytes = sample_docs.tobytes()
4344
sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
44-
text = extract_text(sample_pdf_file_like_object)
45+
laparams = LAParams(
46+
line_overlap=0.5,
47+
char_margin=2.0,
48+
line_margin=0.5,
49+
word_margin=0.1,
50+
boxes_flow=None,
51+
detect_vertical=False,
52+
all_texts=False,
53+
)
54+
text = extract_text(pdf_file=sample_pdf_file_like_object, laparams=laparams)
4555
text = text.replace("\n", "")
4656
# logger.info(text)
4757
'''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''

magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# Copyright (c) Opendatalab. All rights reserved.
2+
import time
23
from collections import Counter
34
from uuid import uuid4
45

@@ -102,9 +103,9 @@ def do_detect(self, images: list):
102103
temp_images = split_images(image)
103104
for temp_image in temp_images:
104105
all_images.append(resize_images_to_224(temp_image))
105-
106-
images_lang_res = self.batch_predict(all_images, batch_size=8)
107-
# logger.info(f"images_lang_res: {images_lang_res}")
106+
# langdetect_start = time.time()
107+
images_lang_res = self.batch_predict(all_images, batch_size=256)
108+
# logger.info(f"image number of langdetect: {len(images_lang_res)}, langdetect time: {round(time.time() - langdetect_start, 2)}")
108109
if len(images_lang_res) > 0:
109110
count_dict = Counter(images_lang_res)
110111
language = max(count_dict, key=count_dict.get)

magic_pdf/pdf_parse_union_core_v2.py

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
import time
77
from typing import List
88

9+
import cv2
910
import fitz
1011
import torch
12+
import numpy as np
1113
from loguru import logger
1214

1315
from magic_pdf.config.enums import SupportedPdfParseMethod
@@ -127,16 +129,15 @@ def fill_char_in_spans(spans, all_chars):
127129
span['chars'].append(char)
128130
break
129131

130-
empty_spans = []
131-
132+
need_ocr_spans = []
132133
for span in spans:
133134
chars_to_content(span)
134135
# 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤
135136
if len(span['content']) * span['height'] < span['width'] * 0.5:
136137
# logger.info(f"maybe empty span: {len(span['content'])}, {span['height']}, {span['width']}")
137-
empty_spans.append(span)
138+
need_ocr_spans.append(span)
138139
del span['height'], span['width']
139-
return empty_spans
140+
return need_ocr_spans
140141

141142

142143
# 使用鲁棒性更强的中心点坐标判断
@@ -190,6 +191,31 @@ def remove_tilted_line(text_blocks):
190191
block['lines'].remove(line)
191192

192193

194+
def calculate_contrast(img, img_mode) -> float:
195+
"""
196+
计算给定图像的对比度。
197+
:param img: 图像,类型为numpy.ndarray
198+
:Param img_mode = 图像的色彩通道,'rgb' 或 'bgr'
199+
:return: 图像的对比度值
200+
"""
201+
if img_mode == 'rgb':
202+
# 将RGB图像转换为灰度图
203+
gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
204+
elif img_mode == 'bgr':
205+
# 将BGR图像转换为灰度图
206+
gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
207+
else:
208+
raise ValueError("Invalid image mode. Please provide 'rgb' or 'bgr'.")
209+
210+
# 计算均值和标准差
211+
mean_value = np.mean(gray_img)
212+
std_dev = np.std(gray_img)
213+
# 对比度定义为标准差除以平均值(加上小常数避免除零错误)
214+
contrast = std_dev / (mean_value + 1e-6)
215+
# logger.info(f"contrast: {contrast}")
216+
return round(contrast, 2)
217+
218+
193219
def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
194220
# cid用0xfffd表示,连字符拆开
195221
# text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
@@ -274,9 +300,9 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
274300
span['chars'] = []
275301
new_spans.append(span)
276302

277-
empty_spans = fill_char_in_spans(new_spans, all_pymu_chars)
303+
need_ocr_spans = fill_char_in_spans(new_spans, all_pymu_chars)
278304

279-
if len(empty_spans) > 0:
305+
if len(need_ocr_spans) > 0:
280306

281307
# 初始化ocr模型
282308
atom_model_manager = AtomModelSingleton()
@@ -287,9 +313,15 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
287313
lang=lang
288314
)
289315

290-
for span in empty_spans:
316+
for span in need_ocr_spans:
291317
# 对span的bbox截图再ocr
292318
span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2')
319+
320+
# 计算span的对比度,低于0.20的span不进行ocr
321+
if calculate_contrast(span_img, img_mode='bgr') <= 0.20:
322+
spans.remove(span)
323+
continue
324+
293325
ocr_res = ocr_model.ocr(span_img, det=False)
294326
if ocr_res and len(ocr_res) > 0:
295327
if len(ocr_res[0]) > 0:

0 commit comments

Comments
 (0)