6
6
import time
7
7
from typing import List
8
8
9
+ import cv2
9
10
import fitz
10
11
import torch
12
+ import numpy as np
11
13
from loguru import logger
12
14
13
15
from magic_pdf .config .enums import SupportedPdfParseMethod
@@ -127,16 +129,15 @@ def fill_char_in_spans(spans, all_chars):
127
129
span ['chars' ].append (char )
128
130
break
129
131
130
- empty_spans = []
131
-
132
+ need_ocr_spans = []
132
133
for span in spans :
133
134
chars_to_content (span )
134
135
# 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤
135
136
if len (span ['content' ]) * span ['height' ] < span ['width' ] * 0.5 :
136
137
# logger.info(f"maybe empty span: {len(span['content'])}, {span['height']}, {span['width']}")
137
- empty_spans .append (span )
138
+ need_ocr_spans .append (span )
138
139
del span ['height' ], span ['width' ]
139
- return empty_spans
140
+ return need_ocr_spans
140
141
141
142
142
143
# 使用鲁棒性更强的中心点坐标判断
@@ -190,6 +191,31 @@ def remove_tilted_line(text_blocks):
190
191
block ['lines' ].remove (line )
191
192
192
193
194
+ def calculate_contrast (img , img_mode ) -> float :
195
+ """
196
+ 计算给定图像的对比度。
197
+ :param img: 图像,类型为numpy.ndarray
198
+ :Param img_mode = 图像的色彩通道,'rgb' 或 'bgr'
199
+ :return: 图像的对比度值
200
+ """
201
+ if img_mode == 'rgb' :
202
+ # 将RGB图像转换为灰度图
203
+ gray_img = cv2 .cvtColor (img , cv2 .COLOR_RGB2GRAY )
204
+ elif img_mode == 'bgr' :
205
+ # 将BGR图像转换为灰度图
206
+ gray_img = cv2 .cvtColor (img , cv2 .COLOR_BGR2GRAY )
207
+ else :
208
+ raise ValueError ("Invalid image mode. Please provide 'rgb' or 'bgr'." )
209
+
210
+ # 计算均值和标准差
211
+ mean_value = np .mean (gray_img )
212
+ std_dev = np .std (gray_img )
213
+ # 对比度定义为标准差除以平均值(加上小常数避免除零错误)
214
+ contrast = std_dev / (mean_value + 1e-6 )
215
+ # logger.info(f"contrast: {contrast}")
216
+ return round (contrast , 2 )
217
+
218
+
193
219
def txt_spans_extract_v2 (pdf_page , spans , all_bboxes , all_discarded_blocks , lang ):
194
220
# cid用0xfffd表示,连字符拆开
195
221
# text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
@@ -274,9 +300,9 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
274
300
span ['chars' ] = []
275
301
new_spans .append (span )
276
302
277
- empty_spans = fill_char_in_spans (new_spans , all_pymu_chars )
303
+ need_ocr_spans = fill_char_in_spans (new_spans , all_pymu_chars )
278
304
279
- if len (empty_spans ) > 0 :
305
+ if len (need_ocr_spans ) > 0 :
280
306
281
307
# 初始化ocr模型
282
308
atom_model_manager = AtomModelSingleton ()
@@ -287,9 +313,15 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
287
313
lang = lang
288
314
)
289
315
290
- for span in empty_spans :
316
+ for span in need_ocr_spans :
291
317
# 对span的bbox截图再ocr
292
318
span_img = cut_image_to_pil_image (span ['bbox' ], pdf_page , mode = 'cv2' )
319
+
320
+ # 计算span的对比度,低于0.20的span不进行ocr
321
+ if calculate_contrast (span_img , img_mode = 'bgr' ) <= 0.20 :
322
+ spans .remove (span )
323
+ continue
324
+
293
325
ocr_res = ocr_model .ocr (span_img , det = False )
294
326
if ocr_res and len (ocr_res ) > 0 :
295
327
if len (ocr_res [0 ]) > 0 :
0 commit comments