From 5aa809ff144ffc192cd66a2bbb194400e4aee1f4 Mon Sep 17 00:00:00 2001 From: myhloli Date: Sat, 8 Feb 2025 18:58:53 +0800 Subject: [PATCH 1/6] refactor(magic_pdf): update invalid character detection logic - Uncomment detect_invalid_chars_by_pymupdf function call - Comment out detect_invalid_chars function call --- magic_pdf/filter/pdf_meta_scan.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/magic_pdf/filter/pdf_meta_scan.py b/magic_pdf/filter/pdf_meta_scan.py index eecad515..3982daa2 100644 --- a/magic_pdf/filter/pdf_meta_scan.py +++ b/magic_pdf/filter/pdf_meta_scan.py @@ -323,8 +323,8 @@ def get_language(doc: fitz.Document): def check_invalid_chars(pdf_bytes): """乱码检测.""" - # return detect_invalid_chars_by_pymupdf(pdf_bytes) - return detect_invalid_chars(pdf_bytes) + return detect_invalid_chars_by_pymupdf(pdf_bytes) + # return detect_invalid_chars(pdf_bytes) def pdf_meta_scan(pdf_bytes: bytes): From 9f18ca20197cf41bb8396540d00ec400b28a21ed Mon Sep 17 00:00:00 2001 From: myhloli Date: Sat, 8 Feb 2025 18:59:23 +0800 Subject: [PATCH 2/6] feat(pdf_parse): improve OCR processing and contrast filtering - Rename empty_spans to need_ocr_spans for better clarity - Add calculate_contrast function to measure image contrast - Filter out low-contrast spans to improve OCR accuracy - Update OCR processing workflow to use new filtering method --- magic_pdf/pdf_parse_union_core_v2.py | 44 +++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/magic_pdf/pdf_parse_union_core_v2.py b/magic_pdf/pdf_parse_union_core_v2.py index 999166f9..2caf074c 100644 --- a/magic_pdf/pdf_parse_union_core_v2.py +++ b/magic_pdf/pdf_parse_union_core_v2.py @@ -6,8 +6,10 @@ import time from typing import List +import cv2 import fitz import torch +import numpy as np from loguru import logger from magic_pdf.config.enums import SupportedPdfParseMethod @@ -127,17 +129,15 @@ def fill_char_in_spans(spans, all_chars): span['chars'].append(char) break - empty_spans = [] - + need_ocr_spans = [] for span in spans: chars_to_content(span) # 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤 if len(span['content']) * span['height'] < span['width'] * 0.5: # logger.info(f"maybe empty span: {len(span['content'])}, {span['height']}, {span['width']}") - empty_spans.append(span) + need_ocr_spans.append(span) del span['height'], span['width'] - return empty_spans - + return need_ocr_spans # 使用鲁棒性更强的中心点坐标判断 def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33): @@ -190,6 +190,28 @@ def remove_tilted_line(text_blocks): block['lines'].remove(line) +def calculate_contrast(img) -> float: + """ + 计算给定BGR图像的对比度。 + + :param img: BGR格式的图像,类型为numpy.ndarray + :return: 图像的对比度值 + """ + # 将BGR图像转换为灰度图 + gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + + # 计算均值和标准差 + mean_value = np.mean(gray_img) + std_dev = np.std(gray_img) + + # 对比度定义为标准差除以平均值(加上小常数避免除零错误) + contrast = std_dev / (mean_value + 1e-6) + + # logger.info(f"contrast: {contrast}") + + return round(contrast, 2) + + def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang): # cid用0xfffd表示,连字符拆开 # text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks'] @@ -274,9 +296,9 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang span['chars'] = [] new_spans.append(span) - empty_spans = fill_char_in_spans(new_spans, all_pymu_chars) + need_ocr_spans = fill_char_in_spans(new_spans, all_pymu_chars) - if len(empty_spans) > 0: + if len(need_ocr_spans) > 0: # 初始化ocr模型 atom_model_manager = AtomModelSingleton() @@ -287,9 +309,15 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang lang=lang ) - for span in empty_spans: + for span in need_ocr_spans: # 对span的bbox截图再ocr span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2') + + # 计算span的对比度,低于0.20的span不进行ocr + if calculate_contrast(span_img) <= 0.20: + spans.remove(span) + continue + ocr_res = ocr_model.ocr(span_img, det=False) if ocr_res and len(ocr_res) > 0: if len(ocr_res[0]) > 0: From f35a6c08a8a195e33afb620e7b9ba2e97c45a75a Mon Sep 17 00:00:00 2001 From: myhloli Date: Sun, 9 Feb 2025 19:00:31 +0800 Subject: [PATCH 3/6] refactor(filter): remove unused text layout analysis for PDF classification --- magic_pdf/filter/__init__.py | 2 +- magic_pdf/filter/pdf_classify_by_type.py | 10 ++++++---- magic_pdf/filter/pdf_meta_scan.py | 8 ++++---- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/magic_pdf/filter/__init__.py b/magic_pdf/filter/__init__.py index 07a79b88..28015635 100644 --- a/magic_pdf/filter/__init__.py +++ b/magic_pdf/filter/__init__.py @@ -23,7 +23,7 @@ def classify(pdf_bytes: bytes) -> SupportedPdfParseMethod: pdf_meta['image_info_per_page'], pdf_meta['text_len_per_page'], pdf_meta['imgs_per_page'], - pdf_meta['text_layout_per_page'], + # pdf_meta['text_layout_per_page'], pdf_meta['invalid_chars'], ) if is_text_pdf: diff --git a/magic_pdf/filter/pdf_classify_by_type.py b/magic_pdf/filter/pdf_classify_by_type.py index 319fb3fb..50665737 100644 --- a/magic_pdf/filter/pdf_classify_by_type.py +++ b/magic_pdf/filter/pdf_classify_by_type.py @@ -305,7 +305,8 @@ def is_narrow_strip(img): def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list, - text_layout_list: list, invalid_chars: bool): + # text_layout_list: list, + invalid_chars: bool): """ 这里的图片和页面长度单位是pts :param total_page: @@ -321,7 +322,7 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l 'by_text_len': classify_by_text_len(text_len_list, total_page), 'by_avg_words': classify_by_avg_words(text_len_list), 'by_img_num': classify_by_img_num(img_sz_list, img_num_list), - 'by_text_layout': classify_by_text_layout(text_layout_list), + # 'by_text_layout': classify_by_text_layout(text_layout_list), 'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list), 'by_invalid_chars': invalid_chars, } @@ -332,9 +333,10 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l return False, results else: logger.warning( - f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}," + f"OCR needed based on classification result, by_image_area: {results['by_image_area']}," f" by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}," - f" by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}," + # f" by_text_layout: {results['by_text_layout']}," + f" by_img_narrow_strips: {results['by_img_narrow_strips']}," f" by_invalid_chars: {results['by_invalid_chars']}", file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法 return False, results diff --git a/magic_pdf/filter/pdf_meta_scan.py b/magic_pdf/filter/pdf_meta_scan.py index 3982daa2..9fd870d0 100644 --- a/magic_pdf/filter/pdf_meta_scan.py +++ b/magic_pdf/filter/pdf_meta_scan.py @@ -356,9 +356,9 @@ def pdf_meta_scan(pdf_bytes: bytes): # logger.info(f"image_info_per_page: {image_info_per_page}, junk_img_bojids: {junk_img_bojids}") text_len_per_page = get_pdf_textlen_per_page(doc) # logger.info(f"text_len_per_page: {text_len_per_page}") - text_layout_per_page = get_pdf_text_layout_per_page(doc) + # text_layout_per_page = get_pdf_text_layout_per_page(doc) # logger.info(f"text_layout_per_page: {text_layout_per_page}") - text_language = get_language(doc) + # text_language = get_language(doc) # logger.info(f"text_language: {text_language}") invalid_chars = check_invalid_chars(pdf_bytes) # logger.info(f"invalid_chars: {invalid_chars}") @@ -372,8 +372,8 @@ def pdf_meta_scan(pdf_bytes: bytes): 'page_height_pts': int(page_height_pts), 'image_info_per_page': image_info_per_page, 'text_len_per_page': text_len_per_page, - 'text_layout_per_page': text_layout_per_page, - 'text_language': text_language, + # 'text_layout_per_page': text_layout_per_page, + # 'text_language': text_language, # "svgs_per_page": svgs_per_page, 'imgs_per_page': imgs_per_page, # 增加每页img数量list 'junk_img_bojids': junk_img_bojids, # 增加垃圾图片的bojid list From a5342950f6cfe04e908f0f7668849e72be15dbd5 Mon Sep 17 00:00:00 2001 From: myhloli Date: Sun, 9 Feb 2025 19:24:31 +0800 Subject: [PATCH 4/6] fix(filter): toggle invalid character detection method --- magic_pdf/filter/pdf_meta_scan.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/magic_pdf/filter/pdf_meta_scan.py b/magic_pdf/filter/pdf_meta_scan.py index 9fd870d0..67e56315 100644 --- a/magic_pdf/filter/pdf_meta_scan.py +++ b/magic_pdf/filter/pdf_meta_scan.py @@ -323,8 +323,8 @@ def get_language(doc: fitz.Document): def check_invalid_chars(pdf_bytes): """乱码检测.""" - return detect_invalid_chars_by_pymupdf(pdf_bytes) - # return detect_invalid_chars(pdf_bytes) + # return detect_invalid_chars_by_pymupdf(pdf_bytes) + return detect_invalid_chars(pdf_bytes) def pdf_meta_scan(pdf_bytes: bytes): From e4e4eef1f86462b2d6ce7eea452a7519cc2ea411 Mon Sep 17 00:00:00 2001 From: myhloli Date: Sun, 9 Feb 2025 20:09:32 +0800 Subject: [PATCH 5/6] perf(language_detection): optimize batch size for language detection model - Increase batch size from 8 to 256 for language detection inference - Add timing measurement for language detection process --- magic_pdf/libs/pdf_check.py | 12 +++++++++++- .../language_detection/yolov11/YOLOv11.py | 7 ++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/magic_pdf/libs/pdf_check.py b/magic_pdf/libs/pdf_check.py index d4d3c24b..98402b38 100644 --- a/magic_pdf/libs/pdf_check.py +++ b/magic_pdf/libs/pdf_check.py @@ -4,6 +4,7 @@ import re from io import BytesIO from pdfminer.high_level import extract_text +from pdfminer.layout import LAParams def calculate_sample_count(total_page: int): @@ -41,7 +42,16 @@ def detect_invalid_chars(src_pdf_bytes: bytes) -> bool: sample_docs = extract_pages(src_pdf_bytes) sample_pdf_bytes = sample_docs.tobytes() sample_pdf_file_like_object = BytesIO(sample_pdf_bytes) - text = extract_text(sample_pdf_file_like_object) + laparams = LAParams( + line_overlap=0.5, + char_margin=2.0, + line_margin=0.5, + word_margin=0.1, + boxes_flow=None, + detect_vertical=False, + all_texts=False, + ) + text = extract_text(pdf_file=sample_pdf_file_like_object, laparams=laparams) text = text.replace("\n", "") # logger.info(text) '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)''' diff --git a/magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py b/magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py index a37177d4..992925af 100644 --- a/magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +++ b/magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py @@ -1,4 +1,5 @@ # Copyright (c) Opendatalab. All rights reserved. +import time from collections import Counter from uuid import uuid4 @@ -102,9 +103,9 @@ def do_detect(self, images: list): temp_images = split_images(image) for temp_image in temp_images: all_images.append(resize_images_to_224(temp_image)) - - images_lang_res = self.batch_predict(all_images, batch_size=8) - # logger.info(f"images_lang_res: {images_lang_res}") + # langdetect_start = time.time() + images_lang_res = self.batch_predict(all_images, batch_size=256) + # logger.info(f"image number of langdetect: {len(images_lang_res)}, langdetect time: {round(time.time() - langdetect_start, 2)}") if len(images_lang_res) > 0: count_dict = Counter(images_lang_res) language = max(count_dict, key=count_dict.get) From 5561ac9555b2f10e17d8b2f91805be15dab8aa52 Mon Sep 17 00:00:00 2001 From: myhloli Date: Sun, 9 Feb 2025 20:25:30 +0800 Subject: [PATCH 6/6] fix(pdf_parse): improve image processing and OCR accuracy - Update calculate_contrast function to support both RGB and BGR image modes - Add input validation for image mode in calculate_contrast function - Modify usage of calculate_contrast function in OCR processing to specify image mode --- magic_pdf/pdf_parse_union_core_v2.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/magic_pdf/pdf_parse_union_core_v2.py b/magic_pdf/pdf_parse_union_core_v2.py index 2caf074c..62e267ce 100644 --- a/magic_pdf/pdf_parse_union_core_v2.py +++ b/magic_pdf/pdf_parse_union_core_v2.py @@ -139,6 +139,7 @@ def fill_char_in_spans(spans, all_chars): del span['height'], span['width'] return need_ocr_spans + # 使用鲁棒性更强的中心点坐标判断 def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33): char_center_x = (char_bbox[0] + char_bbox[2]) / 2 @@ -190,25 +191,28 @@ def remove_tilted_line(text_blocks): block['lines'].remove(line) -def calculate_contrast(img) -> float: +def calculate_contrast(img, img_mode) -> float: """ - 计算给定BGR图像的对比度。 - - :param img: BGR格式的图像,类型为numpy.ndarray + 计算给定图像的对比度。 + :param img: 图像,类型为numpy.ndarray + :Param img_mode = 图像的色彩通道,'rgb' 或 'bgr' :return: 图像的对比度值 """ - # 将BGR图像转换为灰度图 - gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + if img_mode == 'rgb': + # 将RGB图像转换为灰度图 + gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) + elif img_mode == 'bgr': + # 将BGR图像转换为灰度图 + gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + else: + raise ValueError("Invalid image mode. Please provide 'rgb' or 'bgr'.") # 计算均值和标准差 mean_value = np.mean(gray_img) std_dev = np.std(gray_img) - # 对比度定义为标准差除以平均值(加上小常数避免除零错误) contrast = std_dev / (mean_value + 1e-6) - # logger.info(f"contrast: {contrast}") - return round(contrast, 2) @@ -314,7 +318,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2') # 计算span的对比度,低于0.20的span不进行ocr - if calculate_contrast(span_img) <= 0.20: + if calculate_contrast(span_img, img_mode='bgr') <= 0.20: spans.remove(span) continue