Skip to content

Commit 8cf7d27

Browse files
author
David Huggins-Daines
committed
feat: allow disabling OCR in hi_res mode (fixes: Unstructured-IO#2467)
1 parent eca4d42 commit 8cf7d27

File tree

3 files changed

+45
-22
lines changed

3 files changed

+45
-22
lines changed

test_unstructured/partition/pdf_image/test_pdf.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,22 @@ def test_partition_pdf_hi_res_ocr_mode_with_table_extraction(ocr_mode):
602602
assert "Layouts of scanned US newspapers from the 20th century" in table[0]
603603

604604

605+
def test_partition_pdf_hi_res_ocr_mode_none():
606+
filename = example_doc_path("pdf/layout-parser-paper.pdf")
607+
elements = pdf.partition_pdf(
608+
filename=filename,
609+
ocr_mode="none",
610+
strategy=PartitionStrategy.HI_RES,
611+
# FIXME: table structure still requires OCR for no good reason
612+
infer_table_structure=False,
613+
)
614+
fast_elements = pdf.partition_pdf(
615+
filename=filename,
616+
strategy=PartitionStrategy.FAST,
617+
)
618+
assert elements != fast_elements
619+
620+
605621
def test_partition_pdf_with_copy_protection():
606622
filename = example_doc_path("pdf/copy-protected.pdf")
607623
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES)

unstructured/partition/pdf.py

Lines changed: 28 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -625,17 +625,20 @@ def _partition_pdf_or_image_local(
625625
hi_res_model_name=hi_res_model_name,
626626
)
627627

628-
final_document_layout = process_file_with_ocr(
629-
filename,
630-
merged_document_layout,
631-
extracted_layout=extracted_layout,
632-
is_image=is_image,
633-
infer_table_structure=infer_table_structure,
634-
ocr_languages=ocr_languages,
635-
ocr_mode=ocr_mode,
636-
pdf_image_dpi=pdf_image_dpi,
637-
ocr_layout_dumper=ocr_layout_dumper,
638-
)
628+
if ocr_mode == OCRMode.NONE.value:
629+
final_document_layout = merged_document_layout
630+
else:
631+
final_document_layout = process_file_with_ocr(
632+
filename,
633+
merged_document_layout,
634+
extracted_layout=extracted_layout,
635+
is_image=is_image,
636+
infer_table_structure=infer_table_structure,
637+
ocr_languages=ocr_languages,
638+
ocr_mode=ocr_mode,
639+
pdf_image_dpi=pdf_image_dpi,
640+
ocr_layout_dumper=ocr_layout_dumper,
641+
)
639642
else:
640643
inferred_document_layout = process_data_with_model(
641644
file,
@@ -680,17 +683,20 @@ def _partition_pdf_or_image_local(
680683

681684
if hasattr(file, "seek"):
682685
file.seek(0)
683-
final_document_layout = process_data_with_ocr(
684-
file,
685-
merged_document_layout,
686-
extracted_layout=extracted_layout,
687-
is_image=is_image,
688-
infer_table_structure=infer_table_structure,
689-
ocr_languages=ocr_languages,
690-
ocr_mode=ocr_mode,
691-
pdf_image_dpi=pdf_image_dpi,
692-
ocr_layout_dumper=ocr_layout_dumper,
693-
)
686+
if ocr_mode == OCRMode.NONE.value:
687+
final_document_layout = merged_document_layout
688+
else:
689+
final_document_layout = process_data_with_ocr(
690+
file,
691+
merged_document_layout,
692+
extracted_layout=extracted_layout,
693+
is_image=is_image,
694+
infer_table_structure=infer_table_structure,
695+
ocr_languages=ocr_languages,
696+
ocr_mode=ocr_mode,
697+
pdf_image_dpi=pdf_image_dpi,
698+
ocr_layout_dumper=ocr_layout_dumper,
699+
)
694700

695701
final_document_layout = clean_pdfminer_inner_elements(final_document_layout)
696702

unstructured/partition/utils/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ class Source(Enum):
1212
class OCRMode(Enum):
1313
INDIVIDUAL_BLOCKS = "individual_blocks"
1414
FULL_PAGE = "entire_page"
15+
NONE = "none"
1516

1617

1718
class PartitionStrategy:

0 commit comments

Comments
 (0)