Skip to content

Commit 623cf32

Browse files
committed
Made hocr parse more robust to errors, added test for preprocess flag
1 parent eca8469 commit 623cf32

File tree

5 files changed

+16
-2
lines changed

5 files changed

+16
-2
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
*.pyc
22
.*
33
*~
4+
*.hocr
5+
*.jpg

pypdfocr/pypdfocr_pdf.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,13 @@ def add_text_layer(self,pdf, hocrfile, page_num,height, dpi):
273273
274274
"""
275275
hocr = ElementTree()
276-
hocr.parse(hocrfile)
276+
try:
277+
# It's possible tesseract has failed and written garbage to this hocr file, so we need to catch any exceptions
278+
hocr.parse(hocrfile)
279+
except Exception:
280+
logging.info("Error loading hocr, not adding any text")
281+
return
282+
277283
logging.debug(xml.etree.ElementTree.tostring(hocr.getroot()))
278284
for c in hocr.getroot(): # Find the <body> tag
279285
if c.tag != 'body':

test/pdfs/test_recipe.pdf

3.01 KB
Binary file not shown.

test/test_option_parsing.py

+4
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@ def test_standalone(self):
2121
self.p.get_options(opts)
2222
assert(self.p.verbose)
2323

24+
opts.append('--preprocess')
25+
self.p.get_options(opts)
26+
assert(not self.p.skip_preprocess)
27+
2428
assert(not self.p.enable_filing)
2529
assert(self.p.config == {})
2630

test/test_pypdfocr.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ def _iter_pdf(self, filename):
4242
]),
4343
(".", os.path.join("temp","target","recipe"), os.path.join("..","test", "pdfs", "1.pdf"), [ ["Simply","Recipes"],
4444
]),
45+
(".", os.path.join("temp","target","recipe"), os.path.join("..","test", "pdfs", "test_recipe_sideways.pdf"), [ ["Simply","Recipes", 'spinach'],
46+
]),
4547
]
4648

4749
#@pytest.mark.skipif(True, reason="Just testing")
@@ -93,7 +95,7 @@ def test_standalone_email(self, dirname, tgt_folder, filename, expected):
9395
with patch("smtplib.SMTP") as mock_smtp:
9496
cwd = os.getcwd()
9597
os.chdir(dirname)
96-
opts = [filename, "--config=test_pypdfocr_config.yaml", "-m"]
98+
opts = [filename, "--preprocess", "--config=test_pypdfocr_config.yaml", "-m"]
9799
self.p.go(opts)
98100

99101
out_filename = filename.replace(".pdf", "_ocr.pdf")

0 commit comments

Comments
 (0)