Skip to content

Commit 9eef347

Browse files
committed
Fixed bug with original pdfs with /Rotate directive
1 parent f70af0e commit 9eef347

File tree

3 files changed

+38
-5
lines changed

3 files changed

+38
-5
lines changed

pypdfocr/pypdfocr.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,8 @@ def run_conversion(self, pdf_filename):
312312
ocr_pdf_filename = self.pdf.overlay_hocr_pages(img_dpi, hocr_filenames, pdf_filename)
313313

314314
# Clean up the files
315-
self._clean_up_files(itertools.chain(*hocr_filenames))
315+
if not self.debug:
316+
self._clean_up_files(itertools.chain(*hocr_filenames))
316317

317318
print ("Completed conversion successfully to %s" % ocr_pdf_filename)
318319
return ocr_pdf_filename

pypdfocr/pypdfocr_pdf.py

+35-3
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import cStringIO
3535
import base64
3636
import zlib
37+
import math
3738

3839
# Pkg to read multiple image tiffs
3940
from PIL import Image
@@ -44,7 +45,7 @@
4445
import xml.etree
4546

4647
# Import Pypdf2
47-
from PyPDF2 import PdfFileMerger, PdfFileReader, PdfFileWriter
48+
from PyPDF2 import PdfFileMerger, PdfFileReader, PdfFileWriter, utils
4849

4950
class PyPdf(object):
5051
"""Class to create pdfs from images"""
@@ -54,6 +55,27 @@ def __init__(self, gs):
5455
self.gs = gs # Pointer to ghostscript object
5556
pass
5657

58+
def mergeRotateAroundPointPage(self,page, page2, rotation, tx, ty):
59+
# Code taken from here:
60+
# http://stackoverflow.com/questions/6041244/how-to-merge-two-landscape-pdf-pages-using-pypdf/17392824#17392824
61+
# Unclear why PyPDF2 builtin page rotation functions don't work
62+
translation = [[1, 0, 0],
63+
[0, 1, 0],
64+
[-tx,-ty,1]]
65+
rotation = math.radians(rotation)
66+
rotating = [[math.cos(rotation), math.sin(rotation),0],
67+
[-math.sin(rotation),math.cos(rotation), 0],
68+
[0, 0, 1]]
69+
rtranslation = [[1, 0, 0],
70+
[0, 1, 0],
71+
[tx,ty,1]]
72+
ctm = utils.matrixMultiply(translation, rotating)
73+
ctm = utils.matrixMultiply(ctm, rtranslation)
74+
75+
return page.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1],
76+
ctm[1][0], ctm[1][1],
77+
ctm[2][0], ctm[2][1]])
78+
5779
def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):
5880

5981
logging.debug("Going to overlay following files onto %s" % orig_pdf_filename)
@@ -75,9 +97,19 @@ def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):
7597
text_file = open(text_pg_filename, 'rb')
7698
text_files.append(text_file) # Save this to close after we write the final pdf
7799
text_pg = self.iter_pdf_page(text_file).next()
78-
orig_pg.mergePage(text_pg)
100+
orig_rotation_angle = int(orig_pg.get('/Rotate', 0))
101+
102+
if orig_rotation_angle != 0:
103+
logging.info("Original Rotation: %s" % orig_pg.get("/Rotate", 0))
104+
self.mergeRotateAroundPointPage(orig_pg, text_pg, orig_rotation_angle, text_pg.mediaBox.getWidth()/2, text_pg.mediaBox.getWidth()/2)
105+
106+
# None of these commands worked for me:
107+
#orig_pg.rotateCounterClockwise(orig_rotation_angle)
108+
#orig_pg.mergeRotatedPage(text_pg,text_rotation_angle)
109+
else:
110+
orig_pg.mergePerge(text_pg)
111+
orig_pg.compressContentStreams()
79112
writer.addPage(orig_pg)
80-
#text_file.close()
81113

82114
pdf_dir, pdf_basename = os.path.split(orig_pdf_filename)
83115
basename = pdf_basename.split('.')[0]

requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
pillow>=2.2
22
reportlab>=2.7
33
watchdog>=0.6.0
4-
pypdf2
4+
pypdf2>=1.23
55
evernote

0 commit comments

Comments
 (0)