Skip to content

Commit 1c7f252

Browse files
committed
Add requirements
1 parent 961a208 commit 1c7f252

File tree

4 files changed

+42
-0
lines changed

4 files changed

+42
-0
lines changed

__pycache__/pdfminer.cpython-36.pyc

1.08 KB
Binary file not shown.

pdfminer.py

+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Alternate approach using pdfminer
2+
3+
try:
4+
from cStringIO import StringIO
5+
except ImportError:
6+
from io import BytesIO
7+
8+
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
9+
from pdfminer.converter import TextConverter
10+
from pdfminer.layout import LAParams
11+
from pdfminer.pdfpage import PDFPage
12+
import re
13+
14+
def convert(fname):
15+
pages=None
16+
if not pages:
17+
pagenums = set()
18+
else:
19+
pagenums = set(pages)
20+
21+
output = StringIO()
22+
manager = PDFResourceManager()
23+
converter = TextConverter(manager, output, laparams=LAParams())
24+
interpreter = PDFPageInterpreter(manager, converter)
25+
26+
infile = file(fname, 'rb')
27+
for page in PDFPage.get_pages(infile, pagenums):
28+
interpreter.process_page(page)
29+
infile.close()
30+
converter.close()
31+
text = output.getvalue()
32+
output.close
33+
print(text)
34+
35+
# write to .txt
36+
text_file = open("output.txt", "w")
37+
text = re.sub("\s\s+", " ", text)
38+
text_file.write("%s" % text)
39+
text_file.close()
40+
41+
convert("input.pdf")

ocr.py pyocr.py

File renamed without changes.

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pdfminer.six

0 commit comments

Comments
 (0)