You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
import os
#for reading the pdf
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from nltk.corpus import stopwords
from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import QuadgramCollocationFinder
#for counting the sentences and words
import nltk
import collections
from nltk import word_tokenize
from collections import Counter
#for couting most frequent words
import re
def convert(filename, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(filename, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
return text
pdfFiles = []
dir_name='C:\Phoenix\Documents from Bryan'
for filename in os.listdir(dir_name):
if filename.endswith('.pdf') or filename.endswith('.PDF') or filename.endswith('.Pdf') or filename.endswith('.pDf') or filename.endswith('.pdF') or filename.endswith('.pDF') or filename.endswith('.pDf') or filename.endswith('.PDF'):
pdfFiles.append(filename)
text=convert(os.path.join(dir_name, filename))
sentence_count = len(nltk.tokenize.sent_tokenize(text))
word_count = len(nltk.tokenize.word_tokenize(text))
print('\nThe file ',filename,' has ',word_count, 'words and ', sentence_count,' sentences in it.\n')
#use findall for counting most common words, quadgrams, trigrams
all_text = re.findall(r'\w+', text)
all_text =map(lambda x: x.lower(), all_text)
filtered_words = list(filter(lambda word: word not in stopwords.words('english') and word.isalpha(), all_text))
word_counts = Counter(filtered_words).most_common(20)
print('The 20 most commonly occuring words in this file are : \n\n', word_counts)
print('\nThe 10 most common 3 word combinations appearing in this file are: \n')
trigram = TrigramCollocationFinder.from_words(filtered_words)
print(sorted(trigram.ngram_fd.items(), key=lambda t: (-t[1], t[0]))[:10])
fourgrams=QuadgramCollocationFinder.from_words(filtered_words)
print('\nThe 10 most common 4 word combinations appearing in this file are: \n')
print(sorted(fourgrams.ngram_fd.items(), key=lambda t: (-t[1], t[0]))[:10])
print('----------------------------------------------------------------------------------------------------')
The text was updated successfully, but these errors were encountered:
I encountered the same question. It seems like the latest version does not have pdfpager module any more.
Try pip3 install pdfminer.six and it works for me
I am using Anaconda and used conda forge to install pdfminer3k
Error:
runfile('C:/Phoenix/Python/listpdfsandcountwords.py', wdir='C:/Phoenix/Python')
Traceback (most recent call last):
File "", line 1, in
runfile('C:/Phoenix/Python/listpdfsandcountwords.py', wdir='C:/Phoenix/Python')
File "C:\Work\lib\site-packages\spyder\utils\site\sitecustomize.py", line 710, in runfile
execfile(filename, namespace)
File "C:\Work\lib\site-packages\spyder\utils\site\sitecustomize.py", line 101, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Phoenix/Python/listpdfsandcountwords.py", line 14, in
from pdfminer.pdfpage import PDFPage
ModuleNotFoundError: No module named 'pdfminer.pdfpage'
Conda Environment:
(C:\Work) C:\Users\dparamanand>conda info
Current conda install:
Code:
-- coding: utf-8 --
"""
Created on Fri Sep 29 10:43:29 2017
@author: dpar0004
"""
import os
#for reading the pdf
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from nltk.corpus import stopwords
from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import QuadgramCollocationFinder
#for counting the sentences and words
import nltk
import collections
from nltk import word_tokenize
from collections import Counter
#for couting most frequent words
import re
def convert(filename, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
pdfFiles = []
dir_name='C:\Phoenix\Documents from Bryan'
for filename in os.listdir(dir_name):
if filename.endswith('.pdf') or filename.endswith('.PDF') or filename.endswith('.Pdf') or filename.endswith('.pDf') or filename.endswith('.pdF') or filename.endswith('.pDF') or filename.endswith('.pDf') or filename.endswith('.PDF'):
pdfFiles.append(filename)
text=convert(os.path.join(dir_name, filename))
sentence_count = len(nltk.tokenize.sent_tokenize(text))
word_count = len(nltk.tokenize.word_tokenize(text))
print('\nThe file ',filename,' has ',word_count, 'words and ', sentence_count,' sentences in it.\n')
The text was updated successfully, but these errors were encountered: