-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb.py
67 lines (50 loc) · 1.79 KB
/
web.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# require
import sys
import nltk
import os
import operator
import re
# corpus root is ~/cs195/web
files = os.listdir('.')
print "Available text files"
for file in files:
if file.endswith(".txt"):
print file
# get input; needs sanity checking
#file_name = sys.argv[1]
#search_word = sys.argv[2]
file_name = raw_input("\nChoose one of these files: ")
print "\nThe file that will be examined is {0}".format(file_name)
from nltk.corpus import PlaintextCorpusReader
corpus_root = '.'
search_text = PlaintextCorpusReader(corpus_root,file_name)
search_text = nltk.Text(search_text.words())
# KWIC concordance
search_word = raw_input("Specify a search word for a keyword in context concordance list: ")
search_text.concordance(search_word,80,lines=1000)
# Apply stopwords to search_text
from nltk.corpus import stopwords
stopwords = nltk.corpus.stopwords.words('bible')
#/Users/barrybandstra/nltk_data/corpora/stopwords
search_text = [word for word in search_text if word.lower() not in stopwords]
# Write search to output.txt file"
output_file = open("output.txt", "w")
for line in search_text:
output_file.write(line),"\n"
output_file.close()
# Frequency distribution vocabulary list; fd is a dictionary
fd = nltk.FreqDist(search_text)
fd_freq = raw_input("\nFrequency distribution of terms.\nMinimum frequency: ")
fd_freq = int(fd_freq)
print "Frequency distribution of terms in",file_name,"sorted by frequency"
# http://stackoverflow.com/questions/613183/sort-a-python-dictionary-by-value
p = re.compile('\d+')
for (key,value) in sorted(fd.items(), key=operator.itemgetter(1), reverse = True):
if value > fd_freq - 1:
# Checks for chapter/verse numbers and skips over them if found
m = p.match(key)
if not m:
print key,value
# Vocabulary
vocab = sorted(set(search_text))
print "Vocabulary: ",vocab