-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscript.py
88 lines (61 loc) · 2.09 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#! /usr/bin/python
import os
import sys
import json
import re
import markovgenerator
DEBUG = True
PATH = "."
TWEETLIST = "justTheTweets.txt"
def process_archive(path):
''' Crawls /data from twitter and returns list of files
containing tweets '''
if not os.path.isdir(path):
print "Path '%s' not found." % path
sys.exit(1)
tweetfiles = []
# index the files & ignore non-.js files
for (root, _, files) in os.walk(path):
for filename in files[:]:
if re.search('\d{4}_\d{2}.js', filename):
print "joining ", root, filename
tweetfiles.append(os.path.join(root, filename))
print "* Found %s months of tweets." % len(files)
return tweetfiles
def parse_files(files):
''' Writes text of tweets to a single file '''
if os.path.isfile(TWEETLIST):
print "* Using existing file %s." % TWEETLIST
else:
for filename in files:
with open(TWEETLIST, 'w') as w:
print "* Writing tweets to %s." % TWEETLIST
with open(filename, 'r') as f:
f.readline() # ignore the first line; it's gibberish
w.write(parse_tweets(f))
def clean_tweets(line):
''' Removes mentions, replies, RTs, and links '''
line = line.split(' ')
for token in line[:]:
if token and (token[0] == '@' or token == 'RT' \
or re.search(".*https*:", token)):
line.remove(token)
return ' '.join(line)
def parse_tweets(f):
''' Returns text of all tweets in a given file object'''
tweets = []
json_data = json.load(f)
for tweet in json_data:
payload = tweet['text'].encode('ascii', 'ignore')
payload = clean_tweets(payload)
tweets.append(payload)
return "\n".join(tweets)
def main():
if len(sys.argv) != 2:
print 'usage: $ python %s <number_of_tweets>' % sys.argv[0]
sys.exit(1)
filelist = process_archive(PATH)
parse_files(filelist)
markovgenerator.markov_it(TWEETLIST, int(sys.argv[1]))
if __name__ == '__main__':
main()