-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtwitter_mecab.py
97 lines (71 loc) · 2.33 KB
/
twitter_mecab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# coding: utf-8
import json;
import urllib;
import MeCab;
import re;
import twitter_gender;
url = "http://api.twitter.com/1/statuses/user_timeline.json";
tagger = MeCab.Tagger('-Ochasen');
f = open("twitter_mecab.csv", "w");
f.write("original,hinsi,trend\n");
pf = open("twitter_phrase.csv", "w");
pf.write("id,pol\n");
phrase_file = open("../phrasePolDic.csv", "r");
phrase_dic = {};
for line in phrase_file:
key = line.split(',')[0];
value = line.split(',')[1];
phrase_dic[key] = value;
twitter_list = twitter_gender.get_genderlist();
count = {};
count["m"] = 0;
count["f"] = 0;
word_count = {};
word_count["m"] = {};
word_count["f"] = {};
word_count["trend"] = {};
def my_mecab(text):
encoded_text = text.encode('utf-8');
word_list = [];
node = tagger.parseToNode(encoded_text);
while (node):
encoded_surface = node.surface;
if (encoded_surface != ""):
encoded_feature = node.feature;
surface = encoded_surface.decode('utf-8');
feature = encoded_feature.decode('utf-8');
feature = feature.split(',')[0];
word_list.append("%s,%s" % (surface, feature));
node = node.next;
return (word_list);
for tid, gender in twitter_list.items():
print tid;
param = {"screen_name" : tid, "count" : "200"};
res = urllib.urlopen(url + "?" + urllib.urlencode(param));
tweet_json_list = json.load(res);
phrase_pol = 0;
for tweet_json in tweet_json_list:
tweet = tweet_json["text"];
if (re.search("http", tweet)):
continue;
word_list = my_mecab(tweet);
for word in word_list:
count[gender] += 1;
if (word_count[gender].has_key(word) == True):
word_count[gender][word] += 1;
else:
word_count[gender][word] = 1;
origin = word.split(",")[0].encode('utf-8');
if (phrase_dic.has_key(origin) == True):
phrase_pol += int(phrase_dic[origin].rstrip('\n'));
pf.write("%s,%d\n" % (tid, phrase_pol));
for key, value in sorted(word_count["f"].items(), key=lambda x:x[1]):
word_count["trend"][key] = float(value) / count["f"];
for key, value in sorted(word_count["m"].items(), key=lambda x:x[1]):
if (word_count["trend"].has_key(key) == True):
word_count["trend"][key] -= (float(value) / count["m"]);
else:
word_count["trend"][key] = (float(value) / count["m"]);
for key, value in sorted(word_count["trend"].items(), key=lambda x:x[1]):
f.write(("%s,%f\n" % (key, value)).encode("utf-8"));
f.close();