-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathLiwc.py
158 lines (127 loc) · 4.12 KB
/
Liwc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# -*- coding: utf-8 -*-
#### Class to provide data and methods to read LIWC dictionary
####
#### Author: Pedro Paulo Balage Filho
#### Version: 1.0
#### Date: 05/12/12
import codecs
# Class to provide data and methods to read LIWC dictionary
class LiwcReader(dict):
"""
Dictionary format:
%
1 funct
2 pronoun
...
125 affect
126 posemo
127 negemo
128 anx
129 anger
130 sad
...
%
a 1 2 3 6 7 9 10 17 121 131 138 252 463
aba 146
abafa 125 127 129
abafad* 125 127 129
abafada 125 127 129
abafadas 125 127 129
abafado 125 127 129
abafados 125 127 129
"""
# Constructor
# dict_file: the path to dictionary file
def __init__(self, dict_file='Dictionaries/LIWC/LIWC2007_Portugues_win.dic'):
self.inverted_index = {}
self.meta = {}
handle = codecs.open(dict_file, 'r', 'iso8859-15')
line = handle.readline()
if line.strip() != '%':
raise ValueError('Dictionary file must start with %')
# Build the MetaTable
line = handle.readline()
while line.strip() != '%':
code, gloss = line.split()
try:
self.meta[int(code)] = gloss
except IOError:
raise ValueError('.dic file is not in LIWC format')
self.inverted_index[gloss] = set()
line = handle.readline()
# Run until the end of the file
line = handle.readline()
while line:
itens = line.split()
word = itens[0]
try:
categories = [int(x) for x in itens[1:]]
except IOError:
raise ValueError('.dic file is not in LIWC format.')
self[word] = categories
for cat in categories:
self.inverted_index[self.meta[cat]].add(word)
line = handle.readline()
handle.close()
self.sorted_dict = sorted(self.keys())
# returns the inverted index
def inverted_index(self):
return self.inverted_index
# returns the vocabulary
def vocabulary(self):
return set(self.keys())
# returns the polar vocabulary
def vocabulary_polar(self):
vocabulary = set()
for key in self:
if self.polarity(key) != 0:
vocabulary.add(key)
return vocabulary
# returns the mata_table which stands for the LIWC word categories
def meta_table(self):
return self.meta
# returns the polarity of a given word. It may be -1, 0 or 1
def polarity(self,word):
_word = self.find_word(word)
if _word in self:
if 126 in self[_word]:
return 1
elif 127 in self[_word]:
return -1
else:
return 0
else:
return None
# returns the sorted dictionary
def sorted_dictionary(self):
return self.sorted_dict
# finds a word in the lexicon and returns it.
# If the word in the lexicon
# has the wildcard *, returns the lexicon entry with the wildcard
# If the word is not found, returns None
def find_word(self, word):
import bisect
closest_index = max(bisect.bisect(self.sorted_dict, word) - 1, 0)
search = self.sorted_dict[closest_index]
if search == word:
return search
# contains a wildcard *
elif search[-1] == '*' and search[:-1] == word[:len(search[:-1])]:
return search
else:
return None
# prints some statistics
def print_statistics(self):
print ':: Meta Categories e number of entries ::'
for cat in sorted(self.inverted_index.keys()):
print cat, (20 - len(cat)) * ' ', len(self.inverted_index[cat])
print
posemo = self.inverted_index['posemo']
negemo = self.inverted_index['negemo']
print len(posemo.intersection(negemo))
print 'posemo e negemo interserction: '
for word in sorted(posemo.intersection(negemo)):
print word
# returns the dictionary name. LIWC
def get_name(self):
return 'Liwc'