22
22
tokenizer = AutoTokenizer .from_pretrained (MODEL )
23
23
model_bert = AutoModelForSequenceClassification .from_pretrained (MODEL )
24
24
os .environ ["TOKENIZERS_PARALLELISM" ] = "false"
25
+ EMOJIS_TO_PRESERVE = {
26
+ "(:" , "(;" , "):" , "/:" , ":(" , ":)" , ":/" , ";)"
27
+ }
25
28
26
29
# Check if embeddings exist
27
30
def check_embeddings (chat_data : pd .DataFrame , vect_path : str , bert_path : str , need_sentence : bool ,
@@ -86,25 +89,11 @@ def check_embeddings(chat_data: pd.DataFrame, vect_path: str, bert_path: str, ne
86
89
# Read in the lexicons (helper function for generating the pickle file)
87
90
def read_in_lexicons (directory , lexicons_dict ):
88
91
for filename in os .listdir (directory ):
92
+ if filename .startswith ("." ):
93
+ continue
89
94
with open (directory / filename , encoding = "mac_roman" ) as lexicons :
90
- if filename .startswith ("." ):
91
- continue
92
- lines = []
93
- for lexicon in lexicons :
94
- lexicon = lexicon .strip ()
95
-
96
- if '*' not in lexicon :
97
- lines .append (r"\b" + lexicon .replace ("\n " , "" ) + r"\b" )
98
- else :
99
- # get rid of any cases of multiple repeat -- e.g., '**'
100
- pattern = re .compile (r'\*+' )
101
- lexicon = pattern .sub ('*' , lexicon )
102
- lexicon = r"\b" + lexicon .replace ("\n " , "" ).replace ("*" , "" ) + r"\S*\b"
103
-
104
- # build the final lexicon
105
- lines .append (r"\b" + lexicon .replace ("\n " , "" ).replace ("*" , "" ) + r"\S*\b" )
106
- clean_name = re .sub ('.txt' , '' , filename )
107
- lexicons_dict [clean_name ] = "|" .join (lines )
95
+ clean_name = re .sub ('.txt' , '' , filename )
96
+ lexicons_dict [clean_name ] = sort_words (lexicons )
108
97
109
98
def generate_lexicon_pkl ():
110
99
"""
@@ -172,38 +161,80 @@ def fix_abbreviations(dicTerm: str) -> str:
172
161
else :
173
162
return dicTerm
174
163
175
- def is_valid_term (dicTerm ) :
164
+ def is_valid_term (dicTerm : str ) -> bool :
176
165
"""
177
166
Check if a dictionary term is valid.
178
167
179
- This function returns ` True` if the term matches the regex pattern and `False` otherwise.
168
+ This functions returns True if the term matches the regex pattern and Flase otherwise.
180
169
The regex pattern matches:
181
-
182
- - Alphanumeric characters (a-z, A-Z, 0-9)
183
- - Valid symbols: `-`, `'`, `*`, `/`
184
- - The `*` symbol can appear only once at the end of a word
185
- - Emojis are valid only when they appear alone
186
- - The `/` symbol can appear only once after alphanumeric characters
170
+ - Alphanumeric characters (a-zA-Z0-9)
171
+ - Valid symbols: -, ', *, /
172
+ - The * symbol can only appear once at the end of a word
173
+ - 8 emojis are valid only when they appear alone
174
+ - The / symbol can only appear once after alphanumeric characters
187
175
- Spaces are allowed between valid words
188
176
189
- :param dicTerm: The dictionary term to validate.
177
+ :param dicTerm: The dictionary term
190
178
:type dicTerm: str
191
179
192
- :return: `True` if the term is valid, `False` otherwise.
180
+ hi:) 120
181
+
182
+ :return: True/False
193
183
:rtype: bool
194
184
"""
195
-
196
- # List of emojis to preserve
197
- emojis_to_preserve = {
198
- "(:" , "(;" , "):" , "/:" , ":(" , ":)" , ":/" , ";)"
199
- }
200
- emoji_pattern = '|' .join (re .escape (emoji ) for emoji in emojis_to_preserve )
185
+ emoji_pattern = '|' .join (re .escape (emoji ) for emoji in EMOJIS_TO_PRESERVE )
201
186
alphanumeric_pattern = (
202
187
fr"^([a-zA-Z0-9\-']+(\*|\/[a-zA-Z0-9\*]*)?|({ emoji_pattern } )\*?)( [a-zA-Z0-9\-']+(\*|\/[a-zA-Z0-9\*]*)?)*$"
203
188
)
204
189
205
190
return bool (re .match (alphanumeric_pattern , dicTerm ))
206
191
192
+ def sort_words (lexicons : list ) -> str :
193
+ """
194
+ Sorts the dictionary terms in a list.
195
+
196
+ This function sorts the dictionary terms in a list by their length in descending order.
197
+ The hyphenated words are sorted first, followed by the non-hyphenated words.
198
+
199
+ :param dicTerms: List of dictionary terms
200
+ :type dicTerms: list
201
+
202
+ :return: dicTerms
203
+ :rtype: str
204
+ """
205
+ hyphenated_words = []
206
+ non_hyphenated_words = []
207
+ for lexicon in lexicons :
208
+ lexicon = lexicon .strip ()
209
+ lexicon = lexicon .replace ("\n " , "" )
210
+ if lexicon == '' :
211
+ continue
212
+ length = len (lexicon )
213
+ # no word boundaries for emojis
214
+ if any (emoji in lexicon for emoji in EMOJIS_TO_PRESERVE ):
215
+ lexicon = lexicon .replace ('(' , r'\(' ).replace (')' , r'\)' ) #.replace('/', r'\/')
216
+ else :
217
+ lexicon = lexicon .replace ('(' , r'\(' ).replace (')' , r'\)' )
218
+ word_boundaries = r"\b" , r"\b"
219
+ if lexicon [- 1 ] == "*" :
220
+ pattern = re .compile (r'\*+' )
221
+ lexicon = pattern .sub ('*' , lexicon )
222
+ if not lexicon [- 2 ].isalnum ():
223
+ word_boundaries = r"(?<!\w)" , r"(?!\w)"
224
+ lexicon = lexicon .replace ("*" , r"\S*" )
225
+ elif not lexicon [- 1 ].isalnum ():
226
+ word_boundaries = r"(?<!\w)" , r"(?!\w)"
227
+ lexicon = lexicon .join (word_boundaries )
228
+ if '-' in lexicon :
229
+ hyphenated_words .append ((lexicon , length ))
230
+ else :
231
+ non_hyphenated_words .append ((lexicon , length ))
232
+ hyphenated_words .sort (key = lambda x : x [1 ], reverse = True )
233
+ non_hyphenated_words .sort (key = lambda x : x [1 ], reverse = True )
234
+ sorted_words = hyphenated_words + non_hyphenated_words
235
+ sorted_words = [lexicon for lexicon , _ in sorted_words ]
236
+ return '|' .join (sorted_words )
237
+
207
238
def load_liwc_dict (dicText : str ) -> dict :
208
239
"""
209
240
Loads up a dictionary that is in the LIWC 2007/2015 format.
@@ -212,7 +243,18 @@ def load_liwc_dict(dicText: str) -> dict:
212
243
This functions reads the content of a LIWC dictionary file in the official format,
213
244
and convert it to a dictionary with lexicon: regular expression format.
214
245
We assume the dicText has two parts: the header, which maps numbers to "category names,"
215
- and the body, which maps words in the lexicon to different category numbers, separated by a '%' sign.
246
+ and the body, which maps words in the lexicon to different category numbers, separated by '%'.
247
+ Below is an example:
248
+ '''
249
+ %
250
+ 1 function
251
+ 2 pronoun
252
+ 3 ppron
253
+ %
254
+ again 1 2
255
+ against 1 2 3
256
+ '''
257
+ Note that the elements in each line are separated by '\t '.
216
258
217
259
:param dicText: The content of a .dic file
218
260
:type dicText: str
@@ -221,42 +263,45 @@ def load_liwc_dict(dicText: str) -> dict:
221
263
:rtype: dict
222
264
"""
223
265
dicSplit = dicText .split ('%' , 2 )
266
+ # check 2 '%' symbols
267
+ if len (dicSplit ) != 3 :
268
+ raise ValueError ("Invalid dictionary file." )
224
269
dicHeader , dicBody = dicSplit [1 ], dicSplit [2 ]
225
270
# read headers
226
271
catNameNumberMap = {}
227
272
for line in dicHeader .splitlines ():
228
273
if line .strip () == '' :
229
274
continue
230
275
lineSplit = line .strip ().split ('\t ' )
276
+ # check header format: 1 function
277
+ if len (lineSplit ) != 2 or not lineSplit [0 ].isdigit ():
278
+ raise ValueError ("Invalid dictionary file." )
231
279
catNameNumberMap [lineSplit [0 ]] = lineSplit [1 ]
232
280
# read body
233
281
dicCategories = {}
234
282
for line in dicBody .splitlines ():
235
283
lineSplit = line .strip ().split ('\t ' )
236
- dicTerm , catNums = lineSplit [0 ], lineSplit [1 :]
237
- dicTerm = fix_abbreviations (dicTerm = ' ' .join (lineSplit [0 ].lower ().strip ().split ()))
238
- dicTerm = dicTerm .strip ()
239
- if dicTerm == '' :
284
+ # check body format: again 1 2
285
+ if lineSplit != ['' ] and len (lineSplit ) < 2 :
240
286
continue
241
- if not is_valid_term (dicTerm ):
242
- warnings .warn (f"WARNING: invalid dict term: { dicTerm } , skipped" )
243
- if '*' in dicTerm :
244
- # Replace consecutive asterisks with a single asterisk -- e.g., '**'->'*'
245
- pattern = re .compile (r'\*+' )
246
- dicTerm = pattern .sub ('*' , dicTerm )
247
- dicTerm = r"\b" + dicTerm .replace ("\n " , "" ).replace ("*" , "" ) + r"\S*\b"
248
- elif '(' in dicTerm or ')' in dicTerm or '/' in dicTerm :
249
- dicTerm = dicTerm .replace ("\n " , "" ).replace ('(' , r'\(' ).replace (')' , r'\)' ).replace ('/' , r'\/' )
250
- else :
251
- dicTerm = r"\b" + dicTerm .replace ("\n " , "" ) + r"\b"
252
-
287
+ lexicon , catNums = lineSplit [0 ], lineSplit [1 :]
288
+ lexicon = fix_abbreviations (dicTerm = ' ' .join (lineSplit [0 ].lower ().strip ().split ()))
289
+ lexicon = lexicon .strip ()
290
+ if lexicon == '' :
291
+ continue
292
+ if not is_valid_term (lexicon ):
293
+ warnings .warn (f"WARNING: invalid lexicon: { lexicon } , skipped" )
294
+ continue
295
+
253
296
for catNum in catNums :
254
297
cat = catNameNumberMap [catNum ]
255
298
if cat not in dicCategories :
256
- dicCategories [cat ] = dicTerm
299
+ dicCategories [cat ] = [ lexicon ]
257
300
else :
258
- cur_dicTerm = dicCategories [cat ]
259
- dicCategories [cat ] = cur_dicTerm + "|" + dicTerm
301
+ dicCategories [cat ].append (lexicon )
302
+ # sort the words in the dictionary
303
+ for cat , lexicons in dicCategories .items ():
304
+ dicCategories [cat ] = sort_words (lexicons )
260
305
return dicCategories
261
306
262
307
def generate_certainty_pkl ():
0 commit comments