-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_tweet.py
75 lines (64 loc) · 2.87 KB
/
clean_tweet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from csv_helper import load_csv, load_csv2
from nltk.tokenize import TweetTokenizer
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
import re
# This file contains all the methods that clean the useless information inside a tweet:
# This function is to remove the URLs inside a line of words
def remove_urls(vTEXT):
vTEXT = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', vTEXT, flags=re.MULTILINE)
return vTEXT
# Tokenizate words in a Tweet, which will return a list which contains lists that contain words in a sentence
def tokenizate(tweets):
tknzer = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=True)
tokens = list([])
for tweet in tweets:
tweet = remove_urls(tweet)
token = tknzer.tokenize(tweet)
if token is not None:
tokens.append(token)
return tokens
# Remove all the stop words which is not useful, and also remove the punctuations
def stop_words_removal(sentence):
english_stopwords = stopwords.words('english')
english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '-', '_',
'___', '__', '"', '/', '...', ",", "∞", "'", 'ö', '\\', '__']
content = list([])
for word in sentence:
if (word.lower() not in english_stopwords) & (word not in english_punctuations):
word = re.sub(r'[^\w]', '', word) # remove signs in a word
word = re.sub(r'\w*\d\w*', '', word).strip() # remove all the words that contains digits
word = word.encode('ascii', 'ignore').decode('ascii')
if not word.isdigit():
if word != '':
content.append(word)
return content
# This function is to stem all the words, eg: loving and love can be stemmed as lov
def stemming(sentence):
st = LancasterStemmer()
content = list([])
for word in sentence:
new_word = st.stem(word)
content.append(new_word)
return content
# tokenize the original tweets information into few sentences
# call the final tweets tweet_clean
def clean_tweet(tweets_tokenize):
tweet_clean = list([])
for tweet in tweets_tokenize:
# remove all the stop words in each tweet sentences
tweet_svr = stop_words_removal(tweet)
# stem each word in these tweet sentences
tweet_s = stemming(tweet_svr)
# add the clean tweets into the list
if len(tweet_s) != 0: # to judge is this tweet empty?
tweet_clean.append(tweet_s)
return tweet_clean
# Combine single words in each tweet, which will return a list that contains all the words in a tweet
# This function is for TF-IDF
def combine(clean_tweet):
tweets_combine = []
for tweet_list in clean_tweet:
tweet_combine = ' '.join(tweet_list)
tweets_combine.append(tweet_combine)
return tweets_combine