-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcluster.py
64 lines (59 loc) · 2.75 KB
/
cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import csv
import pandas as pd
from tqdm import tqdm
# This file is to cluster all the tweets
# This function is to make a cluster of tweets and generate a CSV file, the tweets_clean stands for the old_data we have
# the cluster_number stands for the old_data cluster number we need and the cluster_labels stands for the labels we got,
# it's a list of clusters of each tweet, the name is the name of this file
def generate_cluster(tweets_clean, cluster_number, cluster_labels, name):
# all the tweets needs to be write into 9 files
all_tweets_cluster = list([])
for x in range(cluster_number):
cluster_tweets = list([])
all_tweets_cluster.append(cluster_tweets)
for cluster_index in range(len(cluster_labels)):
cluster = cluster_labels[cluster_index]
tweet_info = tweets_clean[cluster_index]
tweet_info_new = []
for b in tweet_info:
tweet_info_new.append(b)
all_tweets_cluster[cluster - 1].append(tweet_info_new)
i = 1
cluster_group = list([])
for a in tqdm(all_tweets_cluster):
cluster_name = name + ': cluster_' + str(i) + '.csv'
i = i + 1
cluster_group.append(a)
with open(cluster_name, 'w') as f:
f_csv = csv.writer(f)
f_csv.writerows(a)
return cluster_group
# This function is to generate a CSV file that has name of each cluster(the common word) and the ID of each tweet
def generate_node(most_words, cluster_label, name):
id = range(1, len(cluster_label) + 1)
label = []
for i in tqdm(cluster_label):
label.append(most_words[i - 1]) # add correspond word into the list
df = pd.DataFrame({'ID': id, 'Label': label})
df.to_csv(name + '.csv', mode='w', index=False)
return df
# This function is to generate the edges(from source node to target node in the same cluster), it uses the consensus
# function and the threshold stands for the value in the consensus matrix of KMeans, and if it's big enough the 2 nodes
# will be saved
def generate_edge(name, consensus_matrix, cluster_label, threshold):
index_list = list([])
for a in range(9):
same_cluster = [index for index in range(len(cluster_label)) if cluster_label[index] == a]
index_list.append(same_cluster)
source = list([])
target = list([])
for i in tqdm(range(consensus_matrix.shape[0])):
for j in range(consensus_matrix.shape[1]):
if (consensus_matrix[i, j] > threshold) and (i < j):
for clusters in index_list:
if i in clusters and j in clusters:
source.append(i + 1)
target.append(j + 1)
df = pd.DataFrame({'Source': source, 'Target': target})
df.to_csv(name + '.csv', mode='w', index=False)
return df