-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
102 lines (78 loc) · 3.32 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from preprocess import contents2count, contents2count_v2
from utils import prototypeClustering, hierarchicalClustering
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import wordpunct_tokenize
import numpy as np
def do(payloads,vec_size=512,win_size=4 ,th = 0.6 ,th2 = 0.6,min_count=3):
payloads_indices = dict()
for idx, payload in enumerate(payloads):
if payload not in payloads_indices.keys():
payloads_indices[payload] = []
payloads_indices[payload].append(idx)
unique_payloads = list(payloads_indices.keys())
X = contents2count(unique_payloads, vec_size, win_size)
prev_label_list = prototypeClustering(X, th)
# print(prev_label_list)
label_list = hierarchicalClustering(X, prev_label_list, th2)
# print(label_list)
real_label_list = [-1] * len(payloads)
nxt_label = max(label_list) + 1
for i in range(len(unique_payloads)):
payload = unique_payloads[i]
label = label_list[i]
if label==-1 and len(payloads_indices[payload])< min_count:
for idx in payloads_indices[payload]:
real_label_list[idx] = -1
elif label==-1:
for idx in payloads_indices[payload]:
real_label_list[idx] = nxt_label
nxt_label += 1
else:
for idx in payloads_indices[payload]:
real_label_list[idx] = label
# print(real_label_list)
del label,payload
clusters = dict()
for label,payload in zip(label_list,X):
if label not in clusters:
clusters[label] = [payload]
else:
clusters.get(label).append(payload)
return real_label_list,clusters
def do_v2(payloads,vec_size=512,win_size=4 ,th = 0.6 ,th2 = 0.6,min_count=3):
payloads_indices = dict()
for idx, payload in enumerate(payloads):
if payload not in payloads_indices.keys():
payloads_indices[payload] = []
payloads_indices[payload].append(idx)
unique_payloads = list(payloads_indices.keys())
tf = TfidfVectorizer(tokenizer=wordpunct_tokenize,max_features=vec_size)
X = np.array(tf.fit_transform(unique_payloads).todense())
prev_label_list = prototypeClustering(X, th)
# print(prev_label_list)
label_list = hierarchicalClustering(X, prev_label_list, th2)
# print(label_list)
real_label_list = [-1] * len(payloads)
nxt_label = max(label_list) + 1
for i in range(len(unique_payloads)):
payload = unique_payloads[i]
label = label_list[i]
if label==-1 and len(payloads_indices[payload])< min_count:
for idx in payloads_indices[payload]:
real_label_list[idx] = -1
elif label==-1:
for idx in payloads_indices[payload]:
real_label_list[idx] = nxt_label
nxt_label += 1
else:
for idx in payloads_indices[payload]:
real_label_list[idx] = label
# print(real_label_list)
del label,payload
clusters = dict()
for label,payload in zip(label_list,X):
if label not in clusters:
clusters[label] = [payload]
else:
clusters.get(label).append(payload)
return real_label_list,clusters,tf