-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
122 lines (104 loc) · 3.82 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import numpy as np
entities = [
"O",
"B-MISC",
"I-MISC",
"B-PER",
"I-PER",
"B-ORG",
"I-ORG",
"B-LOC",
"I-LOC",
]
entity_map = {entity: i for i, entity in enumerate(entities)}
def readfile(filepath, sep=" "):
"""
Reads file in CoNLL format (IOB2)
"""
lines = open(filepath)
data, sentence, label = [], [], []
for line in lines:
if len(line) == 0 or line.startswith("-DOCSTART") or line[0] == "\n":
if len(sentence) > 0:
data.append((sentence, label))
sentence, label = [], []
continue
splits = line.split(sep)
word = splits[0]
if len(word) > 0 and word[0].isalpha() and word.isupper():
word = word[0] + word[1:].lower()
sentence.append(word)
label.append(entity_map[splits[-1][:-1]])
if len(sentence) > 0:
data.append((sentence, label))
given_words = [d[0] for d in data]
given_labels = [d[1] for d in data]
return given_words, given_labels
def get_probs(pipe, sentence):
"""
@parameter pipe
@parameter sentence: string
@return probs: np.array of shape (n, m)
where n is the number of tokens in the sentence and m is the number of classes.
probs[i][j] is the probability that the i'th sentence belongs to entity j. The
first and last probs are excluded because the first and last tokens are always
[CLS] and [SEP], to represent the start and end of the sentence, respectively.
"""
def softmax(logit):
return np.exp(logit) / np.sum(np.exp(logit))
forward = pipe.forward(pipe.preprocess(sentence))
logits = forward["logits"][0].numpy()
probs = np.array([softmax(logit) for logit in logits])
probs = probs[1:-1]
return probs
def get_pred_probs(scores, tokens, given_token, weighted=False):
"""
Obtain `pred_probs` for one particular sentence. Maps and reduces subword-level tokens to the given
word-level tokens in the original dataset.
Parameters
----------
scores: np.array
np.array with shape `(N', K)`, where N' is the number of tokens of the sentence generated by the
tokenizer, and K is the number of classes of the model prediction. `scores[i][j]` indicates the
model-predicted probability that the i'th token belongs to class j.
tokens: list
list of tokens with length N' generated by the tokenizer.
given_token: list
list of given tokens with length N, where N is the number of tokens of the sentence from the
original dataset.
weighted: bool, default=False
whether to merge the probabilities using a weighted average (or unweighted average). The weight
is proportional to the length of the subword-level token.
Returns
---------
pred_probs: np.array
np.array with shape `(N, K)`, where `pred_probs[i][j]` is the model-predicted probability that the
i'th token belongs to class j after processing (reducing subwords to words, and spliting words
merged by the tokenizers).
"""
i, j = 0, 0
pred_probs = []
for token in given_token:
i_new, j_new = i, j
acc = 0
weights = []
while acc != len(token):
token_len = len(tokens[i_new][j_new:])
remain = len(token) - acc
weights.append(min(remain, token_len))
if token_len > remain:
acc += remain
j_new += remain
else:
acc += token_len
i_new += 1
j_new = 0
if i != i_new:
probs = np.average(
scores[i:i_new], axis=0, weights=weights if weighted else None
)
else:
probs = scores[i]
i, j = i_new, j_new
pred_probs.append(probs)
return np.array(pred_probs)