-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
executable file
·152 lines (133 loc) · 5.52 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import pandas as pd
def mergeIntervals(intervals):
sorted_by_lower_bound = sorted(intervals, key=lambda tup: tup[0])
merged = []
for higher in sorted_by_lower_bound:
if not merged:
merged.append(higher)
else:
lower = merged[-1]
if higher[0] <= lower[1]:
if lower[2] is higher[2]:
upper_bound = max(lower[1], higher[1])
merged[-1] = (lower[0], upper_bound, lower[2])
else:
if lower[1] > higher[1]:
merged[-1] = lower
else:
merged[-1] = (lower[0], higher[1], higher[2])
else:
merged.append(higher)
return merged
def get_entities(df):
entities = []
for i in range(len(df)):
entity = []
for annot in df['annotation'][i]:
try:
ent = annot['label'][0]
start = annot['points'][0]['start']
end = annot['points'][0]['end'] + 1
entity.append((start, end, ent))
except:
pass
entity = mergeIntervals(entity)
entities.append(entity)
return entities
def read_dataset()
df_data = pd.read_json("ner.json", lines=True)
df_data = df_data.drop(['extras'], axis=1)
df_data['content'] = df_data['content'].str.replace("\n", " ")
df_data['entities'] = get_entities(df_data)
return df_data
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
try:
training_data = []
lines=[]
with open(dataturks_JSON_FilePath, 'r') as f:
lines = f.readlines()
for line in lines:
data = json.loads(line)
text = data['content'].replace("\n", " ")
entities = []
data_annotations = data['annotation']
if data_annotations is not None:
for annotation in data_annotations:
#only a single point in text annotation.
point = annotation['points'][0]
labels = annotation['label']
# handle both list of labels or a single label.
if not isinstance(labels, list):
labels = [labels]
for label in labels:
point_start = point['start']
point_end = point['end']
point_text = point['text']
lstrip_diff = len(point_text) - len(point_text.lstrip())
rstrip_diff = len(point_text) - len(point_text.rstrip())
if lstrip_diff != 0:
point_start = point_start + lstrip_diff
if rstrip_diff != 0:
point_end = point_end - rstrip_diff
entities.append((point_start, point_end + 1 , label))
training_data.append((text, {"entities" : entities}))
return training_data
except Exception as e:
logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
return None
def trim_entity_spans(data: list) -> list:
"""Removes leading and trailing white spaces from entity spans.
Args:
data (list): The data to be cleaned in spaCy JSON format.
Returns:
list: The cleaned data.
"""
invalid_span_tokens = re.compile(r'\s')
cleaned_data = []
for text, annotations in data:
entities = annotations['entities']
valid_entities = []
for start, end, label in entities:
valid_start = start
valid_end = end
while valid_start < len(text) and invalid_span_tokens.match(
text[valid_start]):
valid_start += 1
while valid_end > 1 and invalid_span_tokens.match(
text[valid_end - 1]):
valid_end -= 1
valid_entities.append([valid_start, valid_end, label])
cleaned_data.append([text, {'entities': valid_entities}])
return cleaned_data
def clean_dataset(data):
cleanedDF = pd.DataFrame(columns=["setences_cleaned"])
sum1 = 0
for i in range(len(data)):
start = 0
emptyList = ["Empty"] * len(data[i][0].split())
numberOfWords = 0
lenOfString = len(data[i][0])
strData = data[i][0]
strDictData = data[i][1]
lastIndexOfSpace = strData.rfind(' ')
for i in range(lenOfString):
if (strData[i]==" " and strData[i+1]!=" "):
for k,v in strDictData.items():
for j in range(len(v)):
entList = v[len(v)-j-1]
if (start>=int(entList[0]) and i<=int(entList[1])):
emptyList[numberOfWords] = entList[2]
break
else:
continue
start = i + 1
numberOfWords += 1
if (i == lastIndexOfSpace):
for j in range(len(v)):
entList = v[len(v)-j-1]
if (lastIndexOfSpace>=int(entList[0]) and lenOfString<=int(entList[1])):
emptyList[numberOfWords] = entList[2]
numberOfWords += 1
cleanedDF = cleanedDF.append(pd.Series([emptyList], index=cleanedDF.columns ), ignore_index=True )
sum1 = sum1 + numberOfWords
return cleanedDF