Skip to content

Commit b2bcebd

Browse files
authored
Add files via upload
1 parent edbf3c3 commit b2bcebd

File tree

1 file changed

+310
-0
lines changed

1 file changed

+310
-0
lines changed
+310
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,310 @@
1+
# Copyright (C) 2017-2023 Cleanlab Inc.
2+
# This file is part of cleanlab.
3+
#
4+
# cleanlab is free software: you can redistribute it and/or modify
5+
# it under the terms of the GNU Affero General Public License as published
6+
# by the Free Software Foundation, either version 3 of the License, or
7+
# (at your option) any later version.
8+
#
9+
# cleanlab is distributed in the hope that it will be useful,
10+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+
# GNU Affero General Public License for more details.
13+
#
14+
# You should have received a copy of the GNU Affero General Public License
15+
# along with cleanlab. If not, see <https://www.gnu.org/licenses/>.
16+
17+
"""
18+
Text classification with fastText models that are compatible with cleanlab.
19+
This module allows you to easily find label issues in your text datasets.
20+
21+
You must have fastText installed: ``pip install "fasttext==0.9.2"`` or lower.
22+
Version 0.9.3 has a regression bug and the official package has been archived on GitHub.
23+
24+
Tips:
25+
26+
* Check out our example using this class: `fasttext_amazon_reviews <https://github.com/cleanlab/examples/blob/master/fasttext_amazon_reviews/fasttext_amazon_reviews.ipynb>`_
27+
* Our `unit tests <https://github.com/cleanlab/cleanlab/blob/master/tests/test_frameworks.py>`_ also provide basic usage examples.
28+
29+
"""
30+
31+
import time
32+
import os
33+
import copy
34+
import numpy as np
35+
from sklearn.base import BaseEstimator
36+
from fasttext import train_supervised, load_model
37+
38+
39+
LABEL = "__label__"
40+
NEWLINE = " __newline__ "
41+
42+
43+
def data_loader(
44+
fn=None,
45+
indices=None,
46+
label=LABEL,
47+
batch_size=1000,
48+
):
49+
"""Returns a generator, yielding two lists containing
50+
[labels], [text]. Items are always returned in the
51+
order in the file, regardless if indices are provided."""
52+
53+
def _split_labels_and_text(batch):
54+
l, t = [list(t) for t in zip(*(z.split(" ", 1) for z in batch))]
55+
return l, t
56+
57+
# Prepare a stack of indices
58+
if indices is not None:
59+
stack_indices = sorted(indices, reverse=True)
60+
stack_idx = stack_indices.pop()
61+
62+
with open(fn, "r") as f:
63+
len_label = len(label)
64+
idx = 0
65+
batch_counter = 0
66+
prev = f.readline()
67+
batch = []
68+
while True:
69+
try:
70+
line = f.readline()
71+
line = line
72+
if line[:len_label] == label or line == "":
73+
if indices is None or stack_idx == idx:
74+
# Write out prev line and reset prev
75+
batch.append(prev.strip().replace("\n", NEWLINE))
76+
batch_counter += 1
77+
78+
if indices is not None:
79+
if len(stack_indices):
80+
stack_idx = stack_indices.pop()
81+
else: # No more data in indices, quit loading data.
82+
yield _split_labels_and_text(batch)
83+
break
84+
prev = ""
85+
idx += 1
86+
if batch_counter == batch_size:
87+
yield _split_labels_and_text(batch)
88+
# Reset batch
89+
batch_counter = 0
90+
batch = []
91+
prev += line
92+
if line == "":
93+
if len(batch) > 0:
94+
yield _split_labels_and_text(batch)
95+
break
96+
except EOFError:
97+
if indices is None or stack_idx == idx:
98+
# Write out prev line and reset prev
99+
batch.append(prev.strip().replace("\n", NEWLINE))
100+
batch_counter += 1
101+
yield _split_labels_and_text(batch)
102+
break
103+
104+
105+
class FastTextClassifier(BaseEstimator): # Inherits sklearn base classifier
106+
"""Instantiate a fastText classifier that is compatible with :py:class:`CleanLearning <cleanlab.classification.CleanLearning>`.
107+
108+
Parameters
109+
----------
110+
train_data_fn: str
111+
File name of the training data in the format compatible with fastText.
112+
113+
test_data_fn: str, optional
114+
File name of the test data in the format compatible with fastText.
115+
"""
116+
117+
def __init__(
118+
self,
119+
train_data_fn,
120+
test_data_fn=None,
121+
labels=None,
122+
tmp_dir="",
123+
label=LABEL,
124+
del_intermediate_data=True,
125+
kwargs_train_supervised={},
126+
p_at_k=1,
127+
batch_size=1000,
128+
):
129+
self.train_data_fn = train_data_fn
130+
self.test_data_fn = test_data_fn
131+
self.tmp_dir = tmp_dir
132+
self.label = label
133+
self.del_intermediate_data = del_intermediate_data
134+
self.kwargs_train_supervised = kwargs_train_supervised
135+
self.p_at_k = p_at_k
136+
self.batch_size = batch_size
137+
self.clf = None
138+
self.labels = labels
139+
140+
if labels is None:
141+
# Find all class labels across the train and test set (if provided)
142+
unique_labels = set([])
143+
for labels, _ in data_loader(fn=train_data_fn, batch_size=batch_size):
144+
unique_labels = unique_labels.union(set(labels))
145+
if test_data_fn is not None:
146+
for labels, _ in data_loader(fn=test_data_fn, batch_size=batch_size):
147+
unique_labels = unique_labels.union(set(labels))
148+
else:
149+
# Prepend labels with self.label token (e.g. '__label__').
150+
unique_labels = [label + str(l) for l in labels]
151+
# Create maps: label strings <-> integers when label strings are used
152+
unique_labels = sorted(list(unique_labels))
153+
self.label2num = dict(zip(unique_labels, range(len(unique_labels))))
154+
self.num2label = dict((y, x) for x, y in self.label2num.items())
155+
156+
def _create_train_data(self, data_indices):
157+
"""Returns filename of the masked fasttext data file.
158+
Items are written in the order they are in the file,
159+
regardless if indices are provided."""
160+
161+
# If X indexes all training data, no need to rewrite the file.
162+
if data_indices is None:
163+
self.masked_data_was_created = False
164+
return self.train_data_fn
165+
# Mask training data by data_indices
166+
else:
167+
len_label = len(LABEL)
168+
data_indices = sorted(data_indices, reverse=True)
169+
masked_fn = "fastTextClf_" + str(int(time.time())) + ".txt"
170+
open(masked_fn, "w").close()
171+
# Read in training data one line at a time
172+
with open(self.train_data_fn, "r") as rf:
173+
idx = 0
174+
data_idx = data_indices.pop()
175+
for line in rf:
176+
# Mask by data_indices
177+
if idx == data_idx:
178+
with open(masked_fn, "a") as wf:
179+
wf.write(line.strip().replace("\n", NEWLINE) + "\n")
180+
if line[:len_label] == LABEL:
181+
if len(data_indices):
182+
data_idx = data_indices.pop()
183+
else:
184+
break
185+
# Increment data index if starts with __label__
186+
# This enables support for text data containing '\n'.
187+
if line[:len_label] == LABEL:
188+
idx += 1
189+
self.masked_data_was_created = True
190+
191+
return masked_fn
192+
193+
def _remove_masked_data(self, fn):
194+
"""Deletes intermediate data files."""
195+
196+
if self.del_intermediate_data and self.masked_data_was_created:
197+
os.remove(fn)
198+
199+
def __deepcopy__(self, memo):
200+
if self.clf is None:
201+
self_clf_copy = None
202+
else:
203+
fn = "tmp_{}.fasttext.model".format(int(time.time()))
204+
self.clf.save_model(fn)
205+
self_clf_copy = load_model(fn)
206+
os.remove(fn)
207+
# Store self.clf
208+
params = self.__dict__
209+
clf = params.pop("clf")
210+
# Copy params without self.clf (it can't be copied)
211+
params_copy = copy.deepcopy(params)
212+
# Add clf back to self.clf
213+
self.clf = clf
214+
# Create copy to return
215+
clf_copy = FastTextClassifier(self.train_data_fn)
216+
params_copy["clf"] = self_clf_copy
217+
clf_copy.__dict__ = params_copy
218+
return clf_copy
219+
220+
def fit(self, X=None, y=None, sample_weight=None):
221+
"""Trains the fast text classifier.
222+
Typical usage requires NO parameters,
223+
just clf.fit() # No params.
224+
225+
Parameters
226+
----------
227+
X : iterable, e.g. list, numpy array (default None)
228+
The list of indices of the data to use.
229+
When in doubt, set as None. None defaults to range(len(data)).
230+
y : None
231+
Leave this as None. It's a filler to suit sklearns reqs.
232+
sample_weight : None
233+
Leave this as None. It's a filler to suit sklearns reqs."""
234+
235+
train_fn = self._create_train_data(data_indices=X)
236+
self.clf = train_supervised(train_fn, **self.kwargs_train_supervised)
237+
self._remove_masked_data(train_fn)
238+
239+
def predict_proba(self, X=None, train_data=True, return_labels=False):
240+
"""Produces a probability matrix with examples on rows and
241+
classes on columns, where each row sums to 1 and captures the
242+
probability of the example belonging to each class."""
243+
244+
fn = self.train_data_fn if train_data else self.test_data_fn
245+
pred_probs_list = []
246+
if return_labels:
247+
labels_list = []
248+
for labels, text in data_loader(fn=fn, indices=X, batch_size=self.batch_size):
249+
pred = self.clf.predict(text=text, k=len(self.clf.get_labels()))
250+
# Get p(label = k | x) matrix of shape (N x K) of pred probs for each x
251+
pred_probs = [
252+
[p for _, p in sorted(list(zip(*l)), key=lambda x: x[0])] for l in list(zip(*pred))
253+
]
254+
pred_probs_list.append(np.array(pred_probs))
255+
if return_labels:
256+
labels_list.append(labels)
257+
pred_probs = np.concatenate(pred_probs_list, axis=0)
258+
if return_labels:
259+
gold_labels = [self.label2num[z] for l in labels_list for z in l]
260+
return (pred_probs, np.array(gold_labels))
261+
else:
262+
return pred_probs
263+
264+
def predict(self, X=None, train_data=True, return_labels=False):
265+
"""Predict labels of X"""
266+
267+
fn = self.train_data_fn if train_data else self.test_data_fn
268+
pred_list = []
269+
if return_labels:
270+
labels_list = []
271+
for labels, text in data_loader(fn=fn, indices=X, batch_size=self.batch_size):
272+
pred = [self.label2num[z[0]] for z in self.clf.predict(text)[0]]
273+
pred_list.append(pred)
274+
if return_labels:
275+
labels_list.append(labels)
276+
pred = np.array([z for l in pred_list for z in l])
277+
if return_labels:
278+
gold_labels = [self.label2num[z] for l in labels_list for z in l]
279+
return (pred, np.array(gold_labels))
280+
else:
281+
return pred
282+
283+
def score(self, X=None, y=None, sample_weight=None, k=None):
284+
"""Compute the average precision @ k (single label) of the
285+
labels predicted from X and the true labels given by y.
286+
score expects a `y` variable. In this case, `y` is the noisy labels."""
287+
288+
# Set the k for precision@k.
289+
# For single label: 1 if label is in top k, else 0
290+
if k is None:
291+
k = self.p_at_k
292+
293+
fn = self.test_data_fn
294+
pred_list = []
295+
if y is None:
296+
labels_list = []
297+
for labels, text in data_loader(fn=fn, indices=X, batch_size=self.batch_size):
298+
pred = self.clf.predict(text, k=k)[0]
299+
pred_list.append(pred)
300+
if y is None:
301+
labels_list.append(labels)
302+
pred = np.array([z for l in pred_list for z in l])
303+
if y is None:
304+
y = [z for l in labels_list for z in l]
305+
else:
306+
y = [self.num2label[z] for z in y]
307+
308+
apk = np.mean([y[i] in l for i, l in enumerate(pred)])
309+
310+
return apk

0 commit comments

Comments
 (0)