Add files via upload

jwmueller · web-flow · commit b2bcebd79a13 · 2024-07-08T21:48:05.000-07:00
diff --git a/fasttext_amazon_reviews/fasttext_wrapper.py b/fasttext_amazon_reviews/fasttext_wrapper.py
@@ -0,0 +1,310 @@
+# Copyright (C) 2017-2023  Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab.  If not, see <https://www.gnu.org/licenses/>.
+
+"""
+Text classification with fastText models that are compatible with cleanlab.
+This module allows you to easily find label issues in your text datasets.
+
+You must have fastText installed: ``pip install "fasttext==0.9.2"`` or lower.
+Version 0.9.3 has a regression bug and the official package has been archived on GitHub.
+
+Tips:
+
+* Check out our example using this class: `fasttext_amazon_reviews <https://github.com/cleanlab/examples/blob/master/fasttext_amazon_reviews/fasttext_amazon_reviews.ipynb>`_
+* Our `unit tests <https://github.com/cleanlab/cleanlab/blob/master/tests/test_frameworks.py>`_ also provide basic usage examples.
+
+"""
+
+import time
+import os
+import copy
+import numpy as np
+from sklearn.base import BaseEstimator
+from fasttext import train_supervised, load_model
+
+
+LABEL = "__label__"
+NEWLINE = " __newline__ "
+
+
+def data_loader(
+    fn=None,
+    indices=None,
+    label=LABEL,
+    batch_size=1000,
+):
+    """Returns a generator, yielding two lists containing
+    [labels], [text]. Items are always returned in the
+    order in the file, regardless if indices are provided."""
+
+    def _split_labels_and_text(batch):
+        l, t = [list(t) for t in zip(*(z.split(" ", 1) for z in batch))]
+        return l, t
+
+    # Prepare a stack of indices
+    if indices is not None:
+        stack_indices = sorted(indices, reverse=True)
+        stack_idx = stack_indices.pop()
+
+    with open(fn, "r") as f:
+        len_label = len(label)
+        idx = 0
+        batch_counter = 0
+        prev = f.readline()
+        batch = []
+        while True:
+            try:
+                line = f.readline()
+                line = line
+                if line[:len_label] == label or line == "":
+                    if indices is None or stack_idx == idx:
+                        # Write out prev line and reset prev
+                        batch.append(prev.strip().replace("\n", NEWLINE))
+                        batch_counter += 1
+
+                        if indices is not None:
+                            if len(stack_indices):
+                                stack_idx = stack_indices.pop()
+                            else:  # No more data in indices, quit loading data.
+                                yield _split_labels_and_text(batch)
+                                break
+                    prev = ""
+                    idx += 1
+                    if batch_counter == batch_size:
+                        yield _split_labels_and_text(batch)
+                        # Reset batch
+                        batch_counter = 0
+                        batch = []
+                prev += line
+                if line == "":
+                    if len(batch) > 0:
+                        yield _split_labels_and_text(batch)
+                    break
+            except EOFError:
+                if indices is None or stack_idx == idx:
+                    # Write out prev line and reset prev
+                    batch.append(prev.strip().replace("\n", NEWLINE))
+                    batch_counter += 1
+                    yield _split_labels_and_text(batch)
+                break
+
+
+class FastTextClassifier(BaseEstimator):  # Inherits sklearn base classifier
+    """Instantiate a fastText classifier that is compatible with :py:class:`CleanLearning <cleanlab.classification.CleanLearning>`.
+
+    Parameters
+    ----------
+    train_data_fn: str
+        File name of the training data in the format compatible with fastText.
+
+    test_data_fn: str, optional
+        File name of the test data in the format compatible with fastText.
+    """
+
+    def __init__(
+        self,
+        train_data_fn,
+        test_data_fn=None,
+        labels=None,
+        tmp_dir="",
+        label=LABEL,
+        del_intermediate_data=True,
+        kwargs_train_supervised={},
+        p_at_k=1,
+        batch_size=1000,
+    ):
+        self.train_data_fn = train_data_fn
+        self.test_data_fn = test_data_fn
+        self.tmp_dir = tmp_dir
+        self.label = label
+        self.del_intermediate_data = del_intermediate_data
+        self.kwargs_train_supervised = kwargs_train_supervised
+        self.p_at_k = p_at_k
+        self.batch_size = batch_size
+        self.clf = None
+        self.labels = labels
+
+        if labels is None:
+            # Find all class labels across the train and test set (if provided)
+            unique_labels = set([])
+            for labels, _ in data_loader(fn=train_data_fn, batch_size=batch_size):
+                unique_labels = unique_labels.union(set(labels))
+            if test_data_fn is not None:
+                for labels, _ in data_loader(fn=test_data_fn, batch_size=batch_size):
+                    unique_labels = unique_labels.union(set(labels))
+        else:
+            # Prepend labels with self.label token (e.g. '__label__').
+            unique_labels = [label + str(l) for l in labels]
+        # Create maps: label strings <-> integers when label strings are used
+        unique_labels = sorted(list(unique_labels))
+        self.label2num = dict(zip(unique_labels, range(len(unique_labels))))
+        self.num2label = dict((y, x) for x, y in self.label2num.items())
+
+    def _create_train_data(self, data_indices):
+        """Returns filename of the masked fasttext data file.
+        Items are written in the order they are in the file,
+        regardless if indices are provided."""
+
+        # If X indexes all training data, no need to rewrite the file.
+        if data_indices is None:
+            self.masked_data_was_created = False
+            return self.train_data_fn
+        # Mask training data by data_indices
+        else:
+            len_label = len(LABEL)
+            data_indices = sorted(data_indices, reverse=True)
+            masked_fn = "fastTextClf_" + str(int(time.time())) + ".txt"
+            open(masked_fn, "w").close()
+            # Read in training data one line at a time
+            with open(self.train_data_fn, "r") as rf:
+                idx = 0
+                data_idx = data_indices.pop()
+                for line in rf:
+                    # Mask by data_indices
+                    if idx == data_idx:
+                        with open(masked_fn, "a") as wf:
+                            wf.write(line.strip().replace("\n", NEWLINE) + "\n")
+                        if line[:len_label] == LABEL:
+                            if len(data_indices):
+                                data_idx = data_indices.pop()
+                            else:
+                                break
+                    # Increment data index if starts with __label__
+                    # This enables support for text data containing '\n'.
+                    if line[:len_label] == LABEL:
+                        idx += 1
+            self.masked_data_was_created = True
+
+        return masked_fn
+
+    def _remove_masked_data(self, fn):
+        """Deletes intermediate data files."""
+
+        if self.del_intermediate_data and self.masked_data_was_created:
+            os.remove(fn)
+
+    def __deepcopy__(self, memo):
+        if self.clf is None:
+            self_clf_copy = None
+        else:
+            fn = "tmp_{}.fasttext.model".format(int(time.time()))
+            self.clf.save_model(fn)
+            self_clf_copy = load_model(fn)
+            os.remove(fn)
+        # Store self.clf
+        params = self.__dict__
+        clf = params.pop("clf")
+        # Copy params without self.clf (it can't be copied)
+        params_copy = copy.deepcopy(params)
+        # Add clf back to self.clf
+        self.clf = clf
+        # Create copy to return
+        clf_copy = FastTextClassifier(self.train_data_fn)
+        params_copy["clf"] = self_clf_copy
+        clf_copy.__dict__ = params_copy
+        return clf_copy
+
+    def fit(self, X=None, y=None, sample_weight=None):
+        """Trains the fast text classifier.
+        Typical usage requires NO parameters,
+        just clf.fit()  # No params.
+
+        Parameters
+        ----------
+        X : iterable, e.g. list, numpy array (default None)
+          The list of indices of the data to use.
+          When in doubt, set as None. None defaults to range(len(data)).
+        y : None
+          Leave this as None. It's a filler to suit sklearns reqs.
+        sample_weight : None
+          Leave this as None. It's a filler to suit sklearns reqs."""
+
+        train_fn = self._create_train_data(data_indices=X)
+        self.clf = train_supervised(train_fn, **self.kwargs_train_supervised)
+        self._remove_masked_data(train_fn)
+
+    def predict_proba(self, X=None, train_data=True, return_labels=False):
+        """Produces a probability matrix with examples on rows and
+        classes on columns, where each row sums to 1 and captures the
+        probability of the example belonging to each class."""
+
+        fn = self.train_data_fn if train_data else self.test_data_fn
+        pred_probs_list = []
+        if return_labels:
+            labels_list = []
+        for labels, text in data_loader(fn=fn, indices=X, batch_size=self.batch_size):
+            pred = self.clf.predict(text=text, k=len(self.clf.get_labels()))
+            # Get p(label = k | x) matrix of shape (N x K) of pred probs for each x
+            pred_probs = [
+                [p for _, p in sorted(list(zip(*l)), key=lambda x: x[0])] for l in list(zip(*pred))
+            ]
+            pred_probs_list.append(np.array(pred_probs))
+            if return_labels:
+                labels_list.append(labels)
+        pred_probs = np.concatenate(pred_probs_list, axis=0)
+        if return_labels:
+            gold_labels = [self.label2num[z] for l in labels_list for z in l]
+            return (pred_probs, np.array(gold_labels))
+        else:
+            return pred_probs
+
+    def predict(self, X=None, train_data=True, return_labels=False):
+        """Predict labels of X"""
+
+        fn = self.train_data_fn if train_data else self.test_data_fn
+        pred_list = []
+        if return_labels:
+            labels_list = []
+        for labels, text in data_loader(fn=fn, indices=X, batch_size=self.batch_size):
+            pred = [self.label2num[z[0]] for z in self.clf.predict(text)[0]]
+            pred_list.append(pred)
+            if return_labels:
+                labels_list.append(labels)
+        pred = np.array([z for l in pred_list for z in l])
+        if return_labels:
+            gold_labels = [self.label2num[z] for l in labels_list for z in l]
+            return (pred, np.array(gold_labels))
+        else:
+            return pred
+
+    def score(self, X=None, y=None, sample_weight=None, k=None):
+        """Compute the average precision @ k (single label) of the
+        labels predicted from X and the true labels given by y.
+        score expects a `y` variable. In this case, `y` is the noisy labels."""
+
+        # Set the k for precision@k.
+        # For single label: 1 if label is in top k, else 0
+        if k is None:
+            k = self.p_at_k
+
+        fn = self.test_data_fn
+        pred_list = []
+        if y is None:
+            labels_list = []
+        for labels, text in data_loader(fn=fn, indices=X, batch_size=self.batch_size):
+            pred = self.clf.predict(text, k=k)[0]
+            pred_list.append(pred)
+            if y is None:
+                labels_list.append(labels)
+        pred = np.array([z for l in pred_list for z in l])
+        if y is None:
+            y = [z for l in labels_list for z in l]
+        else:
+            y = [self.num2label[z] for z in y]
+
+        apk = np.mean([y[i] in l for i, l in enumerate(pred)])
+
+        return apk