mim-solutions
diff --git a/‎.ci/test.sh
Lines changed: 1 addition & 1 deletion b/‎.ci/test.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎.flake8
Lines changed: 5 additions & 0 deletions b/‎.flake8
Lines changed: 5 additions & 0 deletions
diff --git a/‎.gitignore
Lines changed: 7 additions & 0 deletions b/‎.gitignore
Lines changed: 7 additions & 0 deletions
diff --git a/‎CHANGELOG.md
Lines changed: 14 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 14 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 58 additions & 6 deletions b/‎README.md
Lines changed: 58 additions & 6 deletions
diff --git a/‎mim_nlp/classifier/nn/__init__.py
Lines changed: 1 addition & 0 deletions b/‎mim_nlp/classifier/nn/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎mim_nlp/classifier/nn/nn_classifier.py
Lines changed: 172 additions & 0 deletions b/‎mim_nlp/classifier/nn/nn_classifier.py
Lines changed: 172 additions & 0 deletions
diff --git a/‎mim_nlp/classifier/svm/__init__.py
Lines changed: 1 addition & 0 deletions b/‎mim_nlp/classifier/svm/__init__.py
Lines changed: 1 addition & 0 deletions
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
 set -ex
 pip install poetry
-poetry install
+poetry install --all-extras --with test
 poetry run pytest
@@ -3,6 +3,11 @@ max-line-length = 120
 # W503: we prefer line breaks _before_ operators (as changed in PEP8 in 2016).
 # E203: whitespace before : , black is right here: https://github.com/psf/black/issues/315
 ignore = W503,E203
+# Ignore `F401` (unused imports) in all `__init__.py` files.
+# Ignore `E402` (import not at top of file) in all notebooks. `# flake8-noqa-cell-E402` doesn't work.
+per-file-ignores =
+    __init__.py: F401
+    notebooks/*: E402
 show-source = True
 statistics = True
 exclude =
 
@@ -7,3 +7,10 @@
 .vscode
 venv
 .venv
+tmp
+/models
+lightning_logs
+
+# generated package files
+mim_nlp.egg-info
+/dist
@@ -1,4 +1,18 @@
 # Changelog
 
+## 0.2.0 April 9, 2024
+* Moved files from GitLab project.
+  * Classification
+    * Neural Network
+    * SVM
+  * Regression
+    * Neural Network
+  * Seq2Seq
+    * Summarization
+  * Preprocessing
+    * Text cleaning
+    * Lemmatization
+    * Deduplication
+
 ## 0.1.0 April 2, 2024
 * Project created.
@@ -1,25 +1,73 @@
 # MIM NLP
+With this package you can easily use pre-trained models and fine-tune them,
+as well as create and train your own neural networks.
 
-## Project goal
+Below, we list NLP tasks and models that are available:
+  * Classification
+    * Neural Network
+    * SVM
+  * Regression
+    * Neural Network
+  * Seq2Seq
+    * Summarization (Neural Network)
+
+It comes with utilities for text pre-processing such as:
+  * Text cleaning
+  * Lemmatization
+  * Deduplication
 
 ## Installation
 
+### TODO PyPI package
+The package comes with the following extras (optional dependencies for given modules):
+- `svm` - simple svm model for classification
+- `classifier` - classification models: svm, neural networks
+- `regressor` - regression models
+- `preprocessing` - cleaning text, lemmatization and deduplication
+- `seq2seq` - `Seq2Seq` and `Summarizer` models
+
 ## Usage
+Examples can be found in the [notebooks directory](/notebooks).
 
-## Development
+### Model classes
+* `classifier.nn.NNClassifier` - Neural Network Classifier
+* `classifier.svm.SVMClassifier` - Support Vector Machine Classifier
+* `classifier.svm.SVMClassifierWithFeatureSelection` - `SVMClassifier` with additional feature selection step
+* `regressor.AutoRegressor` - regressor based on transformers' Auto Classes
+* `regressor.NNRegressor` - Neural Network Regressor
+* `seq2seq.AutoSummarizer` - summarizer based on transformers' Auto Classes
+
+### Interface
+All the model classes have common interface:
+* `fit`
+* `predict`
+* `save`
+* `load`
 
+and specific additional methods.
+
+### Text pre-processing
+* `preprocessing.TextCleaner` - define a pipeline for text cleaning, supports concurrent processesing
+* `preprocessing.lemmatize` - lemmatize text in Polish with Morfeusz
+* `preprocessing.Deduplicator` - find near-duplicate texts (depending on `threshold`) with Jaccard index for n-grams
+
+## Development
 Remember to use a separate environment for each project.
 Run commands below inside the project's environment.
 
 ### Dependencies
-
 We use `poetry` for dependency management.
 If you have never used it, consult
 [poetry documentation](https://python-poetry.org/docs/)
 for installation guidelines and basic usage instructions.
 
 ```sh
-poetry install
+poetry install --with dev
+```
+
+To fix the `Failed to unlock the collection!` error or stuck packages installation, execute the below command:
+```sh
+export PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring
 ```
 
 ### Git hooks
@@ -37,13 +85,11 @@ Fails if any changes are made, so you have to run `git add` and `git commit` onc
 * _Strip notebooks_ – produces _stripped_ versions of notebooks in `stripped` directory.
 
 ### Tests
-
 ```sh
 pytest
 ```
 
 ### Linting
-
 We use `isort` and `flake8` along with `nbqa` to ensure code quality.
 The appropriate options are set in configuration files.
 You can run them with:
@@ -62,3 +108,9 @@ You can run black to format code (including notebooks):
 ```sh
 black .
 ```
+
+### New version release
+In order to add the next version of the package to PyPI, do the following steps:
+- First, increment the package version in `pyproject.toml`.
+- Then build the new version: run `poetry build` in the root directory.
+- Finally, upload to PyPI: `poetry publish` (two newly created files).
@@ -0,0 +1 @@
+from .nn_classifier import NNClassifier
@@ -0,0 +1,172 @@
+from __future__ import annotations
+
+from typing import Any, Callable, Optional, Union
+
+import numpy as np
+import torch.nn as nn
+from numpy._typing import NDArray, _ArrayLikeInt_co, _ArrayLikeStr_co
+from pytorch_lightning.callbacks import Callback
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.pipeline import Pipeline
+from torch import Tensor
+from torch.nn.modules.loss import _Loss
+from torch.optim import Adam, Optimizer
+from torchmetrics import Metric
+from transformers import PreTrainedTokenizerBase
+
+from mim_nlp.models import Classifier
+from mim_nlp.neural_network import NNModelMixin
+
+
+class NNClassifier(NNModelMixin, Classifier):
+    """Neural Network Classifier
+
+    The `input_size` parameter denotes the length of a tokenized text.
+    This should be equal to the size of the input layer in the neural network.
+    In the case of using TF-IDF, the output size is constant and equal to the size of the vocabulary,
+    so the `input_size` has to be set accordingly.
+    When transformers' tokenizer is used,
+    a tokenized text is padded or truncated to a constant size equal to the `input_size`.
+
+    Neural network should omit activation function and return logits.
+    Take that into consideration when choosing the loss function!
+    We use Sigmoid / Softmax internally to get predictions.
+
+    The `loss_function` is by default set to BCEWithLogitsLoss,
+    which combines a Sigmoid layer and the BCELoss in one single class.
+    For multiclass classification, use Cross Entropy Loss. Both losses take logits, as stated above.
+
+    Callables in `metrics_dict` take predictions (as probabilities) and targets, in that order! Callables can't be
+    lambda functions because they are not pickleable and it would cause problems with saving the model.
+
+    Tips:
+        - Change every lambda function to a function.
+        - Set every argument in the function via `functools.partial`.
+
+    Example:
+        >>> def accuracy_binary(y_pred, y_target):
+        ...     y_pred = y_pred > 0.5
+        ...     return torch.sum(y_target == y_pred) / len(y_target)
+
+    The `device` parameter can have the following values:
+        - `"cpu"` - The model will be loaded on the CPU.
+        - `"cuda"` - The model will be loaded on a single GPU.
+        - `"cuda:i"` - The model will be loaded on the specific GPU with the index `i`.
+
+    It is also possible to use multiple GPUs. To do this:
+        - Set `device` to `"cuda"`.
+        - Set `many_gpus` to `True`.
+        - As default, it will use all of them.
+
+    To use only selected GPUs - set the environmental variable `CUDA_VISIBLE_DEVICES`.
+    """
+
+    def __init__(
+        self,
+        batch_size: int,
+        epochs: int,
+        input_size: int,
+        tokenizer: Optional[Union[PreTrainedTokenizerBase, Pipeline, TfidfVectorizer]],
+        neural_network: nn.Module,
+        loss_function: Union[_Loss, Callable[[Any, Any], Any]] = nn.BCEWithLogitsLoss(),
+        optimizer: type[Optimizer] = Adam,
+        optimizer_params: Optional[dict[str, Any]] = None,
+        train_metrics_dict: Optional[dict[str, Union[Metric, Callable[[Tensor, Tensor], Any]]]] = None,
+        eval_metrics_dict: Optional[dict[str, Union[Metric, Callable[[Tensor, Tensor], Any]]]] = None,
+        callbacks: Optional[Union[Callback, list[Callback]]] = None,
+        device: str = "cuda:0",
+        many_gpus: bool = False,
+    ):
+        super().__init__(
+            batch_size=batch_size,
+            epochs=epochs,
+            input_size=input_size,
+            tokenizer=tokenizer,
+            neural_network=neural_network,
+            loss_function=loss_function,
+            optimizer=optimizer,
+            optimizer_params=optimizer_params,
+            train_metrics_dict=train_metrics_dict,
+            eval_metrics_dict=eval_metrics_dict,
+            callbacks=callbacks,
+            device=device,
+            many_gpus=many_gpus,
+        )
+
+    def fit(self, x_train: _ArrayLikeStr_co, y_train: _ArrayLikeInt_co, fit_tokenizer: bool = False) -> None:
+        """For multiclass classifications `y_train` labels should be encoded as categorical, i.e. integers."""
+        is_multiclass = False
+        # check if multiclass
+        if any(y >= 2 for y in y_train):
+            y_train = Tensor(y_train).long()
+            is_multiclass = True
+        else:
+            y_train = Tensor(y_train).float()
+        super()._fit(
+            x_train,
+            y_train,
+            x_eval=None,
+            y_eval=None,
+            fit_tokenizer=fit_tokenizer,
+            is_classification=True,
+            is_multiclass=is_multiclass,
+        )
+
+    def fit_eval(
+        self,
+        x_train: _ArrayLikeStr_co,
+        y_train: _ArrayLikeInt_co,
+        x_eval: _ArrayLikeStr_co,
+        y_eval: _ArrayLikeInt_co,
+        fit_tokenizer: bool = False,
+    ) -> None:
+        """For multiclass classifications `y` labels should be encoded as categorical, i.e. integers."""
+        is_multiclass = False
+        # check if multiclass
+        if any(y >= 2 for y in y_train):
+            y_train = Tensor(y_train).long()
+            y_eval = Tensor(y_eval).long()
+            is_multiclass = True
+        else:
+            y_train = Tensor(y_train).float()
+            y_eval = Tensor(y_eval).float()
+        super()._fit(
+            x_train,
+            y_train,
+            x_eval,
+            y_eval,
+            fit_tokenizer=fit_tokenizer,
+            is_classification=True,
+            is_multiclass=is_multiclass,
+        )
+
+    def fit_tokenizer(self, x_train: _ArrayLikeStr_co, y_train: Optional[_ArrayLikeInt_co] = None) -> None:
+        super().fit_tokenizer(x_train, y_train)
+
+    def predict(
+        self, x: _ArrayLikeStr_co, batch_size: Optional[int] = None, score_threshold: float = 0.5
+    ) -> NDArray[np.int64]:
+        predictions = self._get_predictions(x, batch_size)
+        if predictions.shape[1] > 1:
+            # multiclass classification
+            return np.array(np.argmax(predictions, axis=1), dtype=np.int64)
+        return np.array(predictions.flatten() > score_threshold, dtype=np.int64)
+
+    def predict_scores(self, x: _ArrayLikeStr_co, batch_size: Optional[int] = None) -> NDArray[np.float64]:
+        predictions = self._get_predictions(x, batch_size)
+        if predictions.shape[1] == 1:
+            predictions = predictions.flatten()
+        return np.array(predictions, dtype=np.float64)
+
+    def test(
+        self,
+        x: _ArrayLikeStr_co,
+        y_test: _ArrayLikeInt_co,
+        batch_size: Optional[int] = None,
+        test_metrics_dict: Optional[dict[str, Union[Metric, Callable[[Tensor, Tensor], Any]]]] = None,
+    ) -> dict[str, Any]:
+        if self.nn_module.is_multiclass:
+            y_test = Tensor(y_test).long()
+        else:
+            y_test = Tensor(y_test).float()
+        return super()._test(x, y_test, batch_size, test_metrics_dict)
@@ -0,0 +1 @@
+from .svm import SVMClassifier, SVMClassifierWithFeatureSelection
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from .nn_classifier import NNClassifier`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from .svm import SVMClassifier, SVMClassifierWithFeatureSelection`