From 3d195e23d88884ad72699e59ad67ca708addfc05 Mon Sep 17 00:00:00 2001
From: Peter Lombaers <peter@idfuse.nl>
Date: Thu, 28 Mar 2024 15:13:05 +0100
Subject: [PATCH] Linting and formatting

---
 asreviewcontrib/datatools/__init__.py   |  2 +-
 asreviewcontrib/datatools/compose.py    | 43 +++++++++++++++----------
 asreviewcontrib/datatools/describe.py   | 23 +++++++++----
 asreviewcontrib/datatools/entrypoint.py | 10 +++---
 asreviewcontrib/datatools/snowball.py   | 10 +++---
 asreviewcontrib/datatools/stack.py      | 10 +++---
 tests/test_compose.py                   |  3 +-
 tests/test_describe.py                  |  1 -
 8 files changed, 64 insertions(+), 38 deletions(-)

diff --git a/asreviewcontrib/datatools/__init__.py b/asreviewcontrib/datatools/__init__.py
index 8eefaf1..0213423 100644
--- a/asreviewcontrib/datatools/__init__.py
+++ b/asreviewcontrib/datatools/__init__.py
@@ -3,4 +3,4 @@
     from asreviewcontrib.datatools._version import __version_tuple__
 except ImportError:
     __version__ = "0.0.0"
-    __version_tuple__ = (0, 0, 0)
\ No newline at end of file
+    __version_tuple__ = (0, 0, 0)
diff --git a/asreviewcontrib/datatools/compose.py b/asreviewcontrib/datatools/compose.py
index 261b389..6ebc0ab 100644
--- a/asreviewcontrib/datatools/compose.py
+++ b/asreviewcontrib/datatools/compose.py
@@ -17,7 +17,8 @@ def _check_order_arg(order):
         return order
     else:
         raise ValueError(
-            f"hierarchy '{order}' not found, should be one of the following: {allowed_orders}"
+            f"hierarchy '{order}' not found, should be one of the"
+            f" following: {allowed_orders}"
         )
 
 
@@ -48,8 +49,8 @@ def _check_suffix(input_files, output_file):
     if len(set(suffixes)) > 1:
         if not (set_suffixes.issubset(set_ris) or set_suffixes.issubset(set_tabular)):
             raise ValueError(
-                "Files with different file types were; all input files, as well as the output file, should be of the "
-                "same type. "
+                "Files with different file types were; all input files, as well as the"
+                " output file, should be of the same type. "
             )
 
 
@@ -57,9 +58,11 @@ def _check_label_errors(as_lab, path_lab):
     if as_lab is not None:
         if as_lab.labels is None:
             warnings.warn(
-                f"'{path_lab}' was passed as a labeled dataset but no labels were found, continuing with its records "
-                f"marked as unlabeled. If this is not correct, check if your data format complies with: "
-                f"https://asreview.readthedocs.io/en/latest/data_format.html"
+                f"'{path_lab}' was passed as a labeled dataset but no labels were"
+                " found, continuing with its records marked as unlabeled. If this is"
+                " not correct, check if your data format complies with:"
+                " https://asreview.readthedocs.io/en/latest/data_format.html",
+                stacklevel=1,
             )
 
 
@@ -83,8 +86,8 @@ def _concat_label(list_df, label, pid="doi"):
 
         n_total_dedup = n_total - len(df_all)
         print(
-            f"Detected {n_total} records with label '{label}', from which {n_total_dedup} duplicate records with the "
-            f"same label were removed."
+            f"Detected {n_total} records with label '{label}', from which"
+            f" {n_total_dedup} duplicate records with the same label were removed."
         )
     else:
         df_all = pd.DataFrame()
@@ -104,9 +107,9 @@ def create_composition(
     # load all input files and URLs into ASReviewData objects, fill with None
     # if input was not specified
     input_files = [rel_path, irr_path, lab_path, unl_path]
-    as_rel, as_irr, as_lab, as_unl = [
+    as_rel, as_irr, as_lab, as_unl = (
         load_data(item) if item is not None else None for item in input_files
-    ]
+    )
 
     # check whether input files are correctly labeled
     _check_label_errors(as_lab, lab_path)
@@ -185,10 +188,11 @@ def create_composition(
             "left",
         ):
             print(
-                f"\nSome records have inconsistent labels in the input files. This may be intentional because you are "
-                f"trying to overwrite labels in an input file with labels from another input file. However, "
-                f"it may also be because some records are unintentionally labeled inconsistently.\n\n"
-                f"The following records have inconsistent labels in the input files:\n"
+                f"\nSome records have inconsistent labels in the input files. This may"
+                " be intentional because you are trying to overwrite labels in an input"
+                " file with labels from another input file. However, it may also be"
+                " because some records are unintentionally labeled inconsistently.\n\n"
+                "The following records have inconsistent labels in the input files:\n"
                 f"{df_info_conflicts}\n"
             )
 
@@ -197,14 +201,19 @@ def create_composition(
 
         elif resolve == "keep_one":
             warnings.warn(
-                f"Continuing, keeping one label for records with inconsistent labels, resolving conflicts using the "
-                f"following hierarchy:\n1. {dict_terms[order[0]]}\n2. {dict_terms[order[1]]}\n3. {dict_terms[order[2]]}"
+                f"Continuing, keeping one label for records with inconsistent labels,"
+                " resolving conflicts using the following hierarchy:"
+                f"\n1. {dict_terms[order[0]]}\n2. {dict_terms[order[1]]}"
+                f"\n3. {dict_terms[order[2]]}",
+                stacklevel=1,
             )
             df_composed = as_conflict.drop_duplicates(pid=pid).reset_index(drop=True)
 
         elif resolve == "keep_all":
             warnings.warn(
-                f"Continuing, keeping all labels for duplicate records with inconsistent labels."
+                "Continuing, keeping all labels for duplicate records with inconsistent"
+                " labels.",
+                stacklevel=1,
             )
             df_composed = as_conflict.df
 
diff --git a/asreviewcontrib/datatools/describe.py b/asreviewcontrib/datatools/describe.py
index bcd38bb..40336b1 100644
--- a/asreviewcontrib/datatools/describe.py
+++ b/asreviewcontrib/datatools/describe.py
@@ -1,16 +1,20 @@
 import argparse
 import json
-from pathlib import Path
 
 import asreview
 from asreview.data import load_data
-from asreview.data.statistics import *  # noqa
+from asreview.data.statistics import n_duplicates
+from asreview.data.statistics import n_irrelevant
+from asreview.data.statistics import n_missing_abstract
+from asreview.data.statistics import n_missing_title
+from asreview.data.statistics import n_records
+from asreview.data.statistics import n_relevant
+from asreview.data.statistics import n_unlabeled
 
 from asreviewcontrib.datatools import __version__
 
 
 def describe(input_path, output_path=None):
-
     # read data in ASReview data object
     asdata = load_data(input_path)
 
@@ -47,19 +51,26 @@ def describe(input_path, output_path=None):
                 {
                     "id": "n_missing_title",
                     "title": "Number of records with missing title",
-                    "description": "The number of records in the dataset with missing title.",
+                    "description": (
+                        "The number of records in the dataset with missing title."
+                    ),
                     "value": n_missing_title(asdata)[0],
                 },
                 {
                     "id": "n_missing_abstract",
                     "title": "Number of records with missing abstract",
-                    "description": "The number of records in the dataset with missing abstract.",
+                    "description": (
+                        "The number of records in the dataset with missing abstract."
+                    ),
                     "value": n_missing_abstract(asdata)[0],
                 },
                 {
                     "id": "n_duplicates",
                     "title": "Number of duplicate records (basic algorithm)",
-                    "description": "The number of duplicate records in the dataset based on similar text.",
+                    "description": (
+                        "The number of duplicate records in the dataset based on"
+                        " similar text."
+                    ),
                     "value": n_duplicates(asdata),
                 },
             ]
diff --git a/asreviewcontrib/datatools/entrypoint.py b/asreviewcontrib/datatools/entrypoint.py
index 86c83a0..562bea7 100644
--- a/asreviewcontrib/datatools/entrypoint.py
+++ b/asreviewcontrib/datatools/entrypoint.py
@@ -3,6 +3,7 @@
 from asreview.data import load_data
 from asreview.entry_points import BaseEntryPoint
 
+from asreviewcontrib.datatools import __version__
 from asreviewcontrib.datatools.compose import _parse_arguments_compose
 from asreviewcontrib.datatools.compose import compose
 from asreviewcontrib.datatools.convert import _parse_arguments_convert
@@ -24,7 +25,7 @@ class DataEntryPoint(BaseEntryPoint):
     def __init__(self):
         from asreviewcontrib.datatools.__init__ import __version__
 
-        super(DataEntryPoint, self).__init__()
+        super().__init__()
 
         self.version = __version__
 
@@ -78,11 +79,13 @@ def execute(self, argv):
                 if args_dedup.output_path:
                     asdata.to_file(args_dedup.output_path)
                     print(
-                        f"Removed {n_dup} duplicates from dataset with {initial_length} records."
+                        f"Removed {n_dup} duplicates from dataset with"
+                        f" {initial_length} records."
                     )
                 else:
                     print(
-                        f"Found {n_dup} duplicates in dataset with {initial_length} records."
+                        f"Found {n_dup} duplicates in dataset with"
+                        f" {initial_length} records."
                     )
             if argv[0] == "compose":
                 args_compose_parser = _parse_arguments_compose()
@@ -108,7 +111,6 @@ def execute(self, argv):
 
         # Print help message if subcommand not given or incorrect
         else:
-
             parser = argparse.ArgumentParser(
                 prog="asreview data",
                 formatter_class=argparse.RawTextHelpFormatter,
diff --git a/asreviewcontrib/datatools/snowball.py b/asreviewcontrib/datatools/snowball.py
index 3108695..d56114f 100644
--- a/asreviewcontrib/datatools/snowball.py
+++ b/asreviewcontrib/datatools/snowball.py
@@ -211,7 +211,7 @@ def snowball(
         raise ValueError("At least one of 'forward' or 'backward' should be True.")
 
     data = load_data(input_path)
-    if (use_all or (data.included is None)):
+    if use_all or (data.included is None):
         data = data.df
     else:
         data = data.df.loc[data.included.astype(bool)]
@@ -236,9 +236,11 @@ def snowball(
             " records. Performing snowballing for those records."
         )
         data["openalex_id"] = None
-        data.loc[data.doi.notna(), "openalex_id"] = data.loc[
-            data.doi.notna(), "doi"
-        ].str.removeprefix(DOI_PREFIX).apply(lambda doi: id_mapping[doi])
+        data.loc[data.doi.notna(), "openalex_id"] = (
+            data.loc[data.doi.notna(), "doi"]
+            .str.removeprefix(DOI_PREFIX)
+            .apply(lambda doi: id_mapping[doi])
+        )
 
     identifiers = data["openalex_id"].dropna().to_list()
 
diff --git a/asreviewcontrib/datatools/stack.py b/asreviewcontrib/datatools/stack.py
index ec54005..d7dc813 100644
--- a/asreviewcontrib/datatools/stack.py
+++ b/asreviewcontrib/datatools/stack.py
@@ -1,5 +1,4 @@
 import argparse
-import warnings
 from pathlib import Path
 
 import pandas as pd
@@ -19,8 +18,8 @@ def _check_suffix(input_files, output_file):
     if len(set(suffixes)) > 1:
         if not (set_suffixes.issubset(set_ris) or set_suffixes.issubset(set_tabular)):
             raise ValueError(
-                "• Several file types were given; All input files, as well as the output file should be of the same "
-                "type. "
+                "• Several file types were given; All input files, as well as the"
+                " output file should be of the same type. "
             )
 
 
@@ -38,7 +37,10 @@ def _parse_arguments_vstack():
     parser = argparse.ArgumentParser(prog="asreview data vstack")
     parser.add_argument("output_path", type=str, help="The output file path.")
     parser.add_argument(
-        "datasets", type=str, nargs="+", help="Any number of datasets to stack vertically."
+        "datasets",
+        type=str,
+        nargs="+",
+        help="Any number of datasets to stack vertically.",
     )
 
     return parser
diff --git a/tests/test_compose.py b/tests/test_compose.py
index abefe09..30698ef 100644
--- a/tests/test_compose.py
+++ b/tests/test_compose.py
@@ -55,7 +55,8 @@ def test_label_prioritization():
     df_3 = create_composition(*input_files_1, order="uri")
     assert df_3["included"].value_counts()[-1] == len(df_3)
 
-    # input different datasets with some identical records, combining as labeled and unlabeled data
+    # input different datasets with some identical records, combining as labeled and
+    # unlabeled data
     df_4 = create_composition(*input_files_2, order="riu")
     df_4_counts = df_4["included"].value_counts()
     assert df_4_counts[-1] == 7 and df_4_counts[0] == 3 and df_4_counts[1] == 1
diff --git a/tests/test_describe.py b/tests/test_describe.py
index a3016eb..10e0147 100644
--- a/tests/test_describe.py
+++ b/tests/test_describe.py
@@ -2,5 +2,4 @@
 
 
 def test_describe():
-
     subprocess.run(["asreview", "data-describe", "benchmark:van_de_schoot2017"])