From 3d195e23d88884ad72699e59ad67ca708addfc05 Mon Sep 17 00:00:00 2001 From: Peter Lombaers Date: Thu, 28 Mar 2024 15:13:05 +0100 Subject: [PATCH] Linting and formatting --- asreviewcontrib/datatools/__init__.py | 2 +- asreviewcontrib/datatools/compose.py | 43 +++++++++++++++---------- asreviewcontrib/datatools/describe.py | 23 +++++++++---- asreviewcontrib/datatools/entrypoint.py | 10 +++--- asreviewcontrib/datatools/snowball.py | 10 +++--- asreviewcontrib/datatools/stack.py | 10 +++--- tests/test_compose.py | 3 +- tests/test_describe.py | 1 - 8 files changed, 64 insertions(+), 38 deletions(-) diff --git a/asreviewcontrib/datatools/__init__.py b/asreviewcontrib/datatools/__init__.py index 8eefaf1..0213423 100644 --- a/asreviewcontrib/datatools/__init__.py +++ b/asreviewcontrib/datatools/__init__.py @@ -3,4 +3,4 @@ from asreviewcontrib.datatools._version import __version_tuple__ except ImportError: __version__ = "0.0.0" - __version_tuple__ = (0, 0, 0) \ No newline at end of file + __version_tuple__ = (0, 0, 0) diff --git a/asreviewcontrib/datatools/compose.py b/asreviewcontrib/datatools/compose.py index 261b389..6ebc0ab 100644 --- a/asreviewcontrib/datatools/compose.py +++ b/asreviewcontrib/datatools/compose.py @@ -17,7 +17,8 @@ def _check_order_arg(order): return order else: raise ValueError( - f"hierarchy '{order}' not found, should be one of the following: {allowed_orders}" + f"hierarchy '{order}' not found, should be one of the" + f" following: {allowed_orders}" ) @@ -48,8 +49,8 @@ def _check_suffix(input_files, output_file): if len(set(suffixes)) > 1: if not (set_suffixes.issubset(set_ris) or set_suffixes.issubset(set_tabular)): raise ValueError( - "Files with different file types were; all input files, as well as the output file, should be of the " - "same type. " + "Files with different file types were; all input files, as well as the" + " output file, should be of the same type. " ) @@ -57,9 +58,11 @@ def _check_label_errors(as_lab, path_lab): if as_lab is not None: if as_lab.labels is None: warnings.warn( - f"'{path_lab}' was passed as a labeled dataset but no labels were found, continuing with its records " - f"marked as unlabeled. If this is not correct, check if your data format complies with: " - f"https://asreview.readthedocs.io/en/latest/data_format.html" + f"'{path_lab}' was passed as a labeled dataset but no labels were" + " found, continuing with its records marked as unlabeled. If this is" + " not correct, check if your data format complies with:" + " https://asreview.readthedocs.io/en/latest/data_format.html", + stacklevel=1, ) @@ -83,8 +86,8 @@ def _concat_label(list_df, label, pid="doi"): n_total_dedup = n_total - len(df_all) print( - f"Detected {n_total} records with label '{label}', from which {n_total_dedup} duplicate records with the " - f"same label were removed." + f"Detected {n_total} records with label '{label}', from which" + f" {n_total_dedup} duplicate records with the same label were removed." ) else: df_all = pd.DataFrame() @@ -104,9 +107,9 @@ def create_composition( # load all input files and URLs into ASReviewData objects, fill with None # if input was not specified input_files = [rel_path, irr_path, lab_path, unl_path] - as_rel, as_irr, as_lab, as_unl = [ + as_rel, as_irr, as_lab, as_unl = ( load_data(item) if item is not None else None for item in input_files - ] + ) # check whether input files are correctly labeled _check_label_errors(as_lab, lab_path) @@ -185,10 +188,11 @@ def create_composition( "left", ): print( - f"\nSome records have inconsistent labels in the input files. This may be intentional because you are " - f"trying to overwrite labels in an input file with labels from another input file. However, " - f"it may also be because some records are unintentionally labeled inconsistently.\n\n" - f"The following records have inconsistent labels in the input files:\n" + f"\nSome records have inconsistent labels in the input files. This may" + " be intentional because you are trying to overwrite labels in an input" + " file with labels from another input file. However, it may also be" + " because some records are unintentionally labeled inconsistently.\n\n" + "The following records have inconsistent labels in the input files:\n" f"{df_info_conflicts}\n" ) @@ -197,14 +201,19 @@ def create_composition( elif resolve == "keep_one": warnings.warn( - f"Continuing, keeping one label for records with inconsistent labels, resolving conflicts using the " - f"following hierarchy:\n1. {dict_terms[order[0]]}\n2. {dict_terms[order[1]]}\n3. {dict_terms[order[2]]}" + f"Continuing, keeping one label for records with inconsistent labels," + " resolving conflicts using the following hierarchy:" + f"\n1. {dict_terms[order[0]]}\n2. {dict_terms[order[1]]}" + f"\n3. {dict_terms[order[2]]}", + stacklevel=1, ) df_composed = as_conflict.drop_duplicates(pid=pid).reset_index(drop=True) elif resolve == "keep_all": warnings.warn( - f"Continuing, keeping all labels for duplicate records with inconsistent labels." + "Continuing, keeping all labels for duplicate records with inconsistent" + " labels.", + stacklevel=1, ) df_composed = as_conflict.df diff --git a/asreviewcontrib/datatools/describe.py b/asreviewcontrib/datatools/describe.py index bcd38bb..40336b1 100644 --- a/asreviewcontrib/datatools/describe.py +++ b/asreviewcontrib/datatools/describe.py @@ -1,16 +1,20 @@ import argparse import json -from pathlib import Path import asreview from asreview.data import load_data -from asreview.data.statistics import * # noqa +from asreview.data.statistics import n_duplicates +from asreview.data.statistics import n_irrelevant +from asreview.data.statistics import n_missing_abstract +from asreview.data.statistics import n_missing_title +from asreview.data.statistics import n_records +from asreview.data.statistics import n_relevant +from asreview.data.statistics import n_unlabeled from asreviewcontrib.datatools import __version__ def describe(input_path, output_path=None): - # read data in ASReview data object asdata = load_data(input_path) @@ -47,19 +51,26 @@ def describe(input_path, output_path=None): { "id": "n_missing_title", "title": "Number of records with missing title", - "description": "The number of records in the dataset with missing title.", + "description": ( + "The number of records in the dataset with missing title." + ), "value": n_missing_title(asdata)[0], }, { "id": "n_missing_abstract", "title": "Number of records with missing abstract", - "description": "The number of records in the dataset with missing abstract.", + "description": ( + "The number of records in the dataset with missing abstract." + ), "value": n_missing_abstract(asdata)[0], }, { "id": "n_duplicates", "title": "Number of duplicate records (basic algorithm)", - "description": "The number of duplicate records in the dataset based on similar text.", + "description": ( + "The number of duplicate records in the dataset based on" + " similar text." + ), "value": n_duplicates(asdata), }, ] diff --git a/asreviewcontrib/datatools/entrypoint.py b/asreviewcontrib/datatools/entrypoint.py index 86c83a0..562bea7 100644 --- a/asreviewcontrib/datatools/entrypoint.py +++ b/asreviewcontrib/datatools/entrypoint.py @@ -3,6 +3,7 @@ from asreview.data import load_data from asreview.entry_points import BaseEntryPoint +from asreviewcontrib.datatools import __version__ from asreviewcontrib.datatools.compose import _parse_arguments_compose from asreviewcontrib.datatools.compose import compose from asreviewcontrib.datatools.convert import _parse_arguments_convert @@ -24,7 +25,7 @@ class DataEntryPoint(BaseEntryPoint): def __init__(self): from asreviewcontrib.datatools.__init__ import __version__ - super(DataEntryPoint, self).__init__() + super().__init__() self.version = __version__ @@ -78,11 +79,13 @@ def execute(self, argv): if args_dedup.output_path: asdata.to_file(args_dedup.output_path) print( - f"Removed {n_dup} duplicates from dataset with {initial_length} records." + f"Removed {n_dup} duplicates from dataset with" + f" {initial_length} records." ) else: print( - f"Found {n_dup} duplicates in dataset with {initial_length} records." + f"Found {n_dup} duplicates in dataset with" + f" {initial_length} records." ) if argv[0] == "compose": args_compose_parser = _parse_arguments_compose() @@ -108,7 +111,6 @@ def execute(self, argv): # Print help message if subcommand not given or incorrect else: - parser = argparse.ArgumentParser( prog="asreview data", formatter_class=argparse.RawTextHelpFormatter, diff --git a/asreviewcontrib/datatools/snowball.py b/asreviewcontrib/datatools/snowball.py index 3108695..d56114f 100644 --- a/asreviewcontrib/datatools/snowball.py +++ b/asreviewcontrib/datatools/snowball.py @@ -211,7 +211,7 @@ def snowball( raise ValueError("At least one of 'forward' or 'backward' should be True.") data = load_data(input_path) - if (use_all or (data.included is None)): + if use_all or (data.included is None): data = data.df else: data = data.df.loc[data.included.astype(bool)] @@ -236,9 +236,11 @@ def snowball( " records. Performing snowballing for those records." ) data["openalex_id"] = None - data.loc[data.doi.notna(), "openalex_id"] = data.loc[ - data.doi.notna(), "doi" - ].str.removeprefix(DOI_PREFIX).apply(lambda doi: id_mapping[doi]) + data.loc[data.doi.notna(), "openalex_id"] = ( + data.loc[data.doi.notna(), "doi"] + .str.removeprefix(DOI_PREFIX) + .apply(lambda doi: id_mapping[doi]) + ) identifiers = data["openalex_id"].dropna().to_list() diff --git a/asreviewcontrib/datatools/stack.py b/asreviewcontrib/datatools/stack.py index ec54005..d7dc813 100644 --- a/asreviewcontrib/datatools/stack.py +++ b/asreviewcontrib/datatools/stack.py @@ -1,5 +1,4 @@ import argparse -import warnings from pathlib import Path import pandas as pd @@ -19,8 +18,8 @@ def _check_suffix(input_files, output_file): if len(set(suffixes)) > 1: if not (set_suffixes.issubset(set_ris) or set_suffixes.issubset(set_tabular)): raise ValueError( - "• Several file types were given; All input files, as well as the output file should be of the same " - "type. " + "• Several file types were given; All input files, as well as the" + " output file should be of the same type. " ) @@ -38,7 +37,10 @@ def _parse_arguments_vstack(): parser = argparse.ArgumentParser(prog="asreview data vstack") parser.add_argument("output_path", type=str, help="The output file path.") parser.add_argument( - "datasets", type=str, nargs="+", help="Any number of datasets to stack vertically." + "datasets", + type=str, + nargs="+", + help="Any number of datasets to stack vertically.", ) return parser diff --git a/tests/test_compose.py b/tests/test_compose.py index abefe09..30698ef 100644 --- a/tests/test_compose.py +++ b/tests/test_compose.py @@ -55,7 +55,8 @@ def test_label_prioritization(): df_3 = create_composition(*input_files_1, order="uri") assert df_3["included"].value_counts()[-1] == len(df_3) - # input different datasets with some identical records, combining as labeled and unlabeled data + # input different datasets with some identical records, combining as labeled and + # unlabeled data df_4 = create_composition(*input_files_2, order="riu") df_4_counts = df_4["included"].value_counts() assert df_4_counts[-1] == 7 and df_4_counts[0] == 3 and df_4_counts[1] == 1 diff --git a/tests/test_describe.py b/tests/test_describe.py index a3016eb..10e0147 100644 --- a/tests/test_describe.py +++ b/tests/test_describe.py @@ -2,5 +2,4 @@ def test_describe(): - subprocess.run(["asreview", "data-describe", "benchmark:van_de_schoot2017"])