Skip to content

Commit

Permalink
Linting and formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
PeterLombaers committed Mar 28, 2024
1 parent d4cfdfe commit 3d195e2
Show file tree
Hide file tree
Showing 8 changed files with 64 additions and 38 deletions.
2 changes: 1 addition & 1 deletion asreviewcontrib/datatools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
from asreviewcontrib.datatools._version import __version_tuple__
except ImportError:
__version__ = "0.0.0"
__version_tuple__ = (0, 0, 0)
__version_tuple__ = (0, 0, 0)
43 changes: 26 additions & 17 deletions asreviewcontrib/datatools/compose.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ def _check_order_arg(order):
return order
else:
raise ValueError(
f"hierarchy '{order}' not found, should be one of the following: {allowed_orders}"
f"hierarchy '{order}' not found, should be one of the"
f" following: {allowed_orders}"
)


Expand Down Expand Up @@ -48,18 +49,20 @@ def _check_suffix(input_files, output_file):
if len(set(suffixes)) > 1:
if not (set_suffixes.issubset(set_ris) or set_suffixes.issubset(set_tabular)):
raise ValueError(
"Files with different file types were; all input files, as well as the output file, should be of the "
"same type. "
"Files with different file types were; all input files, as well as the"
" output file, should be of the same type. "
)


def _check_label_errors(as_lab, path_lab):
if as_lab is not None:
if as_lab.labels is None:
warnings.warn(
f"'{path_lab}' was passed as a labeled dataset but no labels were found, continuing with its records "
f"marked as unlabeled. If this is not correct, check if your data format complies with: "
f"https://asreview.readthedocs.io/en/latest/data_format.html"
f"'{path_lab}' was passed as a labeled dataset but no labels were"
" found, continuing with its records marked as unlabeled. If this is"
" not correct, check if your data format complies with:"
" https://asreview.readthedocs.io/en/latest/data_format.html",
stacklevel=1,
)


Expand All @@ -83,8 +86,8 @@ def _concat_label(list_df, label, pid="doi"):

n_total_dedup = n_total - len(df_all)
print(
f"Detected {n_total} records with label '{label}', from which {n_total_dedup} duplicate records with the "
f"same label were removed."
f"Detected {n_total} records with label '{label}', from which"
f" {n_total_dedup} duplicate records with the same label were removed."
)
else:
df_all = pd.DataFrame()
Expand All @@ -104,9 +107,9 @@ def create_composition(
# load all input files and URLs into ASReviewData objects, fill with None
# if input was not specified
input_files = [rel_path, irr_path, lab_path, unl_path]
as_rel, as_irr, as_lab, as_unl = [
as_rel, as_irr, as_lab, as_unl = (
load_data(item) if item is not None else None for item in input_files
]
)

# check whether input files are correctly labeled
_check_label_errors(as_lab, lab_path)
Expand Down Expand Up @@ -185,10 +188,11 @@ def create_composition(
"left",
):
print(
f"\nSome records have inconsistent labels in the input files. This may be intentional because you are "
f"trying to overwrite labels in an input file with labels from another input file. However, "
f"it may also be because some records are unintentionally labeled inconsistently.\n\n"
f"The following records have inconsistent labels in the input files:\n"
f"\nSome records have inconsistent labels in the input files. This may"
" be intentional because you are trying to overwrite labels in an input"
" file with labels from another input file. However, it may also be"
" because some records are unintentionally labeled inconsistently.\n\n"
"The following records have inconsistent labels in the input files:\n"
f"{df_info_conflicts}\n"
)

Expand All @@ -197,14 +201,19 @@ def create_composition(

elif resolve == "keep_one":
warnings.warn(
f"Continuing, keeping one label for records with inconsistent labels, resolving conflicts using the "
f"following hierarchy:\n1. {dict_terms[order[0]]}\n2. {dict_terms[order[1]]}\n3. {dict_terms[order[2]]}"
f"Continuing, keeping one label for records with inconsistent labels,"
" resolving conflicts using the following hierarchy:"
f"\n1. {dict_terms[order[0]]}\n2. {dict_terms[order[1]]}"
f"\n3. {dict_terms[order[2]]}",
stacklevel=1,
)
df_composed = as_conflict.drop_duplicates(pid=pid).reset_index(drop=True)

elif resolve == "keep_all":
warnings.warn(
f"Continuing, keeping all labels for duplicate records with inconsistent labels."
"Continuing, keeping all labels for duplicate records with inconsistent"
" labels.",
stacklevel=1,
)
df_composed = as_conflict.df

Expand Down
23 changes: 17 additions & 6 deletions asreviewcontrib/datatools/describe.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
import argparse
import json
from pathlib import Path

import asreview
from asreview.data import load_data
from asreview.data.statistics import * # noqa
from asreview.data.statistics import n_duplicates
from asreview.data.statistics import n_irrelevant
from asreview.data.statistics import n_missing_abstract
from asreview.data.statistics import n_missing_title
from asreview.data.statistics import n_records
from asreview.data.statistics import n_relevant
from asreview.data.statistics import n_unlabeled

from asreviewcontrib.datatools import __version__


def describe(input_path, output_path=None):

# read data in ASReview data object
asdata = load_data(input_path)

Expand Down Expand Up @@ -47,19 +51,26 @@ def describe(input_path, output_path=None):
{
"id": "n_missing_title",
"title": "Number of records with missing title",
"description": "The number of records in the dataset with missing title.",
"description": (
"The number of records in the dataset with missing title."
),
"value": n_missing_title(asdata)[0],
},
{
"id": "n_missing_abstract",
"title": "Number of records with missing abstract",
"description": "The number of records in the dataset with missing abstract.",
"description": (
"The number of records in the dataset with missing abstract."
),
"value": n_missing_abstract(asdata)[0],
},
{
"id": "n_duplicates",
"title": "Number of duplicate records (basic algorithm)",
"description": "The number of duplicate records in the dataset based on similar text.",
"description": (
"The number of duplicate records in the dataset based on"
" similar text."
),
"value": n_duplicates(asdata),
},
]
Expand Down
10 changes: 6 additions & 4 deletions asreviewcontrib/datatools/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from asreview.data import load_data
from asreview.entry_points import BaseEntryPoint

from asreviewcontrib.datatools import __version__
from asreviewcontrib.datatools.compose import _parse_arguments_compose
from asreviewcontrib.datatools.compose import compose
from asreviewcontrib.datatools.convert import _parse_arguments_convert
Expand All @@ -24,7 +25,7 @@ class DataEntryPoint(BaseEntryPoint):
def __init__(self):
from asreviewcontrib.datatools.__init__ import __version__

super(DataEntryPoint, self).__init__()
super().__init__()

self.version = __version__

Expand Down Expand Up @@ -78,11 +79,13 @@ def execute(self, argv):
if args_dedup.output_path:
asdata.to_file(args_dedup.output_path)
print(
f"Removed {n_dup} duplicates from dataset with {initial_length} records."
f"Removed {n_dup} duplicates from dataset with"
f" {initial_length} records."
)
else:
print(
f"Found {n_dup} duplicates in dataset with {initial_length} records."
f"Found {n_dup} duplicates in dataset with"
f" {initial_length} records."
)
if argv[0] == "compose":
args_compose_parser = _parse_arguments_compose()
Expand All @@ -108,7 +111,6 @@ def execute(self, argv):

# Print help message if subcommand not given or incorrect
else:

parser = argparse.ArgumentParser(
prog="asreview data",
formatter_class=argparse.RawTextHelpFormatter,
Expand Down
10 changes: 6 additions & 4 deletions asreviewcontrib/datatools/snowball.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ def snowball(
raise ValueError("At least one of 'forward' or 'backward' should be True.")

data = load_data(input_path)
if (use_all or (data.included is None)):
if use_all or (data.included is None):
data = data.df
else:
data = data.df.loc[data.included.astype(bool)]
Expand All @@ -236,9 +236,11 @@ def snowball(
" records. Performing snowballing for those records."
)
data["openalex_id"] = None
data.loc[data.doi.notna(), "openalex_id"] = data.loc[
data.doi.notna(), "doi"
].str.removeprefix(DOI_PREFIX).apply(lambda doi: id_mapping[doi])
data.loc[data.doi.notna(), "openalex_id"] = (
data.loc[data.doi.notna(), "doi"]
.str.removeprefix(DOI_PREFIX)
.apply(lambda doi: id_mapping[doi])
)

identifiers = data["openalex_id"].dropna().to_list()

Expand Down
10 changes: 6 additions & 4 deletions asreviewcontrib/datatools/stack.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import argparse
import warnings
from pathlib import Path

import pandas as pd
Expand All @@ -19,8 +18,8 @@ def _check_suffix(input_files, output_file):
if len(set(suffixes)) > 1:
if not (set_suffixes.issubset(set_ris) or set_suffixes.issubset(set_tabular)):
raise ValueError(
"• Several file types were given; All input files, as well as the output file should be of the same "
"type. "
"• Several file types were given; All input files, as well as the"
" output file should be of the same type. "
)


Expand All @@ -38,7 +37,10 @@ def _parse_arguments_vstack():
parser = argparse.ArgumentParser(prog="asreview data vstack")
parser.add_argument("output_path", type=str, help="The output file path.")
parser.add_argument(
"datasets", type=str, nargs="+", help="Any number of datasets to stack vertically."
"datasets",
type=str,
nargs="+",
help="Any number of datasets to stack vertically.",
)

return parser
3 changes: 2 additions & 1 deletion tests/test_compose.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ def test_label_prioritization():
df_3 = create_composition(*input_files_1, order="uri")
assert df_3["included"].value_counts()[-1] == len(df_3)

# input different datasets with some identical records, combining as labeled and unlabeled data
# input different datasets with some identical records, combining as labeled and
# unlabeled data
df_4 = create_composition(*input_files_2, order="riu")
df_4_counts = df_4["included"].value_counts()
assert df_4_counts[-1] == 7 and df_4_counts[0] == 3 and df_4_counts[1] == 1
1 change: 0 additions & 1 deletion tests/test_describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,4 @@


def test_describe():

subprocess.run(["asreview", "data-describe", "benchmark:van_de_schoot2017"])

0 comments on commit 3d195e2

Please sign in to comment.