Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Address qc-duplicate-exact-synonym-no-abbrev failures in mondo #751

Merged
merged 10 commits into from
Feb 5, 2025
Merged
12 changes: 11 additions & 1 deletion src/ontology/mondo-ingest.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,17 @@ $(SYN_SYNC_DIR):
mkdir -p $@

.PHONY: sync-synonyms
sync-synonyms: $(SYN_SYNC_DIR)/synonym_sync_combined_cases.robot.tsv $(SYN_SYNC_DIR)/sync-synonyms.added.robot.tsv $(SYN_SYNC_DIR)/sync-synonyms.confirmed.robot.tsv $(SYN_SYNC_DIR)/sync-synonyms.updated.robot.tsv
sync-synonyms: $(SYN_SYNC_DIR)/review-qc-duplicate-exact-synonym-no-abbrev.tsv

# side effects: Mutates .robot.tsv files, filtering out certain cases, which will instead get populated into the review-*.tsv.
$(SYN_SYNC_DIR)/review-qc-duplicate-exact-synonym-no-abbrev.tsv: $(SYN_SYNC_DIR)/synonym_sync_combined_cases.robot.tsv $(SYN_SYNC_DIR)/sync-synonyms.added.robot.tsv $(SYN_SYNC_DIR)/sync-synonyms.confirmed.robot.tsv $(SYN_SYNC_DIR)/sync-synonyms.updated.robot.tsv tmp/mondo-synonyms-scope-type-xref.tsv $(TMPDIR)/mondo.db
python3 $(SCRIPTSDIR)/sync_synonym_curation_filtering.py \
--added-path reports/sync-synonym/sync-synonyms.added.robot.tsv \
--confirmed-path reports/sync-synonym/sync-synonyms.confirmed.robot.tsv \
--updated-path reports/sync-synonym/sync-synonyms.updated.robot.tsv \
--mondo-synonyms-path tmp/mondo-synonyms-scope-type-xref.tsv \
--mondo-db-path $(TMPDIR)/mondo.db \
--outpath reports/sync-synonym/review-qc-duplicate-exact-synonym-no-abbrev.tsv

tmp/mondo-synonyms-scope-type-xref.tsv: $(TMPDIR)/mondo.owl
$(ROBOT) query -i tmp/mondo.owl --query ../sparql/synonyms-scope-type-xref.sparql $@
Expand Down
6 changes: 3 additions & 3 deletions src/scripts/sync_synonym.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def lower_and_strip(x: str) -> str:

def _common_operations(
df: pd.DataFrame, outpath: Union[Path, str], order_cols: List[str] = list(HEADERS_TO_ROBOT_SUBHEADERS.keys()),
sort_cols: List[str] = SORT_COLS, mondo_exclusions_df=pd.DataFrame(), save=True, df_is_combined=False
sort_cols: List[str] = SORT_COLS, mondo_exclusions_df=pd.DataFrame(), save=True, dont_make_scope_cols=False
) -> pd.DataFrame:
"""Merges synonym types, filters exclusions, does some formatting, and optionally saves.

Expand All @@ -174,7 +174,7 @@ def _common_operations(
df = _filter_a_by_not_in_b(df, mondo_exclusions_df, ['mondo_id', 'synonym_scope', 'synonym_join'])

# Format
if not df_is_combined:
if not dont_make_scope_cols:
# - Add ROBOT columns for each synonym scope
synonym_scopes = ['exact', 'broad', 'narrow', 'related']
for scope in synonym_scopes:
Expand Down Expand Up @@ -424,7 +424,7 @@ def sync_synonyms(
# Write outputs
combined_cases_df = pd.concat([confirmed_df, added_df, updated_df, deleted_df], ignore_index=True)\
.fillna('')
combined_cases_df = _common_operations(combined_cases_df, outpath_combined, df_is_combined=True)
combined_cases_df = _common_operations(combined_cases_df, outpath_combined, dont_make_scope_cols=True)
combined_cases_df['source'] = source_name
combined_cases_df = pd.concat([pd.DataFrame([HEADERS_TO_ROBOT_SUBHEADERS]), combined_cases_df])
combined_cases_df.to_csv(outpath_combined, sep='\t', index=False)
Expand Down
148 changes: 148 additions & 0 deletions src/scripts/sync_synonym_curation_filtering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
"""Filter out cases where curation is needed"""
import os
import shutil
import sys
from argparse import ArgumentParser
from datetime import datetime
from pathlib import Path
from typing import Union, List

import pandas as pd
from oaklib import get_adapter
from oaklib.types import CURIE, URI

from src.scripts.utils import remove_angle_brackets

HERE = Path(os.path.abspath(os.path.dirname(__file__)))
SRC_DIR = HERE.parent
PROJECT_ROOT = SRC_DIR.parent
sys.path.insert(0, str(PROJECT_ROOT))
from src.scripts.sync_synonym import _common_operations, _read_sparql_output_tsv


def _read_synonym_file(path: Union[Path, str], case: str) -> pd.DataFrame:
"""Does special operations for reading in a synonym sync robot.tsv in this context"""
df = pd.read_csv(path, sep='\t').rename(columns={'synonym_scope_source': 'synonym_scope'}) \
.drop(0)[['synonym', 'synonym_scope', 'mondo_id', 'source_id', 'synonym_type']].fillna('')
df['synonym_type'] = df['synonym_type'].apply(
lambda x: x.replace('http://purl.obolibrary.org/obo/mondo#ABBREVIATION', 'MONDO:ABBREVIATION'))
df['case'] = case
return df


def sync_synonyms_curation_filtering(
added_path: Union[Path, str], confirmed_path: Union[Path, str], updated_path: Union[Path, str],
mondo_synonyms_path: Union[Path, str], mondo_db_path: Union[Path, str], outpath: Union[Path, str]
):
"""Filter out cases where curation is needed"""
# todo temp: backup temporarily for development; remove when done
twhetzel marked this conversation as resolved.
Show resolved Hide resolved
tmp_dir = SRC_DIR / 'ontology' / 'tmp'
t0 = str(datetime.now())
for file in [added_path, confirmed_path, updated_path]:
shutil.copy(file, tmp_dir / f"{Path(file).name.replace('.robot.tsv', '')}_{t0}.robot.tsv")

# Read -added & -updated
df_add: pd.DataFrame = _read_synonym_file(added_path, 'added')
df_upd: pd.DataFrame = _read_synonym_file(updated_path, 'updated')

# Read -confirmed & ascertain unconfirmed
df_conf = pd.read_csv(confirmed_path, sep='\t').drop(0).fillna('') # Not de-duped. Used for informational purposes.
df_conf['case'] = 'confirmed'
df_conf['synonym_scope'] = df_conf['synonym_scope_source']
df_mondo_syns = _read_sparql_output_tsv(mondo_synonyms_path).fillna('').rename(columns={
'cls_id': 'mondo_id', 'cls_label': 'mondo_label', 'synonym_type': 'synonym_type_mondo', 'dbXref': 'source_id'})
df_mondo_syns = df_mondo_syns[~df_mondo_syns['mondo_label'].str.startswith('obsolete ')]
merge_columns = ['synonym', 'synonym_scope', 'mondo_id']
df_mondo_conf = df_mondo_syns.merge(
df_conf[merge_columns + ['case']], on=merge_columns, how='left', suffixes=('', '_conf'))
df_mondo_syns['case'] = df_mondo_conf['case'].fillna('unconfirmed')

# Group all sources of synonyms & labels cases together
df_all = pd.concat([df_add, df_upd, df_mondo_syns], ignore_index=True).fillna('')
df_all['synonym_type'] = df_all.apply(
lambda row: row['synonym_type'] if row['synonym_type'] else row['synonym_type_mondo'], axis=1)

# Discover review cases: exactSynonym appears in multiple terms
# - find all duplicative cases where scope+synonym has >1 instance
df_all_dupes = df_all[df_all.groupby(['synonym', 'synonym_scope']).transform('size') > 1]\
.sort_values(['synonym', 'synonym_scope', 'mondo_id', 'source_id'])
# - find all cases where, among these multiple synonym+scope instances, there is >1 associated mondo_id
df_review_syns = df_all_dupes[
df_all_dupes.groupby(['synonym', 'synonym_scope'])['mondo_id'].transform('nunique') > 1]
df_review_syns = df_review_syns.sort_values(['synonym', 'synonym_scope', 'mondo_id'])
# - leave only exactMatch cases
df_review_syns = df_review_syns[df_review_syns['synonym_scope'] == 'oio:hasExactSynonym']
# - before filtering abbreviations: handle edge cases of missing synonym_type for 1 of the duplicates
df_review_syns['synonym_type'] = df_review_syns.groupby(
'synonym')['synonym_type'].transform(lambda x: '|'.join(filter(None, x)))
# - remove abbreviations; we aren't bothered by duplicates of this type
df_review_syns = df_review_syns[~df_review_syns['synonym_type'].str.contains('MONDO:ABBREVIATION')]

# Discover review cases: exactSynonym appears as label in another term
# - get mondo labels
# todo: this (reading Mondo & only filtering by its terms (nad possibly labels)) should be a utility func somewhere
oi = get_adapter(mondo_db_path)
ids_all: List[Union[CURIE, URI]] = [x for x in oi.entities(filter_obsoletes=False)]
ids_all = remove_angle_brackets(ids_all)
id_labels_all: List[tuple] = [x for x in oi.labels(ids_all)]
df_mondo_labs = pd.DataFrame(id_labels_all, columns=['mondo_id', 'mondo_label']).fillna('')
df_mondo_labs['prefix'] = df_mondo_labs['mondo_id'].str.split(':', expand=True)[0]
df_mondo_labs = df_mondo_labs[df_mondo_labs['prefix'] == 'MONDO']
del df_mondo_labs['prefix']
# - get relevant synonym sync cases to filter
df_sync = pd.concat([df_add, df_upd], ignore_index=True)
df_sync = df_sync[df_sync['synonym_scope'] == 'oio:hasExactSynonym']
# - filter to keep only rows where mondo_ids are different
df_review_labs = df_sync.merge(df_mondo_labs, left_on=['synonym'], right_on=['mondo_label'], how='inner')
df_review_labs = df_review_labs[df_review_labs['mondo_id_x'] != df_review_labs['mondo_id_y']].rename(columns={
'mondo_id_x': 'mondo_id', 'mondo_id_y': 'filtered_because_this_mondo_id_already_has_this_synonym_as_its_label'})
del df_review_labs['mondo_label'] # this is left over from the merge; not useful information
# - remove abbreviations; we aren't bothered by duplicates of this type
df_review_labs = df_review_labs.merge(df_mondo_syns[['mondo_id', 'synonym', 'synonym_scope', 'synonym_type_mondo']],
on=['mondo_id', 'synonym', 'synonym_scope'], how='left').fillna('')
df_review_labs['synonym_type'] = df_review_labs.apply(
lambda row: row['synonym_type'] if row['synonym_type'] else row['synonym_type_mondo'], axis=1)
del df_review_labs['synonym_type_mondo']
df_review_labs['synonym_type'] = df_review_labs.groupby(
'synonym')['synonym_type'].transform(lambda x: '|'.join(filter(None, x)))
df_review_labs = df_review_labs[~df_review_labs['synonym_type'].str.contains('MONDO:ABBREVIATION')]

# Generate outputs & save
df_review = pd.concat([df_review_syns, df_review_labs], ignore_index=True)[['synonym', 'mondo_id', 'source_id',
'case', 'synonym_type', 'filtered_because_this_mondo_id_already_has_this_synonym_as_its_label']]\
.sort_values(['synonym', 'mondo_id'])
df_review.to_csv(outpath, sep='\t', index=False)

df_filtered = df_all[~df_all.index.isin(df_review.index)]
_common_operations(df_filtered[df_filtered['case'] == 'added'], added_path, dont_make_scope_cols=True)
_common_operations(df_filtered[df_filtered['case'] == 'updated'], updated_path, dont_make_scope_cols=True)


def cli():
"""Command line interface."""
parser = ArgumentParser(
prog='sync-synonyms-curation-filtering',
description='Filter out cases where curation is needed')
parser.add_argument(
'-a', '--added-path', required=True,
help='Path to ROBOT template TSV containing synonyms that aren\'t yet integrated into Mondo.')
parser.add_argument(
'-c', '--confirmed-path', required=True,
help='Path to ROBOT template TSV containing synonym confirmations.')
parser.add_argument(
'-u', '--updated-path', required=True,
help='Path to ROBOT template TSV containing updates to synonym scope.')
parser.add_argument(
'-m', '--mondo-synonyms-path', required=True,
help='Path to a TSV containing information about Mondo synonyms. Columns: ?mondo_id, ?dbXref, ?synonym_scope, '
'?synonym, synonym_type.')
parser.add_argument(
'-M', '--mondo-db-path', required=True, help='Path to Mondo SemanticSQL DB.')
parser.add_argument(
'-o', '--outpath', required=True,
help='Path to curation file for review.')
sync_synonyms_curation_filtering(**vars(parser.parse_args()))


if __name__ == '__main__':
cli()
1 change: 1 addition & 0 deletions src/sparql/synonyms-scope-type-xref.sparql
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX oio: <http://www.geneontology.org/formats/oboInOwl#>
PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
Expand Down