diff --git a/scripts/download_files.py b/scripts/download_files.py new file mode 100644 index 00000000..d5b1d233 --- /dev/null +++ b/scripts/download_files.py @@ -0,0 +1,522 @@ +import base64 +import datetime +import logging +import os +import re +from typing import NamedTuple +import requests +from pathlib import Path +import boto3 +from botocore.exceptions import ClientError +from wags_tails.base_source import DataSource, UnversionedDataSource, RemoteDataError +from wags_tails.utils.storage import get_latest_local_file +from wags_tails.utils.downloads import HTTPS_REQUEST_TIMEOUT, download_http, handle_zip +from wags_tails.utils.versioning import DATE_VERSION_PATTERN, parse_file_version + +from tqdm import tqdm + +_logger = logging.getLogger(__name__) + + +def download_s3(uri: str, outfile_path: Path, tqdm_params: dict | None = None) -> None: + if not tqdm_params: + tqdm_params = {} + _logger.info("Downloading %s from %s...", outfile_path.name, uri) + + bucket, key = uri.removeprefix("s3://").split("/", 1) + + s3 = boto3.client("s3") + try: + response = s3.head_object(Bucket=bucket, Key=key) + except ClientError as e: + _logger.error("Encountered ClientError downloading %s: %s", uri, e.response) + raise e + + file_size = response["ContentLength"] + + with tqdm(total=file_size, **tqdm_params) as progress_bar: + s3.download_file( + Bucket=bucket, + Key=key, + Filename=outfile_path, + Callback=lambda bytes_amount: progress_bar.update(bytes_amount), + ) + + +class UnversionedS3Data(UnversionedDataSource): + _datatype = "claims" + _filetype = "tsv" # most of this data is TSV, can manually set otherwise + + def _download_data(self, version: str, outfile: Path) -> None: + download_s3( + f"s3://nch-igm-wagner-lab/dgidb/source_data/{self._src_name}/{self._src_name}_{self._datatype}.{self._filetype}", + outfile, + self._tqdm_params, + ) + + def get_latest( + self, from_local: bool = False, force_refresh: bool = False + ) -> tuple[Path, str]: + """Get path to data file + + :param from_local: if True, use latest available local file + :param force_refresh: if True, fetch and return data from remote regardless of + whether a local copy is present + :return: Path to location of data, and version value of it + :raise ValueError: if both ``force_refresh`` and ``from_local`` are True + """ + if force_refresh and from_local: + msg = "Cannot set both `force_refresh` and `from_local`" + raise ValueError(msg) + + filename = f"{self._src_name}_{self._datatype}.{self._filetype}" + if from_local: + return get_latest_local_file(self.data_dir, filename), "" + + file_path = self.data_dir / filename + if (not force_refresh) and file_path.exists(): + _logger.debug( + "Found existing file, %s (unversioned).", + file_path.name, + ) + return file_path, "" + self._download_data("", file_path) + return file_path, "" + + +class BaderLab(UnversionedS3Data): + _src_name = "bader_lab" + + +class CancerCommons(UnversionedS3Data): + _src_name = "cancer_commons" + + +class Caris(UnversionedS3Data): + _src_name = "caris_molecular_intelligence" + + +class Cgi(UnversionedS3Data): + _src_name = "cgi" + + +class ClearityFoundationBiomarkers(UnversionedS3Data): + _src_name = "clearity_foundation" + _datatype = "biomarkers_claims" + + +class ClearityFoundationClinicalTrial(UnversionedS3Data): + _src_name = "clearity_foundation" + _datatype = "clinical_trial_claims" + + +class Cosmic(UnversionedS3Data): + _src_name = "cosmic" + _filetype = "csv" + + +class Dgene(UnversionedS3Data): + _src_name = "dgene" + + +class DocmDrugClaims(UnversionedS3Data): + _src_name = "docm" + _datatype = "drug_claims" + _filetype = "csv" + + +class DocmGeneClaims(UnversionedS3Data): + _src_name = "docm" + _datatype = "gene_claims" + _filetype = "csv" + + +class DocmInteractionClaims(UnversionedS3Data): + _src_name = "docm" + _datatype = "interaction_claims" + _filetype = "csv" + + +class DocmInteractionClaimAttributes(UnversionedS3Data): + _src_name = "docm" + _datatype = "interaction_claim_attributes" + _filetype = "csv" + + +class DocmInteractionClaimPublications(UnversionedS3Data): + _src_name = "docm" + _datatype = "interaction_claim_publications" + _filetype = "csv" + + +class DrugbankProtected(DataSource): + _src_name = "drugbank" + _filetype = "xml" + + @staticmethod + def _get_latest_version() -> tuple[str, str]: + releases_url = "https://go.drugbank.com/releases/latest.json" + r = requests.get(releases_url, timeout=HTTPS_REQUEST_TIMEOUT) + r.raise_for_status() + try: + latest = r.json()[0] + url = latest["url"] + version = ( + re.match( + r"https:\/\/go.drugbank.com\/releases\/(.*)\/downloads\/all-full-database", + url, + ) + .groups()[0] + .replace("-", ".") + ) + return version, url + except (KeyError, IndexError) as e: + msg = "Unable to parse latest DrugBank version number from releases API endpoint" + raise RemoteDataError(msg) from e + + def _get_latest_local_file(self, glob: str) -> tuple[Path, str]: + """Get most recent locally-available file. DrugBank uses versioning that isn't + easily sortable by default so we have to use some extra magic. + """ + _logger.debug("Getting local match against pattern %s...", glob) + file_version_pairs = [] + for file in self.data_dir.glob(glob): + version = parse_file_version(file, r"drugbank_([\d\.]+).csv") + formatted_version = [int(digits) for digits in version.split(".")] + file_version_pairs.append((file, version, formatted_version)) + files = sorted(file_version_pairs, key=lambda p: p[2]) + if not files: + msg = "No source data found for DrugBank" + raise FileNotFoundError(msg) + latest = files[-1] + _logger.debug("Returning %s as most recent locally-available file.", latest[0]) + return latest[0], latest[1] + + def _download_data(self, url: str, outfile: Path) -> None: + email = os.environ.get("DRUGBANK_EMAIL") + password = os.environ.get("DRUGBANK_PASSWORD") + if not (email and password): + msg = "Unable to download DrugBank dataset -- must provide email and password under env vars DRUGBANK_EMAIL and DRUGBANK_PASSWORD" + raise RemoteDataError(msg) + + encoded_credentials = base64.b64encode( + f"{email}:{password}".encode("utf-8") + ).decode("utf-8") + headers = {"Authorization": f"Basic {encoded_credentials}"} + download_http( + url, + outfile, + handler=handle_zip, + tqdm_params=self._tqdm_params, + headers=headers, + ) + + def get_latest( + self, from_local: bool = False, force_refresh: bool = False + ) -> tuple[Path, str]: + """Get path to latest version of data, and its version value + + :param from_local: if True, use latest available local file + :param force_refresh: if True, fetch and return data from remote regardless of + whether a local copy is present + :return: Path to location of data, and version value of it + :raise ValueError: if both ``force_refresh`` and ``from_local`` are True + """ + if force_refresh and from_local: + msg = "Cannot set both `force_refresh` and `from_local`" + raise ValueError(msg) + + if from_local: + file_path, version = self._get_latest_local_file("drugbank_*.xml") + return file_path, version + + latest_version, latest_url = self._get_latest_version() + latest_file = self.data_dir / f"drugbank_{latest_version}.xml" + if (not force_refresh) and latest_file.exists(): + _logger.debug( + "Found existing file, %s, matching latest version %s.", + latest_file.name, + latest_version, + ) + return latest_file, latest_version + self._download_data(latest_url, latest_file) + return latest_file, latest_version + + +class Dtc(UnversionedS3Data): + _src_name = "dtc" + _filetype = "csv" + + +class Fda(UnversionedS3Data): + _src_name = "fda" + + +class FoundationOneGenes(UnversionedS3Data): + _src_name = "foundation_one_genes" + + +class GtoPPaths(NamedTuple): + """Container for GuideToPharmacology file paths.""" + + interactions: Path + targets_and_families: Path + + +class GToPInteractionData(DataSource): + """Provide access to Guide to Pharmacology data.""" + + _src_name = "guidetopharmacology" + _filetype = "tsv" + + @staticmethod + def _get_latest_version() -> str: + r = requests.get( + "https://www.guidetopharmacology.org/", timeout=HTTPS_REQUEST_TIMEOUT + ) + r.raise_for_status() + r_text = r.text.split("\n") + pattern = re.compile(r"Current Release Version (\d{4}\.\d) \(.*\)") + for line in r_text: + if "Current Release Version" in line: + matches = re.findall(pattern, line.strip()) + if matches: + return matches[0] + else: + msg = "Unable to parse latest Guide to Pharmacology version number homepage HTML." + raise RemoteDataError(msg) + + def _download_data(self, file_paths: GtoPPaths) -> None: + download_http( + "https://www.guidetopharmacology.org/DATA/interactions.tsv", + file_paths.interactions, + tqdm_params=self._tqdm_params, + ) + download_http( + "https://www.guidetopharmacology.org/DATA/targets_and_families.tsv", + file_paths.targets_and_families, + tqdm_params=self._tqdm_params, + ) + + def get_latest( + self, from_local: bool = False, force_refresh: bool = False + ) -> tuple[GtoPPaths, str]: + """Get path to latest version of data, and its version value + + :param from_local: if True, use latest available local file + :param force_refresh: if True, fetch and return data from remote regardless of + whether a local copy is present + :return: Paths to data, and version value of it + :raise ValueError: if both ``force_refresh`` and ``from_local`` are True + """ + if force_refresh and from_local: + msg = "Cannot set both `force_refresh` and `from_local`" + raise ValueError(msg) + + if from_local: + interactions_path = get_latest_local_file( + self.data_dir, "gtop_interactions_*.tsv" + ) + targets_and_families_path = get_latest_local_file( + self.data_dir, "gtop_targets_and_families_*.tsv" + ) + file_paths = GtoPPaths( + interactions=interactions_path, + targets_and_families=targets_and_families_path, + ) + return file_paths, parse_file_version( + interactions_path, r"gtop_interactions_(\d{4}\.\d+).tsv" + ) + + latest_version = self._get_latest_version() + interactions_path = self.data_dir / f"gtop_interactions_{latest_version}.tsv" + targets_and_families_path = ( + self.data_dir / f"gtop_targets_and_families_{latest_version}.tsv" + ) + file_paths = GtoPPaths( + interactions=interactions_path, + targets_and_families=targets_and_families_path, + ) + if not force_refresh: + if interactions_path.exists() and targets_and_families_path.exists(): + _logger.debug( + "Found existing files, %s, matching latest version %s.", + file_paths, + latest_version, + ) + return file_paths, latest_version + if interactions_path.exists() or targets_and_families_path.exists(): + _logger.warning( + "Existing files, %s, not all available -- attempting full download.", + file_paths, + ) + self._download_data(file_paths) + return file_paths, latest_version + + +class HingoraniCasas(UnversionedS3Data): + _src_name = "hingorani_casas" + + +class HopkinsGroom(UnversionedS3Data): + _src_name = "hopkins_groom" + + +class HumanProteinAtlas(UnversionedS3Data): + _src_name = "human_protein_atlas" + + +class Idg(UnversionedS3Data): + _src_name = "idg" + + +class MskImpact(UnversionedS3Data): + _src_name = "msk_impact" + + +class MyCancerGenome(UnversionedS3Data): + _src_name = "my_cancer_genome" + + +class MyCancerGenomeClinicalTrial(UnversionedS3Data): + _src_name = "my_cancer_genome" + _datatype = "clinical_trial_claims" + + +class Nci(UnversionedS3Data): + _src_name = "nci" + + +class PharmGkbRelations(DataSource): + _src_name = "pharmgkb" + _filetype = "tsv" + + @staticmethod + def _get_latest_version() -> str: + return ( + datetime.datetime.now() + .replace(tzinfo=datetime.UTC) + .strftime(DATE_VERSION_PATTERN) + ) + + def _download_data(self, version: str, outfile: Path) -> None: + download_http( + "https://api.pharmgkb.org/v1/download/file/data/relationships.zip", + outfile, + handler=handle_zip, + tqdm_params=self._tqdm_params, + ) + + +class OncoKbDrugClaims(UnversionedS3Data): + _src_name = "oncokb" + _datatype = "drug_claims" + _filetype = "csv" + + +class OncoKbGeneClaims(UnversionedS3Data): + _src_name = "oncokb" + _datatype = "gene_claims" + _filetype = "csv" + + +class OncoKbGeneClaimAliases(UnversionedS3Data): + _src_name = "oncokb" + _datatype = "gene_claim_aliases" + _filetype = "csv" + + +class OncoKbInteractionClaims(UnversionedS3Data): + _src_name = "oncokb" + _datatype = "interaction_claims" + _filetype = "csv" + + +class OncoKbInteractionClaimLinks(UnversionedS3Data): + _src_name = "oncokb" + _datatype = "interaction_claim_links" + _filetype = "csv" + + +class OncoKbInteractionClaimAttributes(UnversionedS3Data): + _src_name = "oncokb" + _datatype = "interaction_claim_attributes" + _filetype = "csv" + + +class Oncomine(UnversionedS3Data): + _src_name = "oncomine" + + +class RussLampel(UnversionedS3Data): + _src_name = "russ_lampel" + + +class Talc(UnversionedS3Data): + _src_name = "talc" + + +class Tdg(UnversionedS3Data): + _src_name = "tdg_clinical_trial" + + +class Tempus(UnversionedS3Data): + _src_name = "tempus" + + +class Tend(UnversionedS3Data): + _src_name = "tend" + + +class Ttd(UnversionedS3Data): + _src_name = "ttd" + _filetype = "csv" + + +for SourceClass in [ + BaderLab, + CancerCommons, + Caris, + Cgi, + ClearityFoundationBiomarkers, + ClearityFoundationClinicalTrial, + Cosmic, + Dgene, + DocmDrugClaims, + DocmGeneClaims, + DocmInteractionClaims, + DocmInteractionClaimAttributes, + DocmInteractionClaimPublications, + DrugbankProtected, + Dtc, + Fda, + FoundationOneGenes, + GToPInteractionData, + HingoraniCasas, + HopkinsGroom, + HumanProteinAtlas, + Idg, + MskImpact, + MyCancerGenome, + MyCancerGenomeClinicalTrial, + Nci, + OncoKbDrugClaims, + OncoKbGeneClaims, + OncoKbGeneClaimAliases, + OncoKbInteractionClaims, + OncoKbInteractionClaimLinks, + OncoKbInteractionClaimAttributes, + Oncomine, + PharmGkbRelations, + RussLampel, + Talc, + Tdg, + Tempus, + Tend, + Ttd, +]: + try: + SourceClass(silent=False).get_latest() + except Exception as e: + print(SourceClass) + print(e) diff --git a/server/lib/genome/importers/base.rb b/server/lib/genome/importers/base.rb index 423378f0..ca6dca66 100644 --- a/server/lib/genome/importers/base.rb +++ b/server/lib/genome/importers/base.rb @@ -40,11 +40,15 @@ def default_filename 'claims' end + def default_data_dir + "#{Dir.home}/.local/share/wags_tails" + end + def handle_file_location(file_path) return file_path unless file_path.nil? - dir_name = self.class.name.split('::')[-2].underscore - "lib/data/#{dir_name}/#{default_filename}.#{default_filetype}" + src_name = self.class.name.split('::')[-2].underscore + "#{default_data_dir}/#{src_name}/#{src_name}_#{default_filename}.#{default_filetype}" end def remove_existing_source diff --git a/server/lib/genome/importers/file_importers/chembl.rb b/server/lib/genome/importers/file_importers/chembl.rb index 0d627562..708f9ca3 100644 --- a/server/lib/genome/importers/file_importers/chembl.rb +++ b/server/lib/genome/importers/file_importers/chembl.rb @@ -10,12 +10,11 @@ def initialize(file_path) @source_db_name = "ChEMBL" end - def default_filetype - 'db' - end + def handle_file_location(file_path) + return file_path unless file_path.nil? - def default_filename - 'chembl' + directory = "#{default_data_dir}/chembl/" + Dir.glob(File.join(directory, 'chembl_*.db')).max_by { |file| file.match(/chembl_(\d+)\.db/)[1].to_i rescue 0 } end def create_claims diff --git a/server/lib/genome/importers/file_importers/clearity_foundation_biomarkers.rb b/server/lib/genome/importers/file_importers/clearity_foundation_biomarkers.rb index 7e5c5874..09473534 100644 --- a/server/lib/genome/importers/file_importers/clearity_foundation_biomarkers.rb +++ b/server/lib/genome/importers/file_importers/clearity_foundation_biomarkers.rb @@ -31,6 +31,12 @@ def create_new_source @source.save end + def handle_file_location(file_path) + return file_path unless file_path.nil? + + "#{Dir.home}/.local/share/wags_tails/clearity_foundation/clearity_foundation_biomarkers_claims.tsv" + end + def create_interaction_claims CSV.foreach(file_path, headers: true, col_sep: "\t") do |row| gene_claim = create_gene_claim(row['gene_name'].upcase) diff --git a/server/lib/genome/importers/file_importers/clearity_foundation_clinical_trial.rb b/server/lib/genome/importers/file_importers/clearity_foundation_clinical_trial.rb index dc474891..19e8f5b5 100644 --- a/server/lib/genome/importers/file_importers/clearity_foundation_clinical_trial.rb +++ b/server/lib/genome/importers/file_importers/clearity_foundation_clinical_trial.rb @@ -31,6 +31,12 @@ def create_new_source @source.save end + def handle_file_location(file_path) + return file_path unless file_path.nil? + + "#{Dir.home}/.local/share/wags_tails/clearity_foundation/clearity_foundation_clinical_trial_claims.tsv" + end + def create_interaction_claims CSV.foreach(file_path, headers: true, col_sep: "\t") do |row| next if row['Entrez Gene Name'] == 'N/A' || row['Pubchem name'] == 'N/A' diff --git a/server/lib/genome/importers/file_importers/docm.rb b/server/lib/genome/importers/file_importers/docm.rb index 695d1912..042aea08 100644 --- a/server/lib/genome/importers/file_importers/docm.rb +++ b/server/lib/genome/importers/file_importers/docm.rb @@ -7,7 +7,7 @@ class Importer < Genome::Importers::Base def initialize(tsv_root_path) @tsv_root = if tsv_root_path.nil? - 'lib/data/docm/' + "#{default_data_dir}/docm/" else tsv_root_path end @@ -49,21 +49,21 @@ def create_new_source def create_drug_claims - CSV.foreach("#{@tsv_root}drug_claim.csv", headers: true, col_sep: ',') do |row| + CSV.foreach("#{@tsv_root}docm_drug_claims.csv", headers: true, col_sep: ',') do |row| dc = create_drug_claim(row[0]) @drug_claims[row[0]] = dc end end def create_gene_claims - CSV.foreach("#{@tsv_root}gene_claim.csv", headers: true, col_sep: ',') do |row| + CSV.foreach("#{@tsv_root}docm_gene_claims.csv", headers: true, col_sep: ',') do |row| gc = create_gene_claim(row[0], GeneNomenclature::NCBI_NAME) @gene_claims[row[0]] = gc end end def create_interaction_claims - CSV.foreach("#{@tsv_root}interaction_claim.csv", headers: true, col_sep: ',') do |row| + CSV.foreach("#{@tsv_root}docm_interaction_claims.csv", headers: true, col_sep: ',') do |row| gc = @gene_claims[row[1]] dc = @drug_claims[row[0]] next if gc.nil? || dc.nil? @@ -71,7 +71,7 @@ def create_interaction_claims ic = create_interaction_claim(gc, dc) @interaction_claims[[gc, dc]] = ic end - CSV.foreach("#{@tsv_root}interaction_claim_attributes.csv", headers: true, col_sep: ',') do |row| + CSV.foreach("#{@tsv_root}docm_interaction_claim_attributes.csv", headers: true, col_sep: ',') do |row| gc = @gene_claims[row[3]] dc = @drug_claims[row[2]] next if gc.nil? || dc.nil? @@ -79,7 +79,7 @@ def create_interaction_claims ic = @interaction_claims[[gc, dc]] create_interaction_claim_attribute(ic, row[0], row[1]) end - CSV.foreach("#{@tsv_root}interaction_claim_publications.csv", headers: true, col_sep: ',') do |row| + CSV.foreach("#{@tsv_root}docm_interaction_claim_publications.csv", headers: true, col_sep: ',') do |row| gc = @gene_claims[row[3]] dc = @drug_claims[row[2]] next if gc.nil? || dc.nil? diff --git a/server/lib/genome/importers/file_importers/drugbank.rb b/server/lib/genome/importers/file_importers/drugbank.rb index a200c0c3..7be9e9b1 100644 --- a/server/lib/genome/importers/file_importers/drugbank.rb +++ b/server/lib/genome/importers/file_importers/drugbank.rb @@ -11,8 +11,15 @@ def initialize(file_path) @source_db_name = 'DrugBank' end - def default_filetype - 'xml' + def handle_file_location(file_path) + return file_path unless file_path.nil? + + directory = "#{default_data_dir}/drugbank/" + Dir.glob(File.join(directory, 'drugbank_*.xml')) + .max_by do |file| + match = file.match(/drugbank_(\d+)\.(\d+)\.(\d+)\.xml/) + match ? [match[1].to_i, match[2].to_i, match[3].to_i] : [0, 0, 0] + end end def run_parser diff --git a/server/lib/genome/importers/file_importers/guide_to_pharmacology.rb b/server/lib/genome/importers/file_importers/guide_to_pharmacology.rb index e6d159e1..814828bb 100644 --- a/server/lib/genome/importers/file_importers/guide_to_pharmacology.rb +++ b/server/lib/genome/importers/file_importers/guide_to_pharmacology.rb @@ -1,14 +1,14 @@ include ActionView::Helpers::SanitizeHelper module Genome; module Importers; module FileImporters; module GuideToPharmacology; - # gene_file_path should point to `targets_and_families.csv` - # interaction_file_path should point to `interactions.csv` + # gene_file_path should point to `gtop_targets_and_families_*.tsv` + # interaction_file_path should point to `gtop_interactions_*.tsv` class Importer < Genome::Importers::Base attr_reader :interaction_file_path, :gene_file_path, :target_to_entrez def initialize(interaction_file_path, gene_file_path) - @interaction_file_path = interaction_file_path - @gene_file_path = gene_file_path + @interaction_file_path = handle_gtop_file_location(interaction_file_path, "interactions") + @gene_file_path = handle_gtop_file_location(gene_file_path, "targets_and_families") @target_to_entrez = {} @source_db_name = 'GuideToPharmacology' end @@ -20,6 +20,16 @@ def create_claims private + def handle_gtop_file_location(file_path, datatype) + return file_path unless file_path.nil? + + raise "Unrecognized GtoP datatype: #{datatype}" unless %w[targets_and_families interactions].include? datatype + + directory = "#{default_data_dir}/guidetopharmacology/" + Dir.glob(File.join(directory, "gtop_#{datatype}_*.tsv")) + .max_by { |file| file.match(/gtop_#{datatype}_(\d+)\.db/)[1].to_i rescue 0 } + end + def get_version version = '' File.open(@interaction_file_path, 'r') do |file| @@ -57,7 +67,7 @@ def create_new_source def import_gene_claims refseq_id_pattern = /^((AC|AP|NC|NG|NM|NP|NR|NT|NW|XM|XP|XR|YP|ZP)_\d+|(NZ\_[A-Z]{4}\d+))(\.\d+)?$/ - CSV.foreach(gene_file_path, headers: true, skip_lines: /GtoPdb Version/) do |line| + CSV.foreach(gene_file_path, headers: true, skip_lines: /GtoPdb Version/, col_sep: "\t") do |line| gene_lui = line['Human Entrez Gene'] next if blank?(gene_lui) || gene_lui.include?('|') @@ -111,7 +121,7 @@ def import_gene_claims end def import_interaction_claims - CSV.foreach(interaction_file_path, headers: true, skip_lines: /GtoPdb Version/) do |line| + CSV.foreach(interaction_file_path, headers: true, skip_lines: /GtoPdb Version/, col_sep: "\t") do |line| next unless valid_interaction_line?(line) gene_claim = create_gene_claim("NCBIGENE:#{line['Target ID']}", GeneNomenclature::NCBI_ID) diff --git a/server/lib/genome/importers/file_importers/my_cancer_genome_clinical_trial.rb b/server/lib/genome/importers/file_importers/my_cancer_genome_clinical_trial.rb index 3734ceeb..9b6eed54 100644 --- a/server/lib/genome/importers/file_importers/my_cancer_genome_clinical_trial.rb +++ b/server/lib/genome/importers/file_importers/my_cancer_genome_clinical_trial.rb @@ -12,6 +12,17 @@ def create_claims end private + def default_filename + 'clinical_trial_claims' + end + + def handle_file_location(file_path) + return file_path unless file_path.nil? + + src_name = "my_cancer_genome" + "#{default_data_dir}/#{src_name}/#{src_name}_#{default_filename}.#{default_filetype}" + end + def create_new_source @source ||= Source.create( { diff --git a/server/lib/genome/importers/file_importers/oncokb.rb b/server/lib/genome/importers/file_importers/oncokb.rb index 7e9a7491..864d086e 100644 --- a/server/lib/genome/importers/file_importers/oncokb.rb +++ b/server/lib/genome/importers/file_importers/oncokb.rb @@ -1,11 +1,11 @@ module Genome; module Importers; module FileImporters; module Oncokb; class Importer < Genome::Importers::Base def initialize(tsv_root_path) - if tsv_root_path.nil? - @tsv_root = 'lib/data/oncokb/' - else - @tsv_root = tsv_root_path - end + @tsv_root = if tsv_root_path.nil? + "#{default_data_dir}/oncokb/" + else + tsv_root_path + end @source_db_name = 'OncoKB' @drug_claims = {} @gene_claims = {} diff --git a/server/lib/genome/importers/file_importers/pharmgkb.rb b/server/lib/genome/importers/file_importers/pharmgkb.rb index 88dbfdec..8d1930e8 100644 --- a/server/lib/genome/importers/file_importers/pharmgkb.rb +++ b/server/lib/genome/importers/file_importers/pharmgkb.rb @@ -13,6 +13,20 @@ def create_claims private + def handle_file_location(file_path) + return file_path unless file_path.nil? + + directory = "#{default_data_dir}/pharmgkb/" + Dir.glob(File.join(directory, 'pharmgkb_*.tsv')).max_by { |file| file.match(/chembl_(\d+)\.db/)[1].to_i rescue 0 } + end + + + def get_version + match = @file_path.match(/(\d{8})/) + match ? match[1] : nil + end + + def create_new_source @source ||= Source.create( { @@ -23,7 +37,7 @@ def create_new_source pmid: '34216021', pmcid: 'PMC8457105', doi: '10.1002/cpt.2350', - source_db_version: '2024-04-05', # using static file, see issue #420 + source_db_version: get_version, source_db_name: source_db_name, full_name: 'PharmGKB - The Pharmacogenomics Knowledgebase', license: License::CC_BY_SA_4_0, diff --git a/server/lib/tasks/importers.rake b/server/lib/tasks/importers.rake index 4f2bc4a6..44243c3f 100644 --- a/server/lib/tasks/importers.rake +++ b/server/lib/tasks/importers.rake @@ -72,8 +72,8 @@ namespace :dgidb do # Guide to Pharmacology is a special case because it needs two input files def run_gtop_import(args) args.with_defaults( - interaction_file_path: 'lib/data/guide_to_pharmacology/interactions.csv', - gene_file_path: 'lib/data/guide_to_pharmacology/targets_and_families.csv', + interaction_file_path: nil, + gene_file_path: nil, gene_group: false, drug_group: false )