From b2936f8097376a227a19a4099842129c468ba38c Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Tue, 10 Dec 2024 16:16:58 -0800 Subject: [PATCH] Ingest: Derive URL column during ingest --- ingest/defaults/config.yaml | 3 +++ ingest/rules/curate.smk | 21 ++++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index 83f992d..1c7cf66 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -102,6 +102,8 @@ curate: output_id_field: "accession" # The field in the NDJSON record that contains the actual genomic sequence output_sequence_field: "sequence" + # The field in the NDJSON record that contains the actual GenBank accession + genbank_accession: 'accession' # The list of metadata columns to keep in the final output of the curation pipeline. metadata_columns: [ "accession", @@ -121,4 +123,5 @@ curate: "authors", "full_authors", "institution", + "url" ] diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index 4b6ce7c..57d820b 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -116,10 +116,29 @@ rule curate: --output-seq-field {params.sequence_field} ) 2>> {log} """ +rule add_metadata_columns: + """Add columns to metadata + Notable columns: + - [NEW] url: URL linking to the NCBI GenBank record ('https://www.ncbi.nlm.nih.gov/nuccore/*'). + """ + input: + metadata = "data/all_metadata.tsv" + output: + metadata = temp("data/all_metadata_added.tsv") + params: + accession=config['curate']['genbank_accession'] + shell: + """ + csvtk mutate2 -t \ + -n url \ + -e '"https://www.ncbi.nlm.nih.gov/nuccore/" + ${params.accession}' \ + {input.metadata} \ + > {output.metadata} + """ rule subset_metadata: input: - metadata="data/all_metadata.tsv", + metadata="data/all_metadata_added.tsv", output: subset_metadata="data/subset_metadata.tsv", params: