From a587f18e613029070d6c1c5ad4320bb13bda7e4f Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Thu, 12 Sep 2024 15:07:42 -0700 Subject: [PATCH 1/5] Update RKI_INDEX_COL since SEQUENCE.ID renamed to igs_id --- bin/transform-rki-data-to-ndjson | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/transform-rki-data-to-ndjson b/bin/transform-rki-data-to-ndjson index e860a126..7d943686 100755 --- a/bin/transform-rki-data-to-ndjson +++ b/bin/transform-rki-data-to-ndjson @@ -5,7 +5,7 @@ Turn RKI files into ndjson format import typer -RKI_INDEX_COL = "SEQUENCE.ID" +RKI_INDEX_COL = "igs_id" def main( input_rki_sequences: str = typer.Option(..., help="Input file"), From f86c4faa9175932c2fe1bac84bd5d2bca60598f8 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Thu, 12 Sep 2024 15:51:14 -0700 Subject: [PATCH 2/5] Update COLUMN_MAP for RKI data Based on changes at: https://github.com/robert-koch-institut/SARS-CoV-2-Sequenzdaten_aus_Deutschland/commit/255ebfe6574d8dad8a1b1762c05c1af5a20ea794?diff=split&w=0#diff-1550ec65ac92f65817fc28928dfef526912b5f52356ff43651369bae92f56031L106 --- bin/transform-rki | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bin/transform-rki b/bin/transform-rki index c91ab370..3e632dd9 100755 --- a/bin/transform-rki +++ b/bin/transform-rki @@ -29,12 +29,12 @@ from lib.utils.transformpipeline.transforms import (AddHardcodedMetadataRki, UserProvidedAnnotations) COLUMN_MAP = { - "SEQUENCE.DATE_OF_SAMPLING": "date", - "SEQUENCE.PUSHED_TO_DWH": "date_submitted", - "DL.ID": "originating_lab", - "SL.ID": "submitting_lab", - "PANGOLIN.LINEAGE_LATEST": "pango_lineage", - "SEQUENCE.SEQUENCING_REASON": "sampling_strategy", + "date_of_sampling": "date", + "date_of_submission": "date_submitted", + "prime_diagnostic_lab.demis_lab_id": "originating_lab", + "sequencing_lab.demis_lab_id": "submitting_lab", + "lineages": "pango_lineage", + "sequencing_reason": "sampling_strategy", } From 2df0af9a46539d005f81d7695db5fc8d48400d6e Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Thu, 12 Sep 2024 15:55:01 -0700 Subject: [PATCH 3/5] Add new RKI date format %Y-%m-%dT%H:%M:%S to expected date_formats --- lib/utils/transformpipeline/transforms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/utils/transformpipeline/transforms.py b/lib/utils/transformpipeline/transforms.py index 681e7ca0..4faf5228 100644 --- a/lib/utils/transformpipeline/transforms.py +++ b/lib/utils/transformpipeline/transforms.py @@ -299,7 +299,7 @@ def transform_value(self, entry: dict) -> dict: # Standardize date format to ISO 8601 date date_columns = {'date', 'date_submitted'} - date_formats = {'%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d %H:%M:%S %z'} + date_formats = {'%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d %H:%M:%S %z', '%Y-%m-%dT%H:%M:%S'} for column in date_columns: entry[column] = format_date(entry[column], date_formats) From 7767c76a38b14b6f665a680ab2e42a2683463976 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Thu, 12 Sep 2024 16:51:24 -0700 Subject: [PATCH 4/5] Parse new RKI JSON blob for pango_lineage This is currently taking the first lineage value from the JSON blob. We may need to revisit this to loop and extract the latest lineage value. --- lib/utils/transformpipeline/transforms.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/utils/transformpipeline/transforms.py b/lib/utils/transformpipeline/transforms.py index 4faf5228..4b2a820f 100644 --- a/lib/utils/transformpipeline/transforms.py +++ b/lib/utils/transformpipeline/transforms.py @@ -1,6 +1,7 @@ import csv import re import unicodedata +import json from collections import defaultdict from typing import Any, Collection, List, MutableMapping, Sequence, Tuple , Dict , Union import pandas as pd @@ -287,6 +288,8 @@ def __init__(self): def transform_value(self, entry: dict) -> dict: entry['sequence'] = entry['sequence'].replace('\n', '') entry['length'] = len(entry['sequence']) + lineage_dict = json.loads(entry['pango_lineage']) + entry['pango_lineage'] = lineage_dict[0]['lineage'] # Normalize all string data to Unicode Normalization Form C, for # consistent, predictable string comparisons. From e57f3c0ff53ba134e54b7dca0d9b759e1c653658 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Fri, 13 Sep 2024 11:36:22 -0700 Subject: [PATCH 5/5] fixup: Parse new RKI JSON blob for pango_lineage --- lib/utils/transformpipeline/transforms.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/utils/transformpipeline/transforms.py b/lib/utils/transformpipeline/transforms.py index 4b2a820f..2e2024d0 100644 --- a/lib/utils/transformpipeline/transforms.py +++ b/lib/utils/transformpipeline/transforms.py @@ -288,8 +288,13 @@ def __init__(self): def transform_value(self, entry: dict) -> dict: entry['sequence'] = entry['sequence'].replace('\n', '') entry['length'] = len(entry['sequence']) - lineage_dict = json.loads(entry['pango_lineage']) - entry['pango_lineage'] = lineage_dict[0]['lineage'] + + # Pull out latest pango lineage from json blob + # Currently this pulls the first entry, but we've added an assert statement to see if there are ever more than one entry + # At that time, we can loop over the json blob to find the latest pango lineage assignment + lineage_json_blob = json.loads(entry['pango_lineage']) + entry['pango_lineage'] = lineage_json_blob[0]['lineage'] + assert len(lineage_json_blob)==1, f"RKI pango_lineage unexpectedly had more than one entry. rki_accession: {entry['rki_accession']}" # Normalize all string data to Unicode Normalization Form C, for # consistent, predictable string comparisons.