Skip to content

Commit

Permalink
feat(deposition): Automate metadata revision of ENA submissions (#3673)
Browse files Browse the repository at this point in the history
* refactor: add explicit manifest_fields_mapping to defaults.yaml

* fix manifest value mapping: map consensusSequenceSoftware to PROGRAM and sequencingInstrument to PLATFORM.

* add revise option

* revise edge case: error if assembly cannot be manually revised (e.g. if manifest modified) 

* revise edge case: make sure biosample and bioassembly cannot be changed

* revise edge case: make sure older version cannot be uploaded

* add docs
  • Loading branch information
anna-parker authored Feb 24, 2025
1 parent 57936bc commit 345218d
Show file tree
Hide file tree
Showing 19 changed files with 595 additions and 239 deletions.
1 change: 1 addition & 0 deletions .github/workflows/ena-submission-workflow-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ name: ena-submission-workflow-tests
on:
pull_request:
paths:
- "ena-submission/scripts/test_ena_submission_integration.py"
- ".github/workflows/ena-submission-workflow-tests.yml"
push:
branches:
Expand Down
11 changes: 11 additions & 0 deletions ena-submission/ENA_submission.md
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,17 @@ When processing is finished the response should look like:
]
```

# Revising Submissions to ENA

## 1. [Revising Studies (Projects) and Samples](https://ena-docs.readthedocs.io/en/latest/update/metadata/programmatic-study.html)
Revisions to a study or sample should be submitted the same way as original sequences were submitted, with the `ADD` action in the submission request should be changed to a `MODIFY`. However, the alias must BE THE SAME as the previous version or the assigned accession number must be added for the correct sample/study to be updated.

## 2. [Revising Assemblies](https://ena-docs.readthedocs.io/en/latest/update/assembly.html)
It appears that all fields that were explicitly set via the manifest must be updated via an email. This includes changes to any field that is in the `manifest_fields_mapping` field of the default.yaml. Additionally, study and sample reference must stay the same and chromosome names cannot be changed (but new ones can be added).
However, unlike the alias a NEW `ASSEMBLYNAME` is required (cannot be the same as the assemblyname of the previous version).

Currently we automate revision of studies and assemblies, if a manifest update is required the pipeline will not update the assembly but set the state of assembly submission to `HAS_ERRORS` and document the reason for the errors in the database. We will then receive a slack notification and will have to manually send an email to ENA to update the manifest.

## Promises made to ENA

- "I confirm that the data submitted through this account is NOT sensitive, restricted-access or human-identifiable." -> We will want to mirror this into Pathoplexus submissions, at least the sensitive and human-identifiable parts.
Expand Down
16 changes: 16 additions & 0 deletions ena-submission/config/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,22 @@ min_between_github_requests: 2
min_between_ena_checks: 5
log_level: DEBUG
#ena_checklist: ERC000033 - do not use until all fields are mapped to ENA accepted options
manifest_fields_mapping:
authors:
loculus_fields: [authors]
function: reformat_authors
platform:
loculus_fields: [sequencingInstrument]
default: "Unknown"
program:
loculus_fields: [consensusSequenceSoftwareName, consensusSequenceSoftwareVersion]
default: "Unknown"
coverage:
loculus_fields: [depthOfCoverage]
type: int
default: 1
run_ref:
loculus_fields: [insdcRawReadsAccession]
metadata_mapping:
'subject exposure':
loculus_fields: [exposureEvent]
Expand Down
11 changes: 9 additions & 2 deletions ena-submission/scripts/deposition_dry_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@
type=click.Choice(["project", "sample", "assembly"]),
)
@click.option("--center-name", required=False, type=str, default="CENTER_NAME")
@click.option(
"--revision",
required=False,
type=bool,
default=False,
)
@click.option(
"--log-level",
default="INFO",
Expand All @@ -52,6 +58,7 @@ def local_ena_submission_generator(
data_to_submit,
center_name,
mode,
revision,
log_level,
):
"""
Expand Down Expand Up @@ -107,7 +114,7 @@ def local_ena_submission_generator(
if mode == "sample":
entry["center_name"] = center_name
sample_set = construct_sample_set_object(config, entry, entry)
sample_xml = get_sample_xml(sample_set)
sample_xml = get_sample_xml(sample_set, revision=revision)

directory = "sample"
os.makedirs(directory, exist_ok=True)
Expand Down Expand Up @@ -139,7 +146,7 @@ def local_ena_submission_generator(
logger.info(f"Writing results to {directory}")

manifest_object = create_manifest_object(
config, dummy_sample_dict, dummy_project_dict, entry, entry, dir=directory
config, dummy_sample_dict, dummy_project_dict, entry, dir=directory
)
create_manifest(manifest_object, is_broker=config.is_broker, dir=directory)
logger.info(
Expand Down
58 changes: 34 additions & 24 deletions ena-submission/scripts/test_ena_submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
dataclass_to_xml,
get_chromsome_accessions,
get_ena_analysis_process,
get_sample_xml,
reformat_authors_from_loculus_to_embl_style,
)
from ena_deposition.ena_types import default_project_type, default_sample_type
Expand Down Expand Up @@ -51,10 +52,12 @@ def mock_config():
}
config.organisms = {"Test organism": {"enaDeposition": metadata_dict}}
config.metadata_mapping = defaults["metadata_mapping"]
config.manifest_fields_mapping = defaults["manifest_fields_mapping"]
config.metadata_mapping_mandatory_field_defaults = defaults[
"metadata_mapping_mandatory_field_defaults"
]
config.ena_checklist = "ERC000033"
config.set_alias_suffix = None
return config


Expand All @@ -68,6 +71,7 @@ def mock_config():

test_sample_xml_request = Path("test/test_sample_request.xml").read_text(encoding="utf-8")
test_sample_xml_response = Path("test/test_sample_response.xml").read_text(encoding="utf-8")
revision_submission_xml_request = Path("test/test_revision_submission_request.xml").read_text(encoding="utf-8")
process_response_text = Path("test/get_ena_analysis_process_response.json").read_text(
encoding="utf-8"
)
Expand All @@ -78,9 +82,9 @@ def mock_config():
open("test/approved_ena_submission_list_test.json", encoding="utf-8")
)
sample_data_in_submission_table = {
"accession": "test_accession",
"version": "test_version",
"group_id": 1,
"accession": "LOC_0001TLY",
"version": "1",
"group_id": 2,
"organism": "Test organism",
"metadata": loculus_sample["LOC_0001TLY.1"]["metadata"],
"unaligned_nucleotide_sequences": {
Expand All @@ -90,10 +94,10 @@ def mock_config():
},
"center_name": "Fake center name",
}
project_table_entry = {"group_id": "1", "organism": "Test organism"}
project_table_entry = {"group_id": "2", "organism": "Test organism"}
sample_table_entry = {
"accession": "test_accession",
"version": "test_version",
"accession": "LOC_0001TLY",
"version": "1",
}


Expand Down Expand Up @@ -178,6 +182,20 @@ def test_sample_set_construction(self):
xmltodict.parse(test_sample_xml_request),
)

def test_sample_revision(self):
config = mock_config()
sample_set = construct_sample_set_object(
config,
sample_data_in_submission_table,
sample_table_entry,
)
files = get_sample_xml(sample_set, revision=True)
revision = files["SUBMISSION"]
self.assertEqual(
xmltodict.parse(revision),
xmltodict.parse(revision_submission_xml_request),
)


class AssemblyCreationTests(unittest.TestCase):
def setUp(self):
Expand All @@ -187,7 +205,7 @@ def setUp(self):
self.unaligned_sequences = {
"main": "CTTAACTTTGAGAGAGTGAATT",
}
self.seq_key = {"accession": "test_accession", "version": "test_version"}
self.seq_key = {"accession": "LOC_0001TLY", "version": "1"}

def test_format_authors(self):
authors = "Xi,L.;Smith, Anna Maria; Perez Gonzalez, Anthony J.;Doe,;von Doe, John"
Expand All @@ -206,7 +224,7 @@ def test_create_chromosome_list_multi_segment(self):

self.assertEqual(
content,
b"test_accession_seg2\tseg2\tcircular-segmented\ntest_accession_seg3\tseg3\tcircular-segmented\n",
b"LOC_0001TLY_seg2\tseg2\tcircular-segmented\nLOC_0001TLY_seg3\tseg3\tcircular-segmented\n",
)

def test_create_chromosome_list(self):
Expand All @@ -218,7 +236,7 @@ def test_create_chromosome_list(self):

self.assertEqual(
content,
b"test_accession\tgenome\tlinear-monopartite\n",
b"LOC_0001TLY\tgenome\tlinear-monopartite\n",
)

def test_create_fasta_multi(self):
Expand All @@ -231,7 +249,7 @@ def test_create_fasta_multi(self):
content = gz.read()
self.assertEqual(
content,
b">test_accession_seg2\nGCGGCACGTCAGTACGTAAGTGTATCTCAAAGAAATACTTAACTTTGAGAGAGTGAATT\n>test_accession_seg3\nCTTAACTTTGAGAGAGTGAATT\n",
b">LOC_0001TLY_seg2\nGCGGCACGTCAGTACGTAAGTGTATCTCAAAGAAATACTTAACTTTGAGAGAGTGAATT\n>LOC_0001TLY_seg3\nCTTAACTTTGAGAGAGTGAATT\n",
)

def test_create_fasta(self):
Expand All @@ -242,26 +260,18 @@ def test_create_fasta(self):
content = gz.read()
self.assertEqual(
content,
b">test_accession\nCTTAACTTTGAGAGAGTGAATT\n",
b">LOC_0001TLY\nCTTAACTTTGAGAGAGTGAATT\n",
)

def test_create_manifest(self):
config = mock_config()
study_accession = "Test Study Accession"
sample_accession = "Test Sample Accession"
results_in_sample_table = {"result": {"ena_sample_accession": sample_accession}}
results_in_project_table = {
"result": {"bioproject_accession": study_accession},
"center_name": "generic_center_name",
"group_id": 1,
"organism": "Test organism",
}
manifest = create_manifest_object(
config,
results_in_sample_table,
results_in_project_table,
sample_accession,
study_accession,
sample_data_in_submission_table,
self.seq_key,
)
manifest_file_name = create_manifest(manifest)
data = {}
Expand All @@ -278,12 +288,12 @@ def test_create_manifest(self):
expected_data = {
"STUDY": study_accession,
"SAMPLE": sample_accession,
"ASSEMBLYNAME": "test_accession",
"ASSEMBLYNAME": "LOC_0001TLY",
"ASSEMBLY_TYPE": "isolate",
"COVERAGE": "1",
"PROGRAM": "Unknown",
"PROGRAM": "Ivar",
"PLATFORM": "Illumina",
"DESCRIPTION": "Original sequence submitted to Loculus with accession: test_accession, version: test_version",
"DESCRIPTION": "Original sequence submitted to Loculus with accession: LOC_0001TLY, version: 1",
"MOLECULETYPE": "genomic RNA",
}

Expand Down
Loading

0 comments on commit 345218d

Please sign in to comment.