-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(ingest): Add basic ingest pipeline, deploy for ebola-zaire (#1399)
- Loading branch information
1 parent
f3ad919
commit f405c8b
Showing
16 changed files
with
1,098 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
name: ingest | ||
on: | ||
push: | ||
workflow_dispatch: | ||
inputs: | ||
build_arm: | ||
type: boolean | ||
description: "Build for ARM as well" | ||
default: false | ||
required: false | ||
|
||
env: | ||
DOCKER_IMAGE_NAME: ghcr.io/loculus-project/ingest | ||
BUILD_ARM: ${{ github.ref == 'refs/heads/main' || github.event.inputs.build_arm }} | ||
|
||
concurrency: | ||
group: ci-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}-ingest | ||
cancel-in-progress: true | ||
|
||
jobs: | ||
dockerImage: | ||
name: Build ingest Docker Image # Don't change: Referenced by .github/workflows/update-argocd-metadata.yml | ||
runs-on: ubuntu-latest | ||
timeout-minutes: 15 | ||
permissions: | ||
contents: read | ||
packages: write | ||
checks: read | ||
steps: | ||
- uses: actions/checkout@v4 | ||
|
||
- name: Generate files hash | ||
id: files-hash | ||
run: | | ||
DIR_HASH=$(echo -n ${{ hashFiles('ingest/**', '.github/workflows/ingest.yml') }}) | ||
echo "DIR_HASH=$DIR_HASH${{ env.BUILD_ARM && '-arm'|| '' }}" >> $GITHUB_ENV | ||
- name: Setup Docker metadata | ||
id: dockerMetadata | ||
uses: docker/metadata-action@v5 | ||
with: | ||
images: ${{ env.DOCKER_IMAGE_NAME }} | ||
tags: | | ||
type=raw,value=${{ env.DIR_HASH }} | ||
type=raw,value=latest,enable=${{ github.ref == 'refs/heads/main' }} | ||
type=ref,event=branch | ||
type=sha,prefix=commit- | ||
- name: Login to GitHub Container Registry | ||
uses: docker/login-action@v3 | ||
with: | ||
registry: ghcr.io | ||
username: ${{ github.actor }} | ||
password: ${{ secrets.GITHUB_TOKEN }} | ||
|
||
- name: Check if image exists | ||
id: check-image | ||
run: | | ||
EXISTS=$(docker manifest inspect ${{ env.DOCKER_IMAGE_NAME }}:${{ env.DIR_HASH }} > /dev/null 2>&1 && echo "true" || echo "false") | ||
echo "CACHE_HIT=$EXISTS" >> $GITHUB_ENV | ||
- name: Set up Docker Buildx | ||
uses: docker/setup-buildx-action@v3 | ||
|
||
- name: Build and push image if input files changed | ||
if: env.CACHE_HIT == 'false' | ||
uses: docker/build-push-action@v5 | ||
with: | ||
context: ./ingest | ||
push: true | ||
tags: ${{ steps.dockerMetadata.outputs.tags }} | ||
cache-from: type=gha,scope=ingest-${{ github.ref }} | ||
cache-to: type=gha,mode=max,scope=ingest-${{ github.ref }} | ||
platforms: ${{ env.BUILD_ARM && 'linux/amd64,linux/arm64' || 'linux/amd64' }} | ||
|
||
- name: Retag and push existing image if cache hit | ||
if: env.CACHE_HIT == 'true' | ||
run: | | ||
TAGS=(${{ steps.dockerMetadata.outputs.tags }}) | ||
for TAG in "${TAGS[@]}"; do | ||
docker buildx imagetools create --tag $TAG ${{ env.DOCKER_IMAGE_NAME }}:${{ env.DIR_HASH }} | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
.snakemake/ | ||
.git/ | ||
data/ | ||
results/ | ||
result/ | ||
.DS_Store | ||
.ruff_cache | ||
config/config.yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
.snakemake/ | ||
data/ | ||
results/ | ||
.DS_Store | ||
.ruff_cache |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
channels: | ||
- conda-forge | ||
- bioconda | ||
repodata_use_zst: true | ||
channel_priority: strict | ||
download_threads: 20 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
FROM mambaorg/micromamba:1.5.7 | ||
|
||
COPY --chown=$MAMBA_USER:$MAMBA_USER environment.yml /tmp/env.yaml | ||
COPY --chown=$MAMBA_USER:$MAMBA_USER .mambarc /tmp/.mambarc | ||
|
||
RUN micromamba config set extract_threads 1 \ | ||
&& micromamba install -y -n base -f /tmp/env.yaml --rc-file /tmp/.mambarc \ | ||
&& micromamba clean --all --yes | ||
|
||
# Set the environment variable to activate the conda environment | ||
ARG MAMBA_DOCKERFILE_ACTIVATE=1 | ||
|
||
COPY --chown=$MAMBA_USER:$MAMBA_USER . /package | ||
|
||
WORKDIR /package |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# Pipeline to ingest data from INSDC into loculus | ||
|
||
## Overview | ||
|
||
1. Download data from INSDC | ||
2. Filtering | ||
3. Turn into FASTA/Metadata | ||
4. Upload to loculus | ||
|
||
## Deployment | ||
|
||
Pipeline shall be put in a docker container that takes a config file as input |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
TAXON_ID = config["taxon_id"] | ||
ALL_FIELDS = ",".join(config["all_fields"]) | ||
COLUMN_MAPPING = config["column_mapping"] | ||
LOG_LEVEL = config.get("log_level", "INFO") | ||
|
||
|
||
def rename_columns(input_file, output_file): | ||
with open(input_file, "r") as f: | ||
header = f.readline().strip().split("\t") | ||
header = [COLUMN_MAPPING.get(h, h) for h in header] | ||
with open(output_file, "w") as g: | ||
g.write("\t".join(header) + "\n") | ||
for line in f: | ||
g.write(line) | ||
|
||
|
||
rule all: | ||
input: | ||
"data/sequences.fasta", | ||
"data/metadata.tsv", | ||
|
||
|
||
rule fetch_ncbi_dataset_package: | ||
output: | ||
dataset_package="results/ncbi_dataset.zip", | ||
retries: 5 | ||
shell: | ||
""" | ||
datasets download virus genome taxon {TAXON_ID} \ | ||
--no-progressbar \ | ||
--filename {output.dataset_package} | ||
""" | ||
|
||
|
||
rule extract_ncbi_dataset_sequences: | ||
input: | ||
dataset_package="results/ncbi_dataset.zip", | ||
output: | ||
ncbi_dataset_sequences="results/sequences.fasta", | ||
shell: | ||
""" | ||
unzip -jp {input.dataset_package} \ | ||
ncbi_dataset/data/genomic.fna \ | ||
| seqkit seq -i -w0 \ | ||
> {output.ncbi_dataset_sequences} | ||
""" | ||
|
||
|
||
rule format_ncbi_dataset_report: | ||
input: | ||
dataset_package="results/ncbi_dataset.zip", | ||
output: | ||
ncbi_dataset_tsv="results/metadata_post_extract.tsv", | ||
params: | ||
fields_to_include=ALL_FIELDS, | ||
shell: | ||
""" | ||
dataformat tsv virus-genome \ | ||
--package {input.dataset_package} \ | ||
--fields {params.fields_to_include:q} \ | ||
> {output.ncbi_dataset_tsv} | ||
""" | ||
|
||
|
||
rule rename_columns: | ||
input: | ||
ncbi_dataset_tsv="results/metadata_post_extract.tsv", | ||
output: | ||
ncbi_dataset_tsv="results/metadata_post_rename.tsv", | ||
run: | ||
rename_columns(input.ncbi_dataset_tsv, output.ncbi_dataset_tsv) | ||
|
||
|
||
rule prepare_metadata: | ||
input: | ||
metadata="results/metadata_post_rename.tsv", | ||
config="config/config.yaml", | ||
output: | ||
metadata="results/metadata_post_prepare.tsv", | ||
params: | ||
log_level=LOG_LEVEL, | ||
shell: | ||
""" | ||
python scripts/prepare_metadata.py \ | ||
--config-file {input.config} \ | ||
--input {input.metadata} \ | ||
--output {output.metadata} \ | ||
--log-level {params.log_level} \ | ||
""" | ||
|
||
|
||
rule submit_to_loculus: | ||
input: | ||
metadata="results/metadata_post_prepare.tsv", | ||
sequences="results/sequences.fasta", | ||
config="config/config.yaml", | ||
output: | ||
submitted=touch("results/submitted"), | ||
params: | ||
log_level=LOG_LEVEL, | ||
shell: | ||
""" | ||
python scripts/submit_to_loculus.py \ | ||
--mode submit \ | ||
--metadata {input.metadata} \ | ||
--sequences {input.sequences} \ | ||
--config-file {input.config} \ | ||
--log-level {params.log_level} \ | ||
""" | ||
|
||
|
||
rule approve: | ||
input: | ||
submitted="results/submitted", | ||
config="config/config.yaml", | ||
params: | ||
log_level=LOG_LEVEL, | ||
shell: | ||
""" | ||
python scripts/submit_to_loculus.py \ | ||
--mode approve \ | ||
--config-file {input.config} \ | ||
--log-level {params.log_level} \ | ||
""" |
Oops, something went wrong.