feat(ingest): Add basic ingest pipeline, deploy for ebola-zaire (#1399)

loculus-project · Mar 21, 2024 · f405c8b · f405c8b
1 parent f3ad919
commit f405c8b
Show file tree

Hide file tree

Showing 16 changed files with 1,098 additions and 0 deletions.
diff --git a/.github/workflows/ingest.yml b/.github/workflows/ingest.yml
@@ -0,0 +1,82 @@
+name: ingest
+on:
+  push:
+  workflow_dispatch:
+    inputs:
+      build_arm:
+        type: boolean
+        description: "Build for ARM as well"
+        default: false
+        required: false
+
+env:
+  DOCKER_IMAGE_NAME: ghcr.io/loculus-project/ingest
+  BUILD_ARM: ${{ github.ref == 'refs/heads/main' || github.event.inputs.build_arm }}
+
+concurrency:
+  group: ci-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}-ingest
+  cancel-in-progress: true
+
+jobs:
+  dockerImage:
+    name: Build ingest Docker Image # Don't change: Referenced by .github/workflows/update-argocd-metadata.yml
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    permissions:
+      contents: read
+      packages: write
+      checks: read
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Generate files hash
+        id: files-hash
+        run: |
+          DIR_HASH=$(echo -n ${{ hashFiles('ingest/**', '.github/workflows/ingest.yml') }})
+          echo "DIR_HASH=$DIR_HASH${{ env.BUILD_ARM && '-arm'|| '' }}" >> $GITHUB_ENV
+
+      - name: Setup Docker metadata
+        id: dockerMetadata
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.DOCKER_IMAGE_NAME }}
+          tags: |
+            type=raw,value=${{ env.DIR_HASH }}
+            type=raw,value=latest,enable=${{ github.ref == 'refs/heads/main' }}
+            type=ref,event=branch
+            type=sha,prefix=commit-
+
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Check if image exists
+        id: check-image
+        run: |
+          EXISTS=$(docker manifest inspect ${{ env.DOCKER_IMAGE_NAME }}:${{ env.DIR_HASH }} > /dev/null 2>&1 && echo "true" || echo "false")
+          echo "CACHE_HIT=$EXISTS" >> $GITHUB_ENV
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Build and push image if input files changed
+        if: env.CACHE_HIT == 'false'
+        uses: docker/build-push-action@v5
+        with:
+          context: ./ingest
+          push: true
+          tags: ${{ steps.dockerMetadata.outputs.tags }}
+          cache-from: type=gha,scope=ingest-${{ github.ref }}
+          cache-to: type=gha,mode=max,scope=ingest-${{ github.ref }}
+          platforms: ${{ env.BUILD_ARM && 'linux/amd64,linux/arm64' || 'linux/amd64' }}
+
+      - name: Retag and push existing image if cache hit
+        if: env.CACHE_HIT == 'true'
+        run: |
+          TAGS=(${{ steps.dockerMetadata.outputs.tags }})
+          for TAG in "${TAGS[@]}"; do
+            docker buildx imagetools create --tag $TAG ${{ env.DOCKER_IMAGE_NAME }}:${{ env.DIR_HASH }}
+          done
diff --git a/ingest/.dockerignore b/ingest/.dockerignore
@@ -0,0 +1,8 @@
+.snakemake/
+.git/
+data/
+results/
+result/
+.DS_Store
+.ruff_cache
+config/config.yaml
diff --git a/ingest/.gitignore b/ingest/.gitignore
@@ -0,0 +1,5 @@
+.snakemake/
+data/
+results/
+.DS_Store
+.ruff_cache
diff --git a/ingest/.mambarc b/ingest/.mambarc
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+repodata_use_zst: true
+channel_priority: strict
+download_threads: 20
diff --git a/ingest/Dockerfile b/ingest/Dockerfile
@@ -0,0 +1,15 @@
+FROM mambaorg/micromamba:1.5.7
+
+COPY --chown=$MAMBA_USER:$MAMBA_USER environment.yml /tmp/env.yaml
+COPY --chown=$MAMBA_USER:$MAMBA_USER .mambarc /tmp/.mambarc
+
+RUN micromamba config set extract_threads 1 \
+    && micromamba install -y -n base -f /tmp/env.yaml --rc-file /tmp/.mambarc \
+    && micromamba clean --all --yes
+
+# Set the environment variable to activate the conda environment
+ARG MAMBA_DOCKERFILE_ACTIVATE=1
+
+COPY --chown=$MAMBA_USER:$MAMBA_USER . /package
+
+WORKDIR /package
diff --git a/ingest/README.md b/ingest/README.md
@@ -0,0 +1,12 @@
+# Pipeline to ingest data from INSDC into loculus
+
+## Overview
+
+1. Download data from INSDC
+2. Filtering
+3. Turn into FASTA/Metadata
+4. Upload to loculus
+
+## Deployment
+
+Pipeline shall be put in a docker container that takes a config file as input
diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -0,0 +1,124 @@
+TAXON_ID = config["taxon_id"]
+ALL_FIELDS = ",".join(config["all_fields"])
+COLUMN_MAPPING = config["column_mapping"]
+LOG_LEVEL = config.get("log_level", "INFO")
+
+
+def rename_columns(input_file, output_file):
+    with open(input_file, "r") as f:
+        header = f.readline().strip().split("\t")
+        header = [COLUMN_MAPPING.get(h, h) for h in header]
+        with open(output_file, "w") as g:
+            g.write("\t".join(header) + "\n")
+            for line in f:
+                g.write(line)
+
+
+rule all:
+    input:
+        "data/sequences.fasta",
+        "data/metadata.tsv",
+
+
+rule fetch_ncbi_dataset_package:
+    output:
+        dataset_package="results/ncbi_dataset.zip",
+    retries: 5
+    shell:
+        """
+        datasets download virus genome taxon {TAXON_ID} \
+            --no-progressbar \
+            --filename {output.dataset_package}
+        """
+
+
+rule extract_ncbi_dataset_sequences:
+    input:
+        dataset_package="results/ncbi_dataset.zip",
+    output:
+        ncbi_dataset_sequences="results/sequences.fasta",
+    shell:
+        """
+        unzip -jp {input.dataset_package} \
+            ncbi_dataset/data/genomic.fna \
+        | seqkit seq -i -w0 \
+        > {output.ncbi_dataset_sequences}
+        """
+
+
+rule format_ncbi_dataset_report:
+    input:
+        dataset_package="results/ncbi_dataset.zip",
+    output:
+        ncbi_dataset_tsv="results/metadata_post_extract.tsv",
+    params:
+        fields_to_include=ALL_FIELDS,
+    shell:
+        """
+        dataformat tsv virus-genome \
+            --package {input.dataset_package} \
+            --fields {params.fields_to_include:q} \
+            > {output.ncbi_dataset_tsv}
+        """
+
+
+rule rename_columns:
+    input:
+        ncbi_dataset_tsv="results/metadata_post_extract.tsv",
+    output:
+        ncbi_dataset_tsv="results/metadata_post_rename.tsv",
+    run:
+        rename_columns(input.ncbi_dataset_tsv, output.ncbi_dataset_tsv)
+
+
+rule prepare_metadata:
+    input:
+        metadata="results/metadata_post_rename.tsv",
+        config="config/config.yaml",
+    output:
+        metadata="results/metadata_post_prepare.tsv",
+    params:
+        log_level=LOG_LEVEL,
+    shell:
+        """
+        python scripts/prepare_metadata.py \
+            --config-file {input.config} \
+            --input {input.metadata} \
+            --output {output.metadata} \
+            --log-level {params.log_level} \
+        """
+
+
+rule submit_to_loculus:
+    input:
+        metadata="results/metadata_post_prepare.tsv",
+        sequences="results/sequences.fasta",
+        config="config/config.yaml",
+    output:
+        submitted=touch("results/submitted"),
+    params:
+        log_level=LOG_LEVEL,
+    shell:
+        """
+        python scripts/submit_to_loculus.py \
+            --mode submit \
+            --metadata {input.metadata} \
+            --sequences {input.sequences} \
+            --config-file {input.config} \
+            --log-level {params.log_level} \
+        """
+
+
+rule approve:
+    input:
+        submitted="results/submitted",
+        config="config/config.yaml",
+    params:
+        log_level=LOG_LEVEL,
+    shell:
+        """
+        python scripts/submit_to_loculus.py \
+            --mode approve \
+            --config-file {input.config} \
+            --log-level {params.log_level} \
+        """