blab · trvrb · Feb 3, 2025 · Feb 3, 2025 · Feb 3, 2025
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@ logs/
 results/
 benchmarks/
 sequence-counts/
+aggregated-counts/
 mlr-estimates/
 
 # Specific Auspice JSONs

diff --git a/defaults/config.yaml b/defaults/config.yaml
@@ -26,8 +26,14 @@ datasets:
   - h3n2_clades_2022-23
   - h3n2_clades_2023-24
 
-# H3N2 uses a 2-year analysis window and slides 1-year forward
+analyses:
+  - sarscov2_clades
 
+sarscov2_clades:
+  pivot: "19B"
+  generation_time: 3.2
+
+# H3N2 uses a 2-year analysis window and slides 1-year forward
 h3n2_clades_2016-17:
   local_metadata: "data/h3n2_subset_metadata.tsv.zst"
   seq_count_options: >-

diff --git a/defaults/mlr-config.yaml b/defaults/mlr-config.yaml
@@ -6,7 +6,7 @@ data:
 
 settings:
   fit: true # Fit the model?
-  save: true # Save model state?
+  save: false # Save model state?
   load: false # Load old model?
   export_json: true  # Export model results as json
   ps: [0.5, 0.8, 0.95] # HPDI intervals to be exported

diff --git a/rules/mlr_estimates.smk b/rules/mlr_estimates.smk
@@ -4,13 +4,10 @@ This part of the workflow runs the model scripts.
 
 def _get_models_option(wildcards, option_name):
     """
-    Return the option for model from the config based on the
-    wildcards.data_provenance, wildcards.variant_classification and the wildcards.geo_resolution values.
-
-    If the *option* exists as a key within config['models'][wildcard.data_provenance][wildcard.variant_classification][wildcard.geo_resolution]
+    If the *option* exists as a key within config['analyses']
     then return as "--{option-name} {option_value}". Or else return an empty string.
     """
-    option_value = config.get(wildcards.dataset, {}) \
+    option_value = config.get(wildcards.analysis, {}) \
                          .get(option_name)
 
     if option_value is not None:
@@ -22,18 +19,18 @@ def _get_models_option(wildcards, option_name):
 
 rule mlr_model:
     input:
-        sequence_counts = "sequence-counts/{dataset}/collapsed_seq_counts.tsv"
+        sequence_counts = "aggregated-counts/{analysis}/aggregated_sequence_counts.tsv"
     output:
         # Note this output is not used in the shell command because it is one of the many
         # files generated and output to the export path.
         # We are listing this specific file as the output file because it is the final
         # final output of the model script.
-        results = "mlr-estimates/{dataset}/mlr_results.json"
+        results = "mlr-estimates/{analysis}/mlr_results.json"
     log:
-        "logs/{dataset}/mlr_model.txt"
+        "logs/{analysis}/mlr_model.txt"
     params:
         model_config = config.get("mlr_config"),
-        export_path = lambda w: f"mlr-estimates/{w.dataset}",
+        export_path = lambda w: f"mlr-estimates/{w.analysis}",
         pivot = lambda wildcards: _get_models_option(wildcards, 'pivot'),
         generation_time = lambda wildcards: _get_models_option(wildcards, 'generation_time')
     resources:

diff --git a/rules/sequence_counts.smk b/rules/sequence_counts.smk
@@ -100,3 +100,41 @@ rule collapse_sequence_counts:
             {params.collapse_threshold} \
             --output-seq-counts {output.sequence_counts} 2>&1 | tee {log}
         """
+
+rule annotate_sequence_counts:
+    input:
+        sequence_counts = "sequence-counts/{dataset}/collapsed_seq_counts.tsv"
+    output:
+        sequence_counts = "sequence-counts/{dataset}/annotated_seq_counts.tsv"
+    shell:
+        """
+        dataset_suffix=$(echo "{wildcards.dataset}" | awk -F'_' '{{print $NF}}')
+        python scripts/annotate_sequence_counts.py \
+            --input {input.sequence_counts} \
+            --output {output.sequence_counts} \
+            --dataset-suffix $dataset_suffix
+        """
+
+import os
+
+# Rule to combine annotated sequence counts for each analysis
+rule aggregate_sequence_counts:
+    input:
+        lambda wildcards: expand(
+            "sequence-counts/{dataset}/annotated_seq_counts.tsv",
+            dataset=[d for d in config["datasets"] if d.startswith(f"{wildcards.analysis}_")]
+        )
+    output:
+        combined = "aggregated-counts/{analysis}/aggregated_sequence_counts.tsv"
+    run:
+        # Ensure the output directory exists
+        os.makedirs(os.path.dirname(output.combined), exist_ok=True)
+
+        # Handle the case where no input files are present
+        if len(input) == 0:
+            with open(output.combined, 'w') as out_file:
+                out_file.write("location\tvariant\tdate\tsequences\n")  # Example header
+        else:
+            shell("""
+                (head -n 1 {input[0]} && tail -n +2 -q {input}) > {output.combined}
+            """)
diff --git a/scripts/annotate_sequence_counts.py b/scripts/annotate_sequence_counts.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+
+import argparse
+import pandas as pd
+
+def annotate_sequence_counts(input_file, output_file, dataset_suffix):
+    # Read the TSV input file
+    df = pd.read_csv(input_file, sep='\t')
+
+    # Append dataset_suffix to the 'location' column
+    df['location'] = df['location'] + f"_{dataset_suffix}"
+
+    # Modify 'variant' where the value is 'other'
+    df.loc[df['variant'] == 'other', 'variant'] = df['variant'] + f"_{dataset_suffix}"
+
+    # Write the annotated DataFrame to the output file
+    df.to_csv(output_file, sep='\t', index=False)
+
+def main():
+    parser = argparse.ArgumentParser(description="Annotate sequence counts with dataset suffix.")
+    parser.add_argument('--input', required=True, help='Path to the input TSV file')
+    parser.add_argument('--output', required=True, help='Path to the output TSV file')
+    parser.add_argument('--dataset-suffix', required=True, help='Suffix to append to location and variant (if "other")')
+
+    args = parser.parse_args()
+
+    annotate_sequence_counts(args.input, args.output, args.dataset_suffix)
+
+if __name__ == "__main__":
+    main()