Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hierarchical timepoints #2

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ logs/
results/
benchmarks/
sequence-counts/
aggregated-counts/
mlr-estimates/

# Specific Auspice JSONs
Expand Down
8 changes: 7 additions & 1 deletion defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,14 @@ datasets:
- h3n2_clades_2022-23
- h3n2_clades_2023-24

# H3N2 uses a 2-year analysis window and slides 1-year forward
analyses:
- sarscov2_clades

sarscov2_clades:
pivot: "19B"
generation_time: 3.2

# H3N2 uses a 2-year analysis window and slides 1-year forward
h3n2_clades_2016-17:
local_metadata: "data/h3n2_subset_metadata.tsv.zst"
seq_count_options: >-
Expand Down
2 changes: 1 addition & 1 deletion defaults/mlr-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ data:

settings:
fit: true # Fit the model?
save: true # Save model state?
save: false # Save model state?
load: false # Load old model?
export_json: true # Export model results as json
ps: [0.5, 0.8, 0.95] # HPDI intervals to be exported
Expand Down
15 changes: 6 additions & 9 deletions rules/mlr_estimates.smk
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,10 @@ This part of the workflow runs the model scripts.

def _get_models_option(wildcards, option_name):
"""
Return the option for model from the config based on the
wildcards.data_provenance, wildcards.variant_classification and the wildcards.geo_resolution values.

If the *option* exists as a key within config['models'][wildcard.data_provenance][wildcard.variant_classification][wildcard.geo_resolution]
If the *option* exists as a key within config['analyses']
then return as "--{option-name} {option_value}". Or else return an empty string.
"""
option_value = config.get(wildcards.dataset, {}) \
option_value = config.get(wildcards.analysis, {}) \
.get(option_name)

if option_value is not None:
Expand All @@ -22,18 +19,18 @@ def _get_models_option(wildcards, option_name):

rule mlr_model:
input:
sequence_counts = "sequence-counts/{dataset}/collapsed_seq_counts.tsv"
sequence_counts = "aggregated-counts/{analysis}/aggregated_sequence_counts.tsv"
output:
# Note this output is not used in the shell command because it is one of the many
# files generated and output to the export path.
# We are listing this specific file as the output file because it is the final
# final output of the model script.
results = "mlr-estimates/{dataset}/mlr_results.json"
results = "mlr-estimates/{analysis}/mlr_results.json"
log:
"logs/{dataset}/mlr_model.txt"
"logs/{analysis}/mlr_model.txt"
params:
model_config = config.get("mlr_config"),
export_path = lambda w: f"mlr-estimates/{w.dataset}",
export_path = lambda w: f"mlr-estimates/{w.analysis}",
pivot = lambda wildcards: _get_models_option(wildcards, 'pivot'),
generation_time = lambda wildcards: _get_models_option(wildcards, 'generation_time')
resources:
Expand Down
38 changes: 38 additions & 0 deletions rules/sequence_counts.smk
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,41 @@ rule collapse_sequence_counts:
{params.collapse_threshold} \
--output-seq-counts {output.sequence_counts} 2>&1 | tee {log}
"""

rule annotate_sequence_counts:
input:
sequence_counts = "sequence-counts/{dataset}/collapsed_seq_counts.tsv"
output:
sequence_counts = "sequence-counts/{dataset}/annotated_seq_counts.tsv"
shell:
"""
dataset_suffix=$(echo "{wildcards.dataset}" | awk -F'_' '{{print $NF}}')
python scripts/annotate_sequence_counts.py \
--input {input.sequence_counts} \
--output {output.sequence_counts} \
--dataset-suffix $dataset_suffix
"""

import os

# Rule to combine annotated sequence counts for each analysis
rule aggregate_sequence_counts:
input:
lambda wildcards: expand(
"sequence-counts/{dataset}/annotated_seq_counts.tsv",
dataset=[d for d in config["datasets"] if d.startswith(f"{wildcards.analysis}_")]
)
output:
combined = "aggregated-counts/{analysis}/aggregated_sequence_counts.tsv"
run:
# Ensure the output directory exists
os.makedirs(os.path.dirname(output.combined), exist_ok=True)

# Handle the case where no input files are present
if len(input) == 0:
with open(output.combined, 'w') as out_file:
out_file.write("location\tvariant\tdate\tsequences\n") # Example header
else:
shell("""
(head -n 1 {input[0]} && tail -n +2 -q {input}) > {output.combined}
""")
30 changes: 30 additions & 0 deletions scripts/annotate_sequence_counts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env python3

import argparse
import pandas as pd

def annotate_sequence_counts(input_file, output_file, dataset_suffix):
# Read the TSV input file
df = pd.read_csv(input_file, sep='\t')

# Append dataset_suffix to the 'location' column
df['location'] = df['location'] + f"_{dataset_suffix}"

# Modify 'variant' where the value is 'other'
df.loc[df['variant'] == 'other', 'variant'] = df['variant'] + f"_{dataset_suffix}"

# Write the annotated DataFrame to the output file
df.to_csv(output_file, sep='\t', index=False)

def main():
parser = argparse.ArgumentParser(description="Annotate sequence counts with dataset suffix.")
parser.add_argument('--input', required=True, help='Path to the input TSV file')
parser.add_argument('--output', required=True, help='Path to the output TSV file')
parser.add_argument('--dataset-suffix', required=True, help='Suffix to append to location and variant (if "other")')

args = parser.parse_args()

annotate_sequence_counts(args.input, args.output, args.dataset_suffix)

if __name__ == "__main__":
main()