Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
996 changes: 0 additions & 996 deletions artic/align_trim.py

This file was deleted.

19 changes: 13 additions & 6 deletions artic/artic_mqc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
import re
import sys

from artic.utils import read_bed_file
from primalbedtools.scheme import Scheme
from primalbedtools.bedfiles import BedLine, merge_primers
from primalbedtools.amplicons import Amplicon, create_amplicons

# Alignment_Length_Threshold drops binned reads that are <X% of amplicon length)
Alignment_Length_Threshold = 0.95
Expand Down Expand Up @@ -71,13 +73,18 @@ def getSchemeAmplicons(schemeFile):
A dict of amplicon names -> zeroed counter
"""
amplicons = {}
primer_scheme = read_bed_file(schemeFile)
for primer in primer_scheme:

scheme = Scheme.from_file(schemeFile)

# Merge the primers
scheme.bedlines = merge_primers(scheme.bedlines)

for primer in scheme.bedlines:
amplicon = ""
if primer["direction"] == "+":
amplicon = primer["Primer_ID"].split("_LEFT")[0]
if primer.strand == "+":
amplicon = primer.primername.split("_LEFT")[0]
else:
amplicon = primer["Primer_ID"].split("_RIGHT")[0]
amplicon = primer.primername.split("_RIGHT")[0]
if amplicon not in amplicons:
amplicons[amplicon] = 0
amplicons[amplicon] += 1
Expand Down
33 changes: 19 additions & 14 deletions artic/mask.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,26 @@
from Bio import SeqIO
from cyvcf2 import VCF
import argparse
import pandas as pd
import csv


def read_3col_bed(fn):
# read the primer scheme into a pandas dataframe and run type, length and null checks
bedfile = pd.read_csv(
fn,
sep="\t",
header=None,
names=["chrom", "start", "end"],
dtype={"chrom": str, "start": int, "end": int},
usecols=(0, 1, 2),
skiprows=0,
)
return bedfile
with open(fn, "rt") as f:
reader = csv.DictReader(f, fieldnames=["chrom", "start", "end"], delimiter="\t")

bedlines = []
for row in reader:
try:
row["start"] = int(row["start"])
row["end"] = int(row["end"])
except ValueError:
raise ValueError(
"The depth mask bedfile appears to be malformed, the start or end position cannot be converted to an integer"
)
bedlines.append(row)

return bedlines


def go(args):
Expand All @@ -27,9 +32,9 @@ def go(args):
cons[k] = list(seqs[k].seq)

bedfile = read_3col_bed(args.maskfile)
for _, region in bedfile.iterrows():
for n in range(region["start"], region["end"]):
cons[region["chrom"]][n] = "N"
for bedline in bedfile:
for n in range(bedline["start"], bedline["end"]):
cons[bedline["chrom"]][n] = "N"

vcf_reader = VCF(args.maskvcf)
for record in vcf_reader:
Expand Down
20 changes: 11 additions & 9 deletions artic/minion.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import os
import sys
import time
from artic.utils import read_bed_file, get_scheme, choose_model
from artic.utils import get_scheme, choose_model
from primalbedtools.scheme import Scheme


def run(parser, args):
Expand Down Expand Up @@ -121,7 +122,8 @@ def run(parser, args):
raise SystemExit(1)

## collect the primer pools
pools = set([row["PoolName"] for row in read_bed_file(bed)] + ["unmatched"])
scheme = Scheme.from_file(bed)
pools = set([row.pool for row in scheme.bedlines] + ["unmatched"])

## create a holder to keep the pipeline commands in
cmds = []
Expand All @@ -141,26 +143,26 @@ def run(parser, args):
normalise_string = f"--normalise {args.normalise}" if args.normalise else ""

incorrect_pairs_string = (
"--remove-incorrect-pairs" if not args.allow_mismatched_primers else ""
"--allow-incorrect-pairs" if args.allow_mismatched_primers else ""
)

cmds.append(
f"align_trim {normalise_string} {bed} --primer-match-threshold {args.primer_match_threshold} {incorrect_pairs_string} --min-mapq {args.min_mapq} --report {args.sample}.alignreport.csv < {args.sample}.sorted.bam > {args.sample}.trimmed.rg.sam"
f"align_trim {normalise_string} {bed} --no-trim-primers --primer-match-threshold {args.primer_match_threshold} {incorrect_pairs_string} --min-mapq {args.min_mapq} --report {args.sample}.alignreport.tsv --samfile {args.sample}.sorted.bam -o {args.sample}.trimmed.rg.bam"
)

cmds.append(
f"samtools sort -T {args.sample} {args.sample}.trimmed.rg.sam -o {args.sample}.trimmed.rg.sorted.bam"
f"samtools sort -T {args.sample} {args.sample}.trimmed.rg.bam -o {args.sample}.trimmed.rg.sorted.bam"
)
cmds.append(f"rm {args.sample}.trimmed.rg.sam")
cmds.append(f"rm {args.sample}.trimmed.rg.bam")

cmds.append(
f"align_trim {normalise_string} {bed} --primer-match-threshold {args.primer_match_threshold} --min-mapq {args.min_mapq} {incorrect_pairs_string} --trim-primers --report {args.sample}.alignreport.csv --amp-depth-report {args.sample}.amplicon_depths.tsv < {args.sample}.sorted.bam > {args.sample}.primertrimmed.rg.sam"
f"align_trim {normalise_string} {bed} --primer-match-threshold {args.primer_match_threshold} --min-mapq {args.min_mapq} {incorrect_pairs_string} --report {args.sample}.alignreport.tsv --amp-depth-report {args.sample}.amplicon_depths.tsv --samfile {args.sample}.sorted.bam -o {args.sample}.primertrimmed.rg.bam"
)

cmds.append(
f"samtools sort -T {args.sample} {args.sample}.primertrimmed.rg.sam -o {args.sample}.primertrimmed.rg.sorted.bam"
f"samtools sort -T {args.sample} {args.sample}.primertrimmed.rg.bam -o {args.sample}.primertrimmed.rg.sorted.bam"
)
cmds.append(f"rm {args.sample}.primertrimmed.rg.sam")
cmds.append(f"rm {args.sample}.primertrimmed.rg.bam")

cmds.append(f"samtools index {args.sample}.trimmed.rg.sorted.bam")
cmds.append(f"samtools index {args.sample}.primertrimmed.rg.sorted.bam")
Expand Down
137 changes: 0 additions & 137 deletions artic/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,143 +296,6 @@ def identify_bed_file(bed_file):
return version


def read_bed_file(fn):
"""Parses a V2/V3 bed file and collapses primers into canonical primer sites

Parameters
----------
fn : str
The bedfile to parse

Returns
-------
list
A list of dictionaries, where each dictionary contains a row of the parsed bedfile.
The available dictionary keys are - Primer_ID, direction, start, end
"""

# read the primer scheme into a pandas dataframe and run type, length and null checks
version = identify_bed_file(fn)

if version in (1, 2):
return read_bed_file_legacy(fn)

primers = pd.read_csv(
fn,
sep="\t",
comment="#",
header=None,
names=["chrom", "start", "end", "Primer_ID", "PoolName", "direction"],
dtype={
"chrom": str,
"start": int,
"end": int,
"Primer_ID": str,
"PoolName": str,
},
usecols=(0, 1, 2, 3, 4, 5),
skiprows=0,
)
if len(primers.index) < 1:
print("primer scheme file is empty", file=sys.stderr)
raise SystemExit(1)
if primers.isnull().sum().sum():
print("malformed primer scheme file", file=sys.stderr)
raise SystemExit(1)

canonical_primers = {}
for _, row in primers.iterrows():
scheme_name, primer_id, direction, primer_n = row["Primer_ID"].split("_")

if (row["chrom"], primer_id, direction) not in canonical_primers:
canonical_primers[(row["chrom"], primer_id, direction)] = row.to_dict()
continue

canonical_primers[(row["chrom"], primer_id, direction)] = merge_sites(
canonical_primers[(row["chrom"], primer_id, direction)], row
)

# return the bedFile as a list
return [value for value in canonical_primers.values()]


def read_bed_file_legacy(fn):
"""Parses a bed file and collapses alts into canonical primer sites

Parameters
----------
fn : str
The bedfile to parse

Returns
-------
list
A list of dictionaries, where each dictionary contains a row of the parsed bedfile.
The available dictionary keys are - Primer_ID, direction, start, end
"""

# read the primer scheme into a pandas dataframe and run type, length and null checks
primers = pd.read_csv(
fn,
sep="\t",
header=None,
names=["chrom", "start", "end", "Primer_ID", "PoolName", "direction"],
dtype={
"chrom": str,
"start": int,
"end": int,
"Primer_ID": str,
"PoolName": str,
},
usecols=(0, 1, 2, 3, 4, 5),
skiprows=0,
)
if len(primers.index) < 1:
print("primer scheme file is empty", file=sys.stderr)
raise SystemExit(1)
if primers.isnull().sum().sum():
print("malformed primer scheme file", file=sys.stderr)
raise SystemExit(1)

# separate alt primers into a new dataframe
altFilter = primers["Primer_ID"].str.contains("_alt")
alts = pd.DataFrame(
columns=("chrom", "start", "end", "Primer_ID", "PoolName", "direction")
)
alts = pd.concat([alts, primers[altFilter]])
primers = primers.drop(primers[altFilter].index.values)

# convert the primers dataframe to dictionary, indexed by Primer_ID
# - verify_integrity is used to prevent duplicate Primer_IDs being processed
bedFile = primers.set_index(
"Primer_ID", drop=False, verify_integrity=True
).T.to_dict()

# if there were no alts, return the bedfile as a list of dicts
if len(alts.index) == 0:
return list(bedFile.values())

# merge alts
for _, row in alts.iterrows():
primerID = row["Primer_ID"].split("_alt")[0]

# check the bedFile if another version of this primer exists
if primerID not in bedFile:

# add to the bed file and continue
bedFile[primerID] = row.to_dict()
continue

# otherwise, we've got a primer ID we've already seen so merge the alt
mergedSite = merge_sites(bedFile[primerID], row)

# update the bedFile
bedFile[primerID] = mergedSite

# return the bedFile as a list
return [value for value in bedFile.values()]


def overlaps(coords, pos):
for v in coords:
if pos >= v["start"] and pos <= v["end"]:
Expand Down
28 changes: 0 additions & 28 deletions artic/vcf_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,34 +43,6 @@ def check_filter(self, v):
return True


# class MedakaFilter:
# def __init__(self, no_frameshifts, min_depth):
# self.no_frameshifts = no_frameshifts
# self.min_depth = min_depth
# self.min_variant_quality = 20

# def check_filter(self, v, min_depth):
# try:
# # We don't really care about the depth here, just skip it if it isn't there
# depth = v.INFO["DP"]
# except KeyError:
# depth = v.format("DP")[0][0]

# if depth < min_depth:
# return False

# if self.no_frameshifts and not in_frame(v):
# return False

# if v.num_het:
# return False

# if v.QUAL < self.min_variant_quality:
# return False

# return True


def go(args):
vcf_reader = VCF(args.inputvcf)
vcf_writer = Writer(args.output_pass_vcf, vcf_reader, "w")
Expand Down
16 changes: 11 additions & 5 deletions artic/vcf_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,23 @@
import sys
from operator import attrgetter
from collections import defaultdict
from artic.utils import read_bed_file

from primalbedtools.scheme import Scheme
from primalbedtools.bedfiles import merge_primers


def vcf_merge(args):
bed = read_bed_file(args.bedfile)

scheme = Scheme.from_file(args.bedfile)

# Merge the primers
scheme.bedlines = merge_primers(scheme.bedlines)

primer_map = defaultdict(dict)

for p in bed:
for n in range(p["start"], p["end"] + 1):
primer_map[p["PoolName"]][n] = p["Primer_ID"]
for p in scheme.bedlines:
for n in range(p.start, p.end + 1):
primer_map[p.pool][n] = p.primername

template_vcf = None

Expand Down
6 changes: 4 additions & 2 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@ dependencies:
- pandas
- pip
- pysam
- pytest
- pytest
- cyvcf2
- requests
- samtools
- tqdm
- seqtk
- seqtk
- align_trim
- primalbedtools>=0.10.1
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ content-type = "text/markdown"

[project.scripts]
artic = "artic.pipeline:main"
align_trim = "artic.align_trim:main"
margin_cons = "artic.margin_cons:main"
vcfextract = "artic.vcfextract:main"
artic_vcf_merge = "artic.vcf_merge:main"
Expand Down
Loading
Loading