artic-network · BioWilko · Aug 26, 2025 · Aug 26, 2025 · Aug 26, 2025 · Aug 26, 2025
diff --git a/artic/align_trim.py b/artic/align_trim.py
diff --git a/artic/artic_mqc.py b/artic/artic_mqc.py
@@ -3,7 +3,9 @@
 import re
 import sys
 
-from artic.utils import read_bed_file
+from primalbedtools.scheme import Scheme
+from primalbedtools.bedfiles import BedLine, merge_primers
+from primalbedtools.amplicons import Amplicon, create_amplicons
 
 # Alignment_Length_Threshold drops binned reads that are <X% of amplicon length)
 Alignment_Length_Threshold = 0.95
@@ -71,13 +73,18 @@ def getSchemeAmplicons(schemeFile):
         A dict of amplicon names -> zeroed counter
     """
     amplicons = {}
-    primer_scheme = read_bed_file(schemeFile)
-    for primer in primer_scheme:
+
+    scheme = Scheme.from_file(schemeFile)
+
+    # Merge the primers
+    scheme.bedlines = merge_primers(scheme.bedlines)
+
+    for primer in scheme.bedlines:
         amplicon = ""
-        if primer["direction"] == "+":
-            amplicon = primer["Primer_ID"].split("_LEFT")[0]
+        if primer.strand == "+":
+            amplicon = primer.primername.split("_LEFT")[0]
         else:
-            amplicon = primer["Primer_ID"].split("_RIGHT")[0]
+            amplicon = primer.primername.split("_RIGHT")[0]
         if amplicon not in amplicons:
             amplicons[amplicon] = 0
         amplicons[amplicon] += 1

diff --git a/artic/mask.py b/artic/mask.py
@@ -3,21 +3,26 @@
 from Bio import SeqIO
 from cyvcf2 import VCF
 import argparse
-import pandas as pd
+import csv
 
 
 def read_3col_bed(fn):
     # read the primer scheme into a pandas dataframe and run type, length and null checks
-    bedfile = pd.read_csv(
-        fn,
-        sep="\t",
-        header=None,
-        names=["chrom", "start", "end"],
-        dtype={"chrom": str, "start": int, "end": int},
-        usecols=(0, 1, 2),
-        skiprows=0,
-    )
-    return bedfile
+    with open(fn, "rt") as f:
+        reader = csv.DictReader(f, fieldnames=["chrom", "start", "end"], delimiter="\t")
+
+        bedlines = []
+        for row in reader:
+            try:
+                row["start"] = int(row["start"])
+                row["end"] = int(row["end"])
+            except ValueError:
+                raise ValueError(
+                    "The depth mask bedfile appears to be malformed, the start or end position cannot be converted to an integer"
+                )
+            bedlines.append(row)
+
+    return bedlines
 
 
 def go(args):
@@ -27,9 +32,9 @@ def go(args):
         cons[k] = list(seqs[k].seq)
 
     bedfile = read_3col_bed(args.maskfile)
-    for _, region in bedfile.iterrows():
-        for n in range(region["start"], region["end"]):
-            cons[region["chrom"]][n] = "N"
+    for bedline in bedfile:
+        for n in range(bedline["start"], bedline["end"]):
+            cons[bedline["chrom"]][n] = "N"
 
     vcf_reader = VCF(args.maskvcf)
     for record in vcf_reader:

diff --git a/artic/minion.py b/artic/minion.py
@@ -4,7 +4,8 @@
 import os
 import sys
 import time
-from artic.utils import read_bed_file, get_scheme, choose_model
+from artic.utils import get_scheme, choose_model
+from primalbedtools.scheme import Scheme
 
 
 def run(parser, args):
@@ -121,7 +122,8 @@ def run(parser, args):
         raise SystemExit(1)
 
     ## collect the primer pools
-    pools = set([row["PoolName"] for row in read_bed_file(bed)] + ["unmatched"])
+    scheme = Scheme.from_file(bed)
+    pools = set([row.pool for row in scheme.bedlines] + ["unmatched"])
 
     ## create a holder to keep the pipeline commands in
     cmds = []
@@ -141,26 +143,26 @@ def run(parser, args):
     normalise_string = f"--normalise {args.normalise}" if args.normalise else ""
 
     incorrect_pairs_string = (
-        "--remove-incorrect-pairs" if not args.allow_mismatched_primers else ""
+        "--allow-incorrect-pairs" if args.allow_mismatched_primers else ""
     )
 
     cmds.append(
-        f"align_trim {normalise_string} {bed} --primer-match-threshold {args.primer_match_threshold} {incorrect_pairs_string} --min-mapq {args.min_mapq} --report {args.sample}.alignreport.csv < {args.sample}.sorted.bam > {args.sample}.trimmed.rg.sam"
+        f"align_trim {normalise_string} {bed} --no-trim-primers --primer-match-threshold {args.primer_match_threshold} {incorrect_pairs_string} --min-mapq {args.min_mapq} --report {args.sample}.alignreport.tsv --samfile {args.sample}.sorted.bam -o {args.sample}.trimmed.rg.bam"
     )
 
     cmds.append(
-        f"samtools sort -T {args.sample} {args.sample}.trimmed.rg.sam -o {args.sample}.trimmed.rg.sorted.bam"
+        f"samtools sort -T {args.sample} {args.sample}.trimmed.rg.bam -o {args.sample}.trimmed.rg.sorted.bam"
     )
-    cmds.append(f"rm {args.sample}.trimmed.rg.sam")
+    cmds.append(f"rm {args.sample}.trimmed.rg.bam")
 
     cmds.append(
-        f"align_trim {normalise_string} {bed} --primer-match-threshold {args.primer_match_threshold} --min-mapq {args.min_mapq} {incorrect_pairs_string} --trim-primers --report {args.sample}.alignreport.csv --amp-depth-report {args.sample}.amplicon_depths.tsv < {args.sample}.sorted.bam > {args.sample}.primertrimmed.rg.sam"
+        f"align_trim {normalise_string} {bed} --primer-match-threshold {args.primer_match_threshold} --min-mapq {args.min_mapq} {incorrect_pairs_string} --report {args.sample}.alignreport.tsv --amp-depth-report {args.sample}.amplicon_depths.tsv --samfile {args.sample}.sorted.bam -o {args.sample}.primertrimmed.rg.bam"
     )
 
     cmds.append(
-        f"samtools sort -T {args.sample} {args.sample}.primertrimmed.rg.sam -o {args.sample}.primertrimmed.rg.sorted.bam"
+        f"samtools sort -T {args.sample} {args.sample}.primertrimmed.rg.bam -o {args.sample}.primertrimmed.rg.sorted.bam"
     )
-    cmds.append(f"rm {args.sample}.primertrimmed.rg.sam")
+    cmds.append(f"rm {args.sample}.primertrimmed.rg.bam")
 
     cmds.append(f"samtools index {args.sample}.trimmed.rg.sorted.bam")
     cmds.append(f"samtools index {args.sample}.primertrimmed.rg.sorted.bam")

diff --git a/artic/utils.py b/artic/utils.py
@@ -296,143 +296,6 @@ def identify_bed_file(bed_file):
         return version
 
 
-def read_bed_file(fn):
-    """Parses a V2/V3 bed file and collapses primers into canonical primer sites
-
-    Parameters
-    ----------
-    fn : str
-        The bedfile to parse
-
-    Returns
-    -------
-    list
-        A list of dictionaries, where each dictionary contains a row of the parsed bedfile.
-        The available dictionary keys are - Primer_ID, direction, start, end
-    """
-
-    # read the primer scheme into a pandas dataframe and run type, length and null checks
-    version = identify_bed_file(fn)
-
-    if version in (1, 2):
-        return read_bed_file_legacy(fn)
-
-    primers = pd.read_csv(
-        fn,
-        sep="\t",
-        comment="#",
-        header=None,
-        names=["chrom", "start", "end", "Primer_ID", "PoolName", "direction"],
-        dtype={
-            "chrom": str,
-            "start": int,
-            "end": int,
-            "Primer_ID": str,
-            "PoolName": str,
-        },
-        usecols=(0, 1, 2, 3, 4, 5),
-        skiprows=0,
-    )
-    if len(primers.index) < 1:
-        print("primer scheme file is empty", file=sys.stderr)
-        raise SystemExit(1)
-    if primers.isnull().sum().sum():
-        print("malformed primer scheme file", file=sys.stderr)
-        raise SystemExit(1)
-
-    canonical_primers = {}
-    for _, row in primers.iterrows():
-        scheme_name, primer_id, direction, primer_n = row["Primer_ID"].split("_")
-
-        if (row["chrom"], primer_id, direction) not in canonical_primers:
-            canonical_primers[(row["chrom"], primer_id, direction)] = row.to_dict()
-            continue
-
-        canonical_primers[(row["chrom"], primer_id, direction)] = merge_sites(
-            canonical_primers[(row["chrom"], primer_id, direction)], row
-        )
-
-    # return the bedFile as a list
-    return [value for value in canonical_primers.values()]
-
-
-def read_bed_file_legacy(fn):
-    """Parses a bed file and collapses alts into canonical primer sites
-
-    Parameters
-    ----------
-    fn : str
-        The bedfile to parse
-
-    Returns
-    -------
-    list
-        A list of dictionaries, where each dictionary contains a row of the parsed bedfile.
-        The available dictionary keys are - Primer_ID, direction, start, end
-    """
-
-    # read the primer scheme into a pandas dataframe and run type, length and null checks
-    primers = pd.read_csv(
-        fn,
-        sep="\t",
-        header=None,
-        names=["chrom", "start", "end", "Primer_ID", "PoolName", "direction"],
-        dtype={
-            "chrom": str,
-            "start": int,
-            "end": int,
-            "Primer_ID": str,
-            "PoolName": str,
-        },
-        usecols=(0, 1, 2, 3, 4, 5),
-        skiprows=0,
-    )
-    if len(primers.index) < 1:
-        print("primer scheme file is empty", file=sys.stderr)
-        raise SystemExit(1)
-    if primers.isnull().sum().sum():
-        print("malformed primer scheme file", file=sys.stderr)
-        raise SystemExit(1)
-
-    # separate alt primers into a new dataframe
-    altFilter = primers["Primer_ID"].str.contains("_alt")
-    alts = pd.DataFrame(
-        columns=("chrom", "start", "end", "Primer_ID", "PoolName", "direction")
-    )
-    alts = pd.concat([alts, primers[altFilter]])
-    primers = primers.drop(primers[altFilter].index.values)
-
-    # convert the primers dataframe to dictionary, indexed by Primer_ID
-    #  - verify_integrity is used to prevent duplicate Primer_IDs being processed
-    bedFile = primers.set_index(
-        "Primer_ID", drop=False, verify_integrity=True
-    ).T.to_dict()
-
-    # if there were no alts, return the bedfile as a list of dicts
-    if len(alts.index) == 0:
-        return list(bedFile.values())
-
-    # merge alts
-    for _, row in alts.iterrows():
-        primerID = row["Primer_ID"].split("_alt")[0]
-
-        # check the bedFile if another version of this primer exists
-        if primerID not in bedFile:
-
-            # add to the bed file and continue
-            bedFile[primerID] = row.to_dict()
-            continue
-
-        # otherwise, we've got a primer ID we've already seen so merge the alt
-        mergedSite = merge_sites(bedFile[primerID], row)
-
-        # update the bedFile
-        bedFile[primerID] = mergedSite
-
-    # return the bedFile as a list
-    return [value for value in bedFile.values()]
-
-
 def overlaps(coords, pos):
     for v in coords:
         if pos >= v["start"] and pos <= v["end"]:

diff --git a/artic/vcf_filter.py b/artic/vcf_filter.py
@@ -43,34 +43,6 @@ def check_filter(self, v):
         return True
 
 
-# class MedakaFilter:
-#     def __init__(self, no_frameshifts, min_depth):
-#         self.no_frameshifts = no_frameshifts
-#         self.min_depth = min_depth
-#         self.min_variant_quality = 20
-
-#     def check_filter(self, v, min_depth):
-#         try:
-#             # We don't really care about the depth here, just skip it if it isn't there
-#             depth = v.INFO["DP"]
-#         except KeyError:
-#             depth = v.format("DP")[0][0]
-
-#         if depth < min_depth:
-#             return False
-
-#         if self.no_frameshifts and not in_frame(v):
-#             return False
-
-#         if v.num_het:
-#             return False
-
-#         if v.QUAL < self.min_variant_quality:
-#             return False
-
-#         return True
-
-
 def go(args):
     vcf_reader = VCF(args.inputvcf)
     vcf_writer = Writer(args.output_pass_vcf, vcf_reader, "w")

diff --git a/artic/vcf_merge.py b/artic/vcf_merge.py
@@ -2,17 +2,23 @@
 import sys
 from operator import attrgetter
 from collections import defaultdict
-from artic.utils import read_bed_file
+
+from primalbedtools.scheme import Scheme
+from primalbedtools.bedfiles import merge_primers
 
 
 def vcf_merge(args):
-    bed = read_bed_file(args.bedfile)
+
+    scheme = Scheme.from_file(args.bedfile)
+
+    # Merge the primers
+    scheme.bedlines = merge_primers(scheme.bedlines)
 
     primer_map = defaultdict(dict)
 
-    for p in bed:
-        for n in range(p["start"], p["end"] + 1):
-            primer_map[p["PoolName"]][n] = p["Primer_ID"]
+    for p in scheme.bedlines:
+        for n in range(p.start, p.end + 1):
+            primer_map[p.pool][n] = p.primername
 
     template_vcf = None
 

diff --git a/environment.yml b/environment.yml
@@ -16,9 +16,11 @@ dependencies:
   - pandas
   - pip
   - pysam
-  - pytest 
+  - pytest
   - cyvcf2
   - requests
   - samtools
   - tqdm
-  - seqtk
+  - seqtk
+  - align_trim
+  - primalbedtools>=0.10.1
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,7 +29,6 @@ content-type = "text/markdown"
 
 [project.scripts]
 artic = "artic.pipeline:main"
-align_trim = "artic.align_trim:main"
 margin_cons = "artic.margin_cons:main"
 vcfextract = "artic.vcfextract:main"
 artic_vcf_merge = "artic.vcf_merge:main"