-Fixes #17

kwade4 · kwade4 · commit 8c805e1d27b0 · 2019-07-31T17:11:04.000-04:00
-Implemented 'encode' (#22) -Added unit test -Fixes #19
diff --git a/poplars/riplike.py b/poplars/riplike.py
@@ -1,64 +1,49 @@
 # TODO: consistent reference coordinates across outputs
 
+import sys
+import subprocess
 import random
 import argparse
+import os
 
 from poplars.common import convert_fasta
 from poplars.mafft import align
-import numpy as np
 
-# subset of HIV-1 group M subtype references curated by LANL
-with open('../poplars/ref_genomes/HIV1_Mgroup.fasta') as handle:
-    reference = convert_fasta(handle)
 
-
-def pdistance(seq1, seq2):
+def hamming(fasta):
     """
-    Calculate p-distance between two aligned sequences
-    :param seq1: First sequence
-    :param seq2: Second sequence
-    :return: <ndiff> is the number of differences, <denom> is the number of valid positions
+    Convert list of lists into boolean outcomes (difference between query and reference)
+    :param fasta: object returned by align()
+    :return: dictionary of boolean lists keyed by reference label
     """
-    denom = 0.  # number of valid columns
-    ndiff = 0
-
-    seqs = np.char.asarray([seq1, seq2])       # Convert sequences to numpy arrays
-
-    # Stack 2-D numpy arrays and find columns not containing '-'
-    # Gives an array containing True and False
-    denoms = np.where(np.all(np.isin(seqs, '-', invert=True), 0))[0]
-    denom = denoms.shape[0]
-
-    # From the valid positions, find where the sequences contain different nucleotides
-    ndiff = np.sum(seqs[0, :][denoms] != seqs[1, :][denoms])
+    aln = dict(fasta)
+    assert "query" in aln, "Argument <fasta> must contain 'query' entry"
+    query = aln.pop('query')
+    _ = aln.pop('CON_OF_CONS')
+
+    # iterate over remaining sequences as references
+    results = {}
+    for h, s in aln.items():
+        result = []
+        for i, nt1 in enumerate(query):
+            nt2 = s[i]
+            if nt1 == '-' or nt2 == '-':
+                result.append(None)
+                continue
+            result.append(int(nt1 != nt2))
+        results.update({h: result})
 
-    return ndiff, denom
+    return results
 
 
-def bootstrap(s1, s2, reps=100):
+def update_alignment(seq, reference):
     """
-    Sample positions at random with replacement.
-    :param s1:  first sequence
-    :param s2:  second sequence (must be aligned to s1)
-    :param reps:  number of replicates to generate
-
-    :yield: tuples of sequences generated by bootstrap resampling
+    Append query sequence <seq> to reference alignment and remove insertions relative to
+    global consensus sequence.
+    :param seq: the query sequence
+    :param reference: the reference sequence
+    :return: a list of [header, sequence] lists
     """
-    seqlen = len(s1)
-    assert len(s2) == seqlen, "s1 and s2 must be of same length in bootstrap()"
-
-    # Convert sequences to numpy arrays
-    s1_np = np.char.asarray(list(s1))
-    s2_np = np.char.asarray(list(s2))
-
-    for rep in range(reps):
-        bootstrap = np.random.randint(0, seqlen, seqlen)
-        b1 = s1_np[bootstrap]
-        b2 = s2_np[bootstrap]
-        yield b1, b2
-
-
-def update_alignment(seq):
     # append query sequence to reference alignment
     fasta = align(seq, reference)
 
@@ -78,38 +63,53 @@ def update_alignment(seq):
     return fasta2
 
 
-def riplike(seq, outfile, window=400, step=5, nrep=100):
+def encode(sequence):
+    """
+    Encodes each nucleotide in a sequence using 4-bits
+    :param sequence: the sequence
+    :return: the sequence as a bitstring where each nucleotide is encoded using a 4-bits
+    """
+    seq = []
+    binary_nt = {'A': 0B0001, 'T': 0B0010, 'C': 0B0011, 'G': 0B0100, ' ': 0B0000, '-': 0B1111}
+    for nt in sequence:
+        seq.append(binary_nt[nt])
+    return seq
+
+
+def riplike(seq, reference, window=400, step=5, nrep=100):
     """
     :param seq:  query sequence
-    :param outfile:  open file stream in write mode for results
+    :param reference: the alignment background
     :param window:  width of sliding window in nucleotides
     :param step:  step size of sliding window in nucleotides
     :param nrep:  number of replicates for nonparametric bootstrap sampling
+    :return: list of result dictionaries in order of window position
     """
+
     results = []
 
-    fasta = update_alignment(seq)
+    fasta = update_alignment(seq, reference)
     query = dict(fasta)['query']  # aligned query
     seqlen = len(query)
+    ham = hamming(fasta)
 
-    for center in range(window//2, seqlen - (window//2), step):
+    for centre in range(window // 2, seqlen - (window // 2), step):
         best_p, second_p = 1., 1.  # maximum p-distance
         best_ref, second_ref = None, None
-        best_seq = ''
-
-        # cut slice from query sequence for this window
-        q1 = query[center-(window//2):center + (window//2)]
+        best_seq = []
 
         # iterate over reference genomes
-        for h, s in fasta:
+        for h, s in ham.items():
             if h == 'query' or h == 'CON_OF_CONS':
                 continue
 
-                # slice window segment from reference
-            s1 = s[center-(window//2):center + (window//2)]
+            # slice window segment from reference
+            s1 = s[centre - (window // 2): centre + (window // 2)]
+            s2 = [x for x in s1 if x is not None]
 
             # calculate p-distance
-            ndiff, denom = pdistance(list(s1), list(q1))
+            ndiff = sum(s2)
+            denom = len(s2)
             if denom == 0:
                 # no overlap!  TODO: require minimum overlap?
                 continue
@@ -121,31 +121,25 @@ def riplike(seq, outfile, window=400, step=5, nrep=100):
                 second_ref = best_ref
                 best_p = pd
                 best_ref = h
-                best_seq = s1
+                best_seq = s2
             elif pd < second_p:
                 # replace second best
                 second_p = pd
                 second_ref = h
 
-        if best_ref is None:
-            outfile.write('{},{},None,,None,,\n'.format(h, center))
-            continue
-
-        result = {'center': center, 'best_ref': best_ref, 'best_p': best_p,
+        result = {'centre': centre, 'best_ref': best_ref, 'best_p': best_p,
                   'second_ref': second_ref, 'second_p': None if second_ref is None else second_p}
 
         quant = None
-        if second_ref is not None:
+        if second_ref is not None and nrep > 0:
             # use nonparametric bootstrap to determine significance
-            boot_dist = []
-            for bs, bq in bootstrap(best_seq, q1, reps=nrep):
-                ndiff, denom = pdistance(bs, bq)
-                if denom > 0:
-                    boot_dist.append(ndiff / denom)
-
-            # how many are closer than second best?
-            quant = list(map(lambda x: x < second_p, boot_dist))
-            quant = sum(quant) / float(len(quant))
+            count = 0.
+            n = len(best_seq)
+            for rep in range(nrep):
+                boot = [best_seq[round(random.random() * (n - 1))] for _ in range(n)]
+                if sum(boot) / len(boot) < second_p:
+                    count += 1
+            quant = count / nrep
 
         result.update({'quant': quant})
         results.append(result)
@@ -168,18 +162,31 @@ def main():
                         help='<optional, int> Window step size.')
     parser.add_argument('-nrep', type=int, default=100,
                         help='<optional, int> Number of bootstrap replicates.')
+    parser.add_argument('-custombg', type=argparse.FileType('r'),
+                        help='<optional> FASTA file to be used as the alignment background')
 
     args = parser.parse_args()
+
+    if args.custombg:
+        ref_seq = args.custombg
+    else:
+        # subset of HIV-1 group M subtype references curated by LANL
+        seq_path = os.path.dirname(os.path.abspath(__file__))
+        ref_seq = os.path.join(seq_path, 'ref_genomes/HIV1_Mgroup.fasta')
+
+    with open(ref_seq) as handle:
+        reference = convert_fasta(handle)
+
     args.outfile.write('qname,pos,rname,pdist,rname2,pdist2,qboot\n')
 
     fasta = convert_fasta(args.infile)
     for h, s in fasta:
         print(h)  # crude progress monitoring
-        results = riplike(s, args.outfile, window=args.window, step=args.step, nrep=args.nrep)
+        results = riplike(s, reference, window=args.window, step=args.step, nrep=args.nrep)
         for result in results:
             args.outfile.write(
-                '{},{center},{best_ref},{best_p},{second_ref},{second_p},{quant}\n'
-                    .format(h, **result)
+                '{},{centre},{best_ref},{best_p},{second_ref},{second_p},{quant}\n'
+                .format(h, **result)
             )
 
     args.outfile.close()
diff --git a/poplars/sequence_locator.py b/poplars/sequence_locator.py
@@ -604,19 +604,6 @@ def retrieve(virus, base, ref_regions, region, outfile=None, start_offset=1, end
         else:
             end = length + 1
 
-        # # Handles global and local start coordinates
-        # if start_offset <= region_start:
-        #     start = region_start
-        # else:
-        #     start = region_start + (start_offset - region_start)
-        #
-        # # Handles global and local end coordinates
-        # if end_offset == 'end' or end_offset > region_end:
-        #     end = region_end
-        # else:
-        #     end = region_end + (region_end - end_offset)
-
-        # Create a GenomeRegion object for the query
         if ref_region.region_name == region:
             query_region = GenomeRegion(region)
             query_region.set_coords([start, end], base)      # Set global coordinates
diff --git a/poplars/tests/test_riplike.py b/poplars/tests/test_riplike.py
@@ -2,59 +2,55 @@
 import os
 from poplars.riplike import *
 
-
 TEST_HIV_GENOME = os.path.join(os.path.dirname(__file__), '../ref_genomes/K03455.fasta')
+TEST_REFERENCE = os.path.join(os.path.dirname(__file__), '../ref_genomes/HIV1_Mgroup.fasta')
+
 
 class testRiplike(unittest.TestCase):
 
     def setUp(self):
-        with open(TEST_HIV_GENOME) as handle:
+        with open(TEST_HIV_GENOME) as handle, open(TEST_REFERENCE) as ref_handle:
             self.hiv_genome = handle.readlines()[1]
-        
-    def testPdistSimple(self):
-        result = pdistance('ACGT', 'ACGC')
-        expected = (1, 4)
-        self.assertEqual(expected, result)
-        
-        result = pdistance('ACGT', 'TGCA')
-        expected = (4, 4)
-        self.assertEqual(expected, result)
-    
-    def testPdistGapped(self):
-        result = pdistance('ACGT', '---T')
-        expected = (0, 1)
-        self.assertEqual(expected, result)
-        
-        result = pdistance('ACGT', 'G---')
-        expected = (1, 1)
+            self.reference = convert_fasta(ref_handle)
+
+    def testHamming(self):
+        test = [['query', 'ACGT'], ['CON_OF_CONS', 'ACGT'], ['A', 'ACGG']]
+        result = hamming(test)
+        expected = {'A': [0, 0, 0, 1]}
         self.assertEqual(expected, result)
-    
-    def testBootstrap(self):
-        pass  #FIXME: not sure how to test this function just yet
-        
+
     def testUpdateAlignment(self):
-        aln = update_alignment(self.hiv_genome)
+        aln = update_alignment(self.hiv_genome, self.reference)
         aln = dict(aln)
-        
+
         # alignment should have original number plus one
         result = len(aln)
-        expected = 11
-        self.assertEqual(expected, result) 
-        
+        expected = 13 + 1
+        self.assertEqual(expected, result)
+
         # alignment should contain entry for query sequence
         result = 'query' in aln
         self.assertTrue(result)
-        
+
         # aligned query should have same number of nucleotides
         result = [aln['query'].count(nt) for nt in 'ACGT']
         expected = [self.hiv_genome.upper().count(nt) for nt in 'ACGT']
         self.assertEqual(expected, result)
-        
+
         # all sequences should have the same length
         result = len(set([len(s) for h, s in aln.items()]))
         expected = 1
         self.assertEqual(expected, result)
-        
+
+    def testEncode(self):
+        s = 'ATGCGC--  T'
+        expected = [0B0001, 0B0010, 0B0100, 0B0011, 0B0100, 0B0011, 0B1111, 0B1111, 0B0000, 0B0000, 0B0010]
+        result = encode(s)
+        self.assertEqual(expected, result)
+
+    def testRiplike(self):
+        result = riplike(self.hiv_genome, self.reference)
+
 
 
 
diff --git a/poplars/tests/test_sequence_locator.py b/poplars/tests/test_sequence_locator.py
@@ -1139,3 +1139,7 @@ def tearDown(self):
         self.siv_default_prot.close()
         self.siv_test_genome.close()
         self.siv_test_prot.close()
+
+
+if __name__ == '__main__':
+    unittest.main()