dpellow
diff --git a/Diff for: ‎classify_fasta.py
+3-3 b/Diff for: ‎classify_fasta.py
+3-3
diff --git a/Diff for: ‎plasclass/__init__.py
+1 b/Diff for: ‎plasclass/__init__.py
+1
diff --git a/Diff for: ‎plasclass/data/m1000
86.3 KB b/Diff for: ‎plasclass/data/m1000
86.3 KB
diff --git a/Diff for: ‎plasclass/data/m10000
86.3 KB b/Diff for: ‎plasclass/data/m10000
86.3 KB
diff --git a/Diff for: ‎plasclass/data/m100000
86.3 KB b/Diff for: ‎plasclass/data/m100000
86.3 KB
diff --git a/Diff for: ‎plasclass/data/m500000
86.3 KB b/Diff for: ‎plasclass/data/m500000
86.3 KB
diff --git a/Diff for: ‎plasclass/data/s1000
257 KB b/Diff for: ‎plasclass/data/s1000
257 KB
diff --git a/Diff for: ‎plasclass/data/s10000
257 KB b/Diff for: ‎plasclass/data/s10000
257 KB
diff --git a/Diff for: ‎plasclass/data/s100000
257 KB b/Diff for: ‎plasclass/data/s100000
257 KB
diff --git a/Diff for: ‎plasclass/data/s500000
257 KB b/Diff for: ‎plasclass/data/s500000
257 KB
diff --git a/Diff for: ‎plasclass/plasclass.py
+130 b/Diff for: ‎plasclass/plasclass.py
+130
diff --git a/Diff for: ‎plasclass/plasclass_utils.py
+132 b/Diff for: ‎plasclass/plasclass_utils.py
+132
diff --git a/Diff for: ‎setup.py
+3-3 b/Diff for: ‎setup.py
+3-3
@@ -2,8 +2,8 @@
 # Provide a command line script to classify sequences in a fasta file
 ###
 
-from classification import classifier_utils as utils
-from classification import classifier
+from plasclass import plasclass_utils as utils
+from plasclass import plasclass
 
 import argparse
 
@@ -38,7 +38,7 @@ def main(args):
     else: outfile = infile + '.probs.out'
     n_procs = args.num_processes
 
-    c = classifier.classifier(n_procs)
+    c = plasclass.plasclass(n_procs)
     seq_names = []
     seqs = []
     print "Reading {} in batches of 100k sequences".format(infile)
 
@@ -0,0 +1 @@
+name = "plasclass"
@@ -0,0 +1,130 @@
+###
+# Define the classifier class and provide a set of functions to enable classification
+###
+
+import numpy as np
+import os
+from sklearn.linear_model import LogisticRegression
+from sklearn.preprocessing import StandardScaler
+import itertools
+from joblib import load
+
+import multiprocessing as mp
+from multiprocessing import Manager
+
+import plasclass_utils as utils
+
+class plasclass():
+    def __init__(self,n_procs = 1):
+        self._scales = [1000,10000,100000,500000]
+        self._ks = [3,4,5,6,7]
+        self._compute_kmer_inds()
+        self._load_classifiers()
+        self._n_procs = n_procs
+
+
+    def classify(self,seq):
+        '''Classify the sequence(s), return the probability of the sequence(s) being a plasmid.
+        Assumes seq is either an individual string or a list of strings
+        Returns either an individual plasmid probability for seq or a list of
+        plasmid probabilities for each sequence in seq
+        '''
+        if isinstance(seq, basestring): # single sequence
+            print "Counting k-mers for sequence of length {}".format(len(seq))
+            kmer_freqs = [0]
+            scale = self._get_scale(len(seq))
+            utils.count_kmers([0, seq, self._ks, self._kmer_inds, self._kmer_count_lens, kmer_freqs])
+            kmer_freqs = np.array(kmer_freqs)
+            standardized_freqs = self._standardize(kmer_freqs, scale)
+            print "Classifying"
+            return self.classifiers[scale]['clf'].predict_proba(standardized_freqs)[0,1]
+
+        elif isinstance(seq, list): # list of sequences
+            print "{} sequences to classify. Classifying in batches of 100k".format(len(seq))
+            results = []
+            seq_ind = 0
+            pool = mp.Pool(self._n_procs)
+
+            while seq_ind < len(seq):
+                print "Starting new batch"
+                seq_batch = seq[seq_ind:seq_ind + 100000]
+                print "Partitioning by length"
+                scales = [self._get_scale(len(s)) for s in seq_batch]
+                scale_partitions = {s: [seq_batch[i] for i,v in enumerate(scales) if v == s] for s in self._scales}
+
+                partitioned_classifications = {}
+                for scale in self._scales: #scale_partitions:
+                    part_seqs = scale_partitions[scale]
+                    if len(part_seqs) <= 0: continue
+                    print "Getting kmer frequencies for partition length {}".format(scale)
+                    shared_list=Manager().list()
+                    for cur in np.arange(len(part_seqs)):
+                        shared_list.append(0)
+                    pool.map(utils.count_kmers, [[ind, s, self._ks, self._kmer_inds, self._kmer_count_lens, shared_list] for ind,s in enumerate(part_seqs)])
+                    kmer_freqs_mat = np.array(shared_list)
+                    standardized_freqs = self._standardize(kmer_freqs_mat, scale)
+                    print "Classifying sequences of length scale {}".format(scale)
+                    partitioned_classifications[scale] = self.classifiers[scale]['clf'].predict_proba(standardized_freqs)[:,1]
+
+                # recollate the results:
+                scale_inds = {s:0 for s in self._scales}
+                for s in scales:
+                    results.append(partitioned_classifications[s][scale_inds[s]])
+                    scale_inds[s] += 1
+
+                seq_ind += 100000
+
+            # pool.close() TODO: is this needed?
+            return np.array(results)
+
+        else:
+            raise TypeError('Can only classify strings or lists of strings')
+
+
+    def _load_classifiers(self):
+        ''' Load the multi-scale classifiers and scalers
+        '''
+        curr_path = os.path.dirname(os.path.abspath(__file__))
+        data_path = os.path.join(curr_path,'data')
+        self.classifiers = {}
+        for i in self._scales:
+            print "Loading classifier " + str(i)
+            self.classifiers[i] = {'clf': load(os.path.join(data_path,'m'+str(i))), 'scaler': load(os.path.join(data_path,'s'+str(i)))}
+
+
+    def _get_scale(self, length):
+        ''' Choose which length scale to use for the sequence
+        '''
+        if length <= self._scales[0]: return self._scales[0]
+        for i,l in enumerate(self._scales[:-1]):
+             if length <= float(l + self._scales[i+1])/2.0:
+                return l
+        return self._scales[-1]
+
+    def _standardize(self, freqs, scale):
+        ''' Use sklearn's standard scaler to standardize
+        Choose the appropriate scaler based on sequence length
+        '''
+        return self.classifiers[scale]['scaler'].transform(freqs)
+
+    def _compute_kmer_inds(self):
+        ''' Compute the indeces of each canonical kmer in the kmer count vectors
+        '''
+
+        self._kmer_inds = {k: {} for k in self._ks}
+        self._kmer_count_lens = {k: 0 for k in self._ks}
+
+        alphabet = 'ACGT'
+        for k in self._ks:
+            all_kmers = [''.join(kmer) for kmer in itertools.product(alphabet,repeat=k)]
+            all_kmers.sort()
+            ind = 0
+            for kmer in all_kmers:
+                bit_mer = utils.mer2bits(kmer)
+                rc_bit_mer = utils.mer2bits(utils.get_rc(kmer))
+                if rc_bit_mer in self._kmer_inds[k]:
+                    self._kmer_inds[k][bit_mer] = self._kmer_inds[k][rc_bit_mer]
+                else:
+                    self._kmer_inds[k][bit_mer] = ind
+                    self._kmer_count_lens[k] += 1
+                    ind += 1
@@ -0,0 +1,132 @@
+# Utility functions for the classifier module
+
+complements = {'A':'T', 'C':'G', 'G':'C', 'T':'A'}
+nt_bits = {'A':0,'C':1,'G':2,'T':3}
+
+import numpy as np
+
+
+def readfq(fp): # this is a generator function
+    ''' Adapted from https://github.com/lh3/readfq
+    '''
+    last = None # this is a buffer keeping the last unprocessed line
+    while True: # mimic closure; is it a bad idea?
+        if not last: # the first record or a record following a fastq
+            for l in fp: # search for the start of the next record
+                if l[0] in '>@': # fasta/q header line
+                    last = l[:-1] # save this line
+                    break
+        if not last: break
+        name, seqs, last = last[1:].partition(" ")[0], [], None
+        for l in fp: # read the sequence
+            if l[0] in '@+>':
+                last = l[:-1]
+                break
+            seqs.append(l[:-1])
+        if not last or last[0] != '+': # this is a fasta record
+            yield name, ''.join(seqs), None # yield a fasta record
+            if not last: break
+        else: # this is a fastq record
+            seq, leng, seqs = ''.join(seqs), 0, []
+            for l in fp: # read the quality
+                seqs.append(l[:-1])
+                leng += len(l) - 1
+                if leng >= len(seq): # have read enough quality
+                    last = None
+                    yield name, seq, ''.join(seqs); # yield a fastq record
+                    break
+            if last: # reach EOF before reading enough quality
+                yield name, seq, None # yield a fasta record instead
+                break
+
+
+def get_rc(seq):
+    ''' Return the reverse complement of seq
+    '''
+    rev = reversed(seq)
+    return "".join([complements.get(i,i) for i in rev])
+
+
+def mer2bits(kmer):
+    ''' convert kmer to bit representation
+    '''
+    bit_mer=nt_bits[kmer[0]]
+    for c in kmer[1:]:
+        bit_mer = (bit_mer << 2) | nt_bits[c]
+    return bit_mer
+
+
+def count_kmers(args_array):
+    ''' Count the k-mers in the sequence
+        Return a dictionary of counts
+        Assumes ks is sorted
+    '''
+    ret_ind, seq, ks, kmer_inds, vec_lens, shared_list = args_array
+
+    kmer_counts = {k:np.zeros(vec_lens[k]) for k in ks}
+
+    k_masks = [2**(2*k)-1 for k in ks]
+    ind=0
+    bit_mers = [0 for k in ks]
+
+    # get the first set of kmers
+    while True:
+        found = True
+        for i,k in enumerate(ks):
+            try:
+                bit_mers[i] = mer2bits(seq[ind:ind+k])
+                kmer_counts[k][kmer_inds[k][bit_mers[i]]] += 1.
+            except:
+                ind += 1
+                found = False
+                break
+        if found == True:
+            break
+
+    # count all other kmers
+    while ind<len(seq)-ks[-1]: # iterate through sequence until last k-mer for largest k
+        for i,k in enumerate(ks):
+            try:
+                c = nt_bits[seq[ind+k]]
+                bit_mers[i] = ((bit_mers[i]<<2)|c)&k_masks[i]
+                kmer_counts[k][kmer_inds[k][bit_mers[i]]] += 1.
+            except: # out of alphabet
+                ind += 2 # pass it and move on to the next
+                # get the next set of legal kmers
+                while ind<=len(seq)-ks[-1]:
+                    found = True
+                    for i2,k2 in enumerate(ks):
+                        try:
+                            bit_mers[i2] = mer2bits(seq[ind:ind+k2])
+                            kmer_counts[k2][kmer_inds[k2][bit_mers[i2]]] += 1.
+                        except:
+                            ind += 1
+                            found = False
+                            break
+                    if found == True:
+                        ind -= 1 # in next step increment ind
+                        break
+        ind += 1 # move on to next letter in sequence
+
+    # count the last few kmers
+    end = len(ks)-1
+    for i in range(len(seq)-ks[-1]+1,len(seq)-ks[0]+1):
+        for k in ks[:end]:
+            kmer = seq[i:i+k]
+            try:
+                kmer_counts[k][kmer_inds[k][mer2bits(kmer)]] += 1.
+            except:
+                pass
+        end -= 1
+
+    #normalise counts
+    kmer_freqs = np.zeros(sum([vec_lens[k] for k in ks]))
+    ind = 0
+    for k in ks:
+        counts_sum = np.sum(kmer_counts[k])
+        if counts_sum != 0:
+            kmer_counts[k] = kmer_counts[k]/float(counts_sum)
+        kmer_freqs[ind:ind+vec_lens[k]] = kmer_counts[k]
+        ind += vec_lens[k]
+
+    shared_list[ret_ind] = kmer_freqs
@@ -4,15 +4,15 @@
     long_description = fh.read()
 
 setuptools.setup(
-    name="classification-dpellow",
+    name="plasclass-dpellow",
     version="0.1",
     author="David Pellow",
     author_email="[email protected]",
     description="Classification of plasmid sequences",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    url="https://github.com/dpellow/classification",
-    packages=['classification'],
+    url="https://github.com/dpellow/plasclass",
+    packages=['plasclass'],
     classifiers=[
         "Programming Language :: Python :: 2.7",
         "License :: OSI Approved :: MIT License",