-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexercise10.py
67 lines (60 loc) · 2.01 KB
/
exercise10.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
'''Exercise x in Coursera Bioinformatics algorithms(Part1)'''
def hamming(seq1,seq2,dist):
'''Calculates hamming distance between two equally sized sequences'''
hamming=0
for i in range(len(seq1)):
if seq1[i]!=seq2[i]:
hamming+=1
if hamming>dist:return False
return True
def k_mers(seq,pattern):
'''Creates a list of all kmers from sequence with the size of pattern'''
kmers=[]
for i in range(len(seq)-len(pattern)+1):
kmers.append(seq[i:i+len(pattern)])
return kmers
def count(seq,pattern,dist):
'''Calculates the number o hits of a pattern in the sequence allowing less
than dist mismatches'''
kmers=k_mers(seq,pattern)
counter=0
for item in kmers:
if hamming(item,pattern,dist):counter+=1
return counter
def search(seq,dist,size):
dna=['ATCG']
positive_patterns=dict()
from itertools import product
kmers=k_mers(seq,'a'*size)
i=0
for item in product(*(dna*size)):
item="".join(item)
positive_patterns[item]=0
if i%10000==0:
print(i)
item=''.join(item)
for kmer in kmers:
if hamming(item,kmer,dist):
positive_patterns[item]=positive_patterns.get(item,0)+1
if hamming(RevComp(item),kmer,dist):
positive_patterns[item]=positive_patterns.get(item,0)+1
i+=1
return positive_patterns
def RevComp(seq):
"""Takes a seq and return the reverse completment"""
return seq[::-1].translate(str.maketrans('ATCG','TAGC'))
def most_common(seq,dist,size):
maximized=[]
positive_patterns=search(seq,dist,size)
maximum=max(positive_patterns.values())
for key,value in positive_patterns.items():
if value==maximum:
print("".join(key),end=' ')
seq='CTCATCCTCGGATCGTTAAAATCATCCTCAAAGTTCTCAAAAAAATCAAAATCGGGTTGTTATCAAAGGAAAAAAGGCTCGGCTCATCAAAGTTGTTATCAAAGGCTCAAACTCATCGTTGGGTTGTTGGGTTAAAAAAGGGTTATCATCGTTATCGTTGGATCGTTCTCATCAAAGGGGCTCCTCGGGGGTTGTTAAAAAAAAAGGATCCTCCTCCTCGGCTCAAAATCCTCGTT'
dist=3
size=8
from time import clock
def timing(seq,dist,size):
start=clock()
most_common(seq=seq,dist=dist,size=size)
print((clock()-start)/60)