-
Notifications
You must be signed in to change notification settings - Fork 28
/
Copy pathBasics_dna_data_explore.py
78 lines (64 loc) · 2.7 KB
/
Basics_dna_data_explore.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#pip install biopython
#data : https://www.kaggle.com/thomasnelson/examplefasta
from Bio import SeqIO
for sequence in SeqIO.parse('example.fa', "fasta"):
print(sequence.id)
print(sequence.seq)
print(len(sequence))
#Ordinal encoding DNA sequence data¶
# function to convert a DNA sequence string to a numpy array
# converts to lower case, changes any non 'acgt' characters to 'n'
import numpy as np
import re
def string_to_array(seq_string):
seq_string = seq_string.lower()
seq_string = re.sub('[^acgt]', 'z', seq_string)
seq_string = np.array(list(seq_string))
return seq_string
# create a label encoder with 'acgtn' alphabet
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(np.array(['a','c','g','t','z']))
# function to encode a DNA sequence string as an ordinal vector
# returns a numpy vector with a=0.25, c=0.50, g=0.75, t=1.00, n=0.00
def ordinal_encoder(my_array):
integer_encoded = label_encoder.transform(my_array)
float_encoded = integer_encoded.astype(float)
float_encoded[float_encoded == 0] = 0.25 # A
float_encoded[float_encoded == 1] = 0.50 # C
float_encoded[float_encoded == 2] = 0.75 # G
float_encoded[float_encoded == 3] = 1.00 # T
float_encoded[float_encoded == 4] = 0.00 # anything else, lets say z
return float_encoded
seq_test = 'TTCAGCCAGTG'
ordinal_encoder(string_to_array(seq_test))
#One-hot encoding DNA sequence data¶
# function to one-hot encode a DNA sequence string
# non 'acgt' bases (n) are 0000
# returns a L x 4 numpy array
from sklearn.preprocessing import OneHotEncoder
def one_hot_encoder(seq_string):
int_encoded = label_encoder.transform(seq_string)
onehot_encoder = OneHotEncoder(sparse=False, dtype=int)
int_encoded = int_encoded.reshape(len(int_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(int_encoded)
onehot_encoded = np.delete(onehot_encoded, -1, 1)
return onehot_encoded
seq_test = 'GAATTCTCGAA'
one_hot_encoder(string_to_array(seq_test))
#treat the DNA sequence as a language (text) and use various "language" #processing methods.
def Kmers_funct(seq, size):
return [seq[x:x+size].lower() for x in range(len(seq) - size + 1)]
mySeq = 'GTGCCCAGGTTCAGTGAGTGACACAGGCAG'
Kmers_funct(mySeq, size=7)
words = Kmers_funct(mySeq, size=6)
joined_sentence = ' '.join(words)
joined_sentence
mySeq1 = 'TCTCACACATGTGCCAATCACTGTCACCC'
mySeq2 = 'GTGCCCAGGTTCAGTGAGTGACACAGGCAG'
sentence1 = ' '.join(Kmers_funct(mySeq1, size=6))
sentence2 = ' '.join(Kmers_funct(mySeq2, size=6))
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform([joined_sentence, sentence1, sentence2]).toarray()