-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathtest-update-gpu.py
executable file
Β·139 lines (113 loc) Β· 5.35 KB
/
test-update-gpu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/env python
# Copyright 2018-2021 John Lees and Nick Croucher
"""Tests for PopPUNK --update-db order"""
import subprocess
import os, sys
import sys
import shutil
import pickle
import numpy as np
from scipy import stats
import h5py
import scipy.sparse
import pp_sketchlib
if os.environ.get("POPPUNK_PYTHON"):
python_cmd = os.environ.get("POPPUNK_PYTHON")
else:
python_cmd = "python"
def run_regression(x, y, threshold = 0.99):
res = stats.linregress(x, y)
print("R^2: " + str(res.rvalue**2))
if res.rvalue**2 < threshold:
sys.stderr.write("Distance matrix order failed!\n")
sys.exit(1)
def compare_sparse_matrices(d1,d2,r1,r2):
d1_pairs = get_seq_tuples(d1.row,d1.col,r1)
d2_pairs = get_seq_tuples(d2.row,d2.col,r2)
d1_dists = []
d2_dists = []
if (len(d1_pairs) != len(d2_pairs)):
sys.stderr.write("Distance matrix number of entries differ!\n")
print(d1_pairs)
print(d2_pairs)
sys.exit(1)
for (pair1,dist1) in zip(d1_pairs,d1.data):
for (pair2,dist2) in zip(d2_pairs,d2.data):
if pair1 == pair2:
d1_dists.append(dist1)
d2_dists.append(dist2)
break
run_regression(np.asarray(d1_dists),np.asarray(d2_dists))
def get_seq_tuples(rows,cols,names):
tuple_list = []
for (i,j) in zip(rows,cols):
sorted_pair = tuple(sorted((names[i],names[j])))
tuple_list.append(sorted_pair)
return tuple_list
def old_get_seq_tuples(rows,cols):
max_seqs = np.maximum(rows,cols)
min_seqs = np.minimum(rows,cols)
concat_seqs = np.vstack((max_seqs,min_seqs))
seq_pairs = concat_seqs.T
seq_tuples = [tuple(row) for row in seq_pairs]
return seq_tuples
# Check distances after one query
# Check that order is the same after doing 1 + 2 with --update-db, as doing all of 1 + 2 together
subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files rfile12.txt --output batch12 --overwrite --gpu-dist", shell=True, check=True)
subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --ref-db batch12 --ranks 1,2 --gpu-graph", shell=True, check=True)
subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files rfile1.txt --output batch1 --overwrite --gpu-dist", shell=True, check=True)
subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --ref-db batch1 --ranks 1,2 --gpu-graph", shell=True, check=True)
subprocess.run(python_cmd + " ../poppunk_assign-runner.py --db batch1 --query rfile2.txt --output batch2 --update-db --overwrite --gpu-graph --gpu-dist", shell=True, check=True)
# Load updated distances
X2 = np.load("batch2/batch2.dists.npy")
with open("batch2/batch2.dists.pkl", 'rb') as pickle_file:
rlist2, qlist, self = pickle.load(pickle_file)
# Get same distances from the full database
ref_db = "batch12/batch12"
ref_h5 = h5py.File(ref_db + ".h5", 'r')
db_kmers = sorted(ref_h5['sketches/' + rlist2[0]].attrs['kmers'])
ref_h5.close()
X1 = pp_sketchlib.queryDatabase(ref_db, ref_db, rlist2, rlist2, db_kmers,
True, False, 1, False, 0)
# Check distances match
run_regression(X1[:, 0], X2[:, 0])
run_regression(X1[:, 1], X2[:, 1])
# Check sparse distances after one query
with open("batch12/batch12.dists.pkl", 'rb') as pickle_file:
rlist1, qlist1, self = pickle.load(pickle_file)
S1 = scipy.sparse.load_npz("batch12/batch12_rank_2_fit.npz")
S2 = scipy.sparse.load_npz("batch2/batch2_rank_2_fit.npz")
compare_sparse_matrices(S1,S2,rlist1,rlist2)
# Check rank 1
S3 = scipy.sparse.load_npz("batch12/batch12_rank_1_fit.npz")
S4 = scipy.sparse.load_npz("batch2/batch2_rank_1_fit.npz")
compare_sparse_matrices(S3,S4,rlist1,rlist2)
# Check distances after second query
# Check that order is the same after doing 1 + 2 + 3 with --update-db, as doing all of 1 + 2 + 3 together
subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files rfile123.txt --output batch123 --overwrite --gpu-dist", shell=True, check=True)
subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --ref-db batch123 --ranks 1,2 --gpu-graph --gpu-dist", shell=True, check=True)
subprocess.run(python_cmd + " ../poppunk_assign-runner.py --db batch2 --query rfile3.txt --output batch3 --update-db --overwrite --gpu-graph --gpu-dist", shell=True, check=True)
# Load updated distances
X2 = np.load("batch3/batch3.dists.npy")
with open("batch3/batch3.dists.pkl", 'rb') as pickle_file:
rlist4, qlist, self = pickle.load(pickle_file)
# Get same distances from the full database
ref_db = "batch123/batch123"
ref_h5 = h5py.File(ref_db + ".h5", 'r')
db_kmers = sorted(ref_h5['sketches/' + rlist4[0]].attrs['kmers'])
ref_h5.close()
X1 = pp_sketchlib.queryDatabase(ref_db, ref_db, rlist4, rlist4, db_kmers,
True, False, 1, False, 0)
# Check distances match
run_regression(X1[:, 0], X2[:, 0])
run_regression(X1[:, 1], X2[:, 1])
# Check sparse distances after second query
with open("batch123/batch123.dists.pkl", 'rb') as pickle_file:
rlist3, qlist, self = pickle.load(pickle_file)
S5 = scipy.sparse.load_npz("batch123/batch123_rank_2_fit.npz")
S6 = scipy.sparse.load_npz("batch3/batch3_rank_2_fit.npz")
compare_sparse_matrices(S5,S6,rlist3,rlist4)
# Check rank 1
S7 = scipy.sparse.load_npz("batch123/batch123_rank_1_fit.npz")
S8 = scipy.sparse.load_npz("batch3/batch3_rank_1_fit.npz")
compare_sparse_matrices(S7,S8,rlist3,rlist4)