-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathbacktrack_swa.py
executable file
·161 lines (131 loc) · 5.96 KB
/
backtrack_swa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/python
from sys import argv
import string
from os import popen, system
from os.path import exists
from glob import glob
# Let's get going!
start_model = 'S_0'
if len( argv ) > 1 :
start_model = argv[1]
assert( start_model[:2] == 'S_' )
def initialize():
fasta_file = glob( '*.fasta' )[0]
sequence = open( fasta_file ).readlines()[1][:-1]
nres = len( sequence )
cols = open( 'README_SETUP' ).readlines()[1].split()
pos = cols.index( '`seq' )
loop_start = int( cols[pos+1] )
loop_end = int( cols[pos+2][:-1] )
rosetta_path = '/home/rhiju/src/rosetta_TRUNK/'
new_cat_outfile_convention = 0
if sequence.count( 'a' ) > 0:
rosetta_path = '/home/rhiju/src/rosetta_protein_rna/'
#new_cat_outfile_convention = 1
return (sequence, loop_start, loop_end, rosetta_path, new_cat_outfile_convention )
# some useful functions
def find_parent_tag_and_source( outfile, model ):
parent_tag = None
source = None
lines = popen( 'grep -A3 SCORE '+outfile ).readlines()
found_model = 0
for line in lines:
cols = string.split( line[:-1] )
if cols[-1] == model:
found_model = 1
if found_model and cols[0] == 'REMARK' and cols[1] == 'PARENT_TAG':
parent_tag = cols[-1]
if found_model and cols[0] == 'REMARK' and cols[1] == 'SOURCE':
source = cols[-1]
break
return (parent_tag, source )
def get_outfile_ready( outfile ):
if not exists( outfile ):
assert( exists( outfile +'.gz' ) )
system( 'gunzip ' + outfile )
def extract_model( outfile, model, start_model_name ):
extract_model_name = '%s.backtrack_%s.pdb' % (outfile.replace('.out',''), start_model_name )
if not exists( extract_model_name ):
print 'EXTRACTING: ', extract_model_name
command = '%s/rosetta_source/bin/extract_pdbs.linuxgccrelease -in:file:silent %s -in:file:silent_struct_type binary -in:file:tags %s -database %s/rosetta_database/' % (rosetta_path, outfile, model , rosetta_path)
print command
system( command )
assert( exists( model+'.pdb' ) )
system( 'mv %s.pdb %s' % ( model, extract_model_name ) )
command = 'renumber_pdb_in_place.py %s ' % extract_model_name
try:
i = int( outfile.split( '_' )[2] )
j = int( outfile.split( '_' )[1] )
for n in range(1,i+1): command += ' %d' % n
for n in range(j,len(sequence)+1): command += ' %d' % n
#print command
system( command )
except:
print outfile
return extract_model_name
def get_previous_model_and_outfile( parent_tag, source, new_cat_outfile_convention ):
prev_outfile = source.split( '/' )[1]
cols = prev_outfile.split( '_' )
outfile = string.join( cols[2:5], '_' )+'_sample.cluster.out'
get_outfile_ready( outfile )
nmodel = 0
cols = parent_tag.split( '_' )
if len( cols ) > 2:
nmodel = int(cols[1])
if new_cat_outfile_convention: nmodel = int( cols[2] )
return (nmodel,outfile)
def figure_out_actual_model( nmodel, outfile ):
# rescore -- this serves two purposes
# first, it gives back the rmsd to the desired model -- this provides a consistency check
# second, it 'resorts' the models in the order in which they would be sorted by glob --
# S_0, S_1, S_10, S_100, etc.
# and that lets us figure out which decoy to track back. Its roundabout but it works.
outfile_calcRMSD = outfile.replace( '.out', '.backtrack_%s.out' % start_model )
if not exists( outfile_calcRMSD ):
i = int( outfile.split( '_' ) [2] )
j = int( outfile.split( '_' ) [1] )
command = '%s/rosetta_source/bin/stepwise_protein_test.linuxgccrelease -in:file:silent %s -calc_rms -out:file:silent %s -native %s -database %s/rosetta_database/ -calc_rms_res %d-%d -working_res 1-%d %d-%d' % (rosetta_path, outfile, outfile_calcRMSD, start_model_pdb_file, rosetta_path, loop_start, loop_end, i, j, len( sequence ) )
print( command )
system( command )
scorefile_calcRMSD = outfile_calcRMSD.replace('.out','.sc')
if not exists( scorefile_calcRMSD ):
command = 'grep SCORE %s | head -n %d > %s' % ( outfile_calcRMSD, nmodel+2, scorefile_calcRMSD )
print command
system( command )
lines = open( scorefile_calcRMSD ).readlines()
cols = lines[0].split()
rms_col = cols.index( 'rms' )
cols = lines[-1][:-1].split()
rms = float( cols[ rms_col ] )
model = cols[-1]
print outfile_calcRMSD, model, rms
extract_model( outfile, model, start_model ) # extract for viewing.
return (model,rms)
# basic initialization -- what's the sequence? the loop boundaries?
(sequence, loop_start, loop_end, rosetta_path, new_cat_outfile_convention) = initialize()
outfiles_backtrack = []
# starting outfile is special -- a loop closure
outfile = 'region_FINAL.out'
outfiles_backtrack.append( (outfile, start_model, 0.0 ) )
( parent_tag, source ) = find_parent_tag_and_source( outfile, start_model )
start_model_pdb_file = extract_model( outfile, start_model, start_model )
# kind of weird -- need to track down model inside chain-closure file.
chain_closure_outfile = source.split( '/' )[0].lower()+'_sample.cluster.out'
outfiles_backtrack.append( (chain_closure_outfile, 'ND', 0.0 ) )
# MAIN LOOP
count = 0
while source:# and len( outfiles_backtrack ) < 2:
(nmodel, outfile ) = get_previous_model_and_outfile( parent_tag, source, new_cat_outfile_convention )
(model,rms) = figure_out_actual_model( nmodel, outfile )
outfiles_backtrack.append( (outfile, model, rms ) )
# OK, let's go to the next one.
( parent_tag, source ) = find_parent_tag_and_source( outfile, model )
print parent_tag, source
# What we found in the backtrack
summary_file = 'backtrack_summary_%s.txt' % start_model
fid = open( summary_file, 'w' )
for n in range( len( outfiles_backtrack )): fid.write( '%s %s %6.2f\n' % outfiles_backtrack[n] )
fid.close()
print
print 'BACKTRACK:'
system( 'cat '+summary_file )