-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate-biomt-from-header.py
370 lines (290 loc) · 15.5 KB
/
create-biomt-from-header.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
import os, sys
import csv
import gzip
import re
import logging
from os.path import getmtime
from datetime import datetime
from sqlalchemy import *
from sqlalchemy.schema import PrimaryKeyConstraint
from sqlalchemy.engine.url import URL
from sqlalchemy.dialects.postgresql import ARRAY
# register new dialect to write tab-delimited files
csv.register_dialect('tabs', delimiter='\t')
try:
from argparse import ArgumentParser
from getpass import getpass
except ImportError:
print >> sys.stderr, "This program requires Python version 2.7 or later"
sys.exit(1)
else:
parser = ArgumentParser()
parser.add_argument('-u', '--user', dest="dbuser", help="Username on database", default='bernardo')
parser.add_argument('-H', '--host', dest="dbhost", help="Location of the database server", default='bahamut')
parser.add_argument('-p', '--port', dest="dbport", type=int, help="Port of the database server", default=5432)
parser.add_argument('-d', '--db', dest="dbname", help="Name of the database", default='cryst')
parser.add_argument('-s', '--schema', dest="schema", help="Schema to use on the database", default='pdb_dev')
parser.add_argument('-r', '--resume', dest="skip_dump", action='store_true', help="Skip initial dump if file exists.", default=False)
#parser.add_argument('-f', '--force', dest="force", action='store_true', help="Force overwrites", default=False)
parser.add_argument('-v', '--verbose', dest="verbose", action='store_true', help="Echo SQL commands", default=False)
parser.add_argument('-y', '--yes', dest="confirm", action='store_true', help="Auto-confirm", default=False)
parser.add_argument('-L', '--loglevel',dest="loglevel", action='store', choices=['debug','info','warn','error'],
help="Logging level to display (default: info)", default='info')
opt = parser.parse_args()
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s",
datefmt='%m/%d/%Y %I:%M:%S %p', level=getattr(logging, opt.loglevel.upper()))
PDB_MIRROR_DIR = "/tlbnas/mirror/pdb/data/structures/divided/pdb"
BIOMT_DUMP = '/tlbnas/temp/bahamut/biomt.%s' % opt.schema
engine = create_engine(URL(drivername='postgresql+psycopg2', username=opt.dbuser,
password=getpass("DB Password:"), host=opt.dbhost,
port=opt.dbport, database=opt.dbname),
execution_options={'autocommit':True}, echo=opt.verbose)
connection = engine.connect()
metadata = MetaData(bind=engine)
def create_tables(metadata, schema):
"""
"""
rawdata = Table('raw_biomt', metadata,
Column('pdb', String(4), nullable=False),
Column('assembly_serial', Integer),
Column('assembly_type', Text),
Column('determined_by', ARRAY(String)),
Column('pdb_chain_id', String(12)),
Column('operation_serial', Integer),
Column('rotation', ARRAY(Float)),
Column('translation', ARRAY(Float)),
schema=schema, prefixes=['unlogged'])
Index('idx_rawdata_pdb', rawdata.c.pdb, rawdata.c.assembly_serial)
biomt = Table('biomt', metadata,
Column('biomt_id', Integer, nullable=False),
Column('pdb', String(4), nullable=False),
Column('assembly_serial', Integer),
Column('assembly_size', Integer),
Column('assembly_type', Text),
Column('determined_by', ARRAY(String)),
Column('is_monomeric', Boolean(create_constraint=False), DefaultClause('false')),
Column('all_chains_at_identity', Boolean(create_constraint=False), DefaultClause('false')),
schema=schema)
PrimaryKeyConstraint(biomt.c.biomt_id, deferrable=True, initially='deferred')
Index('idx_biomt_assembly', biomt.c.pdb, biomt.c.assembly_serial, unique=True)
biomt_ops = Table('biomt_ops', metadata,
Column('biomt_op_id', Integer, nullable=False),
Column('biomt_id', Integer, nullable=False),
Column('pdb_chain_id', String(12)),
Column('operation_serial', Integer),
Column('rotation', ARRAY(Float)),
Column('translation', ARRAY(Float)),
Column('is_at_identity', Boolean(create_constraint=False), DefaultClause('false'), nullable=False),
schema=schema)
PrimaryKeyConstraint(biomt_ops.c.biomt_op_id, deferrable=True, initially='deferred')
Index('idx_biomt_ops_biomt_id', biomt_ops.c.biomt_id, biomt_ops.c.pdb_chain_id)
metadata.drop_all(checkfirst=True)
metadata.create_all(checkfirst=True)
def get_pdb_files():
"""
"""
# get all PFB files from the mirror recursively
for folder, subs, files in os.walk(PDB_MIRROR_DIR):
for f in files:
# make sure to only include structure files and nothing else
if not f.endswith(".ent.gz"): continue
yield f
def extract_remark350(path):
"""
"""
# the lines of the REMARK 350 section will be appended to this list
remark = []
# the PDB structures are gzipped
with gzip.open(path) as pdbfh:
for line in pdbfh:
if line.startswith("REMARK 350"):
# remove newline and REMARK 350 at the beginning
remark.append(line[11:].rstrip())
# end parsing this file at the SEQRES section
if line.startswith("SEQRES"):
break
# the first line will be an empty string!
return remark[1:]
def extract_bio_oper(remark):
"""
"""
# pattern th extract the chains that the transformation applies to
pchn = re.compile("^(APPLY THE FOLLOWING TO|(?P<cntd>\s+AND)) CHAIN[S]?[:]\s?(?P<chains>.+)$")
op = {}
operation_id = 0
for line in remark:
# a new multimer starts here
if line.startswith('BIOMOLECULE'):
# there might be more than one space between the BIOMOLECULE: start
# and the actual number
try:
biomol = int(line[12:])
except ValueError:
raise ValueError("cannot parse BIOMOLECULE: {}".format(line[12:]))
op.update({biomol:{'chains':{}}})
# attach by whom the quaternary structure was determined
op[biomol]['determined_by'] = []
# try to extract the determined biological unit
elif line.startswith('AUTHOR DETERMINED BIOLOGICAL UNIT'):
try:
op[biomol]['determined_by'].append('author')
op[biomol]['assembly'] = line[35:].lower()
logging.debug("Author determined biological unit {}: {}"
.format(biomol, op[biomol]['assembly']))
# the BIOMOLECULE: 1 line is missing from header
except UnboundLocalError as e:
raise RuntimeError("cannot parse REMARK 350! BIOMOLECULE serial "
"is missing: {}".format(e.message))
# try to extract the determined biological unit
elif line.startswith('SOFTWARE DETERMINED QUATERNARY STRUCTURE'):
try:
op[biomol]['determined_by'].append('software')
op[biomol]['assembly'] = line[42:].lower()
logging.debug("Software determined biological unit {}: {}"
.format(biomol, op[biomol]['assembly']))
# the BIOMOLECULE: 1 line is missing from header
except UnboundLocalError as e:
raise RuntimeError("cannot parse REMARK 350! BIOMOLECULE serial "
"is missing: {}".format(e.message))
# try to extract the list of chains the transformation applies to
elif pchn.match(line):
match = pchn.match(line)
# extract the list of PDB chain identifiers
_chains = [chain for chain in match.group('chains').split(',')]
_chains = [chain.strip() for chain in _chains if chain.strip()]
# add to the existing list of chains if this line continues with chains
if match.group('cntd'): chains.extend(_chains)
# otherwise create new list
else: chains = _chains
for chain in chains:
if len(chain) == 1:
op[biomol]['chains'].update({chain:{}})
else:
logging.warn("Thee chain {} is not valid and was not parsed "
"properly!".format(chain))
logging.debug("Operation for biomolecule {} found: chains {}"
.format(biomol, chains))
# first row of rotations and translations
elif line.startswith(' BIOMT1'):
operation_id = int(line[10:12])
# initialize matrices
rot = map(float, line[13:42].split())
trans = [float(line[46:])]
elif line.startswith(' BIOMT2'):
operation_id = int(line[10:12])
rot.extend(map(float, line[13:42].split()))
trans.append(float(line[46:]))
elif line.startswith(' BIOMT3'):
operation_id = int(line[10:12])
rot.extend(map(float, line[13:42].split()))
trans.append(float(line[46:]))
# add matrices to dictionary for each chain and entity
try:
for chain in chains:
op[biomol]['chains'][chain].update({operation_id:(rot, trans)})
except UnboundLocalError as e:
raise RuntimeError("cannot assign operations to chains: {}"
.format(e.message))
return op
def main():
"""
"""
if os.path.exists(BIOMT_DUMP) and opt.skip_dump:
print >> sys.stderr, "Dump file %s exists. Reusing." % BIOMT_DUMP
most_recent = datetime.fromtimestamp(getmtime(BIOMT_DUMP))
else:
fh = open(BIOMT_DUMP, 'w')
writer = csv.writer(fh, dialect='tabs')
logging.info("Commencing dump.")
most_recent = datetime.fromtimestamp(0)
for f in get_pdb_files():
pdbid = f[3:7]
path = os.path.join(PDB_MIRROR_DIR, pdbid[1:3], f)
logging.info("starting to extract BIOMT of PDB entry %s.", pdbid.upper())
if not os.path.isfile(path):
logging.error("cannot extract BIOMT: path %s does not exist!", path)
continue
# extract the REMARK 350 lines from the PDB header
remark = extract_remark350(path)
# parse the section and turn it into a hierarchical form
try:
biomt = extract_bio_oper(remark)
except StandardError, e:
logging.error("Could not extract operations for PDB entry %s: %s", pdbid.upper(), e.message)
continue
# write transformations
for assembly_serial, biomols in biomt.items():
try: assembly = biomols.pop('assembly')
except KeyError: assembly = '\N'
try:
determined_by = biomols.pop('determined_by')
determined_by = '{' + ','.join(determined_by) + '}'
except KeyError:
determined_by = '\N'
for pdb_chain_id, operation in biomols['chains'].items():
for operation_serial, (rx,tx) in operation.items():
rx = '{' + ','.join(map(str,rx)) + '}'
tx = '{' + ','.join(map(str,tx)) + '}'
fields = [pdbid.upper(), assembly_serial, assembly, determined_by,
pdb_chain_id, operation_serial, rx, tx]
writer.writerow(fields)
pdb_mtime = datetime.fromtimestamp(getmtime(path))
if pdb_mtime > most_recent:
most_recent = pdb_mtime
fh.flush()
fh.close()
if opt.confirm or raw_input("Tables in schema {} are about be dropped and recreated. " \
"Are you sure you want to continue? [y/N] ".format(opt.schema)).strip().lower() == 'y':
logging.info("Creating tables...")
create_tables(metadata, opt.schema)
else:
print >> sys.stderr, "Aborting."
sys.exit(1)
logging.info("Copying dump file to table...")
connection.execute("COPY {}.raw_biomt FROM '{}'".format(opt.schema, BIOMT_DUMP))
logging.info("Inserting into {}.biomt...".format(opt.schema))
connection.execute("""INSERT INTO {schema}.biomt(pdb, assembly_serial, assembly_type, determined_by)
SELECT DISTINCT pdb, assembly_serial, assembly_type, determined_by
FROM {schema}.raw_biomt
ORDER BY 1,2;""".format(schema=opt.schema))
logging.info("Inserting into %s.biomt_ops...", opt.schema)
connection.execute("""INSERT INTO {schema}.biomt_ops(biomt_id, pdb_chain_id, operation_serial, rotation, translation)
SELECT biomt_id, pdb_chain_id, operation_serial, rotation, translation
FROM {schema}.biomt
JOIN {schema}.raw_biomt USING(pdb, assembly_serial)
ORDER BY 1,2,3;""".format(schema=opt.schema))
connection.execute("COMMENT ON TABLE {schema}.biomt IS 'Most recent data: {timestamp}'"\
.format(schema=opt.schema, timestamp=str(most_recent)) )
connection.execute("COMMENT ON TABLE {schema}.biomt_ops IS 'Most recent data: {timestamp}'"\
.format(schema=opt.schema, timestamp=str(most_recent)) )
connection.execute("""UPDATE {schema}.biomt_ops
SET is_at_identity = true
WHERE rotation[1] = 1 and rotation[5] = 1 and rotation[9] = 1 and sum(rotation) = 3
and translation[1] = 0 and translation[2] = 0 and translation[3] = 0;""".format(schema=opt.schema))
connection.execute(""" UPDATE {schema}.biomt b
SET all_chains_at_identity = sq.all_chains_at_identity
FROM (
SELECT biomt_id, bool_and(is_at_identity) as all_chains_at_identity
FROM {schema}.biomt_ops
GROUP BY biomt_id
) sq
WHERE sq.biomt_id = b.biomt_id;""".format(schema=opt.schema))
connection.execute(""" UPDATE {schema}.biomt b
SET assembly_size = sq.assembly_size
FROM (
SELECT biomt_id, count(operation_serial) as assembly_size
FROM {schema}.biomt_ops
GROUP BY biomt_id
) sq
WHERE sq.biomt_id = b.biomt_id""".format(schema=opt.schema))
connection.execute(""" UPDATE {schema}.biomt b
SET is_monomeric = true
FROM (
SELECT pdb
FROM {schema}.biomt
GROUP BY pdb
HAVING bool_and(all_chains_at_identity) = true
AND max(assembly_size) = 1
) sq
WHERE sq.pdb = b.pdb""".format(schema=opt.schema))
main()