Skip to content

Commit c119237

Browse files
author
Burak Han Alver
authored
Merge branch 'develop' into develop
2 parents c076788 + f6980c1 commit c119237

6 files changed

+100
-37
lines changed

Diff for: CHANGELOG

+6
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
v0.10.2 (2019-01-30)
2+
3+
- new option to import a "states" file format, a bed file with categorical data, e.g. from chromHMM to multivec format.
4+
- while converting a bed file to multivec, each segment can now be a multiple of base_resolution,
5+
rather than exactly match the base_resolution
6+
17
v0.10.1 (2019-01-22)
28

39
- Removed a buggy print statement from the conversion script

Diff for: clodius/cli/convert.py

+56-16
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,13 @@
1717
def epilogos_bedline_to_vector(bedline):
1818
'''
1919
Convert a line from an epilogos bedfile to vector format.
20-
20+
2121
Parameters
2222
-----------
2323
parts: [string,....]
2424
A line from a bedfile broken up into its constituent parts
2525
(e.g. ["chr1", "1000", "2000", "[1,2,34,5]"])
26-
26+
2727
Returns
2828
-------
2929
An array containing the values associated with that line
@@ -39,9 +39,40 @@ def epilogos_bedline_to_vector(bedline):
3939
chrom=parts[0]
4040
start=int(parts[1])
4141
end=int(parts[2])
42-
42+
4343
return (chrom, start, end, states)
4444

45+
def states_bedline_to_vector(bedline,states_dic):
46+
'''
47+
Convert a line from a bedfile containing states in categorical data to vector format.
48+
49+
Parameters
50+
----------
51+
52+
parts: [string,...]
53+
A line form a bedfile broken up into its contituent parts
54+
(e.g. ["chr1", "1000", "2000", "state"]))
55+
56+
57+
states_dic:
58+
A dictionary containing the states in the file with a corresponding value
59+
60+
Returns
61+
-------
62+
63+
An array containing values associated the state
64+
65+
'''
66+
parts = bedline.decode('utf8').strip().split('\t')
67+
chrom=parts[0]
68+
start=int(parts[1])
69+
end=int(parts[2])
70+
state= states_dic[parts[3].encode('utf8')]
71+
72+
states_vector = [ 1 if index == state else 0 for index in range(len(states_dic))]
73+
74+
return (chrom, start, end, states_vector)
75+
4576
@cli.group()
4677
def convert():
4778
'''
@@ -82,6 +113,7 @@ def _bedgraph_to_multivec(
82113
if row_infos_filename is not None:
83114
with open(row_infos_filename, 'r') as fr:
84115
row_infos = [l.strip().encode('utf8') for l in fr]
116+
85117
else:
86118
row_infos = None
87119

@@ -101,10 +133,16 @@ def bedline_to_chrom_start_end_vector(bedline):
101133
return (chrom, start, end, vector)
102134

103135
if format == 'epilogos':
104-
cmv.bedfile_to_multivec(filepath, f_out, epilogos_bedline_to_vector,
136+
cmv.bedfile_to_multivec(filepath, f_out, epilogos_bedline_to_vector,
105137
starting_resolution, has_header, chunk_size);
138+
elif format == 'states':
139+
assert(row_infos != None), "A row_infos file must be provided for --format = 'states' "
140+
states_dic = {row_infos[x]:x for x in range(len(row_infos))}
141+
142+
cmv.bedfile_to_multivec(filepath, f_out, states_bedline_to_vector,
143+
starting_resolution, has_header, chunk_size, states_dic);
106144
else:
107-
cmv.bedfile_to_multivec(filepath, f_out, bedline_to_chrom_start_end_vector,
145+
cmv.bedfile_to_multivec(filepath, f_out, bedline_to_chrom_start_end_vector,
108146
starting_resolution, has_header, chunk_size);
109147

110148
f_out.close()
@@ -124,7 +162,7 @@ def agg(x):
124162
# newshape = (x.shape[2], -1, 2)
125163
# b = x.T.reshape((-1,))
126164

127-
165+
128166
a = x.T.reshape((x.shape[1],-1,2))
129167

130168
# this is going to be an odd way to get rid of nan
@@ -153,7 +191,7 @@ def agg(x):
153191
else:
154192
agg=lambda x: x.T.reshape((x.shape[1],-1,2)).sum(axis=2).T
155193

156-
cmv.create_multivec_multires(f_in,
194+
cmv.create_multivec_multires(f_in,
157195
chromsizes = zip(chrom_names, chrom_sizes),
158196
agg=agg,
159197
starting_resolution=starting_resolution,
@@ -242,7 +280,10 @@ def agg(x):
242280
)
243281
@click.option(
244282
'--format',
245-
type=click.Choice(['default', 'epilogos']),
283+
type=click.Choice(['default', 'epilogos', 'states']),
284+
help= "'default':chr start end state1_value state2_value, etc;"
285+
"'epilogos': chr start end [[state1_value, state1_num],[state2_value, state2_num],[etc]];"
286+
"'states': chr start end state_name",
246287
default='default'
247288
)
248289
@click.option(
@@ -262,15 +303,14 @@ def agg(x):
262303
type=click.Choice(['sum', 'logsumexp']),
263304
default='sum'
264305
)
265-
def bedfile_to_multivec(filepath, output_file, assembly, chromosome_col,
266-
from_pos_col, to_pos_col, value_col, has_header,
306+
def bedfile_to_multivec(filepath, output_file, assembly, chromosome_col,
307+
from_pos_col, to_pos_col, value_col, has_header,
267308
chunk_size, nan_value,
268309
chromsizes_filename,
269-
starting_resolution, num_rows,
310+
starting_resolution, num_rows,
270311
format, row_infos_filename, tile_size, method):
271-
_bedgraph_to_multivec(filepath, output_file, assembly, chromosome_col,
272-
from_pos_col, to_pos_col, value_col, has_header,
273-
chunk_size, nan_value,
274-
chromsizes_filename, starting_resolution, num_rows,
312+
_bedgraph_to_multivec(filepath, output_file, assembly, chromosome_col,
313+
from_pos_col, to_pos_col, value_col, has_header,
314+
chunk_size, nan_value,
315+
chromsizes_filename, starting_resolution, num_rows,
275316
format, row_infos_filename, tile_size, method)
276-

Diff for: clodius/multivec.py

+28-21
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,17 @@
88
import os.path as op
99
import sys
1010

11-
def bedfile_to_multivec(input_filename, f_out,
11+
def bedfile_to_multivec(input_filename, f_out,
1212
bedline_to_chrom_start_end_vector, base_resolution,
13-
has_header, chunk_size):
13+
has_header, chunk_size, row_infos):
1414
'''
1515
Convert an epilogos bedfile to multivec format.
1616
'''
1717
if op.splitext(input_filename)[1] == '.gz':
1818
f = gzip.open(input_filename, 'r')
1919
else:
2020
f = open(input_filename, 'r')
21-
21+
2222
FILL_VALUE = np.nan
2323

2424
# batch regions because h5py is really bad at writing
@@ -29,7 +29,7 @@ def bedfile_to_multivec(input_filename, f_out,
2929
curr_index = 0
3030
# the start of the batch in the dataset
3131
batch_start_index = 0
32-
32+
3333
if has_header:
3434
f.readline()
3535

@@ -38,19 +38,19 @@ def bedfile_to_multivec(input_filename, f_out,
3838
warned = False
3939

4040
for line in f:
41-
chrom,start,end,vector = bedline_to_chrom_start_end_vector(line)
41+
chrom,start,end,vector = bedline_to_chrom_start_end_vector(line, row_infos)
4242

43-
if end - start != base_resolution and not warned:
44-
print("WARNING: interval length ({}) doesn't match base resolution ({}): {}".
45-
format(end - start, base_resolution, line))
43+
if end % base_resolution != 0 or start % base_resolution != 0 and not warned:
44+
print("WARNING: either the start or end coordinate is not a multiple of the base resolution ({}): {}".
45+
format(base_resolution, line))
4646
warned = True
4747

4848
if prev_chrom is not None and chrom != prev_chrom:
4949
# we've reached a new chromosome so we'll dump all
5050
# the previous values
5151
print("len(batch:", len(batch))
5252
f_out[prev_chrom][batch_start_index:batch_start_index+len(batch)] = np.array(batch)
53-
53+
5454
# we're starting a new chromosome so we start from the beginning
5555
curr_index = 0
5656
batch_start_index = 0
@@ -77,8 +77,15 @@ def bedfile_to_multivec(input_filename, f_out,
7777

7878
assert(curr_index == data_start_index)
7979
#print('vector', vector)
80-
batch += [vector]
81-
curr_index += 1
80+
81+
#When the binsize is not equal to the base_resolution
82+
# "break down" the binsize into bins of the rbase_esolution size
83+
#and add the values to each bin.
84+
85+
data_end_index = end // base_resolution
86+
while curr_index < data_end_index:
87+
batch += [vector]
88+
curr_index += 1
8289

8390
# fill in empty
8491

@@ -98,14 +105,14 @@ def bedfile_to_multivec(input_filename, f_out,
98105
#print('chrom', chrom)
99106
f_out[chrom][batch_start_index:batch_start_index+len(batch)] = np.array(batch)
100107

101-
def create_multivec_multires(array_data, chromsizes,
108+
def create_multivec_multires(array_data, chromsizes,
102109
agg, starting_resolution=1,
103110
tile_size=1024, output_file='/tmp/my_file.multires',
104111
row_infos=None):
105112
'''
106113
Create a multires file containing the array data
107114
aggregated at multiple resolutions.
108-
115+
109116
Parameters
110117
----------
111118
array_data: {'chrom_key': np.array, }
@@ -131,11 +138,11 @@ def create_multivec_multires(array_data, chromsizes,
131138

132139
# this will be the file that contains our multires data
133140
f = h5py.File(filename, 'w')
134-
141+
135142
# store some metadata
136143
f.create_group('info')
137144
f['info'].attrs['tile-size'] = tile_size
138-
145+
139146
f.create_group('resolutions')
140147
f.create_group('chroms')
141148

@@ -184,15 +191,15 @@ def create_multivec_multires(array_data, chromsizes,
184191
while start < len(chrom_data):
185192
chrom_data[start:start + chunk_size] = array_data[chrom][start:start+chunk_size] # see above section
186193
start += int(min(standard_chunk_size, len(array_data[chrom]) - start))
187-
194+
188195

189196
# the maximum zoom level corresponds to the number of aggregations
190197
# that need to be performed so that the entire extent of
191198
# the dataset fits into one tile
192199
total_length = sum(lengths)
193200
# print("total_length:", total_length, "tile_size:", tile_size, "starting_resolution:", starting_resolution)
194201
max_zoom = math.ceil(math.log(total_length / (tile_size * starting_resolution) ) / math.log(2))
195-
202+
196203
# we're going to go through and create the data for the different
197204
# zoom levels by summing adjacent data points
198205
prev_resolution = curr_resolution
@@ -230,7 +237,7 @@ def create_multivec_multires(array_data, chromsizes,
230237
new_shape[0] = math.ceil(new_shape[0] / 2)
231238
new_shape = tuple(new_shape)
232239

233-
f['resolutions'][str(curr_resolution)]['values'].create_dataset(chrom,
240+
f['resolutions'][str(curr_resolution)]['values'].create_dataset(chrom,
234241
new_shape, compression='gzip')
235242

236243
while start < len(chrom_data):
@@ -239,7 +246,7 @@ def create_multivec_multires(array_data, chromsizes,
239246
#print("prev_resolution:", prev_resolution)
240247
#print("old_data.shape", old_data.shape)
241248

242-
# this is a sort of roundabout way of calculating the
249+
# this is a sort of roundabout way of calculating the
243250
# shape of the aggregated array, but all its doing is
244251
# just halving the first dimension of the previous shape
245252
# without taking into account the other dimensions
@@ -258,8 +265,8 @@ def create_multivec_multires(array_data, chromsizes,
258265
new_data = agg(old_data)
259266

260267
'''
261-
print("zoom_level:", max_zoom - 1 - i,
262-
"resolution:", curr_resolution,
268+
print("zoom_level:", max_zoom - 1 - i,
269+
"resolution:", curr_resolution,
263270
"new_data length", len(new_data))
264271
'''
265272
f['resolutions'][str(curr_resolution)]['values'][chrom][int(start/2):int(start/2+chunk_size/2)] = new_data

Diff for: test/sample_data/states_format_input_testfile.bed.gz

265 Bytes
Binary file not shown.
8.29 MB
Binary file not shown.

Diff for: test/sample_data/states_format_test_row_infos.txt

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
Quies
2+
FaireW
3+
Low
4+
Pol2
5+
Gen3'
6+
Elon
7+
Ctcf
8+
EnhW
9+
EnhWF
10+
ElonW

0 commit comments

Comments
 (0)