8
8
import os .path as op
9
9
import sys
10
10
11
- def bedfile_to_multivec (input_filename , f_out ,
11
+ def bedfile_to_multivec (input_filename , f_out ,
12
12
bedline_to_chrom_start_end_vector , base_resolution ,
13
- has_header , chunk_size ):
13
+ has_header , chunk_size , row_infos ):
14
14
'''
15
15
Convert an epilogos bedfile to multivec format.
16
16
'''
17
17
if op .splitext (input_filename )[1 ] == '.gz' :
18
18
f = gzip .open (input_filename , 'r' )
19
19
else :
20
20
f = open (input_filename , 'r' )
21
-
21
+
22
22
FILL_VALUE = np .nan
23
23
24
24
# batch regions because h5py is really bad at writing
@@ -29,7 +29,7 @@ def bedfile_to_multivec(input_filename, f_out,
29
29
curr_index = 0
30
30
# the start of the batch in the dataset
31
31
batch_start_index = 0
32
-
32
+
33
33
if has_header :
34
34
f .readline ()
35
35
@@ -38,19 +38,19 @@ def bedfile_to_multivec(input_filename, f_out,
38
38
warned = False
39
39
40
40
for line in f :
41
- chrom ,start ,end ,vector = bedline_to_chrom_start_end_vector (line )
41
+ chrom ,start ,end ,vector = bedline_to_chrom_start_end_vector (line , row_infos )
42
42
43
- if end - start != base_resolution and not warned :
44
- print ("WARNING: interval length ({}) doesn't match base resolution ({}): {}" .
45
- format (end - start , base_resolution , line ))
43
+ if end % base_resolution != 0 or start % base_resolution != 0 and not warned :
44
+ print ("WARNING: either the start or end coordinate is not a multiple of the base resolution ({}): {}" .
45
+ format (base_resolution , line ))
46
46
warned = True
47
47
48
48
if prev_chrom is not None and chrom != prev_chrom :
49
49
# we've reached a new chromosome so we'll dump all
50
50
# the previous values
51
51
print ("len(batch:" , len (batch ))
52
52
f_out [prev_chrom ][batch_start_index :batch_start_index + len (batch )] = np .array (batch )
53
-
53
+
54
54
# we're starting a new chromosome so we start from the beginning
55
55
curr_index = 0
56
56
batch_start_index = 0
@@ -77,8 +77,15 @@ def bedfile_to_multivec(input_filename, f_out,
77
77
78
78
assert (curr_index == data_start_index )
79
79
#print('vector', vector)
80
- batch += [vector ]
81
- curr_index += 1
80
+
81
+ #When the binsize is not equal to the base_resolution
82
+ # "break down" the binsize into bins of the rbase_esolution size
83
+ #and add the values to each bin.
84
+
85
+ data_end_index = end // base_resolution
86
+ while curr_index < data_end_index :
87
+ batch += [vector ]
88
+ curr_index += 1
82
89
83
90
# fill in empty
84
91
@@ -98,14 +105,14 @@ def bedfile_to_multivec(input_filename, f_out,
98
105
#print('chrom', chrom)
99
106
f_out [chrom ][batch_start_index :batch_start_index + len (batch )] = np .array (batch )
100
107
101
- def create_multivec_multires (array_data , chromsizes ,
108
+ def create_multivec_multires (array_data , chromsizes ,
102
109
agg , starting_resolution = 1 ,
103
110
tile_size = 1024 , output_file = '/tmp/my_file.multires' ,
104
111
row_infos = None ):
105
112
'''
106
113
Create a multires file containing the array data
107
114
aggregated at multiple resolutions.
108
-
115
+
109
116
Parameters
110
117
----------
111
118
array_data: {'chrom_key': np.array, }
@@ -131,11 +138,11 @@ def create_multivec_multires(array_data, chromsizes,
131
138
132
139
# this will be the file that contains our multires data
133
140
f = h5py .File (filename , 'w' )
134
-
141
+
135
142
# store some metadata
136
143
f .create_group ('info' )
137
144
f ['info' ].attrs ['tile-size' ] = tile_size
138
-
145
+
139
146
f .create_group ('resolutions' )
140
147
f .create_group ('chroms' )
141
148
@@ -184,15 +191,15 @@ def create_multivec_multires(array_data, chromsizes,
184
191
while start < len (chrom_data ):
185
192
chrom_data [start :start + chunk_size ] = array_data [chrom ][start :start + chunk_size ] # see above section
186
193
start += int (min (standard_chunk_size , len (array_data [chrom ]) - start ))
187
-
194
+
188
195
189
196
# the maximum zoom level corresponds to the number of aggregations
190
197
# that need to be performed so that the entire extent of
191
198
# the dataset fits into one tile
192
199
total_length = sum (lengths )
193
200
# print("total_length:", total_length, "tile_size:", tile_size, "starting_resolution:", starting_resolution)
194
201
max_zoom = math .ceil (math .log (total_length / (tile_size * starting_resolution ) ) / math .log (2 ))
195
-
202
+
196
203
# we're going to go through and create the data for the different
197
204
# zoom levels by summing adjacent data points
198
205
prev_resolution = curr_resolution
@@ -230,7 +237,7 @@ def create_multivec_multires(array_data, chromsizes,
230
237
new_shape [0 ] = math .ceil (new_shape [0 ] / 2 )
231
238
new_shape = tuple (new_shape )
232
239
233
- f ['resolutions' ][str (curr_resolution )]['values' ].create_dataset (chrom ,
240
+ f ['resolutions' ][str (curr_resolution )]['values' ].create_dataset (chrom ,
234
241
new_shape , compression = 'gzip' )
235
242
236
243
while start < len (chrom_data ):
@@ -239,7 +246,7 @@ def create_multivec_multires(array_data, chromsizes,
239
246
#print("prev_resolution:", prev_resolution)
240
247
#print("old_data.shape", old_data.shape)
241
248
242
- # this is a sort of roundabout way of calculating the
249
+ # this is a sort of roundabout way of calculating the
243
250
# shape of the aggregated array, but all its doing is
244
251
# just halving the first dimension of the previous shape
245
252
# without taking into account the other dimensions
@@ -258,8 +265,8 @@ def create_multivec_multires(array_data, chromsizes,
258
265
new_data = agg (old_data )
259
266
260
267
'''
261
- print("zoom_level:", max_zoom - 1 - i,
262
- "resolution:", curr_resolution,
268
+ print("zoom_level:", max_zoom - 1 - i,
269
+ "resolution:", curr_resolution,
263
270
"new_data length", len(new_data))
264
271
'''
265
272
f ['resolutions' ][str (curr_resolution )]['values' ][chrom ][int (start / 2 ):int (start / 2 + chunk_size / 2 )] = new_data
0 commit comments