Merge branch 'develop' into develop

Burak Han Alver · web-flow · commit c119237c81ec · 2019-01-30T13:35:54.000-05:00
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,3 +1,9 @@
+v0.10.2 (2019-01-30)
+
+- new option to import a "states" file format, a bed file with categorical data, e.g. from chromHMM to multivec format.
+- while converting a bed file to multivec, each segment can now be a multiple of base_resolution,
+rather than exactly match the base_resolution
+
 v0.10.1 (2019-01-22)
 
 - Removed a buggy print statement from the conversion script
diff --git a/clodius/cli/convert.py b/clodius/cli/convert.py
@@ -17,13 +17,13 @@
 def epilogos_bedline_to_vector(bedline):
     '''
     Convert a line from an epilogos bedfile to vector format.
-    
+
     Parameters
     -----------
     parts: [string,....]
         A line from a bedfile broken up into its constituent parts
         (e.g. ["chr1", "1000", "2000", "[1,2,34,5]"])
-    
+
     Returns
     -------
     An array containing the values associated with that line
@@ -39,9 +39,40 @@ def epilogos_bedline_to_vector(bedline):
     chrom=parts[0]
     start=int(parts[1])
     end=int(parts[2])
-    
+
     return (chrom, start, end, states)
 
+def states_bedline_to_vector(bedline,states_dic):
+    '''
+    Convert a line from a bedfile containing states in categorical data to vector format.
+
+    Parameters
+    ----------
+
+    parts: [string,...]
+        A line form a bedfile broken up into its contituent parts
+        (e.g. ["chr1", "1000", "2000", "state"]))
+
+
+    states_dic:
+        A dictionary containing the states in the file with a corresponding value
+
+    Returns
+    -------
+
+    An array containing values associated the state
+
+    '''
+    parts = bedline.decode('utf8').strip().split('\t')
+    chrom=parts[0]
+    start=int(parts[1])
+    end=int(parts[2])
+    state= states_dic[parts[3].encode('utf8')]
+
+    states_vector = [ 1 if index == state else 0 for index in range(len(states_dic))]
+
+    return (chrom, start, end, states_vector)
+
 @cli.group()
 def convert():
     '''
@@ -82,6 +113,7 @@ def _bedgraph_to_multivec(
         if row_infos_filename is not None:
             with open(row_infos_filename, 'r') as fr:
                 row_infos = [l.strip().encode('utf8') for l in fr]
+
         else:
             row_infos = None
 
@@ -101,10 +133,16 @@ def bedline_to_chrom_start_end_vector(bedline):
             return (chrom, start, end, vector)
 
         if format == 'epilogos':
-            cmv.bedfile_to_multivec(filepath, f_out, epilogos_bedline_to_vector, 
+            cmv.bedfile_to_multivec(filepath, f_out, epilogos_bedline_to_vector,
                     starting_resolution, has_header, chunk_size);
+        elif format == 'states':
+            assert(row_infos != None), "A row_infos file must be provided for --format = 'states' "
+            states_dic = {row_infos[x]:x for x in range(len(row_infos))}
+
+            cmv.bedfile_to_multivec(filepath, f_out, states_bedline_to_vector,
+                    starting_resolution, has_header, chunk_size, states_dic);
         else:
-            cmv.bedfile_to_multivec(filepath, f_out, bedline_to_chrom_start_end_vector, 
+            cmv.bedfile_to_multivec(filepath, f_out, bedline_to_chrom_start_end_vector,
                     starting_resolution, has_header, chunk_size);
 
         f_out.close()
@@ -124,7 +162,7 @@ def agg(x):
                 # newshape = (x.shape[2], -1, 2)
                 # b = x.T.reshape((-1,))
 
-                
+
                 a = x.T.reshape((x.shape[1],-1,2))
 
                 # this is going to be an odd way to get rid of nan
@@ -153,7 +191,7 @@ def agg(x):
         else:
             agg=lambda x: x.T.reshape((x.shape[1],-1,2)).sum(axis=2).T
 
-        cmv.create_multivec_multires(f_in, 
+        cmv.create_multivec_multires(f_in,
                 chromsizes = zip(chrom_names, chrom_sizes),
                 agg=agg,
                 starting_resolution=starting_resolution,
@@ -242,7 +280,10 @@ def agg(x):
 )
 @click.option(
     '--format',
-    type=click.Choice(['default', 'epilogos']),
+    type=click.Choice(['default', 'epilogos', 'states']),
+    help= "'default':chr start end state1_value state2_value, etc;"
+     "'epilogos': chr start end [[state1_value, state1_num],[state2_value, state2_num],[etc]];"
+     "'states': chr start end state_name",
     default='default'
     )
 @click.option(
@@ -262,15 +303,14 @@ def agg(x):
     type=click.Choice(['sum', 'logsumexp']),
     default='sum'
 )
-def bedfile_to_multivec(filepath, output_file, assembly, chromosome_col, 
-        from_pos_col, to_pos_col, value_col, has_header, 
+def bedfile_to_multivec(filepath, output_file, assembly, chromosome_col,
+        from_pos_col, to_pos_col, value_col, has_header,
         chunk_size, nan_value,
         chromsizes_filename,
-        starting_resolution, num_rows, 
+        starting_resolution, num_rows,
         format, row_infos_filename, tile_size, method):
-    _bedgraph_to_multivec(filepath, output_file, assembly, chromosome_col, 
-        from_pos_col, to_pos_col, value_col, has_header, 
-        chunk_size, nan_value, 
-        chromsizes_filename, starting_resolution, num_rows, 
+    _bedgraph_to_multivec(filepath, output_file, assembly, chromosome_col,
+        from_pos_col, to_pos_col, value_col, has_header,
+        chunk_size, nan_value,
+        chromsizes_filename, starting_resolution, num_rows,
         format, row_infos_filename, tile_size, method)
-
diff --git a/clodius/multivec.py b/clodius/multivec.py
@@ -8,17 +8,17 @@
 import os.path as op
 import sys
 
-def bedfile_to_multivec(input_filename, f_out, 
+def bedfile_to_multivec(input_filename, f_out,
         bedline_to_chrom_start_end_vector, base_resolution,
-        has_header, chunk_size):
+        has_header, chunk_size, row_infos):
     '''
     Convert an epilogos bedfile to multivec format.
     '''
     if op.splitext(input_filename)[1] == '.gz':
         f = gzip.open(input_filename, 'r')
     else:
         f = open(input_filename, 'r')
-        
+
     FILL_VALUE = np.nan
 
     # batch regions because h5py is really bad at writing
@@ -29,7 +29,7 @@ def bedfile_to_multivec(input_filename, f_out,
     curr_index = 0
     # the start of the batch in the dataset
     batch_start_index = 0
-    
+
     if has_header:
         f.readline()
 
@@ -38,19 +38,19 @@ def bedfile_to_multivec(input_filename, f_out,
     warned = False
 
     for line in f:
-        chrom,start,end,vector = bedline_to_chrom_start_end_vector(line)
+        chrom,start,end,vector = bedline_to_chrom_start_end_vector(line, row_infos)
 
-        if end - start != base_resolution and not warned:
-            print("WARNING: interval length ({}) doesn't match base resolution ({}): {}".
-                    format(end - start, base_resolution, line))
+        if end % base_resolution != 0 or start % base_resolution != 0 and not warned:
+            print("WARNING: either the start or end coordinate is not a multiple of the base resolution ({}): {}".
+                    format(base_resolution, line))
             warned = True
 
         if prev_chrom is not None and chrom != prev_chrom:
             # we've reached a new chromosome so we'll dump all
             # the previous values
             print("len(batch:", len(batch))
             f_out[prev_chrom][batch_start_index:batch_start_index+len(batch)] = np.array(batch)
-            
+
             # we're starting a new chromosome so we start from the beginning
             curr_index = 0
             batch_start_index = 0
@@ -77,8 +77,15 @@ def bedfile_to_multivec(input_filename, f_out,
 
         assert(curr_index == data_start_index)
         #print('vector', vector)
-        batch += [vector]
-        curr_index += 1
+
+        #When the binsize is not equal to the base_resolution
+        # "break down" the binsize into bins of the rbase_esolution size
+        #and add the values to each bin.
+
+        data_end_index = end // base_resolution
+        while curr_index < data_end_index:
+            batch += [vector]
+            curr_index += 1
 
         # fill in empty
 
@@ -98,14 +105,14 @@ def bedfile_to_multivec(input_filename, f_out,
     #print('chrom', chrom)
     f_out[chrom][batch_start_index:batch_start_index+len(batch)] = np.array(batch)
 
-def create_multivec_multires(array_data, chromsizes, 
+def create_multivec_multires(array_data, chromsizes,
                     agg, starting_resolution=1,
                     tile_size=1024, output_file='/tmp/my_file.multires',
                     row_infos=None):
     '''
     Create a multires file containing the array data
     aggregated at multiple resolutions.
-    
+
     Parameters
     ----------
     array_data: {'chrom_key': np.array, }
@@ -131,11 +138,11 @@ def create_multivec_multires(array_data, chromsizes,
 
     # this will be the file that contains our multires data
     f = h5py.File(filename, 'w')
-        
+
     # store some metadata
     f.create_group('info')
     f['info'].attrs['tile-size'] = tile_size
-    
+
     f.create_group('resolutions')
     f.create_group('chroms')
 
@@ -184,15 +191,15 @@ def create_multivec_multires(array_data, chromsizes,
         while start < len(chrom_data):
             chrom_data[start:start + chunk_size] = array_data[chrom][start:start+chunk_size]    # see above section
             start += int(min(standard_chunk_size, len(array_data[chrom]) - start))
-        
+
 
     # the maximum zoom level corresponds to the number of aggregations
     # that need to be performed so that the entire extent of
     # the dataset fits into one tile
     total_length = sum(lengths)
     # print("total_length:", total_length, "tile_size:", tile_size, "starting_resolution:", starting_resolution)
     max_zoom = math.ceil(math.log(total_length / (tile_size * starting_resolution) ) / math.log(2))
-    
+
     # we're going to go through and create the data for the different
     # zoom levels by summing adjacent data points
     prev_resolution = curr_resolution
@@ -230,7 +237,7 @@ def create_multivec_multires(array_data, chromsizes,
             new_shape[0] = math.ceil(new_shape[0] / 2)
             new_shape = tuple(new_shape)
 
-            f['resolutions'][str(curr_resolution)]['values'].create_dataset(chrom, 
+            f['resolutions'][str(curr_resolution)]['values'].create_dataset(chrom,
                                             new_shape, compression='gzip')
 
             while start < len(chrom_data):
@@ -239,7 +246,7 @@ def create_multivec_multires(array_data, chromsizes,
                 #print("prev_resolution:", prev_resolution)
                 #print("old_data.shape", old_data.shape)
 
-                # this is a sort of roundabout way of calculating the 
+                # this is a sort of roundabout way of calculating the
                 # shape of the aggregated array, but all its doing is
                 # just halving the first dimension of the previous shape
                 # without taking into account the other dimensions
@@ -258,8 +265,8 @@ def create_multivec_multires(array_data, chromsizes,
                 new_data = agg(old_data)
 
                 '''
-                print("zoom_level:", max_zoom - 1 - i, 
-                      "resolution:", curr_resolution, 
+                print("zoom_level:", max_zoom - 1 - i,
+                      "resolution:", curr_resolution,
                       "new_data length", len(new_data))
                 '''
                 f['resolutions'][str(curr_resolution)]['values'][chrom][int(start/2):int(start/2+chunk_size/2)] = new_data
diff --git a/test/sample_data/states_format_input_testfile.bed.gz b/test/sample_data/states_format_input_testfile.bed.gz
diff --git a/test/sample_data/states_format_output_test_file.multires.mv5 b/test/sample_data/states_format_output_test_file.multires.mv5
diff --git a/test/sample_data/states_format_test_row_infos.txt b/test/sample_data/states_format_test_row_infos.txt
@@ -0,0 +1,10 @@
+Quies
+FaireW
+Low
+Pol2
+Gen3'
+Elon
+Ctcf
+EnhW
+EnhWF
+ElonW

-Original file line number
+Diff line change
 +Quies
 +FaireW
 +Low
 +Pol2
 +Gen3'
 +Elon
 +Ctcf
 +EnhW
 +EnhWF
 +ElonW