mrmatrix more flexible (#71)

mccalluc · web-flow · commit dafc422715e3 · 2019-05-21T12:17:57.000-04:00
* Use csv module instead of parsing by hand; Fix #63 * better description * Aspirational command-line options * Checkpoint: Need to update test * checkpoint: still buggy [skip ci] * Runs, but we get one less resolution * Fix test * tsv_to_mrmatrix_test.py whitespace * autopep8 * flake8 clean * changelog [skip ci] * start test unlabelled * unlabelled square * Make test more generic * Test aggregation of tall and wide datasets * Test tall and wide aggregation * If source data not power of 2... * Handle first n * click-ify * Un-click-ify: Argparse feels good-enough for now * update changelog [skip ci] * Fix whitespace; one failing test locally * Comments and logs * missing paren
diff --git a/.flake8-ignore b/.flake8-ignore
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,3 +1,7 @@
+in progress
+
+- Make tsv_to_mrmatrix more flexible and add it to the exported scripts.
+
 v0.10.7
 
 - Changed bins_per_dimension in npvector.tileset_info to match the value in
diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py
@@ -1,12 +1,10 @@
 #!/usr/bin/python
 
+import csv
 import dask.array as da
 import h5py
 import math
 import numpy as np
-import os
-import os.path as op
-import sys
 import argparse
 import time
 
@@ -17,9 +15,7 @@ def coarsen(f, tile_size=256):
     '''
     grid = f['resolutions']['1']['values']
     top_n = grid.shape[0]
-
     max_zoom = math.ceil(math.log(top_n / tile_size) / math.log(2))
-    max_width = tile_size * 2 ** max_zoom
 
     chunk_size = tile_size * 16
     curr_size = grid.shape
@@ -43,84 +39,115 @@ def coarsen(f, tile_size=256):
         da.store(dask_dset, values)
 
 
-def parse(input_handle, output_hdf5, top_n=None):
-    input_handle
-    first_line = next(input_handle)
-    parts = first_line.strip().split('\t')
-    # TODO: Use the python built-in csv module, instead of parsing by handself.
-    # https://github.com/higlass/clodius/issues/63
+def parse(input_handle, output_hdf5, height, width,
+          delimiter, first_n, is_square, is_labelled):
+    reader = csv.reader(input_handle, delimiter=delimiter)
+    if is_labelled:
+        first_row = next(reader)
+        labels = first_row[1:(first_n + 1) if first_n else None]
+        if is_square:
+            output_hdf5.create_dataset(
+                'labels',
+                data=np.array(labels, dtype=h5py.special_dtype(vlen=str)),
+                compression='lzf')
+        # TODO: Handle non-square labels
+        # https://github.com/higlass/clodius/issues/68
 
-    if top_n is None:
-        top_n = len(parts) - 1
-        # TODO: If it's taller than it is wide, it will be truncated to a square,
-        # unless an explicit top_n is provided.
-        # https://github.com/higlass/clodius/issues/64
-
-    labels = parts[1:top_n + 1]
     tile_size = 256
-    max_zoom = math.ceil(math.log(top_n / tile_size) / math.log(2))
+    limit = max(height, width)
+    max_zoom = math.ceil(math.log(limit / tile_size) / math.log(2))
     max_width = tile_size * 2 ** max_zoom
 
-    labels_dset = output_hdf5.create_dataset('labels', data=np.array(labels, dtype=h5py.special_dtype(vlen=str)),
-                                             compression='lzf')
-
     g = output_hdf5.create_group('resolutions')
     g1 = g.create_group('1')
     ds = g1.create_dataset('values', (max_width, max_width),
                            dtype='f4', compression='lzf', fillvalue=np.nan)
-    ds1 = g1.create_dataset('nan_values', (max_width, max_width),
-                            dtype='f4', compression='lzf', fillvalue=0)
-    # TODO: We don't write to this, and I think we want the nan_values for every resolution.
-    # https://github.com/higlass/clodius/issues/62
+    g1.create_dataset('nan_values', (max_width, max_width),
+                      dtype='f4', compression='lzf', fillvalue=0)
+    # TODO: We don't write to this... Is it necessary?
 
     start_time = time.time()
     counter = 0
-    for line in input_handle:
-        parts = line.strip().split('\t')[1:top_n + 1]
-        x = np.array([float(p) for p in parts])
+    for row in reader:
+        x = np.array([float(p) for p in row[1 if is_labelled else None:]])
         ds[counter, :len(x)] = x
 
         counter += 1
-        if counter == top_n:
+        if counter == first_n:
             break
 
         time_elapsed = time.time() - start_time
         time_per_entry = time_elapsed / counter
 
-        time_remaining = time_per_entry * (top_n - counter)
+        time_remaining = time_per_entry * (height - counter)
         print("counter:", counter, "sum(x):", sum(x),
               "time remaining: {:d} seconds".format(int(time_remaining)))
 
     coarsen(output_hdf5)
     output_hdf5.close()
 
 
-def main():
-    parser = argparse.ArgumentParser(description="""
-
-    python tsv-dense-to-sparse
-""")
+def get_height(input_path, is_labelled=True):
+    '''
+    We need to scan the file once just to see how many lines it contains.
+    If it is tall and narrow, the first tile will need to be larger than just
+    looking at the width of the first row would suggest.
+    '''
+    with open(input_path) as f:
+        for i, l in enumerate(f):
+            pass
+    if is_labelled:
+        return i
+    else:
+        return i + 1
 
-    parser.add_argument('input_file')
-    parser.add_argument('output_file')
-    # parser.add_argument('-o', '--options', default='yo',
-    #					 help="Some option", type='str')
-    # parser.add_argument('-u', '--useless', action='store_true',
-    #					 help='Another useless option')
-    parser.add_argument('-n', '--first-n', type=int, default=None,
-                        help="Only use the first n entries in the matrix")
 
-    args = parser.parse_args()
+def get_width(input_path, is_labelled, delimiter='\t'):
+    '''
+    Assume the number of elements in the first row is the total width.
+    '''
+    with open(input_path, 'r', newline='') as input_handle:
+        reader = csv.reader(input_handle, delimiter=delimiter)
+        len_row = len(next(reader))
+        if is_labelled:
+            return len_row - 1
+        return len_row
 
-    count = 0
-    top_n = args.first_n
 
-    if args.input_file == '-':
-        f_in = sys.stdin
-    else:
-        f_in = open(args.input_file, 'r')
+def main():
+    parser = argparse.ArgumentParser(description='''
+        Given a tab-delimited file, produces an HDF5 file with mrmatrix
+        ("multi-resolution matrix") structure: Under the "resolutions"
+        group are datasets, named with successive powers of 2,
+        which represent successively higher aggregations of the input.
+    ''')
+    parser.add_argument('input_file', help='TSV file path')
+    parser.add_argument('output_file', help='HDF5 file')
+    parser.add_argument('-d', '--delimiter', type=str, default='\t',
+                        metavar='D', help='Delimiter; defaults to tab')
+    parser.add_argument('-n', '--first-n', type=int, default=None, metavar='N',
+                        help='Only read first N columns from first N rows')
+    parser.add_argument('-s', '--square', action='store_true',
+                        help='Row labels are assumed to match column labels')
+    parser.add_argument('-l', '--labelled', action='store_true',
+                        help='TSV Matrix has column and row labels')
+    args = parser.parse_args()
 
-    parse(f_in, h5py.File(args.output_file, 'w'), top_n)
+    height = get_height(args.input_file, is_labelled=args.labelled)
+    width = get_width(args.input_file, is_labelled=args.labelled,
+                      delimiter=args.delimiter)
+    print('height:', height)
+    print('width:', width)
+
+    f_in = open(args.input_file, 'r', newline='')
+
+    parse(f_in,
+          h5py.File(args.output_file, 'w'),
+          height=height, width=width,
+          delimiter=args.delimiter,
+          first_n=args.first_n,
+          is_square=args.square,
+          is_labelled=args.labelled)
 
     f = h5py.File(args.output_file, 'r')
     print("sum1:", np.nansum(f['resolutions']['1']['values'][0]))
diff --git a/setup.py b/setup.py
@@ -37,9 +37,12 @@
     packages=['clodius', 'clodius.cli', 'clodius.tiles'],
     setup_requires=setup_requires,
     install_requires=install_requires,
+    scripts=[
+        'scripts/tsv_to_mrmatrix.py'
+    ],
     entry_points={
         'console_scripts': [
-            'clodius = clodius.cli.aggregate:cli',
+                'clodius = clodius.cli.aggregate:cli'
             ]
         }
 )
diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py
@@ -7,7 +7,7 @@
 from numpy.testing import assert_array_equal
 import h5py
 
-from scripts.tsv_to_mrmatrix import coarsen, parse
+from scripts.tsv_to_mrmatrix import coarsen, parse, get_height, get_width
 
 
 class CoarsenTest(unittest.TestCase):
@@ -21,7 +21,8 @@ def test_5_layer_pyramid(self):
             g = hdf5.create_group('resolutions')
             g1 = g.create_group('1')
             ds = g1.create_dataset('values', (max_width, max_width),
-                                   dtype='f4', compression='lzf', fillvalue=np.nan)
+                                   dtype='f4', compression='lzf',
+                                   fillvalue=np.nan)
             for y in range(max_width):
                 a = np.array([float(x) for x in range(max_width)])
                 ds[y, :max_width] = a
@@ -70,7 +71,8 @@ def test_math(self):
             g = hdf5.create_group('resolutions')
             g1 = g.create_group('1')
             ds = g1.create_dataset('values', (max_width, max_width),
-                                   dtype='f4', compression='lzf', fillvalue=np.nan)
+                                   dtype='f4', compression='lzf',
+                                   fillvalue=np.nan)
             for y in range(max_width):
                 a = np.array([float(x) for x in range(max_width)])
                 ds[y, :max_width] = a
@@ -107,7 +109,7 @@ def test_math(self):
 
 
 class ParseTest(unittest.TestCase):
-    def test_parse(self):
+    def test_wide_labelled_square(self):
         with TemporaryDirectory() as tmp_dir:
             csv_path = tmp_dir + '/tmp.csv'
             with open(csv_path, 'w', newline='') as csv_file:
@@ -127,7 +129,11 @@ def test_parse(self):
             hdf5_path = tmp_dir + 'tmp.hdf5'
             hdf5_write_handle = h5py.File(hdf5_path, 'w')
 
-            parse(csv_handle, hdf5_write_handle)
+            height = get_height(csv_path)
+            width = get_width(csv_path, is_labelled=True)
+            parse(csv_handle, hdf5_write_handle, height, width,
+                  delimiter='\t', first_n=None, is_square=True,
+                  is_labelled=True)
 
             hdf5 = h5py.File(hdf5_path, 'r')
             self.assertEqual(list(hdf5.keys()), ['labels', 'resolutions'])
@@ -158,3 +164,118 @@ def test_parse(self):
             assert_array_equal(res_2[4], [0] * 256)
             assert_array_equal(res_2[5], [0] * 256)
             assert_array_equal(res_2[6], [0] * 256)
+            # TODO: We lose nan at higher aggregations.
+            # https://github.com/higlass/clodius/issues/62
+
+    def _assert_unlabelled_roundtrip_lt_256(
+            self, matrix, delimiter, is_square):
+        with TemporaryDirectory() as tmp_dir:
+            csv_path = tmp_dir + '/tmp.csv'
+            with open(csv_path, 'w', newline='') as csv_file:
+                writer = csv.writer(csv_file, delimiter=delimiter)
+                # body:
+                for row in matrix:
+                    writer.writerow(row)
+
+            csv_handle = open(csv_path, 'r')
+            hdf5_path = tmp_dir + 'tmp.hdf5'
+            hdf5_write_handle = h5py.File(hdf5_path, 'w')
+
+            is_labelled = False
+            height = get_height(csv_path, is_labelled=is_labelled)
+            width = get_width(csv_path, is_labelled=is_labelled)
+            parse(csv_handle, hdf5_write_handle, height, width,
+                  first_n=None, is_labelled=is_labelled,
+                  delimiter=delimiter, is_square=is_square)
+
+            hdf5 = h5py.File(hdf5_path, 'r')
+            self.assertEqual(list(hdf5.keys()), ['resolutions'])
+            self.assertEqual(list(hdf5['resolutions'].keys()), ['1'])
+            self.assertEqual(list(hdf5['resolutions']['1'].keys()),
+                             ['nan_values', 'values'])
+            assert_array_equal(
+                hdf5['resolutions']['1']['nan_values'],
+                [[0] * len(matrix[0])] * len(matrix)
+            )
+            assert_array_equal(
+                hdf5['resolutions']['1']['values'],
+                matrix
+            )
+
+    def test_unlabelled_csv_is_square_true(self):
+        self._assert_unlabelled_roundtrip_lt_256(
+            matrix=[[x + y for x in range(4)] for y in range(4)],
+            delimiter=',',
+            is_square=True
+        )
+
+    def test_unlabelled_tsv_is_square_false(self):
+        self._assert_unlabelled_roundtrip_lt_256(
+            matrix=[[x + y for x in range(4)] for y in range(4)],
+            delimiter='\t',
+            is_square=False
+        )
+
+    def _assert_unlabelled_roundtrip_1024(
+            self, matrix, first_row=None, first_col=None, first_n=None):
+        delimiter = '\t'
+        is_square = False
+        with TemporaryDirectory() as tmp_dir:
+            csv_path = tmp_dir + '/tmp.csv'
+            with open(csv_path, 'w', newline='') as csv_file:
+                writer = csv.writer(csv_file, delimiter=delimiter)
+                # body:
+                for row in matrix:
+                    writer.writerow(row)
+
+            csv_handle = open(csv_path, 'r')
+            hdf5_path = tmp_dir + 'tmp.hdf5'
+            hdf5_write_handle = h5py.File(hdf5_path, 'w')
+
+            is_labelled = False
+            height = get_height(csv_path, is_labelled=is_labelled)
+            width = get_width(csv_path, is_labelled=is_labelled)
+            parse(csv_handle, hdf5_write_handle, height, width,
+                  first_n=first_n, is_labelled=is_labelled,
+                  delimiter=delimiter, is_square=is_square)
+
+            hdf5 = h5py.File(hdf5_path, 'r')
+            self.assertEqual(list(hdf5.keys()), ['resolutions'])
+            self.assertEqual(list(hdf5['resolutions'].keys()), ['1', '2', '4'])
+            self.assertEqual(list(hdf5['resolutions']['1'].keys()),
+                             ['nan_values', 'values'])
+            self.assertEqual(list(hdf5['resolutions']['4'].keys()),
+                             ['values'])
+            res_4 = hdf5['resolutions']['4']['values']
+            if first_row:
+                assert_array_equal(res_4[0], first_row)
+            if first_col:
+                assert_array_equal(
+                    [res_4[y][0] for y in range(len(first_col))],
+                    first_col)
+
+    def test_unlabelled_tsv_tall(self):
+        self._assert_unlabelled_roundtrip_1024(
+            matrix=[[1 for x in range(4)] for y in range(1000)],
+            first_col=[16] * 250 + [0] * 6
+        )
+
+    def test_unlabelled_tsv_wide(self):
+        self._assert_unlabelled_roundtrip_1024(
+            matrix=[[1 for x in range(1000)] for y in range(4)],
+            first_row=[16] * 250 + [0] * 6
+        )
+
+    def test_unlabelled_tsv_tall_first_n(self):
+        self._assert_unlabelled_roundtrip_1024(
+            matrix=[[1 for x in range(4)] for y in range(1000)],
+            first_col=[8] + [0] * 255,
+            first_n=2
+        )
+
+    def test_unlabelled_tsv_wide_first_n(self):
+        self._assert_unlabelled_roundtrip_1024(
+            matrix=[[1 for x in range(1000)] for y in range(4)],
+            first_row=[8] * 250 + [0] * 6,
+            first_n=2
+        )
diff --git a/travis_test.sh b/travis_test.sh
@@ -8,7 +8,12 @@ die() { set +v; echo "$*" 1>&2 ; sleep 1; exit 1; }
 # https://github.com/travis-ci/travis-ci/issues/6018
 
 start flake8
-flake8
+# TODO:
+# - Get more files to lint cleanly.
+# - Reduce the number of errors which are ignored everywhere else.
+flake8 --config=.flake8-ignore
+flake8 test/tsv_to_mrmatrix_test.py \
+       scripts/tsv_to_mrmatrix.py
 end flake8
 
 start download

Original file line number	Diff line number	Diff line change
`@@ -37,9 +37,12 @@`
`37`	`37`	`packages=['clodius', 'clodius.cli', 'clodius.tiles'],`
`38`	`38`	`setup_requires=setup_requires,`
`39`	`39`	`install_requires=install_requires,`
	`40`	`+ scripts=[`
	`41`	`+ 'scripts/tsv_to_mrmatrix.py'`
	`42`	`+ ],`
`40`	`43`	`entry_points={`
`41`	`44`	`'console_scripts': [`
`42`		`- 'clodius = clodius.cli.aggregate:cli',`
	`45`	`+ 'clodius = clodius.cli.aggregate:cli'`
`43`	`46`	`]`
`44`	`47`	`}`
`45`	`48`	`)`