Skip to content

Commit dafc422

Browse files
authored
mrmatrix more flexible (#71)
* Use csv module instead of parsing by hand; Fix #63 * better description * Aspirational command-line options * Checkpoint: Need to update test * checkpoint: still buggy [skip ci] * Runs, but we get one less resolution * Fix test * tsv_to_mrmatrix_test.py whitespace * autopep8 * flake8 clean * changelog [skip ci] * start test unlabelled * unlabelled square * Make test more generic * Test aggregation of tall and wide datasets * Test tall and wide aggregation * If source data not power of 2... * Handle first n * click-ify * Un-click-ify: Argparse feels good-enough for now * update changelog [skip ci] * Fix whitespace; one failing test locally * Comments and logs * missing paren
1 parent 1e275f4 commit dafc422

File tree

6 files changed

+219
-59
lines changed

6 files changed

+219
-59
lines changed

Diff for: .flake8 renamed to .flake8-ignore

File renamed without changes.

Diff for: CHANGELOG

+4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
in progress
2+
3+
- Make tsv_to_mrmatrix more flexible and add it to the exported scripts.
4+
15
v0.10.7
26

37
- Changed bins_per_dimension in npvector.tileset_info to match the value in

Diff for: scripts/tsv_to_mrmatrix.py

+79-52
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
11
#!/usr/bin/python
22

3+
import csv
34
import dask.array as da
45
import h5py
56
import math
67
import numpy as np
7-
import os
8-
import os.path as op
9-
import sys
108
import argparse
119
import time
1210

@@ -17,9 +15,7 @@ def coarsen(f, tile_size=256):
1715
'''
1816
grid = f['resolutions']['1']['values']
1917
top_n = grid.shape[0]
20-
2118
max_zoom = math.ceil(math.log(top_n / tile_size) / math.log(2))
22-
max_width = tile_size * 2 ** max_zoom
2319

2420
chunk_size = tile_size * 16
2521
curr_size = grid.shape
@@ -43,84 +39,115 @@ def coarsen(f, tile_size=256):
4339
da.store(dask_dset, values)
4440

4541

46-
def parse(input_handle, output_hdf5, top_n=None):
47-
input_handle
48-
first_line = next(input_handle)
49-
parts = first_line.strip().split('\t')
50-
# TODO: Use the python built-in csv module, instead of parsing by handself.
51-
# https://github.com/higlass/clodius/issues/63
42+
def parse(input_handle, output_hdf5, height, width,
43+
delimiter, first_n, is_square, is_labelled):
44+
reader = csv.reader(input_handle, delimiter=delimiter)
45+
if is_labelled:
46+
first_row = next(reader)
47+
labels = first_row[1:(first_n + 1) if first_n else None]
48+
if is_square:
49+
output_hdf5.create_dataset(
50+
'labels',
51+
data=np.array(labels, dtype=h5py.special_dtype(vlen=str)),
52+
compression='lzf')
53+
# TODO: Handle non-square labels
54+
# https://github.com/higlass/clodius/issues/68
5255

53-
if top_n is None:
54-
top_n = len(parts) - 1
55-
# TODO: If it's taller than it is wide, it will be truncated to a square,
56-
# unless an explicit top_n is provided.
57-
# https://github.com/higlass/clodius/issues/64
58-
59-
labels = parts[1:top_n + 1]
6056
tile_size = 256
61-
max_zoom = math.ceil(math.log(top_n / tile_size) / math.log(2))
57+
limit = max(height, width)
58+
max_zoom = math.ceil(math.log(limit / tile_size) / math.log(2))
6259
max_width = tile_size * 2 ** max_zoom
6360

64-
labels_dset = output_hdf5.create_dataset('labels', data=np.array(labels, dtype=h5py.special_dtype(vlen=str)),
65-
compression='lzf')
66-
6761
g = output_hdf5.create_group('resolutions')
6862
g1 = g.create_group('1')
6963
ds = g1.create_dataset('values', (max_width, max_width),
7064
dtype='f4', compression='lzf', fillvalue=np.nan)
71-
ds1 = g1.create_dataset('nan_values', (max_width, max_width),
72-
dtype='f4', compression='lzf', fillvalue=0)
73-
# TODO: We don't write to this, and I think we want the nan_values for every resolution.
74-
# https://github.com/higlass/clodius/issues/62
65+
g1.create_dataset('nan_values', (max_width, max_width),
66+
dtype='f4', compression='lzf', fillvalue=0)
67+
# TODO: We don't write to this... Is it necessary?
7568

7669
start_time = time.time()
7770
counter = 0
78-
for line in input_handle:
79-
parts = line.strip().split('\t')[1:top_n + 1]
80-
x = np.array([float(p) for p in parts])
71+
for row in reader:
72+
x = np.array([float(p) for p in row[1 if is_labelled else None:]])
8173
ds[counter, :len(x)] = x
8274

8375
counter += 1
84-
if counter == top_n:
76+
if counter == first_n:
8577
break
8678

8779
time_elapsed = time.time() - start_time
8880
time_per_entry = time_elapsed / counter
8981

90-
time_remaining = time_per_entry * (top_n - counter)
82+
time_remaining = time_per_entry * (height - counter)
9183
print("counter:", counter, "sum(x):", sum(x),
9284
"time remaining: {:d} seconds".format(int(time_remaining)))
9385

9486
coarsen(output_hdf5)
9587
output_hdf5.close()
9688

9789

98-
def main():
99-
parser = argparse.ArgumentParser(description="""
100-
101-
python tsv-dense-to-sparse
102-
""")
90+
def get_height(input_path, is_labelled=True):
91+
'''
92+
We need to scan the file once just to see how many lines it contains.
93+
If it is tall and narrow, the first tile will need to be larger than just
94+
looking at the width of the first row would suggest.
95+
'''
96+
with open(input_path) as f:
97+
for i, l in enumerate(f):
98+
pass
99+
if is_labelled:
100+
return i
101+
else:
102+
return i + 1
103103

104-
parser.add_argument('input_file')
105-
parser.add_argument('output_file')
106-
# parser.add_argument('-o', '--options', default='yo',
107-
# help="Some option", type='str')
108-
# parser.add_argument('-u', '--useless', action='store_true',
109-
# help='Another useless option')
110-
parser.add_argument('-n', '--first-n', type=int, default=None,
111-
help="Only use the first n entries in the matrix")
112104

113-
args = parser.parse_args()
105+
def get_width(input_path, is_labelled, delimiter='\t'):
106+
'''
107+
Assume the number of elements in the first row is the total width.
108+
'''
109+
with open(input_path, 'r', newline='') as input_handle:
110+
reader = csv.reader(input_handle, delimiter=delimiter)
111+
len_row = len(next(reader))
112+
if is_labelled:
113+
return len_row - 1
114+
return len_row
114115

115-
count = 0
116-
top_n = args.first_n
117116

118-
if args.input_file == '-':
119-
f_in = sys.stdin
120-
else:
121-
f_in = open(args.input_file, 'r')
117+
def main():
118+
parser = argparse.ArgumentParser(description='''
119+
Given a tab-delimited file, produces an HDF5 file with mrmatrix
120+
("multi-resolution matrix") structure: Under the "resolutions"
121+
group are datasets, named with successive powers of 2,
122+
which represent successively higher aggregations of the input.
123+
''')
124+
parser.add_argument('input_file', help='TSV file path')
125+
parser.add_argument('output_file', help='HDF5 file')
126+
parser.add_argument('-d', '--delimiter', type=str, default='\t',
127+
metavar='D', help='Delimiter; defaults to tab')
128+
parser.add_argument('-n', '--first-n', type=int, default=None, metavar='N',
129+
help='Only read first N columns from first N rows')
130+
parser.add_argument('-s', '--square', action='store_true',
131+
help='Row labels are assumed to match column labels')
132+
parser.add_argument('-l', '--labelled', action='store_true',
133+
help='TSV Matrix has column and row labels')
134+
args = parser.parse_args()
122135

123-
parse(f_in, h5py.File(args.output_file, 'w'), top_n)
136+
height = get_height(args.input_file, is_labelled=args.labelled)
137+
width = get_width(args.input_file, is_labelled=args.labelled,
138+
delimiter=args.delimiter)
139+
print('height:', height)
140+
print('width:', width)
141+
142+
f_in = open(args.input_file, 'r', newline='')
143+
144+
parse(f_in,
145+
h5py.File(args.output_file, 'w'),
146+
height=height, width=width,
147+
delimiter=args.delimiter,
148+
first_n=args.first_n,
149+
is_square=args.square,
150+
is_labelled=args.labelled)
124151

125152
f = h5py.File(args.output_file, 'r')
126153
print("sum1:", np.nansum(f['resolutions']['1']['values'][0]))

Diff for: setup.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,12 @@
3737
packages=['clodius', 'clodius.cli', 'clodius.tiles'],
3838
setup_requires=setup_requires,
3939
install_requires=install_requires,
40+
scripts=[
41+
'scripts/tsv_to_mrmatrix.py'
42+
],
4043
entry_points={
4144
'console_scripts': [
42-
'clodius = clodius.cli.aggregate:cli',
45+
'clodius = clodius.cli.aggregate:cli'
4346
]
4447
}
4548
)

Diff for: test/tsv_to_mrmatrix_test.py

+126-5
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from numpy.testing import assert_array_equal
88
import h5py
99

10-
from scripts.tsv_to_mrmatrix import coarsen, parse
10+
from scripts.tsv_to_mrmatrix import coarsen, parse, get_height, get_width
1111

1212

1313
class CoarsenTest(unittest.TestCase):
@@ -21,7 +21,8 @@ def test_5_layer_pyramid(self):
2121
g = hdf5.create_group('resolutions')
2222
g1 = g.create_group('1')
2323
ds = g1.create_dataset('values', (max_width, max_width),
24-
dtype='f4', compression='lzf', fillvalue=np.nan)
24+
dtype='f4', compression='lzf',
25+
fillvalue=np.nan)
2526
for y in range(max_width):
2627
a = np.array([float(x) for x in range(max_width)])
2728
ds[y, :max_width] = a
@@ -70,7 +71,8 @@ def test_math(self):
7071
g = hdf5.create_group('resolutions')
7172
g1 = g.create_group('1')
7273
ds = g1.create_dataset('values', (max_width, max_width),
73-
dtype='f4', compression='lzf', fillvalue=np.nan)
74+
dtype='f4', compression='lzf',
75+
fillvalue=np.nan)
7476
for y in range(max_width):
7577
a = np.array([float(x) for x in range(max_width)])
7678
ds[y, :max_width] = a
@@ -107,7 +109,7 @@ def test_math(self):
107109

108110

109111
class ParseTest(unittest.TestCase):
110-
def test_parse(self):
112+
def test_wide_labelled_square(self):
111113
with TemporaryDirectory() as tmp_dir:
112114
csv_path = tmp_dir + '/tmp.csv'
113115
with open(csv_path, 'w', newline='') as csv_file:
@@ -127,7 +129,11 @@ def test_parse(self):
127129
hdf5_path = tmp_dir + 'tmp.hdf5'
128130
hdf5_write_handle = h5py.File(hdf5_path, 'w')
129131

130-
parse(csv_handle, hdf5_write_handle)
132+
height = get_height(csv_path)
133+
width = get_width(csv_path, is_labelled=True)
134+
parse(csv_handle, hdf5_write_handle, height, width,
135+
delimiter='\t', first_n=None, is_square=True,
136+
is_labelled=True)
131137

132138
hdf5 = h5py.File(hdf5_path, 'r')
133139
self.assertEqual(list(hdf5.keys()), ['labels', 'resolutions'])
@@ -158,3 +164,118 @@ def test_parse(self):
158164
assert_array_equal(res_2[4], [0] * 256)
159165
assert_array_equal(res_2[5], [0] * 256)
160166
assert_array_equal(res_2[6], [0] * 256)
167+
# TODO: We lose nan at higher aggregations.
168+
# https://github.com/higlass/clodius/issues/62
169+
170+
def _assert_unlabelled_roundtrip_lt_256(
171+
self, matrix, delimiter, is_square):
172+
with TemporaryDirectory() as tmp_dir:
173+
csv_path = tmp_dir + '/tmp.csv'
174+
with open(csv_path, 'w', newline='') as csv_file:
175+
writer = csv.writer(csv_file, delimiter=delimiter)
176+
# body:
177+
for row in matrix:
178+
writer.writerow(row)
179+
180+
csv_handle = open(csv_path, 'r')
181+
hdf5_path = tmp_dir + 'tmp.hdf5'
182+
hdf5_write_handle = h5py.File(hdf5_path, 'w')
183+
184+
is_labelled = False
185+
height = get_height(csv_path, is_labelled=is_labelled)
186+
width = get_width(csv_path, is_labelled=is_labelled)
187+
parse(csv_handle, hdf5_write_handle, height, width,
188+
first_n=None, is_labelled=is_labelled,
189+
delimiter=delimiter, is_square=is_square)
190+
191+
hdf5 = h5py.File(hdf5_path, 'r')
192+
self.assertEqual(list(hdf5.keys()), ['resolutions'])
193+
self.assertEqual(list(hdf5['resolutions'].keys()), ['1'])
194+
self.assertEqual(list(hdf5['resolutions']['1'].keys()),
195+
['nan_values', 'values'])
196+
assert_array_equal(
197+
hdf5['resolutions']['1']['nan_values'],
198+
[[0] * len(matrix[0])] * len(matrix)
199+
)
200+
assert_array_equal(
201+
hdf5['resolutions']['1']['values'],
202+
matrix
203+
)
204+
205+
def test_unlabelled_csv_is_square_true(self):
206+
self._assert_unlabelled_roundtrip_lt_256(
207+
matrix=[[x + y for x in range(4)] for y in range(4)],
208+
delimiter=',',
209+
is_square=True
210+
)
211+
212+
def test_unlabelled_tsv_is_square_false(self):
213+
self._assert_unlabelled_roundtrip_lt_256(
214+
matrix=[[x + y for x in range(4)] for y in range(4)],
215+
delimiter='\t',
216+
is_square=False
217+
)
218+
219+
def _assert_unlabelled_roundtrip_1024(
220+
self, matrix, first_row=None, first_col=None, first_n=None):
221+
delimiter = '\t'
222+
is_square = False
223+
with TemporaryDirectory() as tmp_dir:
224+
csv_path = tmp_dir + '/tmp.csv'
225+
with open(csv_path, 'w', newline='') as csv_file:
226+
writer = csv.writer(csv_file, delimiter=delimiter)
227+
# body:
228+
for row in matrix:
229+
writer.writerow(row)
230+
231+
csv_handle = open(csv_path, 'r')
232+
hdf5_path = tmp_dir + 'tmp.hdf5'
233+
hdf5_write_handle = h5py.File(hdf5_path, 'w')
234+
235+
is_labelled = False
236+
height = get_height(csv_path, is_labelled=is_labelled)
237+
width = get_width(csv_path, is_labelled=is_labelled)
238+
parse(csv_handle, hdf5_write_handle, height, width,
239+
first_n=first_n, is_labelled=is_labelled,
240+
delimiter=delimiter, is_square=is_square)
241+
242+
hdf5 = h5py.File(hdf5_path, 'r')
243+
self.assertEqual(list(hdf5.keys()), ['resolutions'])
244+
self.assertEqual(list(hdf5['resolutions'].keys()), ['1', '2', '4'])
245+
self.assertEqual(list(hdf5['resolutions']['1'].keys()),
246+
['nan_values', 'values'])
247+
self.assertEqual(list(hdf5['resolutions']['4'].keys()),
248+
['values'])
249+
res_4 = hdf5['resolutions']['4']['values']
250+
if first_row:
251+
assert_array_equal(res_4[0], first_row)
252+
if first_col:
253+
assert_array_equal(
254+
[res_4[y][0] for y in range(len(first_col))],
255+
first_col)
256+
257+
def test_unlabelled_tsv_tall(self):
258+
self._assert_unlabelled_roundtrip_1024(
259+
matrix=[[1 for x in range(4)] for y in range(1000)],
260+
first_col=[16] * 250 + [0] * 6
261+
)
262+
263+
def test_unlabelled_tsv_wide(self):
264+
self._assert_unlabelled_roundtrip_1024(
265+
matrix=[[1 for x in range(1000)] for y in range(4)],
266+
first_row=[16] * 250 + [0] * 6
267+
)
268+
269+
def test_unlabelled_tsv_tall_first_n(self):
270+
self._assert_unlabelled_roundtrip_1024(
271+
matrix=[[1 for x in range(4)] for y in range(1000)],
272+
first_col=[8] + [0] * 255,
273+
first_n=2
274+
)
275+
276+
def test_unlabelled_tsv_wide_first_n(self):
277+
self._assert_unlabelled_roundtrip_1024(
278+
matrix=[[1 for x in range(1000)] for y in range(4)],
279+
first_row=[8] * 250 + [0] * 6,
280+
first_n=2
281+
)

Diff for: travis_test.sh

+6-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,12 @@ die() { set +v; echo "$*" 1>&2 ; sleep 1; exit 1; }
88
# https://github.com/travis-ci/travis-ci/issues/6018
99

1010
start flake8
11-
flake8
11+
# TODO:
12+
# - Get more files to lint cleanly.
13+
# - Reduce the number of errors which are ignored everywhere else.
14+
flake8 --config=.flake8-ignore
15+
flake8 test/tsv_to_mrmatrix_test.py \
16+
scripts/tsv_to_mrmatrix.py
1217
end flake8
1318

1419
start download

0 commit comments

Comments
 (0)