-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsubsample.py
158 lines (130 loc) · 4.94 KB
/
subsample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/env python
from optparse import OptionParser
import sys
import csv
from numpy.random import rand
from numpy.random import seed as randomseed
import common
def main():
r"""
DESCRIPTION
-----------
Subsample files or stdin and write to stdout. Optionally subsample in the
space of different values of a KEY_COLUMN. When doing this, every time a
new key value appears, decide whether or not to keep all rows containing
this value.
NOTES
-----
Assumes the first row is a header.
EXAMPLES
---------
Subsample a comma delimited dataset and redirect output to a new file
$ python subsample.py data.csv > subsampled_data.csv
Subsample, keeping only 10% of rows
$ python subsample.py -r 0.1 data.csv
Subsample, keeping 10% of different values in the 'height' column
$ python subsample.py -r 0.1 -k height data.csv
"""
usage = "usage: %prog [options] dataset"
usage += '\n'+main.__doc__
parser = OptionParser(usage=usage)
parser.add_option(
"-r", "--subsample_rate",
help="Subsample subsample_rate, 0 <= r <= 1. E.g. r = 0.1 keeps 10% "
"of rows. [default: %default] ",
action="store", dest='subsample_rate', type=float, default=0.01)
parser.add_option(
"-d", "--delimiter",
help="Use DELIMITER as the column delimiter. [default: %default]",
action="store", dest='delimiter', default=',')
parser.add_option(
"-k", "--key_column",
help="Subsample in the space of values of key_column. "
"[default: %default]",
action="store", dest="key_column", default=None)
parser.add_option(
"-s", "--seed",
help="Integer to seed the random number generator with. "
"[default: %default] ",
action="store", dest='seed', type=int, default=None)
parser.add_option(
"-o", "--outfilename",
help="Write to this file rather than stdout. [default: %default]",
action="store", dest='outfilename', default=None)
(opt, args) = parser.parse_args()
### Parse args
# Raise an exception if the length of args is greater than 1
assert len(args) <= 1
# If an argument is given, then it is the 'infilename'
# If no arguments are given, set infilename equal to None
infilename = args[0] if args else None
## Handle the options
# Deal with tabs
if opt.delimiter in ['t', '\\t', '\t', 'tab']:
opt.delimiter = '\t'
## Get the infile/outfile
infile, outfile = common.get_inout_files(infilename, opt.outfilename)
## Call the function that does the real work
subsample(
infile, outfile, opt.subsample_rate, opt.delimiter,
opt.key_column, opt.seed)
## Close the files iff not stdin, stdout
common.close_files(infile, outfile)
def subsample(
infile, outfile, subsample_rate=0.01, delimiter=',', key_column=None,
seed=None):
"""
Subsample infile and write to outfile.
Parameters
----------
infile : File open in read mode
File should be delimited text and have a header
outfile : File open in write mode
Output is written here
subsample_rate : Real number in the interval [0, 1]
Keep this fraction of rows/key-values
delimiter : Single character string
The delimiter of infile. Also used for outfile.
key_column : String
A column name.
If given, subsample in the space of values of key_column.
Otherwise, subsample in the space of rows.
seed : Integer
If given, use this to seed the random number generator.
"""
## Seed the random number generator for deterministic results
if seed:
randomseed(seed)
## Get the csv reader and writer. Use these to read/write the files.
reader = csv.DictReader(infile, delimiter=delimiter)
writer = csv.DictWriter(
outfile, delimiter=delimiter, fieldnames=reader.fieldnames)
writer.writeheader()
## Iterate through the file and print a selection of rows
if key_column:
_subsample_using_keys(reader, writer, subsample_rate, key_column)
else:
_subsample_without_keys(reader, writer, subsample_rate)
def _subsample_without_keys(reader, writer, subsample_rate):
for row in reader:
if subsample_rate > rand():
writer.writerow(row)
def _subsample_using_keys(reader, writer, subsample_rate, key_column):
"""
Iterate through reader, for every new value in key_column, decide whether
or not to print ALL rows with that value.
"""
keys_to_use = set()
keys_to_not_use = set()
for row in reader:
key_value = row[key_column]
if key_value in keys_to_use:
writer.writerow(row)
elif key_value not in keys_to_not_use:
if subsample_rate > rand():
keys_to_use.add(key_value)
writer.writerow(row)
else:
keys_to_not_use.add(key_value)
if __name__=='__main__':
main()