Skip to content

Commit 22b4bf6

Browse files
committed
ADD: row_filter
1 parent 02e42be commit 22b4bf6

File tree

1 file changed

+119
-0
lines changed

1 file changed

+119
-0
lines changed

row_filter.py

+119
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
#!/usr/bin/env python
2+
from optparse import OptionParser
3+
import sys
4+
import csv
5+
# Set the limit to 1 billion columns
6+
#csv.field_size_limit(10000000)
7+
8+
import jrl_utils.src.common as common
9+
from jrl_utils.src.common import BadDataError
10+
11+
12+
def _cli():
13+
r"""
14+
Removes rows in csv file (or stdin) with header where columns don't meet certain
15+
criteria.
16+
17+
Examples
18+
---------
19+
Keep rows in curriculum.csv where the subject contains the word 'algebra'
20+
$ python row_filter.py -n subject -C algebra curriculum.csv
21+
22+
Keep rows in curriculum.csv where the subject doesn't contain the word 'algebra'
23+
$ python row_filter.py -n subject -c algebra curriculum.csv
24+
25+
Keep rows in curriculum.csv where the subject equals the word 'algebra'
26+
$ python row_filter.py -n subject -E algebra curriculum.csv
27+
28+
Keep rows in curriculum.csv where the subject doesn't equal the word 'algebra'
29+
$ python row_filter.py -n subject -e algebra curriculum.csv
30+
"""
31+
usage = "usage: %prog [options] files"
32+
usage += '\n'+_cli.__doc__
33+
parser = OptionParser(usage=usage)
34+
parser.add_option(
35+
"-d", "--delimiter",
36+
help="Use DELIMITER as the column delimiter. [default: %default]",
37+
action="store", dest='delimiter', default=',')
38+
parser.add_option(
39+
"-n", "--name",
40+
help="Name of the columm to filter on. [default: %default]",
41+
action="store", dest='name', default=None)
42+
parser.add_option(
43+
"-C", "--contains",
44+
help="Column with name = NAME must contain CONTAINS else we kill that row. "
45+
"[default: %default]",
46+
action='store', dest='contains', default=None)
47+
parser.add_option(
48+
"-E", "--equals",
49+
help="Column with name = NAME must equal EQUALS else we kill that row. "
50+
"[default: %default]",
51+
action='store', dest='equals', default=None)
52+
parser.add_option(
53+
"-e", "--notequals",
54+
help="Column with name = NAME must not equal NOTEQUALS else we kill that row. "
55+
"[default: %default]",
56+
action='store', dest='notequals', default=None)
57+
parser.add_option(
58+
"-c", "--notcontains",
59+
help="Column with name = NAME must not contain NOTCONTAINS else we kill that row."
60+
" [default: %default]",
61+
action='store', dest='notcontains', default=None)
62+
parser.add_option(
63+
"-o", "--outfilename",
64+
help="Write to this file rather than stdout. [default: %default]",
65+
action="store", dest='outfilename', default=None)
66+
67+
(opt, args) = parser.parse_args()
68+
69+
### Parse args
70+
infilename = args[0] if args else None
71+
72+
infile, outfile = common.get_inout_files(infilename, opt.outfilename, outmode='wb')
73+
74+
column_filter(infile, outfile, opt.delimiter, opt)
75+
76+
common.close_files(infile, outfile)
77+
78+
79+
def column_filter(infile, outfile, delimiter, opt):
80+
"""
81+
NOTE: Written late at night after drinking...should be refactored!
82+
"""
83+
## Get the csv reader and writer. Use these to read/write the files.
84+
# reader.fieldnames gives you the header
85+
reader = csv.DictReader(infile, delimiter=delimiter)
86+
writer = csv.DictWriter(outfile, delimiter=delimiter, fieldnames=reader.fieldnames)
87+
writer.writeheader()
88+
89+
## Iterate through the file, printing out lines
90+
for row in reader:
91+
content = row[opt.name]
92+
if _shouldwrite(content, opt):
93+
writer.writerow(row)
94+
95+
96+
def _shouldwrite(content, opt):
97+
if opt.equals and content:
98+
shouldwrite = content == opt.equals
99+
elif opt.contains and content:
100+
shouldwrite = opt.contains in content
101+
elif opt.notequals:
102+
if not content:
103+
shouldwrite = True
104+
else:
105+
shouldwrite = content != opt.notequals
106+
elif opt.notcontains:
107+
if not content:
108+
shouldwrite = True
109+
else:
110+
shouldwrite = opt.notcontains not in content
111+
else:
112+
raise ValueError(
113+
"Unable to determine what to filter. options = %s" % opt.__dict__)
114+
115+
return shouldwrite
116+
117+
118+
if __name__=='__main__':
119+
_cli()

0 commit comments

Comments
 (0)