|
| 1 | +#!/usr/bin/env python |
| 2 | +from optparse import OptionParser |
| 3 | +import sys |
| 4 | +import csv |
| 5 | +# Set the limit to 1 billion columns |
| 6 | +#csv.field_size_limit(10000000) |
| 7 | + |
| 8 | +import jrl_utils.src.common as common |
| 9 | +from jrl_utils.src.common import BadDataError |
| 10 | + |
| 11 | + |
| 12 | +def _cli(): |
| 13 | + r""" |
| 14 | + Removes rows in csv file (or stdin) with header where columns don't meet certain |
| 15 | + criteria. |
| 16 | +
|
| 17 | + Examples |
| 18 | + --------- |
| 19 | + Keep rows in curriculum.csv where the subject contains the word 'algebra' |
| 20 | + $ python row_filter.py -n subject -C algebra curriculum.csv |
| 21 | +
|
| 22 | + Keep rows in curriculum.csv where the subject doesn't contain the word 'algebra' |
| 23 | + $ python row_filter.py -n subject -c algebra curriculum.csv |
| 24 | +
|
| 25 | + Keep rows in curriculum.csv where the subject equals the word 'algebra' |
| 26 | + $ python row_filter.py -n subject -E algebra curriculum.csv |
| 27 | +
|
| 28 | + Keep rows in curriculum.csv where the subject doesn't equal the word 'algebra' |
| 29 | + $ python row_filter.py -n subject -e algebra curriculum.csv |
| 30 | + """ |
| 31 | + usage = "usage: %prog [options] files" |
| 32 | + usage += '\n'+_cli.__doc__ |
| 33 | + parser = OptionParser(usage=usage) |
| 34 | + parser.add_option( |
| 35 | + "-d", "--delimiter", |
| 36 | + help="Use DELIMITER as the column delimiter. [default: %default]", |
| 37 | + action="store", dest='delimiter', default=',') |
| 38 | + parser.add_option( |
| 39 | + "-n", "--name", |
| 40 | + help="Name of the columm to filter on. [default: %default]", |
| 41 | + action="store", dest='name', default=None) |
| 42 | + parser.add_option( |
| 43 | + "-C", "--contains", |
| 44 | + help="Column with name = NAME must contain CONTAINS else we kill that row. " |
| 45 | + "[default: %default]", |
| 46 | + action='store', dest='contains', default=None) |
| 47 | + parser.add_option( |
| 48 | + "-E", "--equals", |
| 49 | + help="Column with name = NAME must equal EQUALS else we kill that row. " |
| 50 | + "[default: %default]", |
| 51 | + action='store', dest='equals', default=None) |
| 52 | + parser.add_option( |
| 53 | + "-e", "--notequals", |
| 54 | + help="Column with name = NAME must not equal NOTEQUALS else we kill that row. " |
| 55 | + "[default: %default]", |
| 56 | + action='store', dest='notequals', default=None) |
| 57 | + parser.add_option( |
| 58 | + "-c", "--notcontains", |
| 59 | + help="Column with name = NAME must not contain NOTCONTAINS else we kill that row." |
| 60 | + " [default: %default]", |
| 61 | + action='store', dest='notcontains', default=None) |
| 62 | + parser.add_option( |
| 63 | + "-o", "--outfilename", |
| 64 | + help="Write to this file rather than stdout. [default: %default]", |
| 65 | + action="store", dest='outfilename', default=None) |
| 66 | + |
| 67 | + (opt, args) = parser.parse_args() |
| 68 | + |
| 69 | + ### Parse args |
| 70 | + infilename = args[0] if args else None |
| 71 | + |
| 72 | + infile, outfile = common.get_inout_files(infilename, opt.outfilename, outmode='wb') |
| 73 | + |
| 74 | + column_filter(infile, outfile, opt.delimiter, opt) |
| 75 | + |
| 76 | + common.close_files(infile, outfile) |
| 77 | + |
| 78 | + |
| 79 | +def column_filter(infile, outfile, delimiter, opt): |
| 80 | + """ |
| 81 | + NOTE: Written late at night after drinking...should be refactored! |
| 82 | + """ |
| 83 | + ## Get the csv reader and writer. Use these to read/write the files. |
| 84 | + # reader.fieldnames gives you the header |
| 85 | + reader = csv.DictReader(infile, delimiter=delimiter) |
| 86 | + writer = csv.DictWriter(outfile, delimiter=delimiter, fieldnames=reader.fieldnames) |
| 87 | + writer.writeheader() |
| 88 | + |
| 89 | + ## Iterate through the file, printing out lines |
| 90 | + for row in reader: |
| 91 | + content = row[opt.name] |
| 92 | + if _shouldwrite(content, opt): |
| 93 | + writer.writerow(row) |
| 94 | + |
| 95 | + |
| 96 | +def _shouldwrite(content, opt): |
| 97 | + if opt.equals and content: |
| 98 | + shouldwrite = content == opt.equals |
| 99 | + elif opt.contains and content: |
| 100 | + shouldwrite = opt.contains in content |
| 101 | + elif opt.notequals: |
| 102 | + if not content: |
| 103 | + shouldwrite = True |
| 104 | + else: |
| 105 | + shouldwrite = content != opt.notequals |
| 106 | + elif opt.notcontains: |
| 107 | + if not content: |
| 108 | + shouldwrite = True |
| 109 | + else: |
| 110 | + shouldwrite = opt.notcontains not in content |
| 111 | + else: |
| 112 | + raise ValueError( |
| 113 | + "Unable to determine what to filter. options = %s" % opt.__dict__) |
| 114 | + |
| 115 | + return shouldwrite |
| 116 | + |
| 117 | + |
| 118 | +if __name__=='__main__': |
| 119 | + _cli() |
0 commit comments