diff --git a/csv_diff/__init__.py b/csv_diff/__init__.py index 34ce22b..10da480 100644 --- a/csv_diff/__init__.py +++ b/csv_diff/__init__.py @@ -4,8 +4,17 @@ import hashlib -def load_csv(fp, key=None): - fp = csv.reader(fp) +def load_csv(fp, key=None, dialect=None): + if dialect is None and fp.seekable(): + # Peek at first 1MB to sniff the delimiter and other dialect details + peek = fp.read(1024**2) + fp.seek(0) + try: + dialect = csv.Sniffer().sniff(peek, delimiters=",\t") + except csv.Error: + # Oh well, we tried. Fallback to the default. + pass + fp = csv.reader(fp, dialect=(dialect or 'excel')) headings = next(fp) rows = [dict(zip(headings, line)) for line in fp] if key: diff --git a/csv_diff/cli.py b/csv_diff/cli.py index 99c916f..bb26f86 100644 --- a/csv_diff/cli.py +++ b/csv_diff/cli.py @@ -16,6 +16,9 @@ @click.option( "--key", type=str, default=None, help="Column to use as a unique ID for each row" ) +@click.option( + "--format", type=click.Choice(["csv", "tsv"]), default=None, help="Explicitly specify input format (csv, tsv) instead of auto-detecting" +) @click.option( "--json", type=bool, default=False, help="Output changes as JSON", is_flag=True ) @@ -31,9 +34,15 @@ default=None, help="Plural word to use, e.g. 'trees' for '2 trees'", ) -def cli(previous, current, key, json, singular, plural): +def cli(previous, current, key, format, json, singular, plural): "Diff two CSV files" - diff = compare(load_csv(open(previous), key=key), load_csv(open(current), key=key)) + dialect = { + "csv": "excel", + "tsv": "excel-tab", + } + def load(filename): + return load_csv(open(filename, newline=""), key=key, dialect=dialect.get(format)) + diff = compare(load(previous), load(current)) if json: print(std_json.dumps(diff, indent=4)) else: diff --git a/tests/test_csv_diff.py b/tests/test_csv_diff.py index 41efb0f..f886db1 100644 --- a/tests/test_csv_diff.py +++ b/tests/test_csv_diff.py @@ -9,6 +9,10 @@ 1,Cleo,5 2,Pancakes,2""" +TWO_TSV = """id\tname\tage +1\tCleo\t5 +2\tPancakes\t2""" + THREE = """id,name,age 1,Cleo,5""" @@ -86,3 +90,16 @@ def test_columns_changed(): "columns_added": ["weight"], "columns_removed": ["age"], } == diff + + +def test_tsv(): + diff = compare( + load_csv(io.StringIO(ONE), key="id"), load_csv(io.StringIO(TWO_TSV), key="id") + ) + assert { + "added": [], + "removed": [], + "changed": [{"key": "1", "changes": {"age": ["4", "5"]}}], + "columns_added": [], + "columns_removed": [], + } == diff