From 6b6e2f25668a2ca1aefc966627a2043ac56084f6 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Wed, 19 Feb 2020 21:43:27 -0800 Subject: [PATCH 1/3] Open files with newline="" The csv module documentation says this is necessary in some cases, such as for parsing embedded newlines in quoted fields. Refer to the . --- csv_diff/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csv_diff/cli.py b/csv_diff/cli.py index 99c916f..a3aefca 100644 --- a/csv_diff/cli.py +++ b/csv_diff/cli.py @@ -33,7 +33,7 @@ ) def cli(previous, current, key, json, singular, plural): "Diff two CSV files" - diff = compare(load_csv(open(previous), key=key), load_csv(open(current), key=key)) + diff = compare(load_csv(open(previous, newline=""), key=key), load_csv(open(current, newline=""), key=key)) if json: print(std_json.dumps(diff, indent=4)) else: From a7db89354d658d4fd98d5f66e58cd0cd514c8853 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Wed, 19 Feb 2020 23:05:54 -0800 Subject: [PATCH 2/3] Support TSV as a "dialect" of CSV For seekable streams, the delimiter is sniffed from the first 1MB of data. This should provide enough rows to the sniffer even for datasets with very long rows without blowing up memory usage much. A csv.Dialect may also be specified directly to load_csv() for programmatic usage. --- csv_diff/__init__.py | 13 +++++++++++-- tests/test_csv_diff.py | 17 +++++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/csv_diff/__init__.py b/csv_diff/__init__.py index 34ce22b..10da480 100644 --- a/csv_diff/__init__.py +++ b/csv_diff/__init__.py @@ -4,8 +4,17 @@ import hashlib -def load_csv(fp, key=None): - fp = csv.reader(fp) +def load_csv(fp, key=None, dialect=None): + if dialect is None and fp.seekable(): + # Peek at first 1MB to sniff the delimiter and other dialect details + peek = fp.read(1024**2) + fp.seek(0) + try: + dialect = csv.Sniffer().sniff(peek, delimiters=",\t") + except csv.Error: + # Oh well, we tried. Fallback to the default. + pass + fp = csv.reader(fp, dialect=(dialect or 'excel')) headings = next(fp) rows = [dict(zip(headings, line)) for line in fp] if key: diff --git a/tests/test_csv_diff.py b/tests/test_csv_diff.py index 41efb0f..f886db1 100644 --- a/tests/test_csv_diff.py +++ b/tests/test_csv_diff.py @@ -9,6 +9,10 @@ 1,Cleo,5 2,Pancakes,2""" +TWO_TSV = """id\tname\tage +1\tCleo\t5 +2\tPancakes\t2""" + THREE = """id,name,age 1,Cleo,5""" @@ -86,3 +90,16 @@ def test_columns_changed(): "columns_added": ["weight"], "columns_removed": ["age"], } == diff + + +def test_tsv(): + diff = compare( + load_csv(io.StringIO(ONE), key="id"), load_csv(io.StringIO(TWO_TSV), key="id") + ) + assert { + "added": [], + "removed": [], + "changed": [{"key": "1", "changes": {"age": ["4", "5"]}}], + "columns_added": [], + "columns_removed": [], + } == diff From a799273caecd53da7c6ce307421d2c95b75fb4c3 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Thu, 20 Feb 2020 09:49:10 -0800 Subject: [PATCH 3/3] Add a --format option to support explicit dialect selection Useful when you want to disable sniffing or when one or both of the files aren't seekable, so sniffing doesn't work. --- csv_diff/cli.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/csv_diff/cli.py b/csv_diff/cli.py index a3aefca..bb26f86 100644 --- a/csv_diff/cli.py +++ b/csv_diff/cli.py @@ -16,6 +16,9 @@ @click.option( "--key", type=str, default=None, help="Column to use as a unique ID for each row" ) +@click.option( + "--format", type=click.Choice(["csv", "tsv"]), default=None, help="Explicitly specify input format (csv, tsv) instead of auto-detecting" +) @click.option( "--json", type=bool, default=False, help="Output changes as JSON", is_flag=True ) @@ -31,9 +34,15 @@ default=None, help="Plural word to use, e.g. 'trees' for '2 trees'", ) -def cli(previous, current, key, json, singular, plural): +def cli(previous, current, key, format, json, singular, plural): "Diff two CSV files" - diff = compare(load_csv(open(previous, newline=""), key=key), load_csv(open(current, newline=""), key=key)) + dialect = { + "csv": "excel", + "tsv": "excel-tab", + } + def load(filename): + return load_csv(open(filename, newline=""), key=key, dialect=dialect.get(format)) + diff = compare(load(previous), load(current)) if json: print(std_json.dumps(diff, indent=4)) else: