diff --git a/README.md b/README.md index ecd93b9..18123e5 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,10 @@ Consider two CSV files: name: Pancakes age: 2 -The `--key=id` option means that the `id` column should be treated as the unique key, to identify which records have changed. +The `--key=id` option means that the `id` column should be treated as the unique key, to identify which records have changed. To use a combination of columns as the key, separate them with a comma, e.g., `--key=id1,id2`. + +The `--ignore=col` option means that the `col` column will be ignored during the comparison. To ignore multiple columns, separate them with a comma, +e.g., `--ignore=col1,col2`. The tool will automatically detect if your files are comma- or tab-separated. You can over-ride this automatic detection and force the tool to use a specific format using `--format=tsv` or `--format=csv`. diff --git a/csv_diff/__init__.py b/csv_diff/__init__.py index 650966d..e386e63 100644 --- a/csv_diff/__init__.py +++ b/csv_diff/__init__.py @@ -2,9 +2,10 @@ from dictdiffer import diff import json import hashlib +from operator import itemgetter -def load_csv(fp, key=None, dialect=None): +def load_csv(fp, key=None, dialect=None, ignore=None): if dialect is None and fp.seekable(): # Peek at first 1MB to sniff the delimiter and other dialect details peek = fp.read(1024 ** 2) @@ -16,9 +17,10 @@ def load_csv(fp, key=None, dialect=None): pass fp = csv.reader(fp, dialect=(dialect or "excel")) headings = next(fp) - rows = [dict(zip(headings, line)) for line in fp] + ignore = set(ignore.split(',')) if ignore else set() + rows = [dict( (k, v) for k,v in zip(headings, line) if k not in ignore) for line in fp] if key: - keyfn = lambda r: r[key] + keyfn = itemgetter(*key.split(',')) else: keyfn = lambda r: hashlib.sha1( json.dumps(r, sort_keys=True).encode("utf8") @@ -26,14 +28,18 @@ def load_csv(fp, key=None, dialect=None): return {keyfn(r): r for r in rows} -def load_json(fp, key=None): +def load_json(fp, key=None, ignore=None): raw_list = json.load(fp) assert isinstance(raw_list, list) + if ignore: + for item in raw_list: + for field in ignore.split(','): + item.pop(field, None) common_keys = set() for item in raw_list: common_keys.update(item.keys()) if key: - keyfn = lambda r: r[key] + keyfn = itemgetter(*key.split(',')) else: keyfn = lambda r: hashlib.sha1( json.dumps(r, sort_keys=True).encode("utf8") diff --git a/csv_diff/cli.py b/csv_diff/cli.py index 919c81e..c72222a 100644 --- a/csv_diff/cli.py +++ b/csv_diff/cli.py @@ -14,7 +14,16 @@ type=click.Path(exists=True, file_okay=True, dir_okay=False, allow_dash=False), ) @click.option( - "--key", type=str, default=None, help="Column to use as a unique ID for each row" + "--key", + type=str, + default=None, + help="Column(s) to use as a unique ID for each row. To use multiple keys, separate them with a comma, e.g., key1,key2" +) +@click.option( + "--ignore", + type=str, + default=None, + help="Column(s) to be ignored. To ignore multiple keys, separate them with a comma, e.g., key1,key2" ) @click.option( "--format", @@ -42,7 +51,7 @@ is_flag=True, help="Show unchanged fields for rows with at least one change", ) -def cli(previous, current, key, format, json, singular, plural, show_unchanged): +def cli(previous, current, key, ignore, format, json, singular, plural, show_unchanged): "Diff two CSV or JSON files" dialect = { "csv": "excel", @@ -51,10 +60,10 @@ def cli(previous, current, key, format, json, singular, plural, show_unchanged): def load(filename): if format == "json": - return load_json(open(filename), key=key) + return load_json(open(filename), key=key, ignore=ignore) else: return load_csv( - open(filename, newline=""), key=key, dialect=dialect.get(format) + open(filename, newline=""), key=key, dialect=dialect.get(format), ignore=ignore ) diff = compare(load(previous), load(current), show_unchanged) diff --git a/tests/test_cli.py b/tests/test_cli.py index eb18d73..459f094 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -40,6 +40,32 @@ def json_files(tmpdir): return str(one), str(two) +@pytest.fixture +def json_files_two(tmpdir): + one = tmpdir / "one.json" + one.write( + json.dumps( + [ + {"state": "CA", "county": "Yikes", "pop": 100, "extra": 1}, + {"state": "NY", "county": "Beep", "pop": 200, "extra": 2 }, + {"state": "CA", "county": "Zoinks", "pop": 100 }, + {"state": "NY", "county": "Zoinks", "pop": 200 } + ] + ) + ) + two = tmpdir / "two.json" + two.write( + json.dumps( + [ + {"state": "CA", "county": "Yikes", "pop": 100}, + {"state": "NY", "county": "Beep", "pop": 200, "extra": 2 }, + {"state": "CA", "county": "Zoinks", "pop": 300 }, + {"state": "NY", "county": "Zoinks", "pop": 200 } + ] + ) + ) + return str(one), str(two) + def test_human_cli(tmpdir): one = tmpdir / "one.csv" one.write(ONE) @@ -234,3 +260,45 @@ def test_semicolon_delimited(tmpdir): "columns_added": [], "columns_removed": [], } == json.loads(result.output.strip()) + + +def test_multikey(json_files_two): + # https://github.com/simonw/csv-diff/issues/7 + one, two = json_files_two + result = CliRunner().invoke( + cli.cli, + [one, two, "--key", "state,county", "--json", "--format", "json"], + catch_exceptions=False, + ) + assert 0 == result.exit_code + assert { + "added": [], + "removed": [], + "changed": [ + {"key": ["CA", "Yikes"], "changes": {"extra": [1, None]}}, + {"key": ["CA", "Zoinks"], "changes": {"pop": [100, 300]}}, + ], + "columns_added": [], + "columns_removed": [], + } == json.loads(result.output.strip()) + + + +def test_ignore(json_files_two): + # https://github.com/simonw/csv-diff/issues/7 + one, two = json_files_two + result = CliRunner().invoke( + cli.cli, + [one, two, "--key", "state,county", "--ignore", "extra", "--json", "--format", "json"], + catch_exceptions=False, + ) + assert 0 == result.exit_code + assert { + "added": [], + "removed": [], + "changed": [ + {"key": ["CA", "Zoinks"], "changes": {"pop": [100, 300]}}, + ], + "columns_added": [], + "columns_removed": [], + } == json.loads(result.output.strip()) \ No newline at end of file diff --git a/tests/test_csv_diff.py b/tests/test_csv_diff.py index 0e3670f..0cb0db2 100644 --- a/tests/test_csv_diff.py +++ b/tests/test_csv_diff.py @@ -51,6 +51,29 @@ 1,Cleo,5 2,Pancakes,3""" +ELEVEN = """state,county,pop +CA,Yikes,100 +NY,Beep,200 +CA,Zoinks,100 +NY,Zoinks,200 +""" + +TWELVE = """state,county,pop +CA,Yikes,100 +NY,Beep,200 +CA,Zoinks,300 +NY,Zoinks,200 +""" + +THIRTEEN = """id,name,age,sex +1,Cleo,5,male +2,Pancakes,4,female +""" + +FOURTEEN = """id,name,age,sex +1,Cleo,5,female +2,Pancakes,3,female +""" def test_row_changed(): diff = compare( @@ -115,3 +138,30 @@ def test_tsv(): "columns_added": [], "columns_removed": [], } == diff + +def test_multikey(): + diff = compare( + load_csv(io.StringIO(ELEVEN), key="state,county"), + load_csv(io.StringIO(TWELVE), key="state,county"), + ) + assert { + "added": [], + "removed": [], + "changed": [{"key": ("CA", "Zoinks"), "changes": {"pop": ["100", "300"]}}], + "columns_added": [], + "columns_removed": [], + } == diff + + +def test_ignore_columns(): + diff = compare( + load_csv(io.StringIO(THIRTEEN), key="id", ignore="sex"), + load_csv(io.StringIO(FOURTEEN), key="id", ignore="sex"), + ) + assert { + "added": [], + "removed": [], + "changed": [{"key": "2", "changes": {"age": ["4", "3"]}}], + "columns_added": [], + "columns_removed": [], + } == diff