Skip to content

Commit b63e116

Browse files
committed
support mult-key and ignore multiple columns
1 parent 33e0a59 commit b63e116

File tree

5 files changed

+146
-10
lines changed

5 files changed

+146
-10
lines changed

README.md

+4-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,10 @@ Consider two CSV files:
4949
name: Pancakes
5050
age: 2
5151

52-
The `--key=id` option means that the `id` column should be treated as the unique key, to identify which records have changed.
52+
The `--key=id` option means that the `id` column should be treated as the unique key, to identify which records have changed. To use a combination of columns as the key, separate them with a comma, e.g., `--key=id1,id2`.
53+
54+
The `--ignore=col` option means that the `col` column will be ignored during the comparison. To ignore multiple columns, separate them with a comma,
55+
e.g., `--ignore=col1,col2`.
5356

5457
The tool will automatically detect if your files are comma- or tab-separated. You can over-ride this automatic detection and force the tool to use a specific format using `--format=tsv` or `--format=csv`.
5558

csv_diff/__init__.py

+11-5
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22
from dictdiffer import diff
33
import json
44
import hashlib
5+
from operator import itemgetter
56

67

7-
def load_csv(fp, key=None, dialect=None):
8+
def load_csv(fp, key=None, dialect=None, ignore=None):
89
if dialect is None and fp.seekable():
910
# Peek at first 1MB to sniff the delimiter and other dialect details
1011
peek = fp.read(1024 ** 2)
@@ -16,24 +17,29 @@ def load_csv(fp, key=None, dialect=None):
1617
pass
1718
fp = csv.reader(fp, dialect=(dialect or "excel"))
1819
headings = next(fp)
19-
rows = [dict(zip(headings, line)) for line in fp]
20+
ignore = set(ignore.split(',')) if ignore else set()
21+
rows = [dict( (k, v) for k,v in dict(zip(headings, line)).items() if k not in ignore) for line in fp]
2022
if key:
21-
keyfn = lambda r: r[key]
23+
keyfn = itemgetter(*key.split(','))
2224
else:
2325
keyfn = lambda r: hashlib.sha1(
2426
json.dumps(r, sort_keys=True).encode("utf8")
2527
).hexdigest()
2628
return {keyfn(r): r for r in rows}
2729

2830

29-
def load_json(fp, key=None):
31+
def load_json(fp, key=None, ignore=None):
3032
raw_list = json.load(fp)
3133
assert isinstance(raw_list, list)
34+
if ignore:
35+
for r in raw_list:
36+
for k in ignore.split(','):
37+
r.pop(k, None)
3238
common_keys = set()
3339
for item in raw_list:
3440
common_keys.update(item.keys())
3541
if key:
36-
keyfn = lambda r: r[key]
42+
keyfn = itemgetter(*key.split(','))
3743
else:
3844
keyfn = lambda r: hashlib.sha1(
3945
json.dumps(r, sort_keys=True).encode("utf8")

csv_diff/cli.py

+13-4
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,16 @@
1414
type=click.Path(exists=True, file_okay=True, dir_okay=False, allow_dash=False),
1515
)
1616
@click.option(
17-
"--key", type=str, default=None, help="Column to use as a unique ID for each row"
17+
"--key",
18+
type=str,
19+
default=None,
20+
help="Column(s) to use as a unique ID for each row. To use multiple keys, separate them with a comma, e.g., key1,key2"
21+
)
22+
@click.option(
23+
"--ignore",
24+
type=str,
25+
default=None,
26+
help="Column(s) to be ignored. To ignore multiple keys, separate them with a comma, e.g., key1,key2"
1827
)
1928
@click.option(
2029
"--format",
@@ -42,7 +51,7 @@
4251
is_flag=True,
4352
help="Show unchanged fields for rows with at least one change",
4453
)
45-
def cli(previous, current, key, format, json, singular, plural, show_unchanged):
54+
def cli(previous, current, key, ignore, format, json, singular, plural, show_unchanged):
4655
"Diff two CSV or JSON files"
4756
dialect = {
4857
"csv": "excel",
@@ -51,10 +60,10 @@ def cli(previous, current, key, format, json, singular, plural, show_unchanged):
5160

5261
def load(filename):
5362
if format == "json":
54-
return load_json(open(filename), key=key)
63+
return load_json(open(filename), key=key, ignore=ignore)
5564
else:
5665
return load_csv(
57-
open(filename, newline=""), key=key, dialect=dialect.get(format)
66+
open(filename, newline=""), key=key, dialect=dialect.get(format), ignore=ignore
5867
)
5968

6069
diff = compare(load(previous), load(current), show_unchanged)

tests/test_cli.py

+68
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,32 @@ def json_files(tmpdir):
4040
return str(one), str(two)
4141

4242

43+
@pytest.fixture
44+
def json_files_two(tmpdir):
45+
one = tmpdir / "one.json"
46+
one.write(
47+
json.dumps(
48+
[
49+
{"state": "CA", "county": "Yikes", "pop": 100, "extra": 1},
50+
{"state": "NY", "county": "Beep", "pop": 200, "extra": 2 },
51+
{"state": "CA", "county": "Zoinks", "pop": 100 },
52+
{"state": "NY", "county": "Zoinks", "pop": 200 }
53+
]
54+
)
55+
)
56+
two = tmpdir / "two.json"
57+
two.write(
58+
json.dumps(
59+
[
60+
{"state": "CA", "county": "Yikes", "pop": 100},
61+
{"state": "NY", "county": "Beep", "pop": 200, "extra": 2 },
62+
{"state": "CA", "county": "Zoinks", "pop": 300 },
63+
{"state": "NY", "county": "Zoinks", "pop": 200 }
64+
]
65+
)
66+
)
67+
return str(one), str(two)
68+
4369
def test_human_cli(tmpdir):
4470
one = tmpdir / "one.csv"
4571
one.write(ONE)
@@ -234,3 +260,45 @@ def test_semicolon_delimited(tmpdir):
234260
"columns_added": [],
235261
"columns_removed": [],
236262
} == json.loads(result.output.strip())
263+
264+
265+
def test_multikey(json_files_two):
266+
# https://github.com/simonw/csv-diff/issues/7
267+
one, two = json_files_two
268+
result = CliRunner().invoke(
269+
cli.cli,
270+
[one, two, "--key", "state,county", "--json", "--format", "json"],
271+
catch_exceptions=False,
272+
)
273+
assert 0 == result.exit_code
274+
assert {
275+
"added": [],
276+
"removed": [],
277+
"changed": [
278+
{"key": ["CA", "Yikes"], "changes": {"extra": [1, None]}},
279+
{"key": ["CA", "Zoinks"], "changes": {"pop": [100, 300]}},
280+
],
281+
"columns_added": [],
282+
"columns_removed": [],
283+
} == json.loads(result.output.strip())
284+
285+
286+
287+
def test_ignore(json_files_two):
288+
# https://github.com/simonw/csv-diff/issues/7
289+
one, two = json_files_two
290+
result = CliRunner().invoke(
291+
cli.cli,
292+
[one, two, "--key", "state,county", "--ignore", "extra", "--json", "--format", "json"],
293+
catch_exceptions=False,
294+
)
295+
assert 0 == result.exit_code
296+
assert {
297+
"added": [],
298+
"removed": [],
299+
"changed": [
300+
{"key": ["CA", "Zoinks"], "changes": {"pop": [100, 300]}},
301+
],
302+
"columns_added": [],
303+
"columns_removed": [],
304+
} == json.loads(result.output.strip())

tests/test_csv_diff.py

+50
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,29 @@
5151
1,Cleo,5
5252
2,Pancakes,3"""
5353

54+
ELEVEN = """state,county,pop
55+
CA,Yikes,100
56+
NY,Beep,200
57+
CA,Zoinks,100
58+
NY,Zoinks,200
59+
"""
60+
61+
TWELVE = """state,county,pop
62+
CA,Yikes,100
63+
NY,Beep,200
64+
CA,Zoinks,300
65+
NY,Zoinks,200
66+
"""
67+
68+
THIRTEEN = """id,name,age,sex
69+
1,Cleo,5,male
70+
2,Pancakes,4,female
71+
"""
72+
73+
FOURTEEN = """id,name,age,sex
74+
1,Cleo,5,female
75+
2,Pancakes,3,female
76+
"""
5477

5578
def test_row_changed():
5679
diff = compare(
@@ -115,3 +138,30 @@ def test_tsv():
115138
"columns_added": [],
116139
"columns_removed": [],
117140
} == diff
141+
142+
def test_multikey():
143+
diff = compare(
144+
load_csv(io.StringIO(ELEVEN), key="state,county"),
145+
load_csv(io.StringIO(TWELVE), key="state,county"),
146+
)
147+
assert {
148+
"added": [],
149+
"removed": [],
150+
"changed": [{"key": ("CA", "Zoinks"), "changes": {"pop": ["100", "300"]}}],
151+
"columns_added": [],
152+
"columns_removed": [],
153+
} == diff
154+
155+
156+
def test_ignore_columns():
157+
diff = compare(
158+
load_csv(io.StringIO(THIRTEEN), key="id", ignore="sex"),
159+
load_csv(io.StringIO(FOURTEEN), key="id", ignore="sex"),
160+
)
161+
assert {
162+
"added": [],
163+
"removed": [],
164+
"changed": [{"key": "2", "changes": {"age": ["4", "3"]}}],
165+
"columns_added": [],
166+
"columns_removed": [],
167+
} == diff

0 commit comments

Comments
 (0)