support mult-key and ignore multiple columns

gambler147 · gambler147 · commit b63e11671284 · 2021-06-21T22:26:16.000-04:00
diff --git a/README.md b/README.md
@@ -49,7 +49,10 @@ Consider two CSV files:
       name: Pancakes
       age: 2
 
-The `--key=id` option means that the `id` column should be treated as the unique key, to identify which records have changed.
+The `--key=id` option means that the `id` column should be treated as the unique key, to identify which records have changed. To use a combination of columns as the key, separate them with a comma, e.g., `--key=id1,id2`.
+
+The `--ignore=col` option means that the `col` column will be ignored during the comparison. To ignore multiple columns, separate them with a comma, 
+e.g., `--ignore=col1,col2`.
 
 The tool will automatically detect if your files are comma- or tab-separated. You can over-ride this automatic detection and force the tool to use a specific format using `--format=tsv` or `--format=csv`.
 
diff --git a/csv_diff/__init__.py b/csv_diff/__init__.py
@@ -2,9 +2,10 @@
 from dictdiffer import diff
 import json
 import hashlib
+from operator import itemgetter
 
 
-def load_csv(fp, key=None, dialect=None):
+def load_csv(fp, key=None, dialect=None, ignore=None):
     if dialect is None and fp.seekable():
         # Peek at first 1MB to sniff the delimiter and other dialect details
         peek = fp.read(1024 ** 2)
@@ -16,24 +17,29 @@ def load_csv(fp, key=None, dialect=None):
             pass
     fp = csv.reader(fp, dialect=(dialect or "excel"))
     headings = next(fp)
-    rows = [dict(zip(headings, line)) for line in fp]
+    ignore = set(ignore.split(',')) if ignore else set()
+    rows = [dict( (k, v) for k,v in dict(zip(headings, line)).items() if k not in ignore) for line in fp]
     if key:
-        keyfn = lambda r: r[key]
+        keyfn = itemgetter(*key.split(','))
     else:
         keyfn = lambda r: hashlib.sha1(
             json.dumps(r, sort_keys=True).encode("utf8")
         ).hexdigest()
     return {keyfn(r): r for r in rows}
 
 
-def load_json(fp, key=None):
+def load_json(fp, key=None, ignore=None):
     raw_list = json.load(fp)
     assert isinstance(raw_list, list)
+    if ignore:
+      for r in raw_list:
+        for k in ignore.split(','):
+            r.pop(k, None)
     common_keys = set()
     for item in raw_list:
         common_keys.update(item.keys())
     if key:
-        keyfn = lambda r: r[key]
+        keyfn = itemgetter(*key.split(','))
     else:
         keyfn = lambda r: hashlib.sha1(
             json.dumps(r, sort_keys=True).encode("utf8")
diff --git a/csv_diff/cli.py b/csv_diff/cli.py
@@ -14,7 +14,16 @@
     type=click.Path(exists=True, file_okay=True, dir_okay=False, allow_dash=False),
 )
 @click.option(
-    "--key", type=str, default=None, help="Column to use as a unique ID for each row"
+    "--key", 
+    type=str, 
+    default=None, 
+    help="Column(s) to use as a unique ID for each row. To use multiple keys, separate them with a comma, e.g., key1,key2"
+)
+@click.option(
+    "--ignore",
+    type=str, 
+    default=None, 
+    help="Column(s) to be ignored. To ignore multiple keys, separate them with a comma, e.g., key1,key2"
 )
 @click.option(
     "--format",
@@ -42,7 +51,7 @@
     is_flag=True,
     help="Show unchanged fields for rows with at least one change",
 )
-def cli(previous, current, key, format, json, singular, plural, show_unchanged):
+def cli(previous, current, key, ignore, format, json, singular, plural, show_unchanged):
     "Diff two CSV or JSON files"
     dialect = {
         "csv": "excel",
@@ -51,10 +60,10 @@ def cli(previous, current, key, format, json, singular, plural, show_unchanged):
 
     def load(filename):
         if format == "json":
-            return load_json(open(filename), key=key)
+            return load_json(open(filename), key=key, ignore=ignore)
         else:
             return load_csv(
-                open(filename, newline=""), key=key, dialect=dialect.get(format)
+                open(filename, newline=""), key=key, dialect=dialect.get(format), ignore=ignore
             )
 
     diff = compare(load(previous), load(current), show_unchanged)
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -40,6 +40,32 @@ def json_files(tmpdir):
     return str(one), str(two)
 
 
+@pytest.fixture
+def json_files_two(tmpdir):
+    one = tmpdir / "one.json"
+    one.write(
+        json.dumps(
+            [
+                {"state": "CA", "county": "Yikes", "pop": 100, "extra": 1},
+                {"state": "NY", "county": "Beep", "pop": 200, "extra": 2 },
+                {"state": "CA", "county": "Zoinks", "pop": 100 },
+                {"state": "NY", "county": "Zoinks", "pop": 200 }
+            ]
+        )
+    )
+    two = tmpdir / "two.json"
+    two.write(
+        json.dumps(
+            [
+                {"state": "CA", "county": "Yikes", "pop": 100},
+                {"state": "NY", "county": "Beep", "pop": 200, "extra": 2 },
+                {"state": "CA", "county": "Zoinks", "pop": 300 },
+                {"state": "NY", "county": "Zoinks", "pop": 200 }
+            ]
+        )
+    )
+    return str(one), str(two)
+
 def test_human_cli(tmpdir):
     one = tmpdir / "one.csv"
     one.write(ONE)
@@ -234,3 +260,45 @@ def test_semicolon_delimited(tmpdir):
         "columns_added": [],
         "columns_removed": [],
     } == json.loads(result.output.strip())
+
+
+def test_multikey(json_files_two):
+    # https://github.com/simonw/csv-diff/issues/7
+    one, two = json_files_two
+    result = CliRunner().invoke(
+        cli.cli,
+        [one, two, "--key", "state,county", "--json", "--format", "json"],
+        catch_exceptions=False,
+    )
+    assert 0 == result.exit_code
+    assert {
+        "added": [],
+        "removed": [],
+        "changed": [
+          {"key": ["CA", "Yikes"], "changes": {"extra": [1, None]}},
+          {"key": ["CA", "Zoinks"], "changes": {"pop": [100, 300]}},
+        ],
+        "columns_added": [],
+        "columns_removed": [],
+    } == json.loads(result.output.strip())
+
+
+
+def test_ignore(json_files_two):
+    # https://github.com/simonw/csv-diff/issues/7
+    one, two = json_files_two
+    result = CliRunner().invoke(
+        cli.cli,
+        [one, two, "--key", "state,county", "--ignore", "extra", "--json", "--format", "json"],
+        catch_exceptions=False,
+    )
+    assert 0 == result.exit_code
+    assert {
+        "added": [],
+        "removed": [],
+        "changed": [
+          {"key": ["CA", "Zoinks"], "changes": {"pop": [100, 300]}},
+        ],
+        "columns_added": [],
+        "columns_removed": [],
+    } == json.loads(result.output.strip())
diff --git a/tests/test_csv_diff.py b/tests/test_csv_diff.py
@@ -51,6 +51,29 @@
 1,Cleo,5
 2,Pancakes,3"""
 
+ELEVEN = """state,county,pop
+CA,Yikes,100
+NY,Beep,200
+CA,Zoinks,100
+NY,Zoinks,200
+"""
+
+TWELVE = """state,county,pop
+CA,Yikes,100
+NY,Beep,200
+CA,Zoinks,300
+NY,Zoinks,200
+"""
+
+THIRTEEN = """id,name,age,sex
+1,Cleo,5,male
+2,Pancakes,4,female
+"""
+
+FOURTEEN = """id,name,age,sex
+1,Cleo,5,female
+2,Pancakes,3,female
+"""
 
 def test_row_changed():
     diff = compare(
@@ -115,3 +138,30 @@ def test_tsv():
         "columns_added": [],
         "columns_removed": [],
     } == diff
+
+def test_multikey():
+    diff = compare(
+        load_csv(io.StringIO(ELEVEN), key="state,county"),
+        load_csv(io.StringIO(TWELVE), key="state,county"),
+    )
+    assert {
+        "added": [],
+        "removed": [],
+        "changed": [{"key": ("CA", "Zoinks"), "changes": {"pop": ["100", "300"]}}],
+        "columns_added": [],
+        "columns_removed": [],
+    } == diff
+
+
+def test_ignore_columns():
+    diff = compare(
+        load_csv(io.StringIO(THIRTEEN), key="id", ignore="sex"),
+        load_csv(io.StringIO(FOURTEEN), key="id", ignore="sex"),
+    )
+    assert {
+        "added": [],
+        "removed": [],
+        "changed": [{"key": "2", "changes": {"age": ["4", "3"]}}],
+        "columns_added": [],
+        "columns_removed": [],
+    } == diff