From 49a894ee3c738f5b0509d5b192a867ed339e5dc8 Mon Sep 17 00:00:00 2001 From: Arijit Sircar <arijit.sircar@gmail.com> Date: Mon, 29 Jan 2024 21:13:38 +0000 Subject: [PATCH] Added support for multi-keys on csvs --- .gitignore | 1 + csv_diff/__init__.py | 7 +++++-- tests/test_csv_diff.py | 25 +++++++++++++++++++++++++ 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index df13fc5..b2cd23c 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ venv .DS_Store .schema .vscode +build/* diff --git a/csv_diff/__init__.py b/csv_diff/__init__.py index 650966d..53a52b9 100644 --- a/csv_diff/__init__.py +++ b/csv_diff/__init__.py @@ -4,7 +4,7 @@ import hashlib -def load_csv(fp, key=None, dialect=None): +def load_csv(fp, key=None, dialect=None, key_sep='-'): if dialect is None and fp.seekable(): # Peek at first 1MB to sniff the delimiter and other dialect details peek = fp.read(1024 ** 2) @@ -18,7 +18,10 @@ def load_csv(fp, key=None, dialect=None): headings = next(fp) rows = [dict(zip(headings, line)) for line in fp] if key: - keyfn = lambda r: r[key] + if type(key) == list: # if a list of cols provided then build a concatenated key with them - order matters + keyfn = lambda r: key_sep.join([r[x] for x in key]) + else: + keyfn = lambda r: r[key] else: keyfn = lambda r: hashlib.sha1( json.dumps(r, sort_keys=True).encode("utf8") diff --git a/tests/test_csv_diff.py b/tests/test_csv_diff.py index 0e3670f..3857217 100644 --- a/tests/test_csv_diff.py +++ b/tests/test_csv_diff.py @@ -51,6 +51,19 @@ 1,Cleo,5 2,Pancakes,3""" +ELEVEN = """name, state, age +Ann, UT, 56 +Ann, NY, 24 +Lisa, CA, 35 +Bill, FL, 33 +Bill, WY, 23""" + +TWELVE = """name, state, age +Ann, UT, 45 +Ann, NY, 24 +Lisa, CA, 35 +Bill, WY, 23""" + def test_row_changed(): diff = compare( @@ -115,3 +128,15 @@ def test_tsv(): "columns_added": [], "columns_removed": [], } == diff + +def test_multi_key(): + diff = compare( + load_csv(io.StringIO(ELEVEN), key=["name", "state"], key_sep='~'), load_csv(io.StringIO(TWELVE), key=["name", "state"], key_sep='~'), + ) + assert { + 'added': [], + 'removed': [{'name': 'Bill', 'state': 'FL', 'age': '33'}], + 'changed': [{'key': 'Ann~UT', 'changes': {'age': ['56', '45']}}], + 'columns_added': [], + 'columns_removed': [], + } == diff \ No newline at end of file