From 49a894ee3c738f5b0509d5b192a867ed339e5dc8 Mon Sep 17 00:00:00 2001
From: Arijit Sircar <arijit.sircar@gmail.com>
Date: Mon, 29 Jan 2024 21:13:38 +0000
Subject: [PATCH] Added support for multi-keys on csvs

---
 .gitignore             |  1 +
 csv_diff/__init__.py   |  7 +++++--
 tests/test_csv_diff.py | 25 +++++++++++++++++++++++++
 3 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index df13fc5..b2cd23c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,4 @@ venv
 .DS_Store
 .schema
 .vscode
+build/*
diff --git a/csv_diff/__init__.py b/csv_diff/__init__.py
index 650966d..53a52b9 100644
--- a/csv_diff/__init__.py
+++ b/csv_diff/__init__.py
@@ -4,7 +4,7 @@
 import hashlib
 
 
-def load_csv(fp, key=None, dialect=None):
+def load_csv(fp, key=None, dialect=None, key_sep='-'):
     if dialect is None and fp.seekable():
         # Peek at first 1MB to sniff the delimiter and other dialect details
         peek = fp.read(1024 ** 2)
@@ -18,7 +18,10 @@ def load_csv(fp, key=None, dialect=None):
     headings = next(fp)
     rows = [dict(zip(headings, line)) for line in fp]
     if key:
-        keyfn = lambda r: r[key]
+        if type(key) == list: # if a list of cols provided then build a concatenated key with them - order matters
+            keyfn = lambda r: key_sep.join([r[x] for x in key])
+        else:
+            keyfn = lambda r: r[key]
     else:
         keyfn = lambda r: hashlib.sha1(
             json.dumps(r, sort_keys=True).encode("utf8")
diff --git a/tests/test_csv_diff.py b/tests/test_csv_diff.py
index 0e3670f..3857217 100644
--- a/tests/test_csv_diff.py
+++ b/tests/test_csv_diff.py
@@ -51,6 +51,19 @@
 1,Cleo,5
 2,Pancakes,3"""
 
+ELEVEN = """name, state, age
+Ann, UT, 56
+Ann, NY, 24
+Lisa, CA, 35
+Bill, FL, 33
+Bill, WY, 23"""
+
+TWELVE = """name, state, age
+Ann, UT, 45
+Ann, NY, 24
+Lisa, CA, 35
+Bill, WY, 23"""
+
 
 def test_row_changed():
     diff = compare(
@@ -115,3 +128,15 @@ def test_tsv():
         "columns_added": [],
         "columns_removed": [],
     } == diff
+
+def test_multi_key():
+    diff = compare(
+        load_csv(io.StringIO(ELEVEN), key=["name", "state"], key_sep='~'), load_csv(io.StringIO(TWELVE), key=["name", "state"], key_sep='~'), 
+    )
+    assert {
+        'added': [],
+        'removed': [{'name': 'Bill', 'state': 'FL', 'age': '33'}],
+        'changed': [{'key': 'Ann~UT', 'changes': {'age': ['56', '45']}}],
+        'columns_added': [],
+        'columns_removed': [],
+    } == diff
\ No newline at end of file