Support TSV as a "dialect" of CSV (#4)

tsibley · web-flow · commit 140fe0d21b40 · 2020-02-29T09:43:59.000-08:00
* Open files with newline="" The csv module documentation says this is necessary in some cases, such as for parsing embedded newlines in quoted fields. Refer to the <https://docs.python.org/3/library/csv.html#id3>. * Support TSV as a "dialect" of CSV For seekable streams, the delimiter is sniffed from the first 1MB of data. This should provide enough rows to the sniffer even for datasets with very long rows without blowing up memory usage much. A csv.Dialect may also be specified directly to load_csv() for programmatic usage. * Add a --format option to support explicit dialect selection Useful when you want to disable sniffing or when one or both of the files aren't seekable, so sniffing doesn't work. Thanks, @tsibley
diff --git a/csv_diff/__init__.py b/csv_diff/__init__.py
@@ -4,8 +4,17 @@
 import hashlib
 
 
-def load_csv(fp, key=None):
-    fp = csv.reader(fp)
+def load_csv(fp, key=None, dialect=None):
+    if dialect is None and fp.seekable():
+        # Peek at first 1MB to sniff the delimiter and other dialect details
+        peek = fp.read(1024**2)
+        fp.seek(0)
+        try:
+            dialect = csv.Sniffer().sniff(peek, delimiters=",\t")
+        except csv.Error:
+            # Oh well, we tried. Fallback to the default.
+            pass
+    fp = csv.reader(fp, dialect=(dialect or 'excel'))
     headings = next(fp)
     rows = [dict(zip(headings, line)) for line in fp]
     if key:
diff --git a/csv_diff/cli.py b/csv_diff/cli.py
@@ -16,6 +16,9 @@
 @click.option(
     "--key", type=str, default=None, help="Column to use as a unique ID for each row"
 )
+@click.option(
+    "--format", type=click.Choice(["csv", "tsv"]), default=None, help="Explicitly specify input format (csv, tsv) instead of auto-detecting"
+)
 @click.option(
     "--json", type=bool, default=False, help="Output changes as JSON", is_flag=True
 )
@@ -31,9 +34,15 @@
     default=None,
     help="Plural word to use, e.g. 'trees' for '2 trees'",
 )
-def cli(previous, current, key, json, singular, plural):
+def cli(previous, current, key, format, json, singular, plural):
     "Diff two CSV files"
-    diff = compare(load_csv(open(previous), key=key), load_csv(open(current), key=key))
+    dialect = {
+        "csv": "excel",
+        "tsv": "excel-tab",
+    }
+    def load(filename):
+        return load_csv(open(filename, newline=""), key=key, dialect=dialect.get(format))
+    diff = compare(load(previous), load(current))
     if json:
         print(std_json.dumps(diff, indent=4))
     else:
diff --git a/tests/test_csv_diff.py b/tests/test_csv_diff.py
@@ -9,6 +9,10 @@
 1,Cleo,5
 2,Pancakes,2"""
 
+TWO_TSV = """id\tname\tage
+1\tCleo\t5
+2\tPancakes\t2"""
+
 THREE = """id,name,age
 1,Cleo,5"""
 
@@ -86,3 +90,16 @@ def test_columns_changed():
         "columns_added": ["weight"],
         "columns_removed": ["age"],
     } == diff
+
+
+def test_tsv():
+    diff = compare(
+        load_csv(io.StringIO(ONE), key="id"), load_csv(io.StringIO(TWO_TSV), key="id")
+    )
+    assert {
+        "added": [],
+        "removed": [],
+        "changed": [{"key": "1", "changes": {"age": ["4", "5"]}}],
+        "columns_added": [],
+        "columns_removed": [],
+    } == diff