Skip to content

Commit 140fe0d

Browse files
authored
Support TSV as a "dialect" of CSV (#4)
* Open files with newline="" The csv module documentation says this is necessary in some cases, such as for parsing embedded newlines in quoted fields. Refer to the <https://docs.python.org/3/library/csv.html#id3>. * Support TSV as a "dialect" of CSV For seekable streams, the delimiter is sniffed from the first 1MB of data. This should provide enough rows to the sniffer even for datasets with very long rows without blowing up memory usage much. A csv.Dialect may also be specified directly to load_csv() for programmatic usage. * Add a --format option to support explicit dialect selection Useful when you want to disable sniffing or when one or both of the files aren't seekable, so sniffing doesn't work. Thanks, @tsibley
1 parent eaa4702 commit 140fe0d

File tree

3 files changed

+39
-4
lines changed

3 files changed

+39
-4
lines changed

Diff for: csv_diff/__init__.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,17 @@
44
import hashlib
55

66

7-
def load_csv(fp, key=None):
8-
fp = csv.reader(fp)
7+
def load_csv(fp, key=None, dialect=None):
8+
if dialect is None and fp.seekable():
9+
# Peek at first 1MB to sniff the delimiter and other dialect details
10+
peek = fp.read(1024**2)
11+
fp.seek(0)
12+
try:
13+
dialect = csv.Sniffer().sniff(peek, delimiters=",\t")
14+
except csv.Error:
15+
# Oh well, we tried. Fallback to the default.
16+
pass
17+
fp = csv.reader(fp, dialect=(dialect or 'excel'))
918
headings = next(fp)
1019
rows = [dict(zip(headings, line)) for line in fp]
1120
if key:

Diff for: csv_diff/cli.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616
@click.option(
1717
"--key", type=str, default=None, help="Column to use as a unique ID for each row"
1818
)
19+
@click.option(
20+
"--format", type=click.Choice(["csv", "tsv"]), default=None, help="Explicitly specify input format (csv, tsv) instead of auto-detecting"
21+
)
1922
@click.option(
2023
"--json", type=bool, default=False, help="Output changes as JSON", is_flag=True
2124
)
@@ -31,9 +34,15 @@
3134
default=None,
3235
help="Plural word to use, e.g. 'trees' for '2 trees'",
3336
)
34-
def cli(previous, current, key, json, singular, plural):
37+
def cli(previous, current, key, format, json, singular, plural):
3538
"Diff two CSV files"
36-
diff = compare(load_csv(open(previous), key=key), load_csv(open(current), key=key))
39+
dialect = {
40+
"csv": "excel",
41+
"tsv": "excel-tab",
42+
}
43+
def load(filename):
44+
return load_csv(open(filename, newline=""), key=key, dialect=dialect.get(format))
45+
diff = compare(load(previous), load(current))
3746
if json:
3847
print(std_json.dumps(diff, indent=4))
3948
else:

Diff for: tests/test_csv_diff.py

+17
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
1,Cleo,5
1010
2,Pancakes,2"""
1111

12+
TWO_TSV = """id\tname\tage
13+
1\tCleo\t5
14+
2\tPancakes\t2"""
15+
1216
THREE = """id,name,age
1317
1,Cleo,5"""
1418

@@ -86,3 +90,16 @@ def test_columns_changed():
8690
"columns_added": ["weight"],
8791
"columns_removed": ["age"],
8892
} == diff
93+
94+
95+
def test_tsv():
96+
diff = compare(
97+
load_csv(io.StringIO(ONE), key="id"), load_csv(io.StringIO(TWO_TSV), key="id")
98+
)
99+
assert {
100+
"added": [],
101+
"removed": [],
102+
"changed": [{"key": "1", "changes": {"age": ["4", "5"]}}],
103+
"columns_added": [],
104+
"columns_removed": [],
105+
} == diff

0 commit comments

Comments
 (0)