diff --git a/csv_diff/cli.py b/csv_diff/cli.py index 919c81e..f96b727 100644 --- a/csv_diff/cli.py +++ b/csv_diff/cli.py @@ -42,7 +42,12 @@ is_flag=True, help="Show unchanged fields for rows with at least one change", ) -def cli(previous, current, key, format, json, singular, plural, show_unchanged): +@click.option( + "--encoding", + default=None, + help="Specify text encoding of the csv files", +) +def cli(previous, current, key, format, json, singular, plural, show_unchanged, encoding): "Diff two CSV or JSON files" dialect = { "csv": "excel", @@ -51,10 +56,10 @@ def cli(previous, current, key, format, json, singular, plural, show_unchanged): def load(filename): if format == "json": - return load_json(open(filename), key=key) + return load_json(open(filename, encoding=encoding), key=key) else: return load_csv( - open(filename, newline=""), key=key, dialect=dialect.get(format) + open(filename, newline="", encoding=encoding), key=key, dialect=dialect.get(format) ) diff = compare(load(previous), load(current), show_unchanged) diff --git a/tests/test_cli.py b/tests/test_cli.py index eb18d73..a18f99f 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -234,3 +234,43 @@ def test_semicolon_delimited(tmpdir): "columns_added": [], "columns_removed": [], } == json.loads(result.output.strip()) + + +def test_human_cli_non_utf8_encoding(tmpdir): + # This test confirms the ability to parse csv files that are not encoded using utf-8. + # The names in the files contain characters that would cause UnicodeDecodeErrors if they + # are encoeded using cp1252 and then parsed using utf-8. + encoding = "cp1252" + one = tmpdir / "one.csv" + two = tmpdir / "two.csv" + one.write_binary( + dedent( + """ + id;name + 1;José + """ + ).strip().encode(encoding) + ) + two.write_binary( + dedent( + """ + id;name + 1;Ángela + """ + ).strip().encode(encoding) + ) + result = CliRunner().invoke( + cli.cli, [str(one), str(two), "--key", "id", "--encoding", encoding], catch_exceptions=False + ) + assert 0 == result.exit_code + assert ( + dedent( + """ + 1 row changed + + id: 1 + name: "José" => "Ángela" + """ + ).strip() + == result.output.strip() + )