From bba84e551721c1bda779968b3c09247653af2dbd Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Tue, 28 Jan 2025 01:29:55 -0800 Subject: [PATCH] fix: Fix JSONConverter to properly skip files that are not utf-8 encoded (#8775) * Small fix * Add reno * Trying out license header fix here --- haystack/components/converters/json.py | 1 + ...n-converter-non-utf8-3a755df732a8cbd5.yaml | 4 ++++ test/components/converters/test_json.py | 19 +++++++++++++++++++ 3 files changed, 24 insertions(+) create mode 100644 releasenotes/notes/fix-json-converter-non-utf8-3a755df732a8cbd5.yaml diff --git a/haystack/components/converters/json.py b/haystack/components/converters/json.py index 3a8c6f52f0..6d3781e4e9 100644 --- a/haystack/components/converters/json.py +++ b/haystack/components/converters/json.py @@ -194,6 +194,7 @@ def _get_content_and_meta(self, source: ByteStream) -> List[Tuple[str, Dict[str, source=source.meta["file_path"], error=exc, ) + return [] meta_fields = self._meta_fields or set() diff --git a/releasenotes/notes/fix-json-converter-non-utf8-3a755df732a8cbd5.yaml b/releasenotes/notes/fix-json-converter-non-utf8-3a755df732a8cbd5.yaml new file mode 100644 index 0000000000..2c475d201e --- /dev/null +++ b/releasenotes/notes/fix-json-converter-non-utf8-3a755df732a8cbd5.yaml @@ -0,0 +1,4 @@ +--- +fixes: + - | + Fixed JSONConverter to properly skip converting JSON files that are not utf-8 encoded. diff --git a/test/components/converters/test_json.py b/test/components/converters/test_json.py index f9dcf2fa0c..5419fa812a 100644 --- a/test/components/converters/test_json.py +++ b/test/components/converters/test_json.py @@ -236,6 +236,25 @@ def test_run_with_bad_filter(tmpdir, caplog): assert result == {"documents": []} +def test_run_with_bad_encoding(tmpdir, caplog): + test_file = Path(tmpdir / "test_file.json") + test_file.write_text(json.dumps(test_data[0]), "utf-16") + + sources = [test_file] + converter = JSONConverter(".laureates") + + caplog.clear() + with caplog.at_level(logging.WARNING): + result = converter.run(sources=sources) + + records = caplog.records + assert len(records) == 1 + assert records[0].msg.startswith( + f"Failed to extract text from {test_file}. Skipping it. Error: 'utf-8' codec can't decode byte" + ) + assert result == {"documents": []} + + def test_run_with_single_meta(tmpdir): first_test_file = Path(tmpdir / "first_test_file.json") second_test_file = Path(tmpdir / "second_test_file.json")