Skip to content

Commit 5e77d3e

Browse files
authored
Merge pull request #488 from Ghabry/better-enc-dec
Improve encoding detection by making the string larger
2 parents f166025 + 1038fae commit 5e77d3e

File tree

1 file changed

+23
-2
lines changed

1 file changed

+23
-2
lines changed

src/reader_util.cpp

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,13 +142,32 @@ std::vector<std::string> ReaderUtil::DetectEncodings(StringView string) {
142142
UCharsetDetector* detector = ucsdet_open(&status);
143143

144144
auto s = std::string(string);
145-
ucsdet_setText(detector, s.c_str(), s.length(), &status);
146145

146+
int confidence = 0;
147147
int32_t matches_count;
148-
const UCharsetMatch** matches = ucsdet_detectAll(detector, &matches_count, &status);
148+
const UCharsetMatch** matches = nullptr;
149+
150+
while (true) {
151+
ucsdet_setText(detector, s.c_str(), s.length(), &status);
152+
matches = ucsdet_detectAll(detector, &matches_count, &status);
153+
154+
if (!matches || matches_count < 1) {
155+
break;
156+
}
157+
158+
confidence = ucsdet_getConfidence(matches[0], &status);
159+
160+
if (confidence > 70 || s.length() > 100) {
161+
break;
162+
}
163+
164+
// Concatenating the string to itself increases the confidence (for short strings)
165+
s += s;
166+
}
149167

150168
if (matches != nullptr) {
151169
// Collect all candidates, most confident comes first
170+
152171
for (int i = 0; i < matches_count; ++i) {
153172
std::string encoding = ucsdet_getName(matches[i], &status);
154173

@@ -171,6 +190,8 @@ std::vector<std::string> ReaderUtil::DetectEncodings(StringView string) {
171190
encodings.emplace_back("ibm-5349_P100-1998"); // Greek with Euro
172191
} else if (encoding == "ISO-8859-8" || encoding == "windows-1255") {
173192
encodings.emplace_back("ibm-9447_P100-2002"); // Hebrew with Euro
193+
} else if (encoding == "UTF-16BE" || encoding == "UTF-16LE") {
194+
// ignore encodings that are obviously wrong
174195
} else {
175196
encodings.push_back(encoding);
176197
}

0 commit comments

Comments
 (0)