@@ -142,13 +142,32 @@ std::vector<std::string> ReaderUtil::DetectEncodings(StringView string) {
142
142
UCharsetDetector* detector = ucsdet_open (&status);
143
143
144
144
auto s = std::string (string);
145
- ucsdet_setText (detector, s.c_str (), s.length (), &status);
146
145
146
+ int confidence = 0 ;
147
147
int32_t matches_count;
148
- const UCharsetMatch** matches = ucsdet_detectAll (detector, &matches_count, &status);
148
+ const UCharsetMatch** matches = nullptr ;
149
+
150
+ while (true ) {
151
+ ucsdet_setText (detector, s.c_str (), s.length (), &status);
152
+ matches = ucsdet_detectAll (detector, &matches_count, &status);
153
+
154
+ if (!matches || matches_count < 1 ) {
155
+ break ;
156
+ }
157
+
158
+ confidence = ucsdet_getConfidence (matches[0 ], &status);
159
+
160
+ if (confidence > 70 || s.length () > 100 ) {
161
+ break ;
162
+ }
163
+
164
+ // Concatenating the string to itself increases the confidence (for short strings)
165
+ s += s;
166
+ }
149
167
150
168
if (matches != nullptr ) {
151
169
// Collect all candidates, most confident comes first
170
+
152
171
for (int i = 0 ; i < matches_count; ++i) {
153
172
std::string encoding = ucsdet_getName (matches[i], &status);
154
173
@@ -171,6 +190,8 @@ std::vector<std::string> ReaderUtil::DetectEncodings(StringView string) {
171
190
encodings.emplace_back (" ibm-5349_P100-1998" ); // Greek with Euro
172
191
} else if (encoding == " ISO-8859-8" || encoding == " windows-1255" ) {
173
192
encodings.emplace_back (" ibm-9447_P100-2002" ); // Hebrew with Euro
193
+ } else if (encoding == " UTF-16BE" || encoding == " UTF-16LE" ) {
194
+ // ignore encodings that are obviously wrong
174
195
} else {
175
196
encodings.push_back (encoding);
176
197
}
0 commit comments