13
13
#include " lcf/scope_guard.h"
14
14
#include < cstdio>
15
15
#include < cstdlib>
16
- #include < exception>
17
16
18
17
#if LCF_SUPPORT_ICU
19
18
# include < unicode/ucsdet.h>
20
19
# include < unicode/ucnv.h>
21
20
#else
22
- # ifdef _MSC_VER
23
- # error MSVC builds require ICU
24
- # endif
21
+ # include < cstdint>
25
22
#endif
26
23
27
24
#ifdef _WIN32
28
25
# include < windows.h>
29
26
#else
30
- # if !LCF_SUPPORT_ICU
31
- # include < iconv.h>
32
- # endif
33
27
# include < locale>
34
28
#endif
35
29
@@ -82,12 +76,12 @@ void Encoder::Init() {
82
76
return ;
83
77
}
84
78
85
- #if LCF_SUPPORT_ICU
86
79
auto code_page = atoi (_encoding.c_str ());
87
80
const auto & storage_encoding = code_page > 0
88
81
? ReaderUtil::CodepageToEncoding (code_page)
89
82
: _encoding;
90
83
84
+ #if LCF_SUPPORT_ICU
91
85
auto status = U_ZERO_ERROR;
92
86
constexpr auto runtime_encoding = " UTF-8" ;
93
87
auto conv_runtime = ucnv_open (runtime_encoding, &status);
@@ -111,26 +105,30 @@ void Encoder::Init() {
111
105
_conv_runtime = conv_runtime;
112
106
_conv_storage = conv_storage;
113
107
#else
114
- _conv_runtime = const_cast <char *>(" UTF-8" );
115
- _conv_storage = const_cast <char *>(_encoding.c_str ());
108
+ if (storage_encoding != " windows-1252" ) {
109
+ return ;
110
+ }
111
+
112
+ _conv_runtime = 65001 ;
113
+ _conv_storage = 1252 ;
116
114
#endif
117
115
}
118
116
119
- void Encoder::Reset () {
120
117
#if LCF_SUPPORT_ICU
121
- auto * conv = reinterpret_cast <UConverter*>(_conv_runtime);
122
- if (conv) ucnv_close (conv);
123
- conv = reinterpret_cast <UConverter*>(_conv_storage);
124
- if (conv) ucnv_close (conv);
125
- #endif
126
- }
118
+ void Encoder::Reset () {
119
+ if (_conv_runtime) {
120
+ ucnv_close (_conv_runtime);
121
+ _conv_runtime = nullptr ;
122
+ }
127
123
124
+ if (_conv_storage) {
125
+ ucnv_close (_conv_storage);
126
+ _conv_storage = nullptr ;
127
+ }
128
+ }
128
129
129
- void Encoder::Convert (std::string& str, void * conv_dst_void, void * conv_src_void) {
130
- #if LCF_SUPPORT_ICU
130
+ void Encoder::Convert (std::string& str, UConverter* conv_dst, UConverter* conv_src) {
131
131
const auto & src = str;
132
- auto * conv_dst = reinterpret_cast <UConverter*>(conv_dst_void);
133
- auto * conv_src = reinterpret_cast <UConverter*>(conv_src_void);
134
132
135
133
auto status = U_ZERO_ERROR;
136
134
_buffer.resize (src.size () * 4 );
@@ -151,36 +149,65 @@ void Encoder::Convert(std::string& str, void* conv_dst_void, void* conv_src_void
151
149
}
152
150
153
151
str.assign (_buffer.data (), dst_p);
154
- return ;
152
+ }
155
153
#else
156
- auto * conv_dst = reinterpret_cast <const char *>(conv_dst_void);
157
- auto * conv_src = reinterpret_cast <const char *>(conv_src_void);
158
- iconv_t cd = iconv_open (conv_dst, conv_src);
159
- if (cd == (iconv_t )-1 )
160
- return ;
161
- char *src = &str.front ();
162
- size_t src_left = str.size ();
163
- size_t dst_size = str.size () * 5 + 10 ;
164
- _buffer.resize (dst_size);
165
- char *dst = _buffer.data ();
166
- size_t dst_left = dst_size;
167
- # ifdef ICONV_CONST
168
- char ICONV_CONST *p = src;
169
- # else
170
- char *p = src;
171
- # endif
172
- char *q = dst;
173
- size_t status = iconv (cd, &p, &src_left, &q, &dst_left);
174
- iconv_close (cd);
175
- if (status == (size_t ) -1 || src_left > 0 ) {
176
- str.clear ();
154
+ void Encoder::Convert (std::string& str, int conv_dst, int ) {
155
+ if (str.empty ()) {
177
156
return ;
178
157
}
179
- *q++ = ' \0 ' ;
180
- str.assign (dst, dst_size - dst_left);
181
- return ;
182
- #endif
158
+
159
+ size_t buf_idx = 0 ;
160
+
161
+ if (conv_dst == 65001 ) {
162
+ // From 1252 to UTF-8
163
+ // Based on https://stackoverflow.com/q/4059775/
164
+ _buffer.resize (str.size () * 2 + 1 );
165
+
166
+ for (unsigned char ch: str) {
167
+ if (ch < 0x80 ) {
168
+ _buffer[buf_idx] = static_cast <char >(ch);
169
+ } else {
170
+ _buffer[buf_idx] = static_cast <char >(0xC0 | (ch >> 6 ));
171
+ ++buf_idx;
172
+ _buffer[buf_idx] = static_cast <char >(0x80 | (ch & 0x3F ));
173
+ }
174
+
175
+ ++buf_idx;
176
+ }
177
+ } else {
178
+ // From UTF-8 to 1252
179
+ // Based on https://stackoverflow.com/q/23689733/
180
+ _buffer.resize (str.size () + 1 );
181
+ uint32_t codepoint;
182
+
183
+ for (size_t str_idx = 0 ; str_idx < str.size (); ++str_idx) {
184
+ unsigned char ch = str[str_idx];
185
+ if (ch <= 0x7F ) {
186
+ codepoint = ch;
187
+ } else if (ch <= 0xBF ) {
188
+ codepoint = (codepoint << 6 ) | (ch & 0x3F );
189
+ } else if (ch <= 0xDF ) {
190
+ codepoint = ch & 0x1F ;
191
+ } else if (ch <= 0xEF ) {
192
+ codepoint = ch & 0x0F ;
193
+ } else {
194
+ codepoint = ch & 0x07 ;
195
+ }
196
+ ++str_idx;
197
+ ch = str[str_idx];
198
+ if (((ch & 0xC0 ) != 0x80 ) && (codepoint <= 0x10ffff )) {
199
+ if (codepoint <= 255 ) {
200
+ _buffer[buf_idx] = static_cast <char >(codepoint);
201
+ } else {
202
+ _buffer[buf_idx] = ' ?' ;
203
+ }
204
+ }
205
+ ++buf_idx;
206
+ }
207
+ }
208
+
209
+ str.assign (_buffer.data (), buf_idx);
183
210
}
211
+ #endif
184
212
185
213
} // namespace lcf
186
-
0 commit comments