1313#include " lcf/scope_guard.h"
1414#include < cstdio>
1515#include < cstdlib>
16- #include < exception>
1716
1817#if LCF_SUPPORT_ICU
1918# include < unicode/ucsdet.h>
2019# include < unicode/ucnv.h>
2120#else
22- # ifdef _MSC_VER
23- # error MSVC builds require ICU
24- # endif
21+ # include < cstdint>
2522#endif
2623
2724#ifdef _WIN32
2825# include < windows.h>
2926#else
30- # if !LCF_SUPPORT_ICU
31- # include < iconv.h>
32- # endif
3327# include < locale>
3428#endif
3529
@@ -82,12 +76,12 @@ void Encoder::Init() {
8276 return ;
8377 }
8478
85- #if LCF_SUPPORT_ICU
8679 auto code_page = atoi (_encoding.c_str ());
8780 const auto & storage_encoding = code_page > 0
8881 ? ReaderUtil::CodepageToEncoding (code_page)
8982 : _encoding;
9083
84+ #if LCF_SUPPORT_ICU
9185 auto status = U_ZERO_ERROR ;
9286 constexpr auto runtime_encoding = " UTF-8" ;
9387 auto conv_runtime = ucnv_open (runtime_encoding, &status);
@@ -111,26 +105,30 @@ void Encoder::Init() {
111105 _conv_runtime = conv_runtime;
112106 _conv_storage = conv_storage;
113107#else
114- _conv_runtime = const_cast <char *>(" UTF-8" );
115- _conv_storage = const_cast <char *>(_encoding.c_str ());
108+ if (storage_encoding != " windows-1252" ) {
109+ return ;
110+ }
111+
112+ _conv_runtime = 65001 ;
113+ _conv_storage = 1252 ;
116114#endif
117115}
118116
119- void Encoder::Reset () {
120117#if LCF_SUPPORT_ICU
121- auto * conv = reinterpret_cast <UConverter*>(_conv_runtime);
122- if (conv) ucnv_close (conv);
123- conv = reinterpret_cast <UConverter*>(_conv_storage);
124- if (conv) ucnv_close (conv);
125- #endif
126- }
118+ void Encoder::Reset () {
119+ if (_conv_runtime) {
120+ ucnv_close (_conv_runtime);
121+ _conv_runtime = nullptr ;
122+ }
127123
124+ if (_conv_storage) {
125+ ucnv_close (_conv_storage);
126+ _conv_storage = nullptr ;
127+ }
128+ }
128129
129- void Encoder::Convert (std::string& str, void * conv_dst_void, void * conv_src_void) {
130- #if LCF_SUPPORT_ICU
130+ void Encoder::Convert (std::string& str, UConverter* conv_dst, UConverter* conv_src) {
131131 const auto & src = str;
132- auto * conv_dst = reinterpret_cast <UConverter*>(conv_dst_void);
133- auto * conv_src = reinterpret_cast <UConverter*>(conv_src_void);
134132
135133 auto status = U_ZERO_ERROR ;
136134 _buffer.resize (src.size () * 4 );
@@ -151,36 +149,65 @@ void Encoder::Convert(std::string& str, void* conv_dst_void, void* conv_src_void
151149 }
152150
153151 str.assign (_buffer.data (), dst_p);
154- return ;
152+ }
155153#else
156- auto * conv_dst = reinterpret_cast <const char *>(conv_dst_void);
157- auto * conv_src = reinterpret_cast <const char *>(conv_src_void);
158- iconv_t cd = iconv_open (conv_dst, conv_src);
159- if (cd == (iconv_t )-1 )
160- return ;
161- char *src = &str.front ();
162- size_t src_left = str.size ();
163- size_t dst_size = str.size () * 5 + 10 ;
164- _buffer.resize (dst_size);
165- char *dst = _buffer.data ();
166- size_t dst_left = dst_size;
167- # ifdef ICONV_CONST
168- char ICONV_CONST *p = src;
169- # else
170- char *p = src;
171- # endif
172- char *q = dst;
173- size_t status = iconv (cd, &p, &src_left, &q, &dst_left);
174- iconv_close (cd);
175- if (status == (size_t ) -1 || src_left > 0 ) {
176- str.clear ();
154+ void Encoder::Convert (std::string& str, int conv_dst, int ) {
155+ if (str.empty ()) {
177156 return ;
178157 }
179- *q++ = ' \0 ' ;
180- str.assign (dst, dst_size - dst_left);
181- return ;
182- #endif
158+
159+ size_t buf_idx = 0 ;
160+
161+ if (conv_dst == 65001 ) {
162+ // From 1252 to UTF-8
163+ // Based on https://stackoverflow.com/q/4059775/
164+ _buffer.resize (str.size () * 2 + 1 );
165+
166+ for (unsigned char ch: str) {
167+ if (ch < 0x80 ) {
168+ _buffer[buf_idx] = static_cast <char >(ch);
169+ } else {
170+ _buffer[buf_idx] = static_cast <char >(0xC0 | (ch >> 6 ));
171+ ++buf_idx;
172+ _buffer[buf_idx] = static_cast <char >(0x80 | (ch & 0x3F ));
173+ }
174+
175+ ++buf_idx;
176+ }
177+ } else {
178+ // From UTF-8 to 1252
179+ // Based on https://stackoverflow.com/q/23689733/
180+ _buffer.resize (str.size () + 1 );
181+ uint32_t codepoint;
182+
183+ for (size_t str_idx = 0 ; str_idx < str.size (); ++str_idx) {
184+ unsigned char ch = str[str_idx];
185+ if (ch <= 0x7F ) {
186+ codepoint = ch;
187+ } else if (ch <= 0xBF ) {
188+ codepoint = (codepoint << 6 ) | (ch & 0x3F );
189+ } else if (ch <= 0xDF ) {
190+ codepoint = ch & 0x1F ;
191+ } else if (ch <= 0xEF ) {
192+ codepoint = ch & 0x0F ;
193+ } else {
194+ codepoint = ch & 0x07 ;
195+ }
196+ ++str_idx;
197+ ch = str[str_idx];
198+ if (((ch & 0xC0 ) != 0x80 ) && (codepoint <= 0x10ffff )) {
199+ if (codepoint <= 255 ) {
200+ _buffer[buf_idx] = static_cast <char >(codepoint);
201+ } else {
202+ _buffer[buf_idx] = ' ?' ;
203+ }
204+ }
205+ ++buf_idx;
206+ }
207+ }
208+
209+ str.assign (_buffer.data (), buf_idx);
183210}
211+ #endif
184212
185213} // namespace lcf
186-
0 commit comments