@@ -4057,10 +4057,10 @@ class TextOffset
4057
4057
return start;
4058
4058
}
4059
4059
4060
- // UTF-16LE
4060
+ // UTF-16LE, UTF-16BE
4061
4061
const wchar_t * SkipBOM (const wchar_t * start, const wchar_t * end)
4062
4062
{
4063
- if (end - start > 1 && *start == 0xFEFF )
4063
+ if (end - start > 1 && ( *start == 0xFEFF || *start == 0xFFFE ) )
4064
4064
{
4065
4065
lenBOM = 1 ;
4066
4066
return start + 1 ;
@@ -4129,23 +4129,46 @@ class TextOffset
4129
4129
}
4130
4130
};
4131
4131
4132
+ static std::wstring utf16Swap (const std::wstring& str)
4133
+ {
4134
+ std::wstring swapped = str;
4135
+ for (size_t i = 0 ; i < swapped.length (); ++i)
4136
+ {
4137
+ swapped[i] = swapped[i] << 8 | (swapped[i] >> 8 & 0xff );
4138
+ }
4139
+ return swapped;
4140
+ }
4141
+
4132
4142
template <typename CharT = char >
4133
- std::basic_string<CharT> ConvertToString (const std::wstring& str, CTextFile::UnicodeType encoding, bool bForceChar, CharT* dummy = NULL )
4143
+ std::basic_string<CharT> ConvertToString (const std::wstring& str, CTextFile::UnicodeType encoding, CharT* dummy = NULL )
4134
4144
{};
4135
4145
4136
4146
template <>
4137
- std::basic_string<char > ConvertToString<char >(const std::wstring& str, CTextFile::UnicodeType encoding, bool bForceChar, char *)
4147
+ std::basic_string<char > ConvertToString<char >(const std::wstring& str, CTextFile::UnicodeType encoding, char *)
4138
4148
{
4139
- if (bForceChar )
4149
+ switch (encoding )
4140
4150
{
4141
- return std::basic_string<char >(reinterpret_cast <const char *>(str.c_str ()), 2 * str.length ());
4151
+ case CTextFile::Unicode_Le:
4152
+ return std::basic_string<char >(reinterpret_cast <const char *>(str.c_str ()), 2 * str.length ());
4153
+ case CTextFile::Unicode_Be:
4154
+ {
4155
+ std::wstring strBe = utf16Swap (str);
4156
+ return std::basic_string<char >(reinterpret_cast <const char *>(strBe.c_str ()), 2 * strBe.length ());
4157
+ }
4158
+ case CTextFile::Ansi:
4159
+ return CUnicodeUtils::StdGetANSI (str);
4160
+ case CTextFile::UTF8:
4161
+ return CUnicodeUtils::StdGetUTF8 (str);
4162
+ default :
4163
+ return " " ;
4142
4164
}
4143
- return (encoding == CTextFile::Ansi) ? CUnicodeUtils::StdGetANSI (str) : CUnicodeUtils::StdGetUTF8 (str);
4144
4165
};
4145
4166
4146
4167
template <>
4147
- std::basic_string<wchar_t > ConvertToString<wchar_t >(const std::wstring& str, CTextFile::UnicodeType, bool , wchar_t *)
4168
+ std::basic_string<wchar_t > ConvertToString<wchar_t >(const std::wstring& str, CTextFile::UnicodeType encoding , wchar_t *)
4148
4169
{
4170
+ if (encoding == CTextFile::Unicode_Be)
4171
+ return utf16Swap (str);
4149
4172
return str;
4150
4173
};
4151
4174
@@ -4169,7 +4192,7 @@ int CSearchDlg::SearchByFilePath(CSearchInfo& sInfo, const std::wstring& searchR
4169
4192
const CharT* end = fBeg + inSize / sizeof (CharT);
4170
4193
4171
4194
TextOffset<CharT> textOffset;
4172
- if ((sInfo .encoding == CTextFile::UTF8) || (sInfo .encoding == CTextFile::Unicode_Le))
4195
+ if ((sInfo .encoding == CTextFile::UTF8) || (sInfo .encoding == CTextFile::Unicode_Le) || ( sInfo . encoding == CTextFile::Unicode_Be) )
4173
4196
{
4174
4197
start = textOffset.SkipBOM (fBeg , end);
4175
4198
}
@@ -4199,9 +4222,7 @@ int CSearchDlg::SearchByFilePath(CSearchInfo& sInfo, const std::wstring& searchR
4199
4222
}
4200
4223
end = reinterpret_cast <const CharT*>(inData + skipSize + workSize);
4201
4224
4202
- bool bTreateWcharAsChars = sizeof (CharT) == 1 && !m_bUseRegex &&
4203
- (sInfo .encoding == CTextFile::Unicode_Le || sInfo .encoding == CTextFile::Binary);
4204
- std::basic_string<CharT> expr = ConvertToString<CharT>(searchExpression, sInfo .encoding , bTreateWcharAsChars);
4225
+ std::basic_string<CharT> expr = ConvertToString<CharT>(searchExpression, sInfo .encoding );
4205
4226
4206
4227
if (!m_bUseRegex && m_bWholeWords)
4207
4228
{
@@ -4255,7 +4276,7 @@ int CSearchDlg::SearchByFilePath(CSearchInfo& sInfo, const std::wstring& searchR
4255
4276
return nFound;
4256
4277
}
4257
4278
4258
- std::basic_string<CharT> repl = ConvertToString<CharT>(replaceExpression, sInfo .encoding , bTreateWcharAsChars );
4279
+ std::basic_string<CharT> repl = ConvertToString<CharT>(replaceExpression, sInfo .encoding );
4259
4280
4260
4281
std::wstring filePathTempW = sInfo .filePath + L" .grepwinreplaced" ;
4261
4282
std::string filePathTempA = filePathA + " .grepwinreplaced" ;
@@ -4368,9 +4389,9 @@ void CSearchDlg::SearchFile(CSearchInfo sInfo, const std::wstring& searchRoot)
4368
4389
sInfo .readError = true ;
4369
4390
nCount = -1 ;
4370
4391
}
4371
- else if (bLoadResult && ((type != CTextFile::Binary) || m_bIncludeBinary)) // loaded
4392
+ else if (bLoadResult && ((type != CTextFile::Binary) || m_bIncludeBinary)) // transcoded
4372
4393
{
4373
- // for unrecognized, only `Binary` returns true
4394
+ // for unrecognized, only `Binary` returns true and treated as UTF-16LE, the same as app internal
4374
4395
try
4375
4396
{
4376
4397
nCount = SearchOnTextFile (sInfo , searchRoot, searchExpression, replaceExpression, syntaxFlags, matchFlags, textFile);
@@ -4384,33 +4405,84 @@ void CSearchDlg::SearchFile(CSearchInfo sInfo, const std::wstring& searchRoot)
4384
4405
else if ((type != CTextFile::Binary) || m_bIncludeBinary || m_bForceBinary)
4385
4406
{
4386
4407
// file is either too big or binary.
4387
- try
4408
+ // types: Ansi, UTF8, Unicode_Le, Unicode_Be and Binary
4409
+ std::vector<CTextFile::UnicodeType> encodingTries;
4410
+ if (!m_bUseRegex || type == CTextFile::Binary)
4388
4411
{
4389
- // Ansi, UTF8, Unicode_Le, Unicode_Be and Binary
4390
- if (type != CTextFile::Unicode_Le)
4412
+ // Treating a multi-byte char as single byte chars:
4413
+ // yields part of it may be matched as a standalone char,
4414
+ // so requires it grouped for repeats to get accurate results.
4415
+ // Unicode_Le and Unicode_Be in Regex mode are turned into wchar_t branch. UTF8 is still here.
4416
+ // Without transcoding the file, transcoding the input to other encoding is a trick, to get a bit more outcome.
4417
+ // It only works for raw data, not escaped sequence, that is pure ASCII char!
4418
+ switch (type)
4391
4419
{
4392
- // Treating a multi-byte char as single byte chars:
4393
- // yields part of it may be matched as a standalone char,
4394
- // so requires it grouped for repeats to get accurate results.
4395
- nCount = SearchByFilePath<char >(sInfo , searchRoot, searchExpression, replaceExpression, syntaxFlags, matchFlags, false );
4420
+ case CTextFile::Binary:
4421
+ {
4422
+ if (m_bUseRegex)
4423
+ encodingTries = {CTextFile::Ansi, CTextFile::UTF8};
4424
+ else
4425
+ encodingTries = {CTextFile::Ansi, CTextFile::UTF8, CTextFile::Unicode_Le, CTextFile::Unicode_Be};
4426
+ }
4427
+ break ;
4428
+ case CTextFile::Ansi:
4429
+ case CTextFile::UTF8:
4430
+ case CTextFile::Unicode_Le:
4431
+ case CTextFile::Unicode_Be:
4432
+ default :
4433
+ encodingTries = {type};
4434
+ break ;
4396
4435
}
4397
- if (type == CTextFile::Unicode_Le || (type == CTextFile::Binary && nCount == 0 ) )
4436
+ for ( auto assumption : encodingTries )
4398
4437
{
4399
- nCount = SearchByFilePath<wchar_t >(sInfo , searchRoot, searchExpression, replaceExpression, syntaxFlags, matchFlags, false );
4400
- if (type == CTextFile::Binary)
4401
- nCount += SearchByFilePath<wchar_t >(sInfo , searchRoot, searchExpression, replaceExpression, syntaxFlags, matchFlags, true );
4438
+ sInfo .encoding = assumption;
4439
+ try
4440
+ {
4441
+ nCount = SearchByFilePath<char >(sInfo , searchRoot, searchExpression, replaceExpression, syntaxFlags, matchFlags, false );
4442
+ }
4443
+ catch (...)
4444
+ {
4445
+ // regex error
4446
+ }
4447
+ if (nCount > 0 )
4448
+ {
4449
+ break ; // try all is consuming
4450
+ }
4402
4451
}
4403
4452
}
4404
- catch (const std::exception& ex)
4405
- {
4406
- sInfo .exception = CUnicodeUtils::StdGetUnicode (ex.what ());
4407
- nCount = 1 ;
4408
- }
4409
- catch (...)
4453
+ if (m_bUseRegex && (type == CTextFile::Unicode_Le || type == CTextFile::Unicode_Be || type == CTextFile::Binary))
4410
4454
{
4411
- nCount = -1 ;
4412
- return ;
4455
+ switch (type)
4456
+ {
4457
+ case CTextFile::Binary:
4458
+ encodingTries = {CTextFile::Unicode_Le, CTextFile::Unicode_Be};
4459
+ break ;
4460
+ case CTextFile::Unicode_Le:
4461
+ case CTextFile::Unicode_Be:
4462
+ default :
4463
+ encodingTries = {type};
4464
+ break ;
4465
+ }
4466
+ for (auto assumption : encodingTries)
4467
+ {
4468
+ sInfo .encoding = assumption;
4469
+ try
4470
+ {
4471
+ nCount = SearchByFilePath<wchar_t >(sInfo , searchRoot, searchExpression, replaceExpression, syntaxFlags, matchFlags, false );
4472
+ if (type == CTextFile::Binary)
4473
+ nCount += SearchByFilePath<wchar_t >(sInfo , searchRoot, searchExpression, replaceExpression, syntaxFlags, matchFlags, true );
4474
+ }
4475
+ catch (...)
4476
+ {
4477
+ // regex error
4478
+ }
4479
+ if (nCount > 0 )
4480
+ {
4481
+ break ; // try all is consuming
4482
+ }
4483
+ }
4413
4484
}
4485
+ // sInfo.encoding = type; // show the matched encoding
4414
4486
}
4415
4487
4416
4488
SendMessage (*this , SEARCH_PROGRESS, (nCount >= 0 ), 0 );
0 commit comments