@@ -4057,10 +4057,10 @@ class TextOffset
40574057 return start;
40584058 }
40594059
4060- // UTF-16LE
4060+ // UTF-16LE, UTF-16BE
40614061 const wchar_t * SkipBOM (const wchar_t * start, const wchar_t * end)
40624062 {
4063- if (end - start > 1 && *start == 0xFEFF )
4063+ if (end - start > 1 && ( *start == 0xFEFF || *start == 0xFFFE ) )
40644064 {
40654065 lenBOM = 1 ;
40664066 return start + 1 ;
@@ -4129,23 +4129,46 @@ class TextOffset
41294129 }
41304130};
41314131
4132+ static std::wstring utf16Swap (const std::wstring& str)
4133+ {
4134+ std::wstring swapped = str;
4135+ for (size_t i = 0 ; i < swapped.length (); ++i)
4136+ {
4137+ swapped[i] = swapped[i] << 8 | (swapped[i] >> 8 & 0xff );
4138+ }
4139+ return swapped;
4140+ }
4141+
41324142template <typename CharT = char >
4133- std::basic_string<CharT> ConvertToString (const std::wstring& str, CTextFile::UnicodeType encoding, bool bForceChar, CharT* dummy = NULL )
4143+ std::basic_string<CharT> ConvertToString (const std::wstring& str, CTextFile::UnicodeType encoding, CharT* dummy = NULL )
41344144{};
41354145
41364146template <>
4137- std::basic_string<char > ConvertToString<char >(const std::wstring& str, CTextFile::UnicodeType encoding, bool bForceChar, char *)
4147+ std::basic_string<char > ConvertToString<char >(const std::wstring& str, CTextFile::UnicodeType encoding, char *)
41384148{
4139- if (bForceChar )
4149+ switch (encoding )
41404150 {
4141- return std::basic_string<char >(reinterpret_cast <const char *>(str.c_str ()), 2 * str.length ());
4151+ case CTextFile::Unicode_Le:
4152+ return std::basic_string<char >(reinterpret_cast <const char *>(str.c_str ()), 2 * str.length ());
4153+ case CTextFile::Unicode_Be:
4154+ {
4155+ std::wstring strBe = utf16Swap (str);
4156+ return std::basic_string<char >(reinterpret_cast <const char *>(strBe.c_str ()), 2 * strBe.length ());
4157+ }
4158+ case CTextFile::Ansi:
4159+ return CUnicodeUtils::StdGetANSI (str);
4160+ case CTextFile::UTF8:
4161+ return CUnicodeUtils::StdGetUTF8 (str);
4162+ default :
4163+ return " " ;
41424164 }
4143- return (encoding == CTextFile::Ansi) ? CUnicodeUtils::StdGetANSI (str) : CUnicodeUtils::StdGetUTF8 (str);
41444165};
41454166
41464167template <>
4147- std::basic_string<wchar_t > ConvertToString<wchar_t >(const std::wstring& str, CTextFile::UnicodeType, bool , wchar_t *)
4168+ std::basic_string<wchar_t > ConvertToString<wchar_t >(const std::wstring& str, CTextFile::UnicodeType encoding , wchar_t *)
41484169{
4170+ if (encoding == CTextFile::Unicode_Be)
4171+ return utf16Swap (str);
41494172 return str;
41504173};
41514174
@@ -4169,7 +4192,7 @@ int CSearchDlg::SearchByFilePath(CSearchInfo& sInfo, const std::wstring& searchR
41694192 const CharT* end = fBeg + inSize / sizeof (CharT);
41704193
41714194 TextOffset<CharT> textOffset;
4172- if ((sInfo .encoding == CTextFile::UTF8) || (sInfo .encoding == CTextFile::Unicode_Le))
4195+ if ((sInfo .encoding == CTextFile::UTF8) || (sInfo .encoding == CTextFile::Unicode_Le) || ( sInfo . encoding == CTextFile::Unicode_Be) )
41734196 {
41744197 start = textOffset.SkipBOM (fBeg , end);
41754198 }
@@ -4199,9 +4222,7 @@ int CSearchDlg::SearchByFilePath(CSearchInfo& sInfo, const std::wstring& searchR
41994222 }
42004223 end = reinterpret_cast <const CharT*>(inData + skipSize + workSize);
42014224
4202- bool bTreateWcharAsChars = sizeof (CharT) == 1 && !m_bUseRegex &&
4203- (sInfo .encoding == CTextFile::Unicode_Le || sInfo .encoding == CTextFile::Binary);
4204- std::basic_string<CharT> expr = ConvertToString<CharT>(searchExpression, sInfo .encoding , bTreateWcharAsChars);
4225+ std::basic_string<CharT> expr = ConvertToString<CharT>(searchExpression, sInfo .encoding );
42054226
42064227 if (!m_bUseRegex && m_bWholeWords)
42074228 {
@@ -4255,7 +4276,7 @@ int CSearchDlg::SearchByFilePath(CSearchInfo& sInfo, const std::wstring& searchR
42554276 return nFound;
42564277 }
42574278
4258- std::basic_string<CharT> repl = ConvertToString<CharT>(replaceExpression, sInfo .encoding , bTreateWcharAsChars );
4279+ std::basic_string<CharT> repl = ConvertToString<CharT>(replaceExpression, sInfo .encoding );
42594280
42604281 std::wstring filePathTempW = sInfo .filePath + L" .grepwinreplaced" ;
42614282 std::string filePathTempA = filePathA + " .grepwinreplaced" ;
@@ -4368,9 +4389,9 @@ void CSearchDlg::SearchFile(CSearchInfo sInfo, const std::wstring& searchRoot)
43684389 sInfo .readError = true ;
43694390 nCount = -1 ;
43704391 }
4371- else if (bLoadResult && ((type != CTextFile::Binary) || m_bIncludeBinary)) // loaded
4392+ else if (bLoadResult && ((type != CTextFile::Binary) || m_bIncludeBinary)) // transcoded
43724393 {
4373- // for unrecognized, only `Binary` returns true
4394+ // for unrecognized, only `Binary` returns true and treated as UTF-16LE, the same as app internal
43744395 try
43754396 {
43764397 nCount = SearchOnTextFile (sInfo , searchRoot, searchExpression, replaceExpression, syntaxFlags, matchFlags, textFile);
@@ -4384,33 +4405,84 @@ void CSearchDlg::SearchFile(CSearchInfo sInfo, const std::wstring& searchRoot)
43844405 else if ((type != CTextFile::Binary) || m_bIncludeBinary || m_bForceBinary)
43854406 {
43864407 // file is either too big or binary.
4387- try
4408+ // types: Ansi, UTF8, Unicode_Le, Unicode_Be and Binary
4409+ std::vector<CTextFile::UnicodeType> encodingTries;
4410+ if (!m_bUseRegex || type == CTextFile::Binary)
43884411 {
4389- // Ansi, UTF8, Unicode_Le, Unicode_Be and Binary
4390- if (type != CTextFile::Unicode_Le)
4412+ // Treating a multi-byte char as single byte chars:
4413+ // yields part of it may be matched as a standalone char,
4414+ // so requires it grouped for repeats to get accurate results.
4415+ // Unicode_Le and Unicode_Be in Regex mode are turned into wchar_t branch. UTF8 is still here.
4416+ // Without transcoding the file, transcoding the input to other encoding is a trick, to get a bit more outcome.
4417+ // It only works for raw data, not escaped sequence, that is pure ASCII char!
4418+ switch (type)
43914419 {
4392- // Treating a multi-byte char as single byte chars:
4393- // yields part of it may be matched as a standalone char,
4394- // so requires it grouped for repeats to get accurate results.
4395- nCount = SearchByFilePath<char >(sInfo , searchRoot, searchExpression, replaceExpression, syntaxFlags, matchFlags, false );
4420+ case CTextFile::Binary:
4421+ {
4422+ if (m_bUseRegex)
4423+ encodingTries = {CTextFile::Ansi, CTextFile::UTF8};
4424+ else
4425+ encodingTries = {CTextFile::Ansi, CTextFile::UTF8, CTextFile::Unicode_Le, CTextFile::Unicode_Be};
4426+ }
4427+ break ;
4428+ case CTextFile::Ansi:
4429+ case CTextFile::UTF8:
4430+ case CTextFile::Unicode_Le:
4431+ case CTextFile::Unicode_Be:
4432+ default :
4433+ encodingTries = {type};
4434+ break ;
43964435 }
4397- if (type == CTextFile::Unicode_Le || (type == CTextFile::Binary && nCount == 0 ) )
4436+ for ( auto assumption : encodingTries )
43984437 {
4399- nCount = SearchByFilePath<wchar_t >(sInfo , searchRoot, searchExpression, replaceExpression, syntaxFlags, matchFlags, false );
4400- if (type == CTextFile::Binary)
4401- nCount += SearchByFilePath<wchar_t >(sInfo , searchRoot, searchExpression, replaceExpression, syntaxFlags, matchFlags, true );
4438+ sInfo .encoding = assumption;
4439+ try
4440+ {
4441+ nCount = SearchByFilePath<char >(sInfo , searchRoot, searchExpression, replaceExpression, syntaxFlags, matchFlags, false );
4442+ }
4443+ catch (...)
4444+ {
4445+ // regex error
4446+ }
4447+ if (nCount > 0 )
4448+ {
4449+ break ; // try all is consuming
4450+ }
44024451 }
44034452 }
4404- catch (const std::exception& ex)
4405- {
4406- sInfo .exception = CUnicodeUtils::StdGetUnicode (ex.what ());
4407- nCount = 1 ;
4408- }
4409- catch (...)
4453+ if (m_bUseRegex && (type == CTextFile::Unicode_Le || type == CTextFile::Unicode_Be || type == CTextFile::Binary))
44104454 {
4411- nCount = -1 ;
4412- return ;
4455+ switch (type)
4456+ {
4457+ case CTextFile::Binary:
4458+ encodingTries = {CTextFile::Unicode_Le, CTextFile::Unicode_Be};
4459+ break ;
4460+ case CTextFile::Unicode_Le:
4461+ case CTextFile::Unicode_Be:
4462+ default :
4463+ encodingTries = {type};
4464+ break ;
4465+ }
4466+ for (auto assumption : encodingTries)
4467+ {
4468+ sInfo .encoding = assumption;
4469+ try
4470+ {
4471+ nCount = SearchByFilePath<wchar_t >(sInfo , searchRoot, searchExpression, replaceExpression, syntaxFlags, matchFlags, false );
4472+ if (type == CTextFile::Binary)
4473+ nCount += SearchByFilePath<wchar_t >(sInfo , searchRoot, searchExpression, replaceExpression, syntaxFlags, matchFlags, true );
4474+ }
4475+ catch (...)
4476+ {
4477+ // regex error
4478+ }
4479+ if (nCount > 0 )
4480+ {
4481+ break ; // try all is consuming
4482+ }
4483+ }
44134484 }
4485+ // sInfo.encoding = type; // show the matched encoding
44144486 }
44154487
44164488 SendMessage (*this , SEARCH_PROGRESS, (nCount >= 0 ), 0 );
0 commit comments