Skip to content

Commit 9d33492

Browse files
committed
cover more file encodings in raw mode
1 parent 61a216e commit 9d33492

File tree

1 file changed

+106
-34
lines changed

1 file changed

+106
-34
lines changed

src/SearchDlg.cpp

Lines changed: 106 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -4057,10 +4057,10 @@ class TextOffset
40574057
return start;
40584058
}
40594059

4060-
// UTF-16LE
4060+
// UTF-16LE, UTF-16BE
40614061
const wchar_t* SkipBOM(const wchar_t* start, const wchar_t* end)
40624062
{
4063-
if (end - start > 1 && *start == 0xFEFF)
4063+
if (end - start > 1 && (*start == 0xFEFF || *start == 0xFFFE))
40644064
{
40654065
lenBOM = 1;
40664066
return start + 1;
@@ -4129,23 +4129,46 @@ class TextOffset
41294129
}
41304130
};
41314131

4132+
static std::wstring utf16Swap(const std::wstring& str)
4133+
{
4134+
std::wstring swapped = str;
4135+
for (size_t i = 0; i < swapped.length(); ++i)
4136+
{
4137+
swapped[i] = swapped[i] << 8 | (swapped[i] >> 8 & 0xff);
4138+
}
4139+
return swapped;
4140+
}
4141+
41324142
template<typename CharT = char>
4133-
std::basic_string<CharT> ConvertToString(const std::wstring& str, CTextFile::UnicodeType encoding, bool bForceChar, CharT* dummy = NULL)
4143+
std::basic_string<CharT> ConvertToString(const std::wstring& str, CTextFile::UnicodeType encoding, CharT* dummy = NULL)
41344144
{};
41354145

41364146
template<>
4137-
std::basic_string<char> ConvertToString<char>(const std::wstring& str, CTextFile::UnicodeType encoding, bool bForceChar, char*)
4147+
std::basic_string<char> ConvertToString<char>(const std::wstring& str, CTextFile::UnicodeType encoding, char*)
41384148
{
4139-
if (bForceChar)
4149+
switch (encoding)
41404150
{
4141-
return std::basic_string<char>(reinterpret_cast<const char*>(str.c_str()), 2 * str.length());
4151+
case CTextFile::Unicode_Le:
4152+
return std::basic_string<char>(reinterpret_cast<const char*>(str.c_str()), 2 * str.length());
4153+
case CTextFile::Unicode_Be:
4154+
{
4155+
std::wstring strBe = utf16Swap(str);
4156+
return std::basic_string<char>(reinterpret_cast<const char*>(strBe.c_str()), 2 * strBe.length());
4157+
}
4158+
case CTextFile::Ansi:
4159+
return CUnicodeUtils::StdGetANSI(str);
4160+
case CTextFile::UTF8:
4161+
return CUnicodeUtils::StdGetUTF8(str);
4162+
default:
4163+
return "";
41424164
}
4143-
return (encoding == CTextFile::Ansi) ? CUnicodeUtils::StdGetANSI(str) : CUnicodeUtils::StdGetUTF8(str);
41444165
};
41454166

41464167
template<>
4147-
std::basic_string<wchar_t> ConvertToString<wchar_t>(const std::wstring& str, CTextFile::UnicodeType, bool, wchar_t*)
4168+
std::basic_string<wchar_t> ConvertToString<wchar_t>(const std::wstring& str, CTextFile::UnicodeType encoding, wchar_t*)
41484169
{
4170+
if (encoding == CTextFile::Unicode_Be)
4171+
return utf16Swap(str);
41494172
return str;
41504173
};
41514174

@@ -4169,7 +4192,7 @@ int CSearchDlg::SearchByFilePath(CSearchInfo& sInfo, const std::wstring& searchR
41694192
const CharT* end = fBeg + inSize / sizeof(CharT);
41704193

41714194
TextOffset<CharT> textOffset;
4172-
if ((sInfo.encoding == CTextFile::UTF8) || (sInfo.encoding == CTextFile::Unicode_Le))
4195+
if ((sInfo.encoding == CTextFile::UTF8) || (sInfo.encoding == CTextFile::Unicode_Le) || (sInfo.encoding == CTextFile::Unicode_Be))
41734196
{
41744197
start = textOffset.SkipBOM(fBeg, end);
41754198
}
@@ -4199,9 +4222,7 @@ int CSearchDlg::SearchByFilePath(CSearchInfo& sInfo, const std::wstring& searchR
41994222
}
42004223
end = reinterpret_cast<const CharT*>(inData + skipSize + workSize);
42014224

4202-
bool bTreateWcharAsChars = sizeof(CharT) == 1 && !m_bUseRegex &&
4203-
(sInfo.encoding == CTextFile::Unicode_Le || sInfo.encoding == CTextFile::Binary);
4204-
std::basic_string<CharT> expr = ConvertToString<CharT>(searchExpression, sInfo.encoding, bTreateWcharAsChars);
4225+
std::basic_string<CharT> expr = ConvertToString<CharT>(searchExpression, sInfo.encoding);
42054226

42064227
if (!m_bUseRegex && m_bWholeWords)
42074228
{
@@ -4255,7 +4276,7 @@ int CSearchDlg::SearchByFilePath(CSearchInfo& sInfo, const std::wstring& searchR
42554276
return nFound;
42564277
}
42574278

4258-
std::basic_string<CharT> repl = ConvertToString<CharT>(replaceExpression, sInfo.encoding, bTreateWcharAsChars);
4279+
std::basic_string<CharT> repl = ConvertToString<CharT>(replaceExpression, sInfo.encoding);
42594280

42604281
std::wstring filePathTempW = sInfo.filePath + L".grepwinreplaced";
42614282
std::string filePathTempA = filePathA + ".grepwinreplaced";
@@ -4368,9 +4389,9 @@ void CSearchDlg::SearchFile(CSearchInfo sInfo, const std::wstring& searchRoot)
43684389
sInfo.readError = true;
43694390
nCount = -1;
43704391
}
4371-
else if (bLoadResult && ((type != CTextFile::Binary) || m_bIncludeBinary)) // loaded
4392+
else if (bLoadResult && ((type != CTextFile::Binary) || m_bIncludeBinary)) // transcoded
43724393
{
4373-
// for unrecognized, only `Binary` returns true
4394+
// for unrecognized, only `Binary` returns true and treated as UTF-16LE, the same as app internal
43744395
try
43754396
{
43764397
nCount = SearchOnTextFile(sInfo, searchRoot, searchExpression, replaceExpression, syntaxFlags, matchFlags, textFile);
@@ -4384,33 +4405,84 @@ void CSearchDlg::SearchFile(CSearchInfo sInfo, const std::wstring& searchRoot)
43844405
else if ((type != CTextFile::Binary) || m_bIncludeBinary || m_bForceBinary)
43854406
{
43864407
// file is either too big or binary.
4387-
try
4408+
// types: Ansi, UTF8, Unicode_Le, Unicode_Be and Binary
4409+
std::vector<CTextFile::UnicodeType> encodingTries;
4410+
if (!m_bUseRegex || type == CTextFile::Binary)
43884411
{
4389-
// Ansi, UTF8, Unicode_Le, Unicode_Be and Binary
4390-
if (type != CTextFile::Unicode_Le)
4412+
// Treating a multi-byte char as single byte chars:
4413+
// yields part of it may be matched as a standalone char,
4414+
// so requires it grouped for repeats to get accurate results.
4415+
// Unicode_Le and Unicode_Be in Regex mode are turned into wchar_t branch. UTF8 is still here.
4416+
// Without transcoding the file, transcoding the input to other encoding is a trick, to get a bit more outcome.
4417+
// It only works for raw data, not escaped sequence, that is pure ASCII char!
4418+
switch (type)
43914419
{
4392-
// Treating a multi-byte char as single byte chars:
4393-
// yields part of it may be matched as a standalone char,
4394-
// so requires it grouped for repeats to get accurate results.
4395-
nCount = SearchByFilePath<char>(sInfo, searchRoot, searchExpression, replaceExpression, syntaxFlags, matchFlags, false);
4420+
case CTextFile::Binary:
4421+
{
4422+
if (m_bUseRegex)
4423+
encodingTries = {CTextFile::Ansi, CTextFile::UTF8};
4424+
else
4425+
encodingTries = {CTextFile::Ansi, CTextFile::UTF8, CTextFile::Unicode_Le, CTextFile::Unicode_Be};
4426+
}
4427+
break;
4428+
case CTextFile::Ansi:
4429+
case CTextFile::UTF8:
4430+
case CTextFile::Unicode_Le:
4431+
case CTextFile::Unicode_Be:
4432+
default:
4433+
encodingTries = {type};
4434+
break;
43964435
}
4397-
if (type == CTextFile::Unicode_Le || (type == CTextFile::Binary && nCount == 0))
4436+
for (auto assumption : encodingTries)
43984437
{
4399-
nCount = SearchByFilePath<wchar_t>(sInfo, searchRoot, searchExpression, replaceExpression, syntaxFlags, matchFlags, false);
4400-
if (type == CTextFile::Binary)
4401-
nCount += SearchByFilePath<wchar_t>(sInfo, searchRoot, searchExpression, replaceExpression, syntaxFlags, matchFlags, true);
4438+
sInfo.encoding = assumption;
4439+
try
4440+
{
4441+
nCount = SearchByFilePath<char>(sInfo, searchRoot, searchExpression, replaceExpression, syntaxFlags, matchFlags, false);
4442+
}
4443+
catch (...)
4444+
{
4445+
// regex error
4446+
}
4447+
if (nCount > 0)
4448+
{
4449+
break; // try all is consuming
4450+
}
44024451
}
44034452
}
4404-
catch (const std::exception& ex)
4405-
{
4406-
sInfo.exception = CUnicodeUtils::StdGetUnicode(ex.what());
4407-
nCount = 1;
4408-
}
4409-
catch (...)
4453+
if (m_bUseRegex && (type == CTextFile::Unicode_Le || type == CTextFile::Unicode_Be || type == CTextFile::Binary))
44104454
{
4411-
nCount = -1;
4412-
return;
4455+
switch (type)
4456+
{
4457+
case CTextFile::Binary:
4458+
encodingTries = {CTextFile::Unicode_Le, CTextFile::Unicode_Be};
4459+
break;
4460+
case CTextFile::Unicode_Le:
4461+
case CTextFile::Unicode_Be:
4462+
default:
4463+
encodingTries = {type};
4464+
break;
4465+
}
4466+
for (auto assumption : encodingTries)
4467+
{
4468+
sInfo.encoding = assumption;
4469+
try
4470+
{
4471+
nCount = SearchByFilePath<wchar_t>(sInfo, searchRoot, searchExpression, replaceExpression, syntaxFlags, matchFlags, false);
4472+
if (type == CTextFile::Binary)
4473+
nCount += SearchByFilePath<wchar_t>(sInfo, searchRoot, searchExpression, replaceExpression, syntaxFlags, matchFlags, true);
4474+
}
4475+
catch (...)
4476+
{
4477+
// regex error
4478+
}
4479+
if (nCount > 0)
4480+
{
4481+
break; // try all is consuming
4482+
}
4483+
}
44134484
}
4485+
// sInfo.encoding = type; // show the matched encoding
44144486
}
44154487

44164488
SendMessage(*this, SEARCH_PROGRESS, (nCount >= 0), 0);

0 commit comments

Comments
 (0)