-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMixedUtf8.cs
32 lines (29 loc) · 1.11 KB
/
MixedUtf8.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
static string PreProcessMixedEntities(string xml)
{
// Handle standard Unicode character references (4-6 hex digits)
string processed = Regex.Replace(xml,
@"&#x([0-9A-Fa-f]{4,6});",
match => {
int codePoint = Convert.ToInt32(match.Groups[1].Value, 16);
return char.ConvertFromUtf32(codePoint);
});
// Find sequences of byte entities (patterns like ●)
processed = Regex.Replace(processed,
@"(&#x[0-9A-Fa-f]{2};)+",
match => {
// Extract all hex values from the sequence
var hexValues = Regex.Matches(match.Value, @"&#x([0-9A-Fa-f]{2});")
.Cast<Match>()
.Select(m => Convert.ToByte(m.Groups[1].Value, 16))
.ToArray();
// Only process if we have a valid UTF-8 sequence
try {
return Encoding.UTF8.GetString(hexValues);
}
catch {
// If not a valid UTF-8 sequence, return the original
return match.Value;
}
});
return processed;
}