Skip to content

Commit 8d11147

Browse files
committed
Added UTF-8 detection for headers
1 parent 07f8b33 commit 8d11147

8 files changed

Lines changed: 216 additions & 25 deletions

File tree

MimeKit/AsyncMimeReader.cs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,6 @@ async Task StepMboxMarkerAsync (CancellationToken cancellationToken)
131131

132132
async Task StepHeadersAsync (CancellationToken cancellationToken)
133133
{
134-
var options = DetectMimeComplianceViolations ? ByteDetectionOptions.Detect8Bit | ByteDetectionOptions.DetectNulls : ByteDetectionOptions.None;
135134
int headersBeginLineNumber = lineNumber;
136135
var eof = false;
137136

@@ -155,6 +154,7 @@ async Task StepHeadersAsync (CancellationToken cancellationToken)
155154
await ReadAheadAsync (ReadAheadSize, 0, cancellationToken).ConfigureAwait (false);
156155

157156
do {
157+
var options = DetectMimeComplianceViolations ? ByteDetectionOptions.Detect8Bit | ByteDetectionOptions.DetectNulls : ByteDetectionOptions.None;
158158
var beginOffset = GetOffset (inputIndex);
159159
var beginLineNumber = lineNumber;
160160
int left = inputEnd - inputIndex;
@@ -282,12 +282,13 @@ async Task StepHeadersAsync (CancellationToken cancellationToken)
282282
}
283283

284284
bool midline = true;
285+
bool ascii = true;
285286

286287
// Consume the header value.
287288
do {
288289
unsafe {
289290
fixed (byte* inbuf = input) {
290-
if (StepHeaderValue (inbuf, ref options, ref midline))
291+
if (StepHeaderValue (inbuf, ref options, ref midline, ref ascii))
291292
break;
292293
}
293294
}
@@ -310,7 +311,7 @@ async Task StepHeadersAsync (CancellationToken cancellationToken)
310311
return;
311312
}
312313

313-
var header = CreateHeader (beginOffset, beginLineNumber, fieldNameLength, headerFieldLength, invalid);
314+
var header = CreateHeader (beginOffset, beginLineNumber, fieldNameLength, headerFieldLength, invalid, ascii);
314315

315316
await OnHeaderReadAsync (header, beginLineNumber, cancellationToken).ConfigureAwait (false);
316317
} while (!eof);

MimeKit/MimeComplianceViolation.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ public enum MimeComplianceViolation
182182
/// US-ASCII should be encoded using the encoding mechanism described in the MIME specification and/or should be
183183
/// valid UTF-8 as allowed in the Internationalized Email Headers specification.
184184
/// </remarks>
185-
Unexpected8BitBytesInHeaders,
185+
Unexpected8BitBytesInHeader,
186186

187187
/// <summary>
188188
/// A MIME part's body contained 8-bit content where only 7-bit content was expected.

MimeKit/MimeKit.csproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<Project Sdk="Microsoft.NET.Sdk">
1+
<Project Sdk="Microsoft.NET.Sdk">
22

33
<PropertyGroup>
44
<Description>An Open Source library for creating and parsing MIME, S/MIME, PGP messages on desktop and mobile platforms.</Description>

MimeKit/MimeReader.cs

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#endif
3232
using System.Threading;
3333
using System.Diagnostics;
34+
using System.Text.Unicode;
3435
using System.Threading.Tasks;
3536
using System.Collections.Generic;
3637
using System.Diagnostics.CodeAnalysis;
@@ -1777,7 +1778,7 @@ void StepHeaderField (int headerFieldLength)
17771778
inputIndex += headerIndex;
17781779
}
17791780

1780-
unsafe bool StepHeaderValue (byte* inbuf, ref ByteDetectionOptions options, ref bool midline)
1781+
unsafe bool StepHeaderValue (byte* inbuf, ref ByteDetectionOptions options, ref bool midline, ref bool ascii)
17811782
{
17821783
byte* start = inbuf + inputIndex;
17831784
byte* inend = inbuf + inputEnd;
@@ -1792,8 +1793,11 @@ unsafe bool StepHeaderValue (byte* inbuf, ref ByteDetectionOptions options, ref
17921793
inptr = ParseUtils.EndOfLine (inptr, inend + 1, options, out var detected);
17931794

17941795
if ((detected & ByteDetectionResults.Detected8Bit) != 0) {
1795-
OnMimeComplianceViolation (MimeComplianceViolation.Unexpected8BitBytesInHeaders, lineBeginOffset, lineNumber);
1796+
// Note: we don't emit Unexpected8BitBytesInHeader here because 8-bit might only indicate UTF-8 which is valid in headers
1797+
// according to RFC 6532. Instead, we'll just track that this header value contains non-ASCII text and check for valid UTF-8
1798+
// once we've got the full value.
17961799
options &= ~ByteDetectionOptions.Detect8Bit;
1800+
ascii = false;
17971801
}
17981802

17991803
if ((detected & ByteDetectionResults.DetectedNulls) != 0) {
@@ -1922,7 +1926,7 @@ unsafe bool TryCheckMboxMarkerWithinHeaderBlock (byte* inbuf)
19221926
return true;
19231927
}
19241928

1925-
Header CreateHeader (long beginOffset, int beginLineNumber, int fieldNameLength, int headerFieldLength, bool invalid)
1929+
Header CreateHeader (long beginOffset, int beginLineNumber, int fieldNameLength, int headerFieldLength, bool invalid, bool ascii)
19261930
{
19271931
byte[] field, value;
19281932

@@ -1942,6 +1946,23 @@ Header CreateHeader (long beginOffset, int beginLineNumber, int fieldNameLength,
19421946
Offset = beginOffset
19431947
};
19441948

1949+
if (DetectMimeComplianceViolations) {
1950+
if (invalid) {
1951+
// This means that the field name itself contains all of the data and is invalid. Check for null bytes *and* non-UTF-8 text.
1952+
var fieldSpan = field.AsSpan ();
1953+
int index = fieldSpan.IndexOf ((byte) '\0');
1954+
1955+
if (index != -1)
1956+
OnMimeComplianceViolation (MimeComplianceViolation.UnexpectedNullBytesInHeader, beginOffset + index, beginLineNumber);
1957+
1958+
if (!Utf8.IsValid (fieldSpan))
1959+
OnMimeComplianceViolation (MimeComplianceViolation.Unexpected8BitBytesInHeader, beginOffset, beginLineNumber);
1960+
} else if (!ascii) {
1961+
if (!Utf8.IsValid (value))
1962+
OnMimeComplianceViolation (MimeComplianceViolation.Unexpected8BitBytesInHeader, beginOffset, beginLineNumber);
1963+
}
1964+
}
1965+
19451966
UpdateHeaderState (header, beginOffset, beginLineNumber);
19461967
headerCount++;
19471968

@@ -1950,7 +1971,6 @@ Header CreateHeader (long beginOffset, int beginLineNumber, int fieldNameLength,
19501971

19511972
unsafe void StepHeaders (byte* inbuf, CancellationToken cancellationToken)
19521973
{
1953-
var options = DetectMimeComplianceViolations ? ByteDetectionOptions.Detect8Bit | ByteDetectionOptions.DetectNulls : ByteDetectionOptions.None;
19541974
int headersBeginLineNumber = lineNumber;
19551975
var eof = false;
19561976

@@ -1974,6 +1994,7 @@ unsafe void StepHeaders (byte* inbuf, CancellationToken cancellationToken)
19741994
ReadAhead (ReadAheadSize, 0, cancellationToken);
19751995

19761996
do {
1997+
var options = DetectMimeComplianceViolations ? ByteDetectionOptions.Detect8Bit | ByteDetectionOptions.DetectNulls : ByteDetectionOptions.None;
19771998
var beginOffset = GetOffset (inputIndex);
19781999
var beginLineNumber = lineNumber;
19792000
int left = inputEnd - inputIndex;
@@ -2080,9 +2101,10 @@ unsafe void StepHeaders (byte* inbuf, CancellationToken cancellationToken)
20802101
}
20812102

20822103
bool midline = true;
2104+
bool ascii = true;
20832105

20842106
// Consume the header value.
2085-
while (!StepHeaderValue (inbuf, ref options, ref midline)) {
2107+
while (!StepHeaderValue (inbuf, ref options, ref midline, ref ascii)) {
20862108
if (ReadAhead (1, 0, cancellationToken) == 0) {
20872109
if (DetectMimeComplianceViolations) {
20882110
if (midline)
@@ -2101,7 +2123,7 @@ unsafe void StepHeaders (byte* inbuf, CancellationToken cancellationToken)
21012123
return;
21022124
}
21032125

2104-
var header = CreateHeader (beginOffset, beginLineNumber, fieldNameLength, headerFieldLength, invalid);
2126+
var header = CreateHeader (beginOffset, beginLineNumber, fieldNameLength, headerFieldLength, invalid, ascii);
21052127

21062128
OnHeaderRead (header, beginLineNumber, cancellationToken);
21072129
} while (!eof);

MimeKit/Utils/Utf8.cs

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
//
2+
// Utf8.cs
3+
//
4+
// Author: Jeffrey Stedfast <jestedfa@microsoft.com>
5+
//
6+
// Copyright (c) 2013-2026 .NET Foundation and Contributors
7+
//
8+
// Permission is hereby granted, free of charge, to any person obtaining a copy
9+
// of this software and associated documentation files (the "Software"), to deal
10+
// in the Software without restriction, including without limitation the rights
11+
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12+
// copies of the Software, and to permit persons to whom the Software is
13+
// furnished to do so, subject to the following conditions:
14+
//
15+
// The above copyright notice and this permission notice shall be included in
16+
// all copies or substantial portions of the Software.
17+
//
18+
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19+
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20+
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21+
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22+
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23+
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24+
// THE SOFTWARE.
25+
//
26+
27+
#if NETFRAMEWORK || NETSTANDARD || !NET8_0_OR_GREATER
28+
29+
using System;
30+
using System.Runtime.CompilerServices;
31+
32+
using Microsoft.Win32;
33+
34+
namespace System.Text.Unicode
35+
{
36+
static class Utf8
37+
{
38+
/// <summary>
39+
/// Validate that the value is well-formed UTF-8.
40+
/// </summary>
41+
/// <remarks>
42+
/// Validates that the value is well-formed UTF-8.
43+
/// </remarks>
44+
/// <param name="value">The byte string.</param>
45+
/// <returns><see langword="true"/> if the value is well-formed UTF-8; otherwise, <see langword="false"/>.</returns>
46+
public static bool IsValid (ReadOnlySpan<byte> value)
47+
{
48+
int index = 0;
49+
50+
if (value.IsEmpty)
51+
return true;
52+
53+
while (index < value.Length) {
54+
uint u = ReadUnichar (value, ref index);
55+
56+
if (u == 0xfffe)
57+
return false;
58+
}
59+
60+
return true;
61+
}
62+
63+
static uint ReadUnichar (ReadOnlySpan<byte> value, ref int index)
64+
{
65+
uint u = 0;
66+
byte c, r;
67+
68+
if (index >= value.Length)
69+
return 0;
70+
71+
r = value[index++];
72+
73+
if (r < 0x80) {
74+
// simple ascii character
75+
u = r;
76+
} else if (r < 0xfe) {
77+
// mask for utf-8 length bits
78+
uint mask = 0x7f80;
79+
80+
u = r;
81+
82+
do {
83+
if (index >= value.Length)
84+
return 0xfffe;
85+
86+
c = value[index++];
87+
if ((c & 0xc0) != 0x80) {
88+
// invalid utf-8 sequence
89+
return 0xfffe;
90+
}
91+
92+
u = (u << 6) | ((uint) c & 0x3f);
93+
r <<= 1;
94+
mask <<= 5;
95+
} while ((r & 0x40) != 0);
96+
97+
u &= ~mask;
98+
} else {
99+
// invalid utf-8 start character
100+
return 0xfffe;
101+
}
102+
103+
return u;
104+
}
105+
}
106+
}
107+
108+
#endif

0 commit comments

Comments
 (0)