From 68dec50924893b57bee456ff10bb3b7e2a3450e2 Mon Sep 17 00:00:00 2001 From: David Kilzer Date: Mon, 11 Nov 2024 11:55:44 -0800 Subject: [PATCH] tidy-html5:tidy_fuzzer: Heap-buffer-overflow in prvTidyEncodeCharToUTF8Bytes Found by oss-fuzz. Fixes potential out-of-bounds write in both NormalizeSpaces() and DowngradeTypography(). Adds assert() statements to catch more bugs with fuzzing. * src/clean.c: (NormalizeSpaces): (DowngradeTypography): - Use a temporary buffer when calling PutUTF8() to avoid a heap buffer overflow write and to avoid clobbering data in-place. - Handle all possible return values after calling PutUTF8(). * src/utf8.c: (DecodeUTF8BytesToChar): (GetUTF8): (PutUTF8): - Add assert() statements to catch bugs during fuzzing. --- src/clean.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++------ src/utf8.c | 5 ++++- 2 files changed, 58 insertions(+), 7 deletions(-) diff --git a/src/clean.c b/src/clean.c index e0cd3baae..2a28fafdd 100644 --- a/src/clean.c +++ b/src/clean.c @@ -1821,18 +1821,42 @@ void TY_(NormalizeSpaces)(Lexer *lexer, Node *node) for (i = node->start; i < node->end; ++i) { + uint utf8BytesRead = 1, utf8BytesWritten = 0; + tmbchar tempbuf[10] = {0}; + tmbstr result = NULL; c = (byte) lexer->lexbuf[i]; /* look for UTF-8 multibyte character */ if ( c > 0x7F ) - i += TY_(GetUTF8)( lexer->lexbuf + i, &c ); + utf8BytesRead += TY_(GetUTF8)( lexer->lexbuf + i, &c ); if ( c == 160 ) c = ' '; - p = TY_(PutUTF8)(p, c); + result = TY_(PutUTF8)(&tempbuf[0], c); + utf8BytesWritten = (result > &tempbuf[0]) ? (uint)(result - &tempbuf[0]) : 0; + if ( utf8BytesWritten == 0 ) { + (lexer->lexbuf + i)[0] = (tmbchar) c; + ++p; + } + else if ( utf8BytesRead >= utf8BytesWritten ) { + memmove(&(lexer->lexbuf + i)[0], &tempbuf[0], utf8BytesWritten); + i += utf8BytesRead - 1; /* Offset ++i in for loop. */ + p += utf8BytesWritten; + } else { + /* Error; keep this byte and move to the next. */ + if ( c != 0xFFFD && utf8BytesRead != utf8BytesWritten ) { +#if 1 && defined(_DEBUG) + fprintf(stderr, "utf8BytesRead = %u, utf8BytesWritten = %u\n", utf8BytesRead, utf8BytesWritten); + fprintf(stderr, "i = %d, c = %u\n", i, c); +#endif + assert( utf8BytesRead == utf8BytesWritten ); /* Can't extend buffer. */ + } + ++p; + } } - node->end = p - lexer->lexbuf; + intptr_t pos = (p > lexer->lexbuf) ? (p - lexer->lexbuf) : 0; + node->end = (pos >= node->start && pos <= node->end) ? (uint)pos : node->end; } node = node->next; @@ -2504,10 +2528,13 @@ void TY_(DowngradeTypography)(TidyDocImpl* doc, Node* node) for (i = node->start; i < node->end; ++i) { + uint utf8BytesRead = 1, utf8BytesWritten = 0; + tmbchar tempbuf[10] = {0}; + tmbstr result = NULL; c = (unsigned char) lexer->lexbuf[i]; if (c > 0x7F) - i += TY_(GetUTF8)(lexer->lexbuf + i, &c); + utf8BytesRead += TY_(GetUTF8)(lexer->lexbuf + i, &c); if (c >= 0x2013 && c <= 0x201E) { @@ -2530,10 +2557,31 @@ void TY_(DowngradeTypography)(TidyDocImpl* doc, Node* node) } } - p = TY_(PutUTF8)(p, c); + result = TY_(PutUTF8)(&tempbuf[0], c); + utf8BytesWritten = (result > &tempbuf[0]) ? (uint)(result - &tempbuf[0]) : 0; + if ( utf8BytesWritten == 0 ) { + (lexer->lexbuf + i)[0] = (tmbchar) c; + ++p; + } + else if ( utf8BytesRead >= utf8BytesWritten ) { + memmove(&(lexer->lexbuf + i)[0], &tempbuf[0], utf8BytesWritten); + i += utf8BytesRead - 1; /* Offset ++i in for loop. */ + p += utf8BytesWritten; + } else { + /* Error; keep this byte and move to the next. */ + if ( c != 0xFFFD && utf8BytesRead != utf8BytesWritten ) { +#if 1 && defined(_DEBUG) + fprintf(stderr, "utf8BytesRead = %u, utf8BytesWritten = %u\n", utf8BytesRead, utf8BytesWritten); + fprintf(stderr, "i = %d, c = %u\n", i, c); +#endif + assert( utf8BytesRead == utf8BytesWritten ); /* Can't extend buffer. */ + } + ++p; + } } - node->end = p - lexer->lexbuf; + intptr_t pos = (p > lexer->lexbuf) ? (p - lexer->lexbuf) : 0; + node->end = (pos >= node->start && pos <= node->end) ? (uint)pos : node->end; } if (node->content) diff --git a/src/utf8.c b/src/utf8.c index da90b95a7..9ac5bd19e 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -321,6 +321,7 @@ int TY_(DecodeUTF8BytesToChar)( uint* c, uint firstByte, ctmbstr successorBytes, } #endif + assert(bytes > 0); *count = bytes; *c = n; if ( hasError ) @@ -443,7 +444,8 @@ uint TY_(GetUTF8)( ctmbstr str, uint *ch ) } *ch = n; - return bytes - 1; + assert(bytes > 0); + return (bytes > 0) ? (bytes - 1) : 0; } /* store char c as UTF-8 encoded byte stream */ @@ -464,6 +466,7 @@ tmbstr TY_(PutUTF8)( tmbstr buf, uint c ) count = 3; } + assert(count >= 0); buf += count; return buf; }