From 68dec50924893b57bee456ff10bb3b7e2a3450e2 Mon Sep 17 00:00:00 2001
From: David Kilzer <ddkilzer@apple.com>
Date: Mon, 11 Nov 2024 11:55:44 -0800
Subject: [PATCH] tidy-html5:tidy_fuzzer: Heap-buffer-overflow in
 prvTidyEncodeCharToUTF8Bytes <https://issues.oss-fuzz.com/issues/42498297>

Found by oss-fuzz.

Fixes potential out-of-bounds write in both NormalizeSpaces() and
DowngradeTypography(). Adds assert() statements to catch more bugs with
fuzzing.

* src/clean.c:
(NormalizeSpaces):
(DowngradeTypography):
- Use a temporary buffer when calling PutUTF8() to avoid a heap buffer
  overflow write and to avoid clobbering data in-place.
- Handle all possible return values after calling PutUTF8().
* src/utf8.c:
(DecodeUTF8BytesToChar):
(GetUTF8):
(PutUTF8):
- Add assert() statements to catch bugs during fuzzing.
---
 src/clean.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++------
 src/utf8.c  |  5 ++++-
 2 files changed, 58 insertions(+), 7 deletions(-)

diff --git a/src/clean.c b/src/clean.c
index e0cd3baae..2a28fafdd 100644
--- a/src/clean.c
+++ b/src/clean.c
@@ -1821,18 +1821,42 @@ void TY_(NormalizeSpaces)(Lexer *lexer, Node *node)
 
             for (i = node->start; i < node->end; ++i)
             {
+                uint utf8BytesRead = 1, utf8BytesWritten = 0;
+                tmbchar tempbuf[10] = {0};
+                tmbstr result = NULL;
                 c = (byte) lexer->lexbuf[i];
 
                 /* look for UTF-8 multibyte character */
                 if ( c > 0x7F )
-                    i += TY_(GetUTF8)( lexer->lexbuf + i, &c );
+                    utf8BytesRead += TY_(GetUTF8)( lexer->lexbuf + i, &c );
 
                 if ( c == 160 )
                     c = ' ';
 
-                p = TY_(PutUTF8)(p, c);
+                result = TY_(PutUTF8)(&tempbuf[0], c);
+                utf8BytesWritten = (result > &tempbuf[0]) ? (uint)(result - &tempbuf[0]) : 0;
+                if ( utf8BytesWritten == 0 ) {
+                    (lexer->lexbuf + i)[0] = (tmbchar) c;
+                    ++p;
+                }
+                else if ( utf8BytesRead >= utf8BytesWritten ) {
+                    memmove(&(lexer->lexbuf + i)[0], &tempbuf[0], utf8BytesWritten);
+                    i += utf8BytesRead - 1; /* Offset ++i in for loop. */
+                    p += utf8BytesWritten;
+                } else {
+                    /* Error; keep this byte and move to the next. */
+                    if ( c != 0xFFFD && utf8BytesRead != utf8BytesWritten ) {
+#if 1 && defined(_DEBUG)
+                        fprintf(stderr, "utf8BytesRead = %u, utf8BytesWritten = %u\n", utf8BytesRead, utf8BytesWritten);
+                        fprintf(stderr, "i = %d, c = %u\n", i, c);
+#endif
+                        assert( utf8BytesRead == utf8BytesWritten ); /* Can't extend buffer. */
+                    }
+                    ++p;
+                }
             }
-            node->end = p - lexer->lexbuf;
+            intptr_t pos = (p > lexer->lexbuf) ? (p - lexer->lexbuf) : 0;
+            node->end = (pos >= node->start && pos <= node->end) ? (uint)pos : node->end;
         }
 
         node = node->next;
@@ -2504,10 +2528,13 @@ void TY_(DowngradeTypography)(TidyDocImpl* doc, Node* node)
 
             for (i = node->start; i < node->end; ++i)
             {
+                uint utf8BytesRead = 1, utf8BytesWritten = 0;
+                tmbchar tempbuf[10] = {0};
+                tmbstr result = NULL;
                 c = (unsigned char) lexer->lexbuf[i];
 
                 if (c > 0x7F)
-                    i += TY_(GetUTF8)(lexer->lexbuf + i, &c);
+                    utf8BytesRead += TY_(GetUTF8)(lexer->lexbuf + i, &c);
 
                 if (c >= 0x2013 && c <= 0x201E)
                 {
@@ -2530,10 +2557,31 @@ void TY_(DowngradeTypography)(TidyDocImpl* doc, Node* node)
                     }
                 }
 
-                p = TY_(PutUTF8)(p, c);
+                result = TY_(PutUTF8)(&tempbuf[0], c);
+                utf8BytesWritten = (result > &tempbuf[0]) ? (uint)(result - &tempbuf[0]) : 0;
+                if ( utf8BytesWritten == 0 ) {
+                    (lexer->lexbuf + i)[0] = (tmbchar) c;
+                    ++p;
+                }
+                else if ( utf8BytesRead >= utf8BytesWritten ) {
+                    memmove(&(lexer->lexbuf + i)[0], &tempbuf[0], utf8BytesWritten);
+                    i += utf8BytesRead - 1; /* Offset ++i in for loop. */
+                    p += utf8BytesWritten;
+                } else {
+                    /* Error; keep this byte and move to the next. */
+                    if ( c != 0xFFFD && utf8BytesRead != utf8BytesWritten ) {
+#if 1 && defined(_DEBUG)
+                        fprintf(stderr, "utf8BytesRead = %u, utf8BytesWritten = %u\n", utf8BytesRead, utf8BytesWritten);
+                        fprintf(stderr, "i = %d, c = %u\n", i, c);
+#endif
+                        assert( utf8BytesRead == utf8BytesWritten ); /* Can't extend buffer. */
+                    }
+                    ++p;
+                }
             }
 
-            node->end = p - lexer->lexbuf;
+            intptr_t pos = (p > lexer->lexbuf) ? (p - lexer->lexbuf) : 0;
+            node->end = (pos >= node->start && pos <= node->end) ? (uint)pos : node->end;
         }
 
         if (node->content)
diff --git a/src/utf8.c b/src/utf8.c
index da90b95a7..9ac5bd19e 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -321,6 +321,7 @@ int TY_(DecodeUTF8BytesToChar)( uint* c, uint firstByte, ctmbstr successorBytes,
     }
 #endif
 
+    assert(bytes > 0);
     *count = bytes;
     *c = n;
     if ( hasError )
@@ -443,7 +444,8 @@ uint TY_(GetUTF8)( ctmbstr str, uint *ch )
     }
 
     *ch = n;
-    return bytes - 1;
+    assert(bytes > 0);
+    return (bytes > 0) ? (bytes - 1) : 0;
 }
 
 /* store char c as UTF-8 encoded byte stream */
@@ -464,6 +466,7 @@ tmbstr TY_(PutUTF8)( tmbstr buf, uint c )
         count = 3;
     }
     
+    assert(count >= 0);
     buf += count;
     return buf;
 }