diff --git a/src/clean.c b/src/clean.c index e0cd3baa..2a28fafd 100644 --- a/src/clean.c +++ b/src/clean.c @@ -1821,18 +1821,42 @@ void TY_(NormalizeSpaces)(Lexer *lexer, Node *node) for (i = node->start; i < node->end; ++i) { + uint utf8BytesRead = 1, utf8BytesWritten = 0; + tmbchar tempbuf[10] = {0}; + tmbstr result = NULL; c = (byte) lexer->lexbuf[i]; /* look for UTF-8 multibyte character */ if ( c > 0x7F ) - i += TY_(GetUTF8)( lexer->lexbuf + i, &c ); + utf8BytesRead += TY_(GetUTF8)( lexer->lexbuf + i, &c ); if ( c == 160 ) c = ' '; - p = TY_(PutUTF8)(p, c); + result = TY_(PutUTF8)(&tempbuf[0], c); + utf8BytesWritten = (result > &tempbuf[0]) ? (uint)(result - &tempbuf[0]) : 0; + if ( utf8BytesWritten == 0 ) { + (lexer->lexbuf + i)[0] = (tmbchar) c; + ++p; + } + else if ( utf8BytesRead >= utf8BytesWritten ) { + memmove(&(lexer->lexbuf + i)[0], &tempbuf[0], utf8BytesWritten); + i += utf8BytesRead - 1; /* Offset ++i in for loop. */ + p += utf8BytesWritten; + } else { + /* Error; keep this byte and move to the next. */ + if ( c != 0xFFFD && utf8BytesRead != utf8BytesWritten ) { +#if 1 && defined(_DEBUG) + fprintf(stderr, "utf8BytesRead = %u, utf8BytesWritten = %u\n", utf8BytesRead, utf8BytesWritten); + fprintf(stderr, "i = %d, c = %u\n", i, c); +#endif + assert( utf8BytesRead == utf8BytesWritten ); /* Can't extend buffer. */ + } + ++p; + } } - node->end = p - lexer->lexbuf; + intptr_t pos = (p > lexer->lexbuf) ? (p - lexer->lexbuf) : 0; + node->end = (pos >= node->start && pos <= node->end) ? (uint)pos : node->end; } node = node->next; @@ -2504,10 +2528,13 @@ void TY_(DowngradeTypography)(TidyDocImpl* doc, Node* node) for (i = node->start; i < node->end; ++i) { + uint utf8BytesRead = 1, utf8BytesWritten = 0; + tmbchar tempbuf[10] = {0}; + tmbstr result = NULL; c = (unsigned char) lexer->lexbuf[i]; if (c > 0x7F) - i += TY_(GetUTF8)(lexer->lexbuf + i, &c); + utf8BytesRead += TY_(GetUTF8)(lexer->lexbuf + i, &c); if (c >= 0x2013 && c <= 0x201E) { @@ -2530,10 +2557,31 @@ void TY_(DowngradeTypography)(TidyDocImpl* doc, Node* node) } } - p = TY_(PutUTF8)(p, c); + result = TY_(PutUTF8)(&tempbuf[0], c); + utf8BytesWritten = (result > &tempbuf[0]) ? (uint)(result - &tempbuf[0]) : 0; + if ( utf8BytesWritten == 0 ) { + (lexer->lexbuf + i)[0] = (tmbchar) c; + ++p; + } + else if ( utf8BytesRead >= utf8BytesWritten ) { + memmove(&(lexer->lexbuf + i)[0], &tempbuf[0], utf8BytesWritten); + i += utf8BytesRead - 1; /* Offset ++i in for loop. */ + p += utf8BytesWritten; + } else { + /* Error; keep this byte and move to the next. */ + if ( c != 0xFFFD && utf8BytesRead != utf8BytesWritten ) { +#if 1 && defined(_DEBUG) + fprintf(stderr, "utf8BytesRead = %u, utf8BytesWritten = %u\n", utf8BytesRead, utf8BytesWritten); + fprintf(stderr, "i = %d, c = %u\n", i, c); +#endif + assert( utf8BytesRead == utf8BytesWritten ); /* Can't extend buffer. */ + } + ++p; + } } - node->end = p - lexer->lexbuf; + intptr_t pos = (p > lexer->lexbuf) ? (p - lexer->lexbuf) : 0; + node->end = (pos >= node->start && pos <= node->end) ? (uint)pos : node->end; } if (node->content) diff --git a/src/utf8.c b/src/utf8.c index da90b95a..9ac5bd19 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -321,6 +321,7 @@ int TY_(DecodeUTF8BytesToChar)( uint* c, uint firstByte, ctmbstr successorBytes, } #endif + assert(bytes > 0); *count = bytes; *c = n; if ( hasError ) @@ -443,7 +444,8 @@ uint TY_(GetUTF8)( ctmbstr str, uint *ch ) } *ch = n; - return bytes - 1; + assert(bytes > 0); + return (bytes > 0) ? (bytes - 1) : 0; } /* store char c as UTF-8 encoded byte stream */ @@ -464,6 +466,7 @@ tmbstr TY_(PutUTF8)( tmbstr buf, uint c ) count = 3; } + assert(count >= 0); buf += count; return buf; }