Skip to content

Commit

Permalink
Use new HTML5 parser available on PHP >= 8.4
Browse files Browse the repository at this point in the history
  • Loading branch information
alecpl committed Sep 1, 2024
1 parent da81079 commit 7c8968f
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 25 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
- Convert images in HTML content pasted into HTML editor to `data:` URIs (and later to attachments) (#6938)
- Add possibility to change ATTR_EMULATE_PREPARES via config file (#9213)
- Use draft settings (like DSN) on "Edit as new" (#9349)
- Use new HTML5 parser available on PHP >= 8.4
- Mailvelope: Add a button to enable the extension for webmail domain (#9498)
- OAuth: Add support for SMTP without authentication (#9183)
- OAuth: Add support for OAuth/OpenIDC discovery (#8201)
Expand Down Expand Up @@ -58,6 +59,9 @@
- Fix attachment name decoding when 'charset' parameter exists in the headers (#9376)
- Fix deprecated (in PHP 8.4) use of session_set_save_handler() (#9060)
- Fix potential HTTP protocol version mismatch (#8982)

## Release 1.6.9

- Fix regression where printing/scaling/rotating image attachments was broken (#9571)
- Fix regression where HTML messages were displayed unstyled (#9586)

Expand Down
42 changes: 33 additions & 9 deletions program/lib/Roundcube/rcube_washtml.php
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<?php

use Dom\HTMLDocument;
use Masterminds\HTML5;

/*
Expand Down Expand Up @@ -299,7 +300,7 @@ private function wash_attribs($node)

// in SVG to/from attribs may contain anything, including URIs
if ($key == 'to' || $key == 'from') {
$key = strtolower($node->getAttribute('attributeName'));
$key = strtolower((string) $node->getAttribute('attributeName'));
if ($key && !isset($this->_html_attribs[$key])) {
$key = null;
}
Expand Down Expand Up @@ -328,6 +329,8 @@ private function wash_attribs($node)
&& (in_array($key, ['id', 'class', 'for']) || ($key == 'name' && $node->nodeName == 'a'))
) {
$out = preg_replace('/(\S+)/', $this->_css_prefix . '\1', $value);
} elseif ($key == 'xmlns' && !strpos($value, '://')) {
continue;
} elseif ($key) {
$out = $value;
}
Expand Down Expand Up @@ -579,12 +582,22 @@ private function dumpHtml($node, $level = 20)
$this->wash_attribs($node), $this->dumpHtml($node, $level), $this);
} elseif (isset($this->_html_elements[$tagName])) {
$content = $this->dumpHtml($node, $level);
$tag = '<' . $node->nodeName;
$tag = '<' . $tagName;

if ($tagName == 'svg') {
$xpath = new DOMXPath($node->ownerDocument);
foreach ($xpath->query('namespace::*') as $ns) {
if ($ns->nodeName != 'xmlns:xml') {
if (method_exists($node, 'getInScopeNamespaces')) {
$ns_nodes = $node->getInScopeNamespaces();
} else {
$xpath = new DOMXPath($node->ownerDocument);
$ns_nodes = $xpath->query('namespace::*');
}

foreach ($ns_nodes as $ns) {
if (isset($ns->nodeName) && isset($ns->nodeValue)
&& $ns->nodeName != 'xmlns:xml'
&& preg_match('/^[a-zA-Z:-]+$/', $ns->nodeName)
&& strpos($ns->nodeValue, '://')
) {
$tag .= sprintf(' %s="%s"',
$ns->nodeName,
htmlspecialchars($ns->nodeValue, \ENT_QUOTES, $this->config['charset'])
Expand All @@ -602,15 +615,15 @@ private function dumpHtml($node, $level = 20)
} elseif ($content === '' && ($this->is_xml || isset($this->_void_elements[$tagName]))) {
$dump .= $tag . ' />';
} else {
$dump .= $tag . '>' . $content . '</' . $node->nodeName . '>';
$dump .= $tag . '>' . $content . '</' . $tagName . '>';
}
} elseif (isset($this->_ignore_elements[$tagName])) {
if ($this->config['add_comments']) {
$dump .= '<!-- ' . htmlspecialchars($node->nodeName, \ENT_QUOTES, $this->config['charset']) . ' not allowed -->';
$dump .= '<!-- ' . htmlspecialchars($tagName, \ENT_QUOTES, $this->config['charset']) . ' not allowed -->';
}
} else {
if ($this->config['add_comments']) {
$dump .= '<!-- ' . htmlspecialchars($node->nodeName, \ENT_QUOTES, $this->config['charset']) . ' ignored -->';
$dump .= '<!-- ' . htmlspecialchars($tagName, \ENT_QUOTES, $this->config['charset']) . ' ignored -->';
}
$dump .= $this->dumpHtml($node, $level); // ignore tags not its content
}
Expand Down Expand Up @@ -657,8 +670,19 @@ public function wash($html)
$this->is_xml = !preg_match('/<(html|head|body)/i', $html) && stripos($html, '<svg') !== false;
$method = $this->is_xml ? 'loadXML' : 'loadHTML';

// Try HTML5 parser available in PHP >= 8.4
// TODO: Parse XML also with this new PHP parser (?)
if (!$this->is_xml && class_exists('Dom\HTMLDocument')) {
try {
$options = constant('Dom\HTML_NO_DEFAULT_NS') | \LIBXML_COMPACT | \LIBXML_NOERROR;
$node = HTMLDocument::createFromString($html, $options, $this->config['charset']);
} catch (Exception $e) {
// ignore, fallback to other methods
}
}

// DOMDocument does not support HTML5, try Masterminds parser if available
if (!$this->is_xml && class_exists('Masterminds\HTML5')) {
if (empty($node) && !$this->is_xml && class_exists('Masterminds\HTML5')) {
try {
// disabled_html_ns=true is a workaround for the performance issue
// https://github.com/Masterminds/html5-php/issues/181
Expand Down
24 changes: 8 additions & 16 deletions tests/Framework/WashtmlTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,7 @@ public function test_wash_svg()
<!-- script not allowed -->
<text x="10" y="25">An example text</text>
<a xlink:href="http://www.w.pl"><rect width="100%" height="100%" /></a>
<!-- foreignObject ignored -->
<!-- foreignobject ignored -->
<set attributeName="onmouseover" x-washed="to" />
<animate attributeName="onunload" x-washed="to" />
<animate attributeName="xlink:href" begin="0" x-washed="from" />
Expand Down Expand Up @@ -400,7 +400,7 @@ public static function provide_wash_svg_tests_cases(): iterable
],
[
'<svg xmlns="&quot; onload=&quot;alert(document.domain)" />',
'<svg xmlns="&quot; onload=&quot;alert(document.domain)" />',
'<svg />',
],
[
'<html><svg xmlns="&quot; onload=&quot;alert(document.domain)" />',
Expand Down Expand Up @@ -824,23 +824,15 @@ public function test_table_bug7356()
<tr><td></td></tr>
</table>';

$expected = '
<table id="t1">
<tr>
<td>
<table id="t2">
<tr>
<td></td>
</tr>
</table>
</td>
</tr>
<tr><td></td></tr>
</table>';
$expected = '<table id="t1"><tr><td><table id="t2"><tr><td></td></tr></table></td></tr><tr><td></td></tr></table>';

if (class_exists('Dom\HTMLDocument')) {
$expected = '<table id="t1"><tbody><tr><td><table id="t2"><tbody><tr></tr><tr><td></td></tr></tbody></table></td></tr><tr><td></td></tr></tbody></table>';
}

$washer = new \rcube_washtml();
$washed = $this->cleanupResult($washer->wash($html));

$this->assertSame(trim($expected), $washed);
$this->assertSame($expected, preg_replace('/>[^<>]+</', '><', $washed));
}
}

0 comments on commit 7c8968f

Please sign in to comment.