-HTML;
-
-// Create a new document with the above HTML.
-$document = new DOMDocument("1.0", "utf-8");
-$document->loadHTML($html);
-
-// Get reference to span tag.
-$span = $document->getElementsByTagName("span")->item(0);
-
-// Set the span's tag to user-supplied $name (malicious user can enter JavaScript!)
-$name = "";
-$span->textContent = $name;
-
-echo $document->saveHTML();
diff --git a/src/Document.php b/src/Document.php
index 06a6013d..03ff9caa 100644
--- a/src/Document.php
+++ b/src/Document.php
@@ -53,29 +53,28 @@ abstract class Document extends DOMDocument implements Stringable, StreamInterfa
DOMText::class => Text::class,
DOMProcessingInstruction::class => ProcessingInstruction::class,
];
+ const DOCTYPE = "";
public function __construct(
public readonly string $characterSet,
public readonly string $contentType,
) {
parent::__construct("1.0", $this->characterSet);
+ $this->encoding = $this->characterSet;
+ $this->substituteEntities = true;
$this->registerNodeClasses();
libxml_use_internal_errors(true);
}
public function __toString():string {
if(get_class($this) === HTMLDocument::class) {
- $string = $this->saveHTML();
+ $string = self::DOCTYPE . "\n";
+ $string .= $this->saveHTML($this->documentElement);
}
else {
$string = $this->saveXML();
}
- $string = mb_convert_encoding(
- $string,
- "UTF-8",
- "HTML-ENTITIES"
- );
return trim($string) . "\n";
}
@@ -363,4 +362,47 @@ private function registerNodeClasses():void {
$this->registerNodeClass($nativeClass, $gtClass);
}
}
+
+ /**
+ * Due to the way HTML is rendered, non-ASCII characters are converted
+ * into their HTML-encoded counterparts, but this behaviour breaks
+ * script tags that have inline JavaScript. This function extracts the
+ * raw innerHTML of each script, so injectScriptHTML can be called after
+ * page render, retaining the original characters.
+ *
+ * @return array Key = a unique string of characters
+ * that the script tag's innerHTML is replaced with, before rendering
+ * the document. This key will be replaced with the value of the array
+ * item after render.
+ */
+ private function extractScriptHTML():array {
+ $scriptHtmlList = [];
+
+ foreach($this->querySelectorAll("script") as $script) {
+ if(strlen($script->textContent) === 0) {
+ continue;
+ }
+ $html = html_entity_decode($script->innerHTML ?? "");
+ $key = str_repeat("@", 16)
+ . uniqid("---script-") . "---"
+ . str_repeat("@", 16);
+ $scriptHtmlList[$key] = $html;
+ $script->innerHTML = $key;
+ }
+
+ return $scriptHtmlList;
+ }
+
+// public function saveHTML(DOMNode $node = null):string {
+// $scriptHtmlList = $this->extractScriptHTML();
+// if(!$node) {
+// $node = $this->documentElement;
+// }
+// $html = parent::saveHTML((new \DOMXPath($this))->query('/')->item(0));
+// foreach($scriptHtmlList as $key => $js) {
+// $html = str_replace($key, $js, $html);
+// }
+// var_Dump($this->encoding, $this->substituteEntities);die();
+// return $html;
+// }
}
diff --git a/src/HTMLDocument.php b/src/HTMLDocument.php
index 0628c73e..ffb0d2f6 100644
--- a/src/HTMLDocument.php
+++ b/src/HTMLDocument.php
@@ -27,13 +27,20 @@ public function __construct(
"text/html",
);
- $html = mb_convert_encoding(
- $html,
- "HTML-ENTITIES",
- $this->characterSet,
- );
+// Workaround for handling UTF-8 encoding correctly.
+// @link https://stackoverflow.com/questions/8218230/php-domdocument-loadhtml-not-encoding-utf-8-correctly
+ $html = 'encoding)
+ . '" ?>'
+ . $html;
$this->loadHTML($html, LIBXML_SCHEMA_CREATE | LIBXML_COMPACT);
+ foreach($this->childNodes as $child) {
+ if($child instanceof ProcessingInstruction) {
+ $this->removeChild($child);
+ }
+ }
+ /** @var array $nonElementChildNodes */
$nonElementChildNodes = [];
foreach($this->childNodes as $child) {
if($child instanceof DocumentType
diff --git a/src/ParentNode.php b/src/ParentNode.php
index 439e3972..24ca07c4 100644
--- a/src/ParentNode.php
+++ b/src/ParentNode.php
@@ -258,7 +258,7 @@ public function getElementsByTagName(string $qualifiedName):HTMLCollection {
*
* @param Node|Element|Text|Comment $child
*/
- public function removeChild(Node|Element|Text|Comment|DOMNode $child):Node|Element|Text|Comment {
+ public function removeChild(Node|Element|Text|Comment|DOMNode|ProcessingInstruction $child):Node|Element|Text|Comment|CdataSection|ProcessingInstruction {
try {
/** @var Node|Element|Text|Comment $removed */
$removed = parent::removeChild($child);
diff --git a/test/phpunit/DocumentStreamTest.php b/test/phpunit/DocumentStreamTest.php
index 27b34d42..9926daa1 100644
--- a/test/phpunit/DocumentStreamTest.php
+++ b/test/phpunit/DocumentStreamTest.php
@@ -73,7 +73,7 @@ public function testEof():void {
while(!$sut->eof()) {
$bytes .= $sut->read(10);
}
- self::assertEquals("\n\n", $bytes);
+ self::assertEquals("\n\n", $bytes);
}
public function testIsSeekableBeforeOpen():void {
@@ -132,7 +132,7 @@ public function testGetContents():void {
$sut->body->appendChild($sut->createElement("example"));
$sut->open();
$contents = $sut->getContents();
- self::assertEquals("\n\n", $contents);
+ self::assertEquals("\n\n", $contents);
}
public function testGetMetaData():void {
diff --git a/test/phpunit/ElementTest.php b/test/phpunit/ElementTest.php
index 2f3cd657..d198e037 100644
--- a/test/phpunit/ElementTest.php
+++ b/test/phpunit/ElementTest.php
@@ -168,6 +168,34 @@ public function testInnerHTMLReset():void {
self::assertEquals("And another", $sut->children[1]->innerHTML);
}
+ public function testInnerText():void {
+ $document = new HTMLDocument();
+ $sut = $document->createElement("span");
+ $sut->innerText = "Hello, World!";
+ self::assertSame($sut->innerText, $sut->innerHTML);
+ }
+
+ public function testInnerText_containsHTML():void {
+ $document = new HTMLDocument();
+ $sut = $document->createElement("span");
+ $textWithHTML = "Hello, World!";
+ $sut->innerText = $textWithHTML;
+ self::assertSame($textWithHTML, $sut->innerText);
+ self::assertSame("Hello, <b>World</b>!", $sut->innerHTML);
+ }
+
+ public function testTextContent():void {
+ $document = new HTMLDocument();
+ $sut = $document->createElement("span");
+ $document->body->appendChild($sut);
+
+ $textWithHTML = "Hello, World!";
+ $sut->textContent = $textWithHTML;
+ self::assertNotSame($textWithHTML, $sut->innerHTML);
+ self::assertSame($textWithHTML, $sut->innerText);
+ self::assertSame("Hello, <b>World</b>!", $sut->innerHTML);
+ }
+
public function testOuterHTML():void {
$document = new HTMLDocument();
$sut = $document->createElement("example");
diff --git a/test/phpunit/HTMLDocumentTest.php b/test/phpunit/HTMLDocumentTest.php
index 30f8167e..1f65cc56 100644
--- a/test/phpunit/HTMLDocumentTest.php
+++ b/test/phpunit/HTMLDocumentTest.php
@@ -59,7 +59,7 @@ public function testAppendChild_createdElementsAreNotNamespaced():void {
public function testToString_emojiEncoding():void {
$html = "
I ❤️ my 🐈
";
$sut = new HTMLDocument($html);
- self::assertStringContainsString("$html", (string)$sut);
+ self::assertStringContainsString("
I ❤️ my 🐈
", (string)$sut);
}
public function testPropBody_readOnly():void {
@@ -83,13 +83,13 @@ public function testPropBody_instanceOfHTMLBodyElementDefaultHTML():void {
public function testToString_emptyHTML():void {
$sut = new HTMLDocument();
/** @noinspection HtmlRequiredLangAttribute */
- self::assertEquals("\n\n", (string)$sut);
+ self::assertEquals("\n\n", (string)$sut);
}
public function testToStringDefaultHTML():void {
$sut = new HTMLDocument(DocumentTestFactory::HTML_DEFAULT);
/** @noinspection HtmlRequiredLangAttribute */
- self::assertEquals("\n
Hello, PHP.Gt!
\n", (string)$sut);
+ self::assertEquals("\n
Hello, PHP.Gt!
\n", (string)$sut);
}
public function testPropCharacter_default():void {
@@ -262,7 +262,7 @@ public function testWriteHTMLDocument():void {
$contents = stream_get_contents($stream);
/** @noinspection HtmlRequiredLangAttribute */
$expected = <<
+
$message1
$message2
@@ -621,4 +621,164 @@ public function testGetElementById_afterIdChangedViaNode():void {
self::assertSame("changed", $child->getAttribute("id"));
self::assertSame($child, $sut->getElementById("changed"));
}
+
+ public function testSaveHTML_XSS():void {
+ $html = <<
+
+
Hello, you!
+ HTML;
+
+// Create a new document with the above HTML.
+ $document = new HTMLDocument($html);
+ $document->loadHTML($html);
+
+// Get reference to span tag.
+ $span = $document->getElementsByTagName("span")->item(0);
+
+// Set the span's tag to user-supplied $name (malicious user can enter JavaScript!)
+ $name = "";
+ $span->textContent = $name;
+
+ $script = $document->querySelector("script");
+ self::assertNull($script);
+
+ $documentString = (string)$document;
+ self::assertStringNotContainsString("
+