diff --git a/scratch.php b/scratch.php deleted file mode 100644 index 03414260..00000000 --- a/scratch.php +++ /dev/null @@ -1,20 +0,0 @@ - - -

Hello, you!

-HTML; - -// Create a new document with the above HTML. -$document = new DOMDocument("1.0", "utf-8"); -$document->loadHTML($html); - -// Get reference to span tag. -$span = $document->getElementsByTagName("span")->item(0); - -// Set the span's tag to user-supplied $name (malicious user can enter JavaScript!) -$name = ""; -$span->textContent = $name; - -echo $document->saveHTML(); diff --git a/src/Document.php b/src/Document.php index 06a6013d..03ff9caa 100644 --- a/src/Document.php +++ b/src/Document.php @@ -53,29 +53,28 @@ abstract class Document extends DOMDocument implements Stringable, StreamInterfa DOMText::class => Text::class, DOMProcessingInstruction::class => ProcessingInstruction::class, ]; + const DOCTYPE = ""; public function __construct( public readonly string $characterSet, public readonly string $contentType, ) { parent::__construct("1.0", $this->characterSet); + $this->encoding = $this->characterSet; + $this->substituteEntities = true; $this->registerNodeClasses(); libxml_use_internal_errors(true); } public function __toString():string { if(get_class($this) === HTMLDocument::class) { - $string = $this->saveHTML(); + $string = self::DOCTYPE . "\n"; + $string .= $this->saveHTML($this->documentElement); } else { $string = $this->saveXML(); } - $string = mb_convert_encoding( - $string, - "UTF-8", - "HTML-ENTITIES" - ); return trim($string) . "\n"; } @@ -363,4 +362,47 @@ private function registerNodeClasses():void { $this->registerNodeClass($nativeClass, $gtClass); } } + + /** + * Due to the way HTML is rendered, non-ASCII characters are converted + * into their HTML-encoded counterparts, but this behaviour breaks + * script tags that have inline JavaScript. This function extracts the + * raw innerHTML of each script, so injectScriptHTML can be called after + * page render, retaining the original characters. + * + * @return array Key = a unique string of characters + * that the script tag's innerHTML is replaced with, before rendering + * the document. This key will be replaced with the value of the array + * item after render. + */ + private function extractScriptHTML():array { + $scriptHtmlList = []; + + foreach($this->querySelectorAll("script") as $script) { + if(strlen($script->textContent) === 0) { + continue; + } + $html = html_entity_decode($script->innerHTML ?? ""); + $key = str_repeat("@", 16) + . uniqid("---script-") . "---" + . str_repeat("@", 16); + $scriptHtmlList[$key] = $html; + $script->innerHTML = $key; + } + + return $scriptHtmlList; + } + +// public function saveHTML(DOMNode $node = null):string { +// $scriptHtmlList = $this->extractScriptHTML(); +// if(!$node) { +// $node = $this->documentElement; +// } +// $html = parent::saveHTML((new \DOMXPath($this))->query('/')->item(0)); +// foreach($scriptHtmlList as $key => $js) { +// $html = str_replace($key, $js, $html); +// } +// var_Dump($this->encoding, $this->substituteEntities);die(); +// return $html; +// } } diff --git a/src/HTMLDocument.php b/src/HTMLDocument.php index 0628c73e..ffb0d2f6 100644 --- a/src/HTMLDocument.php +++ b/src/HTMLDocument.php @@ -27,13 +27,20 @@ public function __construct( "text/html", ); - $html = mb_convert_encoding( - $html, - "HTML-ENTITIES", - $this->characterSet, - ); +// Workaround for handling UTF-8 encoding correctly. +// @link https://stackoverflow.com/questions/8218230/php-domdocument-loadhtml-not-encoding-utf-8-correctly + $html = 'encoding) + . '" ?>' + . $html; $this->loadHTML($html, LIBXML_SCHEMA_CREATE | LIBXML_COMPACT); + foreach($this->childNodes as $child) { + if($child instanceof ProcessingInstruction) { + $this->removeChild($child); + } + } + /** @var array $nonElementChildNodes */ $nonElementChildNodes = []; foreach($this->childNodes as $child) { if($child instanceof DocumentType diff --git a/src/ParentNode.php b/src/ParentNode.php index 439e3972..24ca07c4 100644 --- a/src/ParentNode.php +++ b/src/ParentNode.php @@ -258,7 +258,7 @@ public function getElementsByTagName(string $qualifiedName):HTMLCollection { * * @param Node|Element|Text|Comment $child */ - public function removeChild(Node|Element|Text|Comment|DOMNode $child):Node|Element|Text|Comment { + public function removeChild(Node|Element|Text|Comment|DOMNode|ProcessingInstruction $child):Node|Element|Text|Comment|CdataSection|ProcessingInstruction { try { /** @var Node|Element|Text|Comment $removed */ $removed = parent::removeChild($child); diff --git a/test/phpunit/DocumentStreamTest.php b/test/phpunit/DocumentStreamTest.php index 27b34d42..9926daa1 100644 --- a/test/phpunit/DocumentStreamTest.php +++ b/test/phpunit/DocumentStreamTest.php @@ -73,7 +73,7 @@ public function testEof():void { while(!$sut->eof()) { $bytes .= $sut->read(10); } - self::assertEquals("\n\n", $bytes); + self::assertEquals("\n\n", $bytes); } public function testIsSeekableBeforeOpen():void { @@ -132,7 +132,7 @@ public function testGetContents():void { $sut->body->appendChild($sut->createElement("example")); $sut->open(); $contents = $sut->getContents(); - self::assertEquals("\n\n", $contents); + self::assertEquals("\n\n", $contents); } public function testGetMetaData():void { diff --git a/test/phpunit/ElementTest.php b/test/phpunit/ElementTest.php index 2f3cd657..d198e037 100644 --- a/test/phpunit/ElementTest.php +++ b/test/phpunit/ElementTest.php @@ -168,6 +168,34 @@ public function testInnerHTMLReset():void { self::assertEquals("And another", $sut->children[1]->innerHTML); } + public function testInnerText():void { + $document = new HTMLDocument(); + $sut = $document->createElement("span"); + $sut->innerText = "Hello, World!"; + self::assertSame($sut->innerText, $sut->innerHTML); + } + + public function testInnerText_containsHTML():void { + $document = new HTMLDocument(); + $sut = $document->createElement("span"); + $textWithHTML = "Hello, World!"; + $sut->innerText = $textWithHTML; + self::assertSame($textWithHTML, $sut->innerText); + self::assertSame("Hello, <b>World</b>!", $sut->innerHTML); + } + + public function testTextContent():void { + $document = new HTMLDocument(); + $sut = $document->createElement("span"); + $document->body->appendChild($sut); + + $textWithHTML = "Hello, World!"; + $sut->textContent = $textWithHTML; + self::assertNotSame($textWithHTML, $sut->innerHTML); + self::assertSame($textWithHTML, $sut->innerText); + self::assertSame("Hello, <b>World</b>!", $sut->innerHTML); + } + public function testOuterHTML():void { $document = new HTMLDocument(); $sut = $document->createElement("example"); diff --git a/test/phpunit/HTMLDocumentTest.php b/test/phpunit/HTMLDocumentTest.php index 30f8167e..1f65cc56 100644 --- a/test/phpunit/HTMLDocumentTest.php +++ b/test/phpunit/HTMLDocumentTest.php @@ -59,7 +59,7 @@ public function testAppendChild_createdElementsAreNotNamespaced():void { public function testToString_emojiEncoding():void { $html = "

I ❤️ my 🐈

"; $sut = new HTMLDocument($html); - self::assertStringContainsString("$html", (string)$sut); + self::assertStringContainsString("

I ❤️ my 🐈

", (string)$sut); } public function testPropBody_readOnly():void { @@ -83,13 +83,13 @@ public function testPropBody_instanceOfHTMLBodyElementDefaultHTML():void { public function testToString_emptyHTML():void { $sut = new HTMLDocument(); /** @noinspection HtmlRequiredLangAttribute */ - self::assertEquals("\n\n", (string)$sut); + self::assertEquals("\n\n", (string)$sut); } public function testToStringDefaultHTML():void { $sut = new HTMLDocument(DocumentTestFactory::HTML_DEFAULT); /** @noinspection HtmlRequiredLangAttribute */ - self::assertEquals("\n

Hello, PHP.Gt!

\n", (string)$sut); + self::assertEquals("\n

Hello, PHP.Gt!

\n", (string)$sut); } public function testPropCharacter_default():void { @@ -262,7 +262,7 @@ public function testWriteHTMLDocument():void { $contents = stream_get_contents($stream); /** @noinspection HtmlRequiredLangAttribute */ $expected = << +

Hello, PHP.Gt!

$message HTML; @@ -281,7 +281,7 @@ public function testWritelnHTMLDocument():void { $contents = stream_get_contents($stream); /** @noinspection HtmlRequiredLangAttribute */ $expected = << +

Hello, PHP.Gt!

$message1 $message2 @@ -621,4 +621,164 @@ public function testGetElementById_afterIdChangedViaNode():void { self::assertSame("changed", $child->getAttribute("id")); self::assertSame($child, $sut->getElementById("changed")); } + + public function testSaveHTML_XSS():void { + $html = << + +

Hello, you!

+ HTML; + +// Create a new document with the above HTML. + $document = new HTMLDocument($html); + $document->loadHTML($html); + +// Get reference to span tag. + $span = $document->getElementsByTagName("span")->item(0); + +// Set the span's tag to user-supplied $name (malicious user can enter JavaScript!) + $name = ""; + $span->textContent = $name; + + $script = $document->querySelector("script"); + self::assertNull($script); + + $documentString = (string)$document; + self::assertStringNotContainsString(" +

This is the page title

+ HTML; + + $sut = new HTMLDocument($content); + $h1 = $sut->querySelector("#pageTitle"); + $div = $sut->createElement("div"); + $div->innerHTML = "lorem"; + $h1->after($div); + + $htmlString = (string)$sut; + self::assertStringContainsString("", $htmlString); + } + + public function testEscapedCharacters_insideScriptTag():void { + $content = << +

+ Hello, Marcin! +

+ + + + HTML; + + $sut = new HTMLDocument($content); + $renderedHTML = (string)$sut; + + self::assertStringContainsString('p.append(" są ");', $renderedHTML); + self::assertStringNotContainsString('p.append(" są ");', $renderedHTML); + self::assertStringContainsString('document.createTextNode("fajne");', $renderedHTML); + self::assertStringContainsString('word1.textContent = "Koty też";', $renderedHTML); + self::assertStringNotContainsString('word1.textContent = "Koty też";', $renderedHTML); + } + + public function testEscapedCharacters_multipleScriptTagsShouldNotBeSlow():void { + $content = << + + + + Speed test using lots of script tags + + +

Speed test using lots of script tags

+ + + HTML; + + $sut = new HTMLDocument($content); + + for($i = 0; $i < 1000; $i++) { + $script = $sut->createElement("script"); + $script->innerHTML = "console.log('Polski jest pięknym językiem');"; + if($i % 2 === 0) { + $sut->head->appendChild($script); + } + else { + $sut->body->appendChild($script); + } + } + + $timeStart = microtime(true); + $renderedHTML = (string)$sut; + $timeEnd = microtime(true); + self::assertLessThan( + 1, + $timeEnd - $timeStart, + "It should never take a second to render the HTML, even with 1,000 script nodes" + ); + + self::assertStringContainsString("Polski jest pięknym językiem", $renderedHTML); + self::assertEquals(1000, substr_count($renderedHTML, "Polski jest pięknym językiem")); + } + + public function testEscapedCharacters_entireDom():void { + $content = << + +

Tworzenie i usuwanie elementów

+

+			Koty też
+			
+ + + HTML; + + $stringsToExpect = [ + "Tworzenie i usuwanie elementów", // within the h1 + "Koty też", // within the pre + "zobaczyć co możemy użyć", // within the script tag + ]; + $stringsToNotExpect = [ + "ó", + "ż", + ]; + + $sut = new HTMLDocument($content); + $domString = (string)$sut; + + foreach($stringsToExpect as $needle) { + self::assertStringContainsString($needle, $domString); + } + foreach($stringsToNotExpect as $needle) { + self::assertStringNotContainsString($needle, $domString); + } + } }