diff --git a/composer.json b/composer.json index 166886f7..024c8e95 100755 --- a/composer.json +++ b/composer.json @@ -24,15 +24,20 @@ "myclabs/php-enum": "^1.7" }, "require-dev": { - "phpunit/phpunit": "^7.5.1", + "phpunit/phpunit": "^10.5", "mockery/mockery": "^1.2", - "infection/infection": "^0.13.4", - "phan/phan": "^2.4", + "infection/infection": ">=0.13.4", + "phan/phan": ">=2.4", "friendsofphp/php-cs-fixer": "^2.16" }, "autoload": { "psr-4": { "PHPHtmlParser\\": "src/PHPHtmlParser" } + }, + "config": { + "allow-plugins": { + "infection/extension-installer": true + } } } diff --git a/src/PHPHtmlParser/DTO/Selector/RuleDTO.php b/src/PHPHtmlParser/DTO/Selector/RuleDTO.php index 5299e3a0..6a3b0b3c 100644 --- a/src/PHPHtmlParser/DTO/Selector/RuleDTO.php +++ b/src/PHPHtmlParser/DTO/Selector/RuleDTO.php @@ -36,6 +36,11 @@ final class RuleDTO */ private $alterNext; + /** + * @var bool + */ + private $isNthOfType; + private function __construct(array $values) { $this->tag = $values['tag']; @@ -44,21 +49,23 @@ private function __construct(array $values) $this->value = $values['value']; $this->noKey = $values['noKey']; $this->alterNext = $values['alterNext']; + $this->isNthOfType = $values['isNthOfType']; } /** * @param string|array|null $key * @param string|array|null $value */ - public static function makeFromPrimitives(string $tag, string $operator, $key, $value, bool $noKey, bool $alterNext): RuleDTO + public static function makeFromPrimitives(string $tag, string $operator, $key, $value, bool $noKey, bool $alterNext, bool $isNthOfType = false): RuleDTO { return new RuleDTO([ - 'tag' => $tag, - 'operator' => $operator, - 'key' => $key, - 'value' => $value, - 'noKey' => $noKey, - 'alterNext' => $alterNext, + 'tag' => $tag, + 'operator' => $operator, + 'key' => $key, + 'value' => $value, + 'noKey' => $noKey, + 'alterNext' => $alterNext, + 'isNthOfType' => $isNthOfType ]); } @@ -97,4 +104,9 @@ public function isAlterNext(): bool { return $this->alterNext; } + + public function isNthOfType(): bool + { + return $this->isNthOfType; + } } diff --git a/src/PHPHtmlParser/Dom/Node/InnerNode.php b/src/PHPHtmlParser/Dom/Node/InnerNode.php index 448057a7..2b9fb305 100644 --- a/src/PHPHtmlParser/Dom/Node/InnerNode.php +++ b/src/PHPHtmlParser/Dom/Node/InnerNode.php @@ -100,6 +100,25 @@ public function countChildren(): int return \count($this->children); } + public function childNodes(): array + { + return $this->children; + } + + public function childElements(): array + { + return array_values(array_filter($this->getChildren(), function ($el) { + return !$el->isTextNode(); + })); + } + + public function childElementsOfType(string $tag): array + { + return array_values(array_filter($this->getChildren(), function ($el) use ($tag) { + return $el instanceof HtmlNode && $el->getTag()->name() == $tag; + })); + } + /** * Adds a child node to this node and returns the id of the child for this * parent. diff --git a/src/PHPHtmlParser/Selector/Parser.php b/src/PHPHtmlParser/Selector/Parser.php index 4643c467..3f1be13b 100755 --- a/src/PHPHtmlParser/Selector/Parser.php +++ b/src/PHPHtmlParser/Selector/Parser.php @@ -19,7 +19,7 @@ class Parser implements ParserInterface * * @var string */ - private $pattern = "/([\w\-:\*>]*)(?:\#([\w\-]+)|\.([\w\.\-]+))?(?:\[@?(!?[\w\-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; + private $pattern = "/([\w:*>+~-]*(?:\([\w\d]+\))?)(?:#([\w-]+)|\.([\w\.-]+))?(?:\[@?(!?[\w:-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; /** * Parses the selector string. @@ -40,9 +40,10 @@ public function parseSelectorString(string $selector): ParsedSelectorCollectionD $value = null; $noKey = false; $alterNext = false; + $isNthOfType = false; // check for elements that alter the behavior of the next element - if ($tag == '>') { + if ($tag == '>' || $tag == '+' || $tag == '~') { $alterNext = true; } @@ -58,11 +59,41 @@ public function parseSelectorString(string $selector): ParsedSelectorCollectionD $value = \explode('.', $match[3]); } + // check for pseudoclass selector + if (strpos($match[0], ':') !== false) { + $pos = strpos($match[0], ':'); + $key = 'pseudoclass'; + $tag = $pos > 0 ? substr($match[0], 0, $pos) : '*'; + $value = \substr($match[0], $pos+1); + + if (\trim($value, ', ') == 'first-child') { + $value = 'nth-child(1)'; + } + else if (\trim($value, ', ') == 'last-child') { + $value = 'nth-last-child(1)'; + } + else if (\trim($value, ', ') == 'first-of-type') { + $value = 'nth-of-type(1)'; + } + else if (\trim($value, ', ') == 'last-of-type') { + $value = 'nth-last-of-type(1)'; + } + + if (preg_match("/^(nth-child|nth-of-type)\(\d+\)$/", \trim($value, ', '))) { + preg_match_all("/^(nth-child|nth-of-type)\((\d+)\)$/", \trim($value, ', '), $matches, PREG_SET_ORDER); + $key = (int) $matches[0][2]; + } else if (preg_match("/^(nth-last-child|nth-last-of-type)\(\d+\)$/", \trim($value, ', '))) { + preg_match_all("/^(nth-last-child|nth-last-of-type)\((\d+)\)$/", \trim($value, ', '), $matches, PREG_SET_ORDER); + $key = - (int) $matches[0][2]; + } + $isNthOfType = preg_match("/^nth(-last)?-of-type\(\d+\)$/", \trim($value, ', ')); + } + // and final attribute selector - if (!empty($match[4])) { + else if (!empty($match[4])) { $key = \strtolower($match[4]); } - if (!empty($match[5])) { + else if (!empty($match[5])) { $operator = $match[5]; } if (!empty($match[6])) { @@ -98,7 +129,8 @@ public function parseSelectorString(string $selector): ParsedSelectorCollectionD $key, $value, $noKey, - $alterNext + $alterNext, + $isNthOfType ); if (isset($match[7]) && \is_string($match[7]) && \trim($match[7]) == ',') { $selectors[] = ParsedSelectorDTO::makeFromRules($rules); diff --git a/src/PHPHtmlParser/Selector/Seeker.php b/src/PHPHtmlParser/Selector/Seeker.php index abd6dc4e..3be94cc0 100644 --- a/src/PHPHtmlParser/Selector/Seeker.php +++ b/src/PHPHtmlParser/Selector/Seeker.php @@ -6,6 +6,7 @@ use PHPHtmlParser\Contracts\Selector\SeekerInterface; use PHPHtmlParser\Dom\Node\AbstractNode; +use PHPHtmlParser\Dom\Node\HtmlNode; use PHPHtmlParser\Dom\Node\InnerNode; use PHPHtmlParser\Dom\Node\LeafNode; use PHPHtmlParser\DTO\Selector\RuleDTO; @@ -19,27 +20,25 @@ class Seeker implements SeekerInterface * * @var InnerNode[] * - * @throws ChildNotFoundException + * @Return AbstractNode[] */ public function seek(array $nodes, RuleDTO $rule, array $options): array { - // XPath index - if ($rule->getTag() !== null && \is_numeric($rule->getKey())) { - $count = 0; + + if ($rule->getTag() == '+' || $rule->getTag() == '~') { + $result = []; foreach ($nodes as $node) { - if ($rule->getTag() == '*' - || $rule->getTag() == $node->getTag() - ->name() - ) { - ++$count; - if ($count == $rule->getKey()) { - // found the node we wanted - return [$node]; + if ($rule->getTag() == '+') { + $result[] = $node->nextSibling(); + } else { + while ($node->hasNextSibling()) { + $result[] = $node->nextSibling(); + $node = $node->nextSibling(); } } } - return []; + return $result; } $options = $this->flattenOptions($options); @@ -62,16 +61,34 @@ public function seek(array $nodes, RuleDTO $rule, array $options): array continue; } - $pass = $this->checkTag($rule, $child); - if ($pass && $rule->getKey() !== null) { - $pass = $this->checkKey($rule, $child); + if (!$child instanceof HtmlNode) { + $child = $this->getNextChild($node, $child); + continue; } - if ($pass && - $rule->getKey() !== null && - $rule->getValue() !== null && - $rule->getValue() != '*' - ) { - $pass = $this->checkComparison($rule, $child); + + $pass = true; + + if ($rule->getTag() !== null && \is_numeric($rule->getKey()) && $node instanceof HtmlNode) { + $children = $rule->isNthOfType() ? + $node->childElementsOfType($child->getTag()->name()) : + $node->childElements(); + $n = $rule->getKey() < 0 ? count($children) + $rule->getKey() : $rule->getKey()-1; + $pass = $n >= 0 && $n < count($children) && $child == $children[$n]; + } + + if ($pass) { + $pass = $this->checkTag($rule, $child); + if ($pass && $rule->getKey() !== null && !\is_numeric($rule->getKey())) { + $pass = $this->checkKey($rule, $child); + } + if ($pass && + $rule->getKey() !== null && + $rule->getValue() !== null && + $rule->getValue() != '*' && + !\is_numeric($rule->getKey()) + ) { + $pass = $this->checkComparison($rule, $child); + } } if ($pass) { diff --git a/src/PHPHtmlParser/Selector/Selector.php b/src/PHPHtmlParser/Selector/Selector.php index 697fb9cd..12150e8b 100755 --- a/src/PHPHtmlParser/Selector/Selector.php +++ b/src/PHPHtmlParser/Selector/Selector.php @@ -70,14 +70,19 @@ public function find(AbstractNode $node): Collection } $options = []; + $lastRule = null; foreach ($selector->getRules() as $rule) { - if ($rule->isAlterNext()) { + if ($rule->getTag() == '*' && $lastRule && ($lastRule->getTag() == '+' || $lastRule->getTag() == '~')) { + continue; + } + if ($rule->isAlterNext() && $rule->getTag() == '>') { $options[] = $this->alterNext($rule); continue; } $nodes = $this->seeker->seek($nodes, $rule, $options); // clear the options $options = []; + $lastRule = $rule; } // this is the final set of nodes diff --git a/tests/Selector/SeekerTest.php b/tests/Selector/SeekerTest.php index d9e0e824..e4263bb0 100644 --- a/tests/Selector/SeekerTest.php +++ b/tests/Selector/SeekerTest.php @@ -2,6 +2,7 @@ declare(strict_types=1); +use PHPHtmlParser\Dom\Node\HtmlNode; use PHPHtmlParser\DTO\Selector\RuleDTO; use PHPHtmlParser\Selector\Seeker; use PHPUnit\Framework\TestCase; @@ -22,4 +23,178 @@ public function testSeekReturnEmptyArray() $results = $seeker->seek([], $ruleDTO, []); $this->assertCount(0, $results); } + + public function testSeekNthChild() + { + $ruleDTO = RuleDTO::makeFromPrimitives( + '*', + '=', + 1, + null, + false, + false + ); + + $test = new HtmlNode('div'); + $p1 = new HtmlNode('p'); + $div = new HtmlNode('div'); + $p2 = new HtmlNode('p'); + $test->addChild($p1); + $test->addChild($div); + $test->addChild($p2); + + $seeker = new Seeker(); + + $results = $seeker->seek([$test], $ruleDTO, []); + $this->assertCount(1, $results); + $this->assertEquals('p', $results[0]->getTag()->name()); + + $ruleDTO = RuleDTO::makeFromPrimitives( + '*', + '=', + -1, + null, + false, + false + ); + + $results = $seeker->seek([$test], $ruleDTO, []); + $this->assertCount(1, $results); + $this->assertEquals('p', $results[0]->getTag()->name()); + } + + public function testSeekNthOfType() + { + $ruleDTO = RuleDTO::makeFromPrimitives( + 'div', + '=', + 1, + null, + false, + false, + true + ); + + $test = new HtmlNode('div'); + $p1 = new HtmlNode('p'); + $div = new HtmlNode('div'); + $p2 = new HtmlNode('p'); + $test->addChild($p1); + $test->addChild($div); + $test->addChild($p2); + + $seeker = new Seeker(); + + $results = $seeker->seek([$test], $ruleDTO, []); + $this->assertCount(1, $results); + $this->assertEquals('div', $results[0]->getTag()->name()); + + $ruleDTO = RuleDTO::makeFromPrimitives( + 'p', + '=', + 2, + null, + false, + false, + true + ); + + $results = $seeker->seek([$test], $ruleDTO, []); + $this->assertCount(1, $results); + $this->assertEquals('p', $results[0]->getTag()->name()); + $this->assertTrue($results[0] === $test->lastChild()); + + $ruleDTO = RuleDTO::makeFromPrimitives( + 'p', + '=', + -1, + null, + false, + false, + true + ); + + $results = $seeker->seek([$test], $ruleDTO, []); + $this->assertCount(1, $results); + $this->assertEquals('p', $results[0]->getTag()->name()); + $this->assertTrue($results[0] === $test->lastChild()); + } + + public function testSeekNextOne() + { + $ruleDTO = RuleDTO::makeFromPrimitives( + '+', + '=', + null, + null, + false, + false + ); + + $test = new HtmlNode('div'); + $p1 = new HtmlNode('p'); + $div = new HtmlNode('div'); + $p2 = new HtmlNode('p'); + $test->addChild($p1); + $test->addChild($div); + $test->addChild($p2); + + $seeker = new Seeker(); + + $results = $seeker->seek([$p1], $ruleDTO, []); + $this->assertCount(1, $results); + $this->assertEquals('div', $results[0]->getTag()->name()); + + $ruleDTO = RuleDTO::makeFromPrimitives( + '+', + '=', + null, + null, + false, + false + ); + + $results = $seeker->seek([$div], $ruleDTO, []); + $this->assertCount(1, $results); + $this->assertEquals('p', $results[0]->getTag()->name()); + } + + public function testSeekNextAll() + { + $ruleDTO = RuleDTO::makeFromPrimitives( + '~', + '=', + null, + null, + false, + false + ); + + $test = new HtmlNode('div'); + $p1 = new HtmlNode('p'); + $div = new HtmlNode('div'); + $p2 = new HtmlNode('p'); + $test->addChild($p1); + $test->addChild($div); + $test->addChild($p2); + + $seeker = new Seeker(); + + $results = $seeker->seek([$p1], $ruleDTO, []); + $this->assertCount(2, $results); + $this->assertEquals('p', $results[1]->getTag()->name()); + + $ruleDTO = RuleDTO::makeFromPrimitives( + '~', + '=', + null, + null, + false, + false + ); + + $results = $seeker->seek([$div], $ruleDTO, []); + $this->assertCount(1, $results); + $this->assertEquals('p', $results[0]->getTag()->name()); + } }