From d2d7dcd476f9d76aaa0c48decfc64b102f6e014d Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Sat, 16 Mar 2024 22:17:47 +0100 Subject: [PATCH] JSHtml --- src/Readability.php | 61 ++++++++++++++++++++++----------------------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/src/Readability.php b/src/Readability.php index 5221f51..81f223f 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -120,7 +120,7 @@ class Readability implements LoggerAwareInterface */ protected ?string $domainRegExp = null; - protected ?\DOMElement $body = null; + protected ?JSLikeHTMLElement $body = null; /** * @var ?string Cache the body HTML in case we need to re-use it later @@ -262,6 +262,7 @@ class Readability implements LoggerAwareInterface // Assume successful outcome $this->success = true; + /** @var \DOMNodeList */ $bodyElems = $this->dom->getElementsByTagName('body'); // WTF multiple body nodes? @@ -284,7 +285,7 @@ class Readability implements LoggerAwareInterface $articleTitle = $this->getArticleTitle(); $articleContent = $this->grabArticle(); - if (!$articleContent) { + if (null === $articleContent) { $this->success = false; $articleContent = $this->dom->createElement('div'); $articleContent->setAttribute('class', 'readability-content'); @@ -423,7 +424,7 @@ class Readability implements LoggerAwareInterface } // Remove service data-candidate attribute. - /** @var \DOMNodeList<\DOMElement> */ + /** @var \DOMNodeList */ $elems = $xpath->query('.//*[@data-candidate]', $articleContent); for ($i = $elems->length - 1; $i >= 0; --$i) { $elems->item($i)->removeAttribute('data-candidate'); @@ -519,7 +520,7 @@ class Readability implements LoggerAwareInterface /** * Remove the style attribute on every $e and under. */ - public function cleanStyles(\DOMElement $e): void + public function cleanStyles(JSLikeHTMLElement $e): void { if (\is_object($e)) { $elems = $e->getElementsByTagName('*'); @@ -552,7 +553,7 @@ class Readability implements LoggerAwareInterface * This is the amount of text that is inside a link divided by the total text in the node. * Can exclude external references to differentiate between simple text and menus/infoblocks. */ - public function getLinkDensity(\DOMElement $e, bool $excludeExternal = false): float + public function getLinkDensity(JSLikeHTMLElement $e, bool $excludeExternal = false): float { $links = $e->getElementsByTagName('a'); $textLength = mb_strlen($this->getInnerText($e, true, true)); @@ -575,7 +576,7 @@ class Readability implements LoggerAwareInterface /** * Get an element relative weight. */ - public function getWeight(\DOMElement $e): int + public function getWeight(JSLikeHTMLElement $e): int { if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) { return 0; @@ -606,7 +607,7 @@ class Readability implements LoggerAwareInterface * * Updated 2012-09-18 to preserve youtube/vimeo iframes */ - public function clean(\DOMElement $e, string $tag): void + public function clean(JSLikeHTMLElement $e, string $tag): void { $targetList = $e->getElementsByTagName($tag); $isEmbed = ('audio' === $tag || 'video' === $tag || 'iframe' === $tag || 'object' === $tag || 'embed' === $tag); @@ -638,7 +639,7 @@ class Readability implements LoggerAwareInterface * "Fishy" is an algorithm based on content length, classnames, * link density, number of images & embeds, etc. */ - public function cleanConditionally(\DOMElement $e, string $tag): void + public function cleanConditionally(JSLikeHTMLElement $e, string $tag): void { if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { return; @@ -751,7 +752,7 @@ class Readability implements LoggerAwareInterface /** * Clean out spurious headers from an Element. Checks things like classnames and link density. */ - public function cleanHeaders(\DOMElement $e): void + public function cleanHeaders(JSLikeHTMLElement $e): void { for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) { $headers = $e->getElementsByTagName('h' . $headerIndex); @@ -791,7 +792,7 @@ class Readability implements LoggerAwareInterface /** * Get the article title as an H1. */ - protected function getArticleTitle(): \DOMElement + protected function getArticleTitle(): JSLikeHTMLElement { try { $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); @@ -861,7 +862,7 @@ class Readability implements LoggerAwareInterface * Initialize a node with the readability object. Also checks the * className/id for special names to add to its score. */ - protected function initializeNode(\DOMElement $node): void + protected function initializeNode(JSLikeHTMLElement $node): void { if (!isset($node->tagName)) { return; @@ -929,10 +930,8 @@ class Readability implements LoggerAwareInterface /** * Using a variety of metrics (content score, classname, element types), find the content that is * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. - * - * @return \DOMElement|false */ - protected function grabArticle(?\DOMElement $page = null) + protected function grabArticle(?JSLikeHTMLElement $page = null): ?JSLikeHTMLElement { if (!$page) { $page = $this->dom; @@ -1078,7 +1077,7 @@ class Readability implements LoggerAwareInterface foreach ($ancestors as $level => $ancestor) { if (!$ancestor->nodeName || !$ancestor->parentNode) { - return false; + return null; } if (!$ancestor->hasAttribute('readability')) { @@ -1103,13 +1102,13 @@ class Readability implements LoggerAwareInterface * This is faster to do before scoring but safer after. */ if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) && $xpath) { - /** @var \DOMNodeList<\DOMElement> */ + /** @var \DOMNodeList */ $candidates = $xpath->query('.//*[(self::footer and count(//footer)<2) or (self::aside and count(//aside)<2)]', $page->documentElement); for ($c = $candidates->length - 1; $c >= 0; --$c) { $node = $candidates->item($c); // node should be readable but not inside of an article otherwise it's probably non-readable block - if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode instanceof \DOMElement ? 0 !== strcasecmp($node->parentNode->tagName, 'article') : true)) { + if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode instanceof JSLikeHTMLElement ? 0 !== strcasecmp($node->parentNode->tagName, 'article') : true)) { $this->logger->debug('Removing unlikely candidate (using note) ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . self::getContentScore($node)); $node->parentNode->removeChild($node); } @@ -1130,7 +1129,7 @@ class Readability implements LoggerAwareInterface $topCandidates = array_fill(0, 5, null); if ($xpath) { // Using array of DOMElements after deletion is a path to DOOMElement. - /** @var \DOMNodeList<\DOMElement> */ + /** @var \DOMNodeList */ $candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement); $this->logger->debug('Candidates: ' . $candidates->length); @@ -1157,7 +1156,7 @@ class Readability implements LoggerAwareInterface } } - /** @var \DOMNodeList<\DOMElement> */ + /** @var \DOMNodeList */ $topCandidates = array_filter( $topCandidates, fn ($v, $idx) => 0 === $idx || null !== $v, @@ -1250,7 +1249,7 @@ class Readability implements LoggerAwareInterface if (0 === strcasecmp($tagName, 'td') || 0 === strcasecmp($tagName, 'tr')) { $up = $topCandidate; - if ($up->parentNode instanceof \DOMElement) { + if ($up->parentNode instanceof JSLikeHTMLElement) { $up = $up->parentNode; if (0 === strcasecmp($up->tagName, 'table')) { @@ -1280,7 +1279,7 @@ class Readability implements LoggerAwareInterface $siblingNode = $siblingNodes->item($s); $siblingNodeName = $siblingNode->nodeName; $append = false; - $this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . (($siblingNode instanceof \DOMElement && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); + $this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . (($siblingNode instanceof JSLikeHTMLElement && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); if ($siblingNode->isSameNode($topCandidate)) { $append = true; @@ -1288,11 +1287,11 @@ class Readability implements LoggerAwareInterface $contentBonus = 0; // Give a bonus if sibling nodes and top candidates have the same classname. - if ($siblingNode instanceof \DOMElement && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && '' !== $topCandidate->getAttribute('class')) { + if ($siblingNode instanceof JSLikeHTMLElement && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && '' !== $topCandidate->getAttribute('class')) { $contentBonus += ((int) $topCandidate->getAttribute('readability')) * 0.2; } - if ($siblingNode instanceof \DOMElement && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) { + if ($siblingNode instanceof JSLikeHTMLElement && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) { $append = true; } elseif (0 === strcasecmp($siblingNodeName, 'p')) { $linkDensity = (int) $this->getLinkDensity($siblingNode); @@ -1369,7 +1368,7 @@ class Readability implements LoggerAwareInterface return $this->grabArticle($this->body); } - return false; + return null; } return $articleContent; @@ -1379,7 +1378,7 @@ class Readability implements LoggerAwareInterface * Get an element weight by attribute. * Uses regular expressions to tell if this element looks good or bad. */ - protected function weightAttribute(\DOMElement $element, string $attribute): int + protected function weightAttribute(JSLikeHTMLElement $element, string $attribute): int { if (!$element->hasAttribute($attribute)) { return 0; @@ -1423,7 +1422,7 @@ class Readability implements LoggerAwareInterface * * @param callable(float): float $f */ - private static function updateContentScore(\DOMElement $element, callable $f): void + private static function updateContentScore(JSLikeHTMLElement $element, callable $f): void { $readabilityAttr = $element->getAttributeNode('readability'); $prevScore = (float) $readabilityAttr->value; @@ -1433,7 +1432,7 @@ class Readability implements LoggerAwareInterface /** * Gets the content score for given element. */ - private static function getContentScore(\DOMElement $element): float + private static function getContentScore(JSLikeHTMLElement $element): float { return $element->hasAttribute('readability') ? (float) $element->getAttribute('readability') : 0; } @@ -1505,11 +1504,11 @@ class Readability implements LoggerAwareInterface $this->dom->registerNodeClass(\DOMElement::class, JSLikeHTMLElement::class); } - private function getAncestors(\DOMElement $node, int $maxDepth = 0): array + private function getAncestors(JSLikeHTMLElement $node, int $maxDepth = 0): array { $ancestors = []; $i = 0; - while ($node->parentNode instanceof \DOMElement) { + while ($node->parentNode instanceof JSLikeHTMLElement) { $ancestors[] = $node->parentNode; if (++$i === $maxDepth) { break; @@ -1537,7 +1536,7 @@ class Readability implements LoggerAwareInterface ); } - private function hasSingleTagInsideElement(\DOMElement $node, string $tag): bool + private function hasSingleTagInsideElement(JSLikeHTMLElement $node, string $tag): bool { if (1 !== $node->childNodes->length || $node->childNodes->item(0)->nodeName !== $tag) { return false; @@ -1557,7 +1556,7 @@ class Readability implements LoggerAwareInterface * Tidy must be configured to not clean the input for this function to * work as expected, see $this->tidy_config['clean'] */ - private function isNodeVisible(\DOMElement $node): bool + private function isNodeVisible(JSLikeHTMLElement $node): bool { return !( $node->hasAttribute('style')