fixup! JSHtml

pull/87/head
Jan Tojnar 2 years ago
parent ccdd16649a
commit 8cbb8a8589
  1. 44
      src/Readability.php

@ -287,6 +287,7 @@ class Readability implements LoggerAwareInterface
if (null === $articleContent) { if (null === $articleContent) {
$this->success = false; $this->success = false;
/** @var JSLikeHTMLElement */
$articleContent = $this->dom->createElement('div'); $articleContent = $this->dom->createElement('div');
$articleContent->setAttribute('class', 'readability-content'); $articleContent->setAttribute('class', 'readability-content');
$articleContent->setInnerHtml('<p>Sorry, Readability was unable to parse this page for content.</p>'); $articleContent->setInnerHtml('<p>Sorry, Readability was unable to parse this page for content.</p>');
@ -302,7 +303,9 @@ class Readability implements LoggerAwareInterface
// without tidy the body can (sometimes) be wiped, so re-create it // without tidy the body can (sometimes) be wiped, so re-create it
if (false === isset($this->body->childNodes)) { if (false === isset($this->body->childNodes)) {
$this->body = $this->dom->createElement('body'); /** @var JSLikeHTMLElement */
$body = $this->dom->createElement('body');
$this->body = $body;
} }
// Clear the old HTML, insert the new content. // Clear the old HTML, insert the new content.
@ -335,19 +338,23 @@ class Readability implements LoggerAwareInterface
*/ */
public function addFootnotes(\DOMElement $articleContent): void public function addFootnotes(\DOMElement $articleContent): void
{ {
/** @var JSLikeHTMLElement */
$footnotesWrapper = $this->dom->createElement('footer'); $footnotesWrapper = $this->dom->createElement('footer');
$footnotesWrapper->setAttribute('class', 'readability-footnotes'); $footnotesWrapper->setAttribute('class', 'readability-footnotes');
$footnotesWrapper->setInnerHtml('<h3>References</h3>'); $footnotesWrapper->setInnerHtml('<h3>References</h3>');
$articleFootnotes = $this->dom->createElement('ol'); $articleFootnotes = $this->dom->createElement('ol');
$articleFootnotes->setAttribute('class', 'readability-footnotes-list'); $articleFootnotes->setAttribute('class', 'readability-footnotes-list');
$footnotesWrapper->appendChild($articleFootnotes); $footnotesWrapper->appendChild($articleFootnotes);
/** @var \DOMNodeList<JSLikeHTMLElement> */
$articleLinks = $articleContent->getElementsByTagName('a'); $articleLinks = $articleContent->getElementsByTagName('a');
$linkCount = 0; $linkCount = 0;
for ($i = 0; $i < $articleLinks->length; ++$i) { for ($i = 0; $i < $articleLinks->length; ++$i) {
$articleLink = $articleLinks->item($i); $articleLink = $articleLinks->item($i);
$footnoteLink = $articleLink->cloneNode(true); $footnoteLink = $articleLink->cloneNode(true);
/** @var JSLikeHTMLElement */
$refLink = $this->dom->createElement('a'); $refLink = $this->dom->createElement('a');
/** @var JSLikeHTMLElement */
$footnote = $this->dom->createElement('li'); $footnote = $this->dom->createElement('li');
$linkDomain = @parse_url($footnoteLink->getAttribute('href'), \PHP_URL_HOST); $linkDomain = @parse_url($footnoteLink->getAttribute('href'), \PHP_URL_HOST);
if (!$linkDomain && isset($this->url)) { if (!$linkDomain && isset($this->url)) {
@ -609,6 +616,7 @@ class Readability implements LoggerAwareInterface
*/ */
public function clean(JSLikeHTMLElement $e, string $tag): void public function clean(JSLikeHTMLElement $e, string $tag): void
{ {
/** @var \DOMNodeList<JSLikeHTMLElement> */
$targetList = $e->getElementsByTagName($tag); $targetList = $e->getElementsByTagName($tag);
$isEmbed = ('audio' === $tag || 'video' === $tag || 'iframe' === $tag || 'object' === $tag || 'embed' === $tag); $isEmbed = ('audio' === $tag || 'video' === $tag || 'iframe' === $tag || 'object' === $tag || 'embed' === $tag);
@ -645,6 +653,7 @@ class Readability implements LoggerAwareInterface
return; return;
} }
/** @var \DOMNodeList<JSLikeHTMLElement> */
$tagsList = $e->getElementsByTagName($tag); $tagsList = $e->getElementsByTagName($tag);
$curTagsLength = $tagsList->length; $curTagsLength = $tagsList->length;
@ -755,6 +764,7 @@ class Readability implements LoggerAwareInterface
public function cleanHeaders(JSLikeHTMLElement $e): void public function cleanHeaders(JSLikeHTMLElement $e): void
{ {
for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) { for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) {
/** @var \DOMNodeList<JSLikeHTMLElement> */
$headers = $e->getElementsByTagName('h' . $headerIndex); $headers = $e->getElementsByTagName('h' . $headerIndex);
for ($i = $headers->length - 1; $i >= 0; --$i) { for ($i = $headers->length - 1; $i >= 0; --$i) {
@ -823,6 +833,7 @@ class Readability implements LoggerAwareInterface
$curTitle = $origTitle; $curTitle = $origTitle;
} }
/** @var JSLikeHTMLElement */
$articleTitle = $this->dom->createElement('h1'); $articleTitle = $this->dom->createElement('h1');
$articleTitle->setInnerHtml($curTitle); $articleTitle->setInnerHtml($curTitle);
@ -840,7 +851,9 @@ class Readability implements LoggerAwareInterface
* so we create a new body node and append it to the document. * so we create a new body node and append it to the document.
*/ */
if (null === $this->body) { if (null === $this->body) {
$this->body = $this->dom->createElement('body'); /** @var JSLikeHTMLElement */
$body = $this->dom->createElement('body');
$this->body = $body;
$this->dom->documentElement->appendChild($this->body); $this->dom->documentElement->appendChild($this->body);
} }
@ -944,6 +957,7 @@ class Readability implements LoggerAwareInterface
$xpath = new \DOMXPath($page); $xpath = new \DOMXPath($page);
} }
/** @var \DOMNodeList<JSLikeHTMLElement> */
$allElements = $page->getElementsByTagName('*'); $allElements = $page->getElementsByTagName('*');
for ($nodeIndex = 0; $allElements->item($nodeIndex); ++$nodeIndex) { for ($nodeIndex = 0; $allElements->item($nodeIndex); ++$nodeIndex) {
@ -986,6 +1000,7 @@ class Readability implements LoggerAwareInterface
// (as in, where they contain no other block level elements). // (as in, where they contain no other block level elements).
if ('div' === $tagName) { if ('div' === $tagName) {
if (!preg_match($this->regexps['divToPElements'], $nodeContent)) { if (!preg_match($this->regexps['divToPElements'], $nodeContent)) {
/** @var JSLikeHTMLElement */
$newNode = $this->dom->createElement('p'); $newNode = $this->dom->createElement('p');
try { try {
@ -1156,7 +1171,7 @@ class Readability implements LoggerAwareInterface
} }
} }
/** @var \DOMNodeList<JSLikeHTMLElement> */ /** @var non-empty-array<JSLikeHTMLElement|null> */
$topCandidates = array_filter( $topCandidates = array_filter(
$topCandidates, $topCandidates,
fn ($v, $idx) => 0 === $idx || null !== $v, fn ($v, $idx) => 0 === $idx || null !== $v,
@ -1169,18 +1184,21 @@ class Readability implements LoggerAwareInterface
* We also have to copy the body node so it is something we can modify. * We also have to copy the body node so it is something we can modify.
*/ */
if (null === $topCandidate || 0 === strcasecmp($topCandidate->tagName, 'body')) { if (null === $topCandidate || 0 === strcasecmp($topCandidate->tagName, 'body')) {
/** @var JSLikeHTMLElement */
$topCandidate = $this->dom->createElement('div'); $topCandidate = $this->dom->createElement('div');
if ($page instanceof \DOMDocument) { if ($page instanceof \DOMDocument) {
if (!isset($page->documentElement)) { /** @var ?JSLikeHTMLElement */
$documentElement = $page->documentElement;
if (null === $documentElement) {
// we don't have a body either? what a mess! :) // we don't have a body either? what a mess! :)
$this->logger->debug('The page has no body!'); $this->logger->debug('The page has no body!');
} else { } else {
$this->logger->debug('Setting body to a raw HTML of original page!'); $this->logger->debug('Setting body to a raw HTML of original page!');
$topCandidate->setInnerHtml($page->documentElement->getInnerHTML()); $topCandidate->setInnerHtml($documentElement->getInnerHTML());
$page->documentElement->setInnerHtml(''); $documentElement->setInnerHtml('');
$this->reinitBody(); $this->reinitBody();
$page->documentElement->appendChild($topCandidate); $documentElement->appendChild($topCandidate);
} }
} else { } else {
$topCandidate->setInnerHtml($page->getInnerHTML()); $topCandidate->setInnerHtml($page->getInnerHTML());
@ -1189,7 +1207,7 @@ class Readability implements LoggerAwareInterface
} }
$this->initializeNode($topCandidate); $this->initializeNode($topCandidate);
} elseif ($topCandidate) { } elseif (null !== $topCandidate) {
$alternativeCandidateAncestors = []; $alternativeCandidateAncestors = [];
foreach ($topCandidates as $candidate) { foreach ($topCandidates as $candidate) {
if ((int) $candidate->getAttribute('readability') / (int) $topCandidate->getAttribute('readability') >= 0.75) { if ((int) $candidate->getAttribute('readability') / (int) $topCandidate->getAttribute('readability') >= 0.75) {
@ -1200,7 +1218,7 @@ class Readability implements LoggerAwareInterface
} }
if (\count($alternativeCandidateAncestors) >= 3) { if (\count($alternativeCandidateAncestors) >= 3) {
$parentOfTopCandidate = $topCandidate->parentNode; $parentOfTopCandidate = $topCandidate->parentNode;
while ('body' !== $parentOfTopCandidate->nodeName) { while ('body' !== $parentOfTopCandidate->nodeName && $parentOfTopCandidate instanceof JSLikeHTMLElement) {
$listsContainingThisAncestor = 0; $listsContainingThisAncestor = 0;
for ($ancestorIndex = 0; $ancestorIndex < \count($alternativeCandidateAncestors) && $listsContainingThisAncestor < 3; ++$ancestorIndex) { for ($ancestorIndex = 0; $ancestorIndex < \count($alternativeCandidateAncestors) && $listsContainingThisAncestor < 3; ++$ancestorIndex) {
$listsContainingThisAncestor += (int) \in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex], true); $listsContainingThisAncestor += (int) \in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex], true);
@ -1264,6 +1282,7 @@ class Readability implements LoggerAwareInterface
* Now that we have the top candidate, look through its siblings for content that might also be related. * Now that we have the top candidate, look through its siblings for content that might also be related.
* Things like preambles, content split by ads that we removed, etc. * Things like preambles, content split by ads that we removed, etc.
*/ */
/** @var JSLikeHTMLElement */
$articleContent = $this->dom->createElement('div'); $articleContent = $this->dom->createElement('div');
$articleContent->setAttribute('class', 'readability-content'); $articleContent->setAttribute('class', 'readability-content');
$siblingScoreThreshold = max(10, ((int) $topCandidate->getAttribute('readability')) * 0.2); $siblingScoreThreshold = max(10, ((int) $topCandidate->getAttribute('readability')) * 0.2);
@ -1311,6 +1330,7 @@ class Readability implements LoggerAwareInterface
if (0 !== strcasecmp($siblingNodeName, 'div') && 0 !== strcasecmp($siblingNodeName, 'p')) { if (0 !== strcasecmp($siblingNodeName, 'div') && 0 !== strcasecmp($siblingNodeName, 'p')) {
// We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. // We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident.
$this->logger->debug('Altering siblingNode "' . $siblingNodeName . '" to "div".'); $this->logger->debug('Altering siblingNode "' . $siblingNodeName . '" to "div".');
/** @var JSLikeHTMLElement */
$nodeToAppend = $this->dom->createElement('div'); $nodeToAppend = $this->dom->createElement('div');
try { try {
@ -1412,7 +1432,9 @@ class Readability implements LoggerAwareInterface
protected function reinitBody(): void protected function reinitBody(): void
{ {
if (!isset($this->body->childNodes)) { if (!isset($this->body->childNodes)) {
$this->body = $this->dom->createElement('body'); /** @var JSLikeHTMLElement */
$body = $this->dom->createElement('body');
$this->body = $body;
$this->body->setInnerHtml($this->bodyCache); $this->body->setInnerHtml($this->bodyCache);
} }
} }
@ -1544,7 +1566,7 @@ class Readability implements LoggerAwareInterface
private function getSingleTagInsideElement(JSLikeHTMLElement $node, string $tag): ?JSLikeHTMLElement private function getSingleTagInsideElement(JSLikeHTMLElement $node, string $tag): ?JSLikeHTMLElement
{ {
$childNodes = iterator_to_array($node->childNodes); $childNodes = iterator_to_array($node->childNodes);
$children = array_filter($childNodes, fn ($childNode) => $childNode instanceof \DOMElement); $children = array_filter($childNodes, fn ($childNode) => $childNode instanceof JSLikeHTMLElement);
// There should be exactly 1 element child with given tag // There should be exactly 1 element child with given tag
if (1 !== \count($children) || $children[0]->nodeName !== $tag) { if (1 !== \count($children) || $children[0]->nodeName !== $tag) {

Loading…
Cancel
Save