|
|
|
@ -220,7 +220,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
{ |
|
|
|
{ |
|
|
|
$this->loadHtml(); |
|
|
|
$this->loadHtml(); |
|
|
|
|
|
|
|
|
|
|
|
if (!isset($this->dom->documentElement)) { |
|
|
|
if (!(property_exists($this->dom, 'documentElement') && null !== $this->dom->documentElement)) { |
|
|
|
return false; |
|
|
|
return false; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
@ -236,7 +236,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if ($bodyElems->length > 0 && null === $this->body) { |
|
|
|
if ($bodyElems->length > 0 && !$this->body instanceof \DOMElement) { |
|
|
|
$this->body = $bodyElems->item(0); |
|
|
|
$this->body = $bodyElems->item(0); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
@ -264,7 +264,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
$overlay->appendChild($innerDiv); |
|
|
|
$overlay->appendChild($innerDiv); |
|
|
|
|
|
|
|
|
|
|
|
// without tidy the body can (sometimes) be wiped, so re-create it |
|
|
|
// without tidy the body can (sometimes) be wiped, so re-create it |
|
|
|
if (false === isset($this->body->childNodes)) { |
|
|
|
if (!(property_exists($this->body, 'childNodes') && null !== $this->body->childNodes)) { |
|
|
|
$this->body = $this->dom->createElement('body'); |
|
|
|
$this->body = $this->dom->createElement('body'); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
@ -313,7 +313,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
$refLink = $this->dom->createElement('a'); |
|
|
|
$refLink = $this->dom->createElement('a'); |
|
|
|
$footnote = $this->dom->createElement('li'); |
|
|
|
$footnote = $this->dom->createElement('li'); |
|
|
|
$linkDomain = @parse_url($footnoteLink->getAttribute('href'), \PHP_URL_HOST); |
|
|
|
$linkDomain = @parse_url($footnoteLink->getAttribute('href'), \PHP_URL_HOST); |
|
|
|
if (!$linkDomain && isset($this->url)) { |
|
|
|
if (!$linkDomain && null !== $this->url) { |
|
|
|
$linkDomain = @parse_url($this->url, \PHP_URL_HOST); |
|
|
|
$linkDomain = @parse_url($this->url, \PHP_URL_HOST); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
@ -433,7 +433,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// add extra text to iframe tag to avoid an auto-closing iframe and then break the html code |
|
|
|
// add extra text to iframe tag to avoid an auto-closing iframe and then break the html code |
|
|
|
if ($iframeCount) { |
|
|
|
if (0 !== $iframeCount) { |
|
|
|
$iframe = $item->getElementsByTagName('iframe'); |
|
|
|
$iframe = $item->getElementsByTagName('iframe'); |
|
|
|
$iframe->item(0)->nodeValue = ' '; |
|
|
|
$iframe->item(0)->nodeValue = ' '; |
|
|
|
|
|
|
|
|
|
|
|
@ -463,7 +463,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
public function getInnerText($e, bool $normalizeSpaces = true, bool $flattenLines = false): string |
|
|
|
public function getInnerText($e, bool $normalizeSpaces = true, bool $flattenLines = false): string |
|
|
|
{ |
|
|
|
{ |
|
|
|
if (null === $e || !isset($e->textContent) || '' === $e->textContent) { |
|
|
|
if (!$e instanceof \DOMNode || !(property_exists($e, 'textContent') && null !== $e->textContent) || '' === $e->textContent) { |
|
|
|
return ''; |
|
|
|
return ''; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
@ -679,8 +679,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
$this->logger->debug(' more than 3 embeds'); |
|
|
|
$this->logger->debug(' more than 3 embeds'); |
|
|
|
$toRemove = true; |
|
|
|
$toRemove = true; |
|
|
|
} |
|
|
|
} |
|
|
|
} else { |
|
|
|
} elseif ($img > $p) { |
|
|
|
if ($img > $p) { |
|
|
|
|
|
|
|
$this->logger->debug(' more image elements than paragraph elements'); |
|
|
|
$this->logger->debug(' more image elements than paragraph elements'); |
|
|
|
$toRemove = true; |
|
|
|
$toRemove = true; |
|
|
|
} elseif (!$isList && $li > $p) { |
|
|
|
} elseif (!$isList && $li > $p) { |
|
|
|
@ -702,7 +701,6 @@ class Readability implements LoggerAwareInterface |
|
|
|
$this->logger->debug(' 1 embed and content length smaller than 75 chars, or more than one embed'); |
|
|
|
$this->logger->debug(' 1 embed and content length smaller than 75 chars, or more than one embed'); |
|
|
|
$toRemove = true; |
|
|
|
$toRemove = true; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ($toRemove) { |
|
|
|
if ($toRemove) { |
|
|
|
$this->logger->debug('Removing...'); |
|
|
|
$this->logger->debug('Removing...'); |
|
|
|
@ -741,7 +739,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
public function addFlag(int $flag): void |
|
|
|
public function addFlag(int $flag): void |
|
|
|
{ |
|
|
|
{ |
|
|
|
$this->flags = $this->flags | $flag; |
|
|
|
$this->flags |= $flag; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/** |
|
|
|
/** |
|
|
|
@ -749,7 +747,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
public function removeFlag(int $flag): void |
|
|
|
public function removeFlag(int $flag): void |
|
|
|
{ |
|
|
|
{ |
|
|
|
$this->flags = $this->flags & ~$flag; |
|
|
|
$this->flags &= ~$flag; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/** |
|
|
|
/** |
|
|
|
@ -829,7 +827,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
protected function initializeNode(DOMElement $node): void |
|
|
|
protected function initializeNode(DOMElement $node): void |
|
|
|
{ |
|
|
|
{ |
|
|
|
if (!isset($node->tagName)) { |
|
|
|
if (!(property_exists($node, 'tagName') && null !== $node->tagName)) { |
|
|
|
return; |
|
|
|
return; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
@ -901,14 +899,14 @@ class Readability implements LoggerAwareInterface |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
protected function grabArticle(DOMElement $page = null) |
|
|
|
protected function grabArticle(DOMElement $page = null) |
|
|
|
{ |
|
|
|
{ |
|
|
|
if (!$page) { |
|
|
|
if (null === $page) { |
|
|
|
$page = $this->dom; |
|
|
|
$page = $this->dom; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
$xpath = null; |
|
|
|
$xpath = null; |
|
|
|
$nodesToScore = []; |
|
|
|
$nodesToScore = []; |
|
|
|
|
|
|
|
|
|
|
|
if ($page instanceof \DOMDocument && isset($page->documentElement)) { |
|
|
|
if ($page instanceof \DOMDocument && (property_exists($page, 'documentElement') && null !== $page->documentElement)) { |
|
|
|
$xpath = new \DOMXPath($page); |
|
|
|
$xpath = new \DOMXPath($page); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
@ -1019,16 +1017,13 @@ class Readability implements LoggerAwareInterface |
|
|
|
* A score is determined by things like number of commas, class names, etc. |
|
|
|
* A score is determined by things like number of commas, class names, etc. |
|
|
|
* Maybe eventually link density. |
|
|
|
* Maybe eventually link density. |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
for ($pt = 0, $scored = \count($nodesToScore); $pt < $scored; ++$pt) { |
|
|
|
foreach ($nodesToScore as $pt => $singleNodesToScore) { |
|
|
|
$ancestors = $this->getAncestors($nodesToScore[$pt], 5); |
|
|
|
$ancestors = $this->getAncestors($singleNodesToScore, 5); |
|
|
|
|
|
|
|
|
|
|
|
// No parent node? Move on... |
|
|
|
// No parent node? Move on... |
|
|
|
if (0 === \count($ancestors)) { |
|
|
|
if ([] === $ancestors) { |
|
|
|
continue; |
|
|
|
continue; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
$innerText = $this->getInnerText($singleNodesToScore); |
|
|
|
$innerText = $this->getInnerText($nodesToScore[$pt]); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it. |
|
|
|
// If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it. |
|
|
|
if (mb_strlen($innerText) < self::MIN_PARAGRAPH_LENGTH) { |
|
|
|
if (mb_strlen($innerText) < self::MIN_PARAGRAPH_LENGTH) { |
|
|
|
continue; |
|
|
|
continue; |
|
|
|
@ -1074,7 +1069,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
for ($c = $candidates->length - 1; $c >= 0; --$c) { |
|
|
|
for ($c = $candidates->length - 1; $c >= 0; --$c) { |
|
|
|
$node = $candidates->item($c); |
|
|
|
$node = $candidates->item($c); |
|
|
|
// node should be readable but not inside of an article otherwise it's probably non-readable block |
|
|
|
// node should be readable but not inside of an article otherwise it's probably non-readable block |
|
|
|
if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? 0 !== strcasecmp($node->parentNode->tagName, 'article') : true)) { |
|
|
|
if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && (null !== $node->parentNode ? 0 !== strcasecmp($node->parentNode->tagName, 'article') : true)) { |
|
|
|
$this->logger->debug('Removing unlikely candidate (using note) ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); |
|
|
|
$this->logger->debug('Removing unlikely candidate (using note) ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); |
|
|
|
$node->parentNode->removeChild($node); |
|
|
|
$node->parentNode->removeChild($node); |
|
|
|
} |
|
|
|
} |
|
|
|
@ -1093,7 +1088,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
* and find the one with the highest score. |
|
|
|
* and find the one with the highest score. |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
$topCandidates = array_fill(0, 5, null); |
|
|
|
$topCandidates = array_fill(0, 5, null); |
|
|
|
if ($xpath) { |
|
|
|
if (null !== $xpath) { |
|
|
|
// Using array of DOMElements after deletion is a path to DOOMElement. |
|
|
|
// Using array of DOMElements after deletion is a path to DOOMElement. |
|
|
|
$candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement); |
|
|
|
$candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement); |
|
|
|
$this->logger->debug('Candidates: ' . $candidates->length); |
|
|
|
$this->logger->debug('Candidates: ' . $candidates->length); |
|
|
|
@ -1135,7 +1130,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
$topCandidate = $this->dom->createElement('div'); |
|
|
|
$topCandidate = $this->dom->createElement('div'); |
|
|
|
|
|
|
|
|
|
|
|
if ($page instanceof \DOMDocument) { |
|
|
|
if ($page instanceof \DOMDocument) { |
|
|
|
if (!isset($page->documentElement)) { |
|
|
|
if (!(property_exists($page, 'documentElement') && null !== $page->documentElement)) { |
|
|
|
// we don't have a body either? what a mess! :) |
|
|
|
// we don't have a body either? what a mess! :) |
|
|
|
$this->logger->debug('The page has no body!'); |
|
|
|
$this->logger->debug('The page has no body!'); |
|
|
|
} else { |
|
|
|
} else { |
|
|
|
@ -1165,7 +1160,8 @@ class Readability implements LoggerAwareInterface |
|
|
|
$parentOfTopCandidate = $topCandidate->parentNode; |
|
|
|
$parentOfTopCandidate = $topCandidate->parentNode; |
|
|
|
while ('body' !== $parentOfTopCandidate->nodeName) { |
|
|
|
while ('body' !== $parentOfTopCandidate->nodeName) { |
|
|
|
$listsContainingThisAncestor = 0; |
|
|
|
$listsContainingThisAncestor = 0; |
|
|
|
for ($ancestorIndex = 0; $ancestorIndex < \count($alternativeCandidateAncestors) && $listsContainingThisAncestor < 3; ++$ancestorIndex) { |
|
|
|
$alternativeCandidateAncestorsCount = \count($alternativeCandidateAncestors); |
|
|
|
|
|
|
|
for ($ancestorIndex = 0; $ancestorIndex < $alternativeCandidateAncestorsCount && $listsContainingThisAncestor < 3; ++$ancestorIndex) { |
|
|
|
$listsContainingThisAncestor += (int) \in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex], true); |
|
|
|
$listsContainingThisAncestor += (int) \in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex], true); |
|
|
|
} |
|
|
|
} |
|
|
|
if ($listsContainingThisAncestor >= 3) { |
|
|
|
if ($listsContainingThisAncestor >= 3) { |
|
|
|
@ -1374,7 +1370,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
protected function reinitBody(): void |
|
|
|
protected function reinitBody(): void |
|
|
|
{ |
|
|
|
{ |
|
|
|
if (!isset($this->body->childNodes)) { |
|
|
|
if (!(property_exists($this->body, 'childNodes') && null !== $this->body->childNodes)) { |
|
|
|
$this->body = $this->dom->createElement('body'); |
|
|
|
$this->body = $this->dom->createElement('body'); |
|
|
|
$this->body->setInnerHtml($this->bodyCache); |
|
|
|
$this->body->setInnerHtml($this->bodyCache); |
|
|
|
} |
|
|
|
} |
|
|
|
@ -1482,7 +1478,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
preg_match($this->regexps['hasContent'], $this->getInnerText($childNode)); |
|
|
|
preg_match($this->regexps['hasContent'], $this->getInnerText($childNode)); |
|
|
|
}); |
|
|
|
}); |
|
|
|
|
|
|
|
|
|
|
|
return 0 === \count($a); |
|
|
|
return [] === $a; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/** |
|
|
|
/** |
|
|
|
|