@ -423,6 +423,7 @@ class Readability implements LoggerAwareInterface
}
}
// Remove service data-candidate attribute.
// Remove service data-candidate attribute.
/** @var \DOMNodeList< \DOMElement> */
$elems = $xpath->query('.//*[@data-candidate]', $articleContent);
$elems = $xpath->query('.//*[@data-candidate]', $articleContent);
for ($i = $elems->length - 1; $i >= 0; --$i) {
for ($i = $elems->length - 1; $i >= 0; --$i) {
$elems->item($i)->removeAttribute('data-candidate');
$elems->item($i)->removeAttribute('data-candidate');
@ -1102,12 +1103,13 @@ class Readability implements LoggerAwareInterface
* This is faster to do before scoring but safer after.
* This is faster to do before scoring but safer after.
*/
*/
if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) & & $xpath) {
if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) & & $xpath) {
/** @var \DOMNodeList< \DOMElement> */
$candidates = $xpath->query('.//*[(self::footer and count(//footer)< 2 ) or ( self::aside and count ( / / aside ) < 2 ) ] ' , $ page- > documentElement);
$candidates = $xpath->query('.//*[(self::footer and count(//footer)< 2 ) or ( self::aside and count ( / / aside ) < 2 ) ] ' , $ page- > documentElement);
for ($c = $candidates->length - 1; $c >= 0; --$c) {
for ($c = $candidates->length - 1; $c >= 0; --$c) {
$node = $candidates->item($c);
$node = $candidates->item($c);
// node should be readable but not inside of an article otherwise it's probably non-readable block
// node should be readable but not inside of an article otherwise it's probably non-readable block
if ($node->hasAttribute('readability') & & (int) $node->getAttributeNode('readability')->value < 40 & & ( $ node- > parentNode ? 0 !== strcasecmp($node->parentNode->tagName, 'article') : true)) {
if ($node->hasAttribute('readability') & & (int) $node->getAttributeNode('readability')->value < 40 & & ( $ node- > parentNode instanceof \DOMElement ? 0 !== strcasecmp($node->parentNode->tagName, 'article') : true)) {
$this->logger->debug('Removing unlikely candidate (using note) ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . self::getContentScore($node));
$this->logger->debug('Removing unlikely candidate (using note) ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . self::getContentScore($node));
$node->parentNode->removeChild($node);
$node->parentNode->removeChild($node);
}
}
@ -1128,6 +1130,7 @@ class Readability implements LoggerAwareInterface
$topCandidates = array_fill(0, 5, null);
$topCandidates = array_fill(0, 5, null);
if ($xpath) {
if ($xpath) {
// Using array of DOMElements after deletion is a path to DOOMElement.
// Using array of DOMElements after deletion is a path to DOOMElement.
/** @var \DOMNodeList< \DOMElement> */
$candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement);
$candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement);
$this->logger->debug('Candidates: ' . $candidates->length);
$this->logger->debug('Candidates: ' . $candidates->length);
@ -1154,6 +1157,7 @@ class Readability implements LoggerAwareInterface
}
}
}
}
/** @var \DOMNodeList< \DOMElement> */
$topCandidates = array_filter(
$topCandidates = array_filter(
$topCandidates,
$topCandidates,
fn ($v, $idx) => 0 === $idx || null !== $v,
fn ($v, $idx) => 0 === $idx || null !== $v,
@ -1276,7 +1280,7 @@ class Readability implements LoggerAwareInterface
$siblingNode = $siblingNodes->item($s);
$siblingNode = $siblingNodes->item($s);
$siblingNodeName = $siblingNode->nodeName;
$siblingNodeName = $siblingNode->nodeName;
$append = false;
$append = false;
$this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . ((\XML_ELEMENT_NODE === $siblingNode->nodeType & & $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
$this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . (($siblingNode instanceof \DOMElement & & $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
if ($siblingNode->isSameNode($topCandidate)) {
if ($siblingNode->isSameNode($topCandidate)) {
$append = true;
$append = true;
@ -1284,11 +1288,11 @@ class Readability implements LoggerAwareInterface
$contentBonus = 0;
$contentBonus = 0;
// Give a bonus if sibling nodes and top candidates have the same classname.
// Give a bonus if sibling nodes and top candidates have the same classname.
if (\XML_ELEMENT_NODE === $siblingNode->nodeType & & $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') & & '' !== $topCandidate->getAttribute('class')) {
if ($siblingNode instanceof \DOMElement & & $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') & & '' !== $topCandidate->getAttribute('class')) {
$contentBonus += ((int) $topCandidate->getAttribute('readability')) * 0.2;
$contentBonus += ((int) $topCandidate->getAttribute('readability')) * 0.2;
}
}
if (\XML_ELEMENT_NODE === $siblingNode->nodeType & & $siblingNode->hasAttribute('readability') & & (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) {
if ($siblingNode instanceof \DOMElement & & $siblingNode->hasAttribute('readability') & & (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) {
$append = true;
$append = true;
} elseif (0 === strcasecmp($siblingNodeName, 'p')) {
} elseif (0 === strcasecmp($siblingNodeName, 'p')) {
$linkDensity = (int) $this->getLinkDensity($siblingNode);
$linkDensity = (int) $this->getLinkDensity($siblingNode);
@ -1518,7 +1522,7 @@ class Readability implements LoggerAwareInterface
private function isPhrasingContent($node): bool
private function isPhrasingContent($node): bool
{
{
return \XML_TEXT_NODE === $node->nodeType
return $node instanceof \DOMText
|| \in_array(strtoupper($node->nodeName), $this->phrasingElements, true)
|| \in_array(strtoupper($node->nodeName), $this->phrasingElements, true)
|| (
|| (
\in_array(strtoupper($node->nodeName), ['A', 'DEL', 'INS'], true)
\in_array(strtoupper($node->nodeName), ['A', 'DEL', 'INS'], true)