Merge pull request #64 from Kdecherf/improvements

pull/71/head
Jérémy Benoist 4 years ago committed by GitHub
commit ccf1b336c5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 333
      src/Readability.php
  2. 44
      tests/ReadabilityTest.php

@ -46,14 +46,27 @@ class Readability implements LoggerAwareInterface
* Defined up here so we don't instantiate them repeatedly in loops. * Defined up here so we don't instantiate them repeatedly in loops.
*/ */
public $regexps = [ public $regexps = [
'unlikelyCandidates' => '/display\s*:\s*none|ignore|\binfos?\b|annoy|clock|date|time|author|intro|hidd?e|about|archive|\bprint|bookmark|tags|tag-list|share|search|social|robot|published|combx|comment|mast(?:head)|subscri|community|category|disqus|extra|head|head(?:er|note)|floor|foot(?:er|note)|menu|tool\b|function|nav|remark|rss|shoutbox|widget|meta|banner|sponsor|adsense|inner-?ad|ad-|sponsor|\badv\b|\bads\b|agr?egate?|pager|sidebar|popup|tweet|twitter/i', 'unlikelyCandidates' => '/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i',
'okMaybeItsACandidate' => '/article\b|contain|\bcontent|column|general|detail|shadow|lightbox|blog|body|entry|main|page|footnote/i', 'okMaybeItsACandidate' => '/article\b|contain|\bcontent|column|general|detail|shadow|lightbox|blog|body|entry|main|page|footnote|element/i',
'positive' => '/read|full|article|body|\bcontent|contain|entry|main|markdown|media|page|attach|pagination|post|text|blog|story/i', 'positive' => '/read|full|article|body|\bcontent|contain|entry|main|markdown|media|page|attach|pagination|post|text|blog|story/i',
'negative' => '/bottom|stat|info|discuss|e[\-]?mail|comment|reply|log.{2}(n|ed)|sign|single|combx|com-|contact|_nav|link|media|promo|\bad-|related|scroll|shoutbox|sidebar|sponsor|shopping|teaser|recommend/i', 'negative' => '/bottom|stat|info|discuss|e[\-]?mail|comment|reply|log.{2}(n|ed)|sign|single|combx|com-|contact|_nav|link|media|promo|\bad-|related|scroll|shoutbox|sidebar|sponsor|shopping|teaser|recommend/i',
'divToPElements' => '/<(?:blockquote|header|section|code|div|article|footer|aside|img|p|pre|dl|ol|ul)/mi', 'divToPElements' => '/<(?:blockquote|header|section|code|div|article|footer|aside|img|p|pre|dl|ol|ul)/mi',
'killBreaks' => '/(<br\s*\/?>([ \r\n\s]|&nbsp;?)*)+/', 'killBreaks' => '/(<br\s*\/?>([ \r\n\s]|&nbsp;?)*)+/',
'media' => '!//(?:[^\.\?/]+\.)?(?:youtu(?:be)?|giphy|soundcloud|dailymotion|vimeo|pornhub|xvideos|twitvid|rutube|openload\.co|viddler)\.(?:com|be|org|net)/!i', 'media' => '!//(?:[^\.\?/]+\.)?(?:youtu(?:be)?|giphy|soundcloud|dailymotion|vimeo|pornhub|xvideos|twitvid|rutube|openload\.co|viddler)\.(?:com|be|org|net)/!i',
'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i', 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i',
'hasContent' => '/\S$/',
'isNotVisible' => '/display\s*:\s*none/',
];
public $defaultTagsToScore = ['section', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'td', 'pre'];
// The commented out elements qualify as phrasing content but tend to be
// removed by readability when put into paragraphs, so we ignore them here.
public $phrasingElements = [
// "CANVAS", "IFRAME", "SVG", "VIDEO",
'ABBR', 'AUDIO', 'B', 'BDO', 'BR', 'BUTTON', 'CITE', 'CODE', 'DATA',
'DATALIST', 'DFN', 'EM', 'EMBED', 'I', 'IMG', 'INPUT', 'KBD', 'LABEL',
'MARK', 'MATH', 'METER', 'NOSCRIPT', 'OBJECT', 'OUTPUT', 'PROGRESS', 'Q',
'RUBY', 'SAMP', 'SCRIPT', 'SELECT', 'SMALL', 'SPAN', 'STRONG', 'SUB',
'SUP', 'TEXTAREA', 'TIME', 'VAR', 'WBR',
]; ];
public $tidy_config = [ public $tidy_config = [
'tidy-mark' => false, 'tidy-mark' => false,
@ -62,7 +75,7 @@ class Readability implements LoggerAwareInterface
'numeric-entities' => false, 'numeric-entities' => false,
// 'preserve-entities' => true, // 'preserve-entities' => true,
'break-before-br' => false, 'break-before-br' => false,
'clean' => true, 'clean' => false,
'output-xhtml' => true, 'output-xhtml' => true,
'logical-emphasis' => true, 'logical-emphasis' => true,
'show-body-only' => false, 'show-body-only' => false,
@ -485,7 +498,7 @@ class Readability implements LoggerAwareInterface
*/ */
public function getCommaCount(string $text): int public function getCommaCount(string $text): int
{ {
return substr_count($text, ','); return \count(explode(',', $text));
} }
/** /**
@ -609,6 +622,9 @@ class Readability implements LoggerAwareInterface
$contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0; $contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0;
$this->logger->debug('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : '')); $this->logger->debug('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : ''));
// XXX Incomplete implementation
$isList = \in_array($node->tagName, ['ul', 'ol'], true);
if ($weight + $contentScore < 0) { if ($weight + $contentScore < 0) {
$this->logger->debug('Removing...'); $this->logger->debug('Removing...');
$node->parentNode->removeChild($node); $node->parentNode->removeChild($node);
@ -643,16 +659,16 @@ class Readability implements LoggerAwareInterface
$toRemove = false; $toRemove = false;
if ($this->lightClean) { if ($this->lightClean) {
if ($li > $p && 'ul' !== $tag && 'ol' !== $tag) { if (!$isList && $li > $p) {
$this->logger->debug(' too many <li> elements, and parent is not <ul> or <ol>'); $this->logger->debug(' too many <li> elements, and parent is not <ul> or <ol>');
$toRemove = true; $toRemove = true;
} elseif ($input > floor($p / 3)) { } elseif ($input > floor($p / 3)) {
$this->logger->debug(' too many <input> elements'); $this->logger->debug(' too many <input> elements');
$toRemove = true; $toRemove = true;
} elseif ($contentLength < 6 && (0 === $embedCount && (0 === $img || $img > 2))) { } elseif (!$isList && $contentLength < 6 && (0 === $embedCount && (0 === $img || $img > 2))) {
$this->logger->debug(' content length less than 6 chars, 0 embeds and either 0 images or more than 2 images'); $this->logger->debug(' content length less than 6 chars, 0 embeds and either 0 images or more than 2 images');
$toRemove = true; $toRemove = true;
} elseif ($weight < 25 && $linkDensity > 0.25) { } elseif (!$isList && $weight < 25 && $linkDensity > 0.25) {
$this->logger->debug(' weight is ' . $weight . ' < 25 and link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.25'); $this->logger->debug(' weight is ' . $weight . ' < 25 and link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.25');
$toRemove = true; $toRemove = true;
} elseif ($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) { } elseif ($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
@ -666,16 +682,16 @@ class Readability implements LoggerAwareInterface
if ($img > $p) { if ($img > $p) {
$this->logger->debug(' more image elements than paragraph elements'); $this->logger->debug(' more image elements than paragraph elements');
$toRemove = true; $toRemove = true;
} elseif ($li > $p && 'ul' !== $tag && 'ol' !== $tag) { } elseif (!$isList && $li > $p) {
$this->logger->debug(' too many <li> elements, and parent is not <ul> or <ol>'); $this->logger->debug(' too many <li> elements, and parent is not <ul> or <ol>');
$toRemove = true; $toRemove = true;
} elseif ($input > floor($p / 3)) { } elseif ($input > floor($p / 3)) {
$this->logger->debug(' too many <input> elements'); $this->logger->debug(' too many <input> elements');
$toRemove = true; $toRemove = true;
} elseif ($contentLength < 10 && (0 === $img || $img > 2)) { } elseif (!$isList && $contentLength < 10 && (0 === $img || $img > 2)) {
$this->logger->debug(' content length less than 10 chars and 0 images, or more than 2 images'); $this->logger->debug(' content length less than 10 chars and 0 images, or more than 2 images');
$toRemove = true; $toRemove = true;
} elseif ($weight < 25 && $linkDensity > 0.2) { } elseif (!$isList && $weight < 25 && $linkDensity > 0.2) {
$this->logger->debug(' weight is ' . $weight . ' lower than 0 and link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.2'); $this->logger->debug(' weight is ' . $weight . ' lower than 0 and link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.2');
$toRemove = true; $toRemove = true;
} elseif ($weight >= 25 && $linkDensity > 0.5) { } elseif ($weight >= 25 && $linkDensity > 0.5) {
@ -846,7 +862,7 @@ class Readability implements LoggerAwareInterface
case 'DD': case 'DD':
case 'DT': case 'DT':
case 'LI': case 'LI':
$readability->value -= 2 * round($this->getLinkDensity($node), 0, \PHP_ROUND_HALF_UP); $readability->value -= 3;
break; break;
case 'ASIDE': case 'ASIDE':
case 'FOOTER': case 'FOOTER':
@ -907,14 +923,35 @@ class Readability implements LoggerAwareInterface
continue; continue;
} }
// Remove invisible nodes
if (!$this->isNodeVisible($node)) {
$this->logger->debug('Removing invisible node ' . $node->getNodePath());
$node->parentNode->removeChild($node);
--$nodeIndex;
continue;
}
// Remove unlikely candidates
$unlikelyMatchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id') . ' ' . $node->getAttribute('style');
if (mb_strlen($unlikelyMatchString) > 3 && // don't process "empty" strings
preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
!preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString)
) {
$this->logger->debug('Removing unlikely candidate (using conf) ' . $node->getNodePath() . ' by "' . $unlikelyMatchString . '"');
$node->parentNode->removeChild($node);
--$nodeIndex;
continue;
}
// Some well known site uses sections as paragraphs. // Some well known site uses sections as paragraphs.
if (0 === strcasecmp($tagName, 'p') || 0 === strcasecmp($tagName, 'td') || 0 === strcasecmp($tagName, 'pre') || 0 === strcasecmp($tagName, 'section')) { if (\in_array($tagName, $this->defaultTagsToScore, true)) {
$nodesToScore[] = $node; $nodesToScore[] = $node;
} }
// Turn divs into P tags where they have been used inappropriately // Turn divs into P tags where they have been used inappropriately
// (as in, where they contain no other block level elements). // (as in, where they contain no other block level elements).
if (0 === strcasecmp($tagName, 'div') || 0 === strcasecmp($tagName, 'article') || 0 === strcasecmp($tagName, 'section')) { if ('div' === $tagName) {
if (!preg_match($this->regexps['divToPElements'], $nodeContent)) { if (!preg_match($this->regexps['divToPElements'], $nodeContent)) {
$newNode = $this->dom->createElement('p'); $newNode = $this->dom->createElement('p');
@ -929,28 +966,47 @@ class Readability implements LoggerAwareInterface
} }
} else { } else {
// Will change these P elements back to text nodes after processing. // Will change these P elements back to text nodes after processing.
for ($i = 0, $il = $node->childNodes->length; $i < $il; ++$i) { $p = null;
$childNode = $node->childNodes->item($i); // foreach does not handle removeChild very well
// See https://www.php.net/manual/en/domnode.removechild.php#90292
$childs = iterator_to_array($node->childNodes);
foreach ($childs as $childNode) {
// executable tags (<?php or <?xml) warning
if ($childNode instanceof \DOMProcessingInstruction) {
$childNode->parentNode->removeChild($childNode);
// it looks like sometimes the loop is going too far and we are retrieving a non-existant child
if (null === $childNode) {
continue; continue;
} }
// executable tags (<?php or <?xml) warning if ($childNode instanceof \DOMText && '' === $this->getInnerText($childNode, true, true)) {
if (\is_object($childNode) && 'DOMProcessingInstruction' === \get_class($childNode)) { /* $this->logger->debug('Remove empty text node'); */
$childNode->parentNode->removeChild($childNode); $childNode->parentNode->removeChild($childNode);
continue; continue;
} }
if (\XML_TEXT_NODE === $childNode->nodeType) { if ($this->isPhrasingContent($childNode)) {
$p = $this->dom->createElement('p'); if (null !== $p) {
$p->setInnerHtml($childNode->nodeValue); $p->appendChild($childNode);
$p->setAttribute('data-readability-styled', 'true'); } elseif ('' !== $this->getInnerText($childNode, true, true)) {
$childNode->parentNode->replaceChild($p, $childNode); $p = $this->dom->createElement('p');
$p->setInnerHtml($childNode->nodeValue);
$p->setAttribute('data-readability-styled', 'true');
$childNode->parentNode->replaceChild($p, $childNode);
}
} elseif (null !== $p) {
while ($p->lastChild && '' === $this->getInnerText($p->lastChild, true, true)) {
$p->removeChild($p->lastChild);
}
$p = null;
} }
} }
if ($this->hasSingleTagInsideElement($node, 'p') && $this->getLinkDensity($node) < 0.25) {
$newNode = $node->childNodes->item(0);
$node->parentNode->replaceChild($newNode, $node);
$nodesToScore[] = $newNode;
}
} }
} }
} }
@ -963,14 +1019,13 @@ class Readability implements LoggerAwareInterface
* Maybe eventually link density. * Maybe eventually link density.
*/ */
for ($pt = 0, $scored = \count($nodesToScore); $pt < $scored; ++$pt) { for ($pt = 0, $scored = \count($nodesToScore); $pt < $scored; ++$pt) {
$parentNode = $nodesToScore[$pt]->parentNode; $ancestors = $this->getAncestors($nodesToScore[$pt], 5);
// No parent node? Move on... // No parent node? Move on...
if (!$parentNode) { if (0 === \count($ancestors)) {
continue; continue;
} }
$grandParentNode = $parentNode->parentNode instanceof \DOMElement ? $parentNode->parentNode : null;
$innerText = $this->getInnerText($nodesToScore[$pt]); $innerText = $this->getInnerText($nodesToScore[$pt]);
// If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it. // If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it.
@ -978,17 +1033,6 @@ class Readability implements LoggerAwareInterface
continue; continue;
} }
// Initialize readability data for the parent.
if (!$parentNode->hasAttribute('readability')) {
$this->initializeNode($parentNode);
$parentNode->setAttribute('data-candidate', 'true');
}
// Initialize readability data for the grandparent.
if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) {
$this->initializeNode($grandParentNode);
$grandParentNode->setAttribute('data-candidate', 'true');
}
// Add a point for the paragraph itself as a base. // Add a point for the paragraph itself as a base.
$contentScore = 1; $contentScore = 1;
// Add points for any commas within this paragraph. // Add points for any commas within this paragraph.
@ -996,25 +1040,26 @@ class Readability implements LoggerAwareInterface
// For every SCORE_CHARS_IN_PARAGRAPH (default:100) characters in this paragraph, add another point. Up to 3 points. // For every SCORE_CHARS_IN_PARAGRAPH (default:100) characters in this paragraph, add another point. Up to 3 points.
$contentScore += min(floor(mb_strlen($innerText) / self::SCORE_CHARS_IN_PARAGRAPH), 3); $contentScore += min(floor(mb_strlen($innerText) / self::SCORE_CHARS_IN_PARAGRAPH), 3);
// For every SCORE_WORDS_IN_PARAGRAPH (default:20) words in this paragraph, add another point. Up to 3 points. // For every SCORE_WORDS_IN_PARAGRAPH (default:20) words in this paragraph, add another point. Up to 3 points.
$contentScore += min(floor($this->getWordCount($innerText) / self::SCORE_WORDS_IN_PARAGRAPH), 3); //$contentScore += min(floor($this->getWordCount($innerText) / self::SCORE_WORDS_IN_PARAGRAPH), 3);
/* TEST: For every positive/negative parent tag, add/substract half point. Up to 3 points. *\/
$up = $nodesToScore[$pt]; foreach ($ancestors as $level => $ancestor) {
$score = 0; if (!$ancestor->nodeName || !$ancestor->parentNode) {
while ($up->parentNode instanceof \DOMElement) { return;
$up = $up->parentNode;
if (preg_match($this->regexps['positive'], $up->getAttribute('class') . ' ' . $up->getAttribute('id'))) {
$score += 0.5;
} elseif (preg_match($this->regexps['negative'], $up->getAttribute('class') . ' ' . $up->getAttribute('id'))) {
$score -= 0.5;
} }
}
$score = floor($score);
$contentScore += max(min($score, 3), -3);/**/
// Add the score to the parent. The grandparent gets half. if (!$ancestor->hasAttribute('readability')) {
$parentNode->getAttributeNode('readability')->value = ((float) $parentNode->getAttributeNode('readability')->value) + $contentScore; $this->initializeNode($ancestor);
if ($grandParentNode) { $ancestor->setAttribute('data-candidate', 'true');
$grandParentNode->getAttributeNode('readability')->value += round($contentScore / self::GRANDPARENT_SCORE_DIVISOR); }
if (0 === $level) {
$scoreDivider = 1;
} elseif (1 === $level) {
$scoreDivider = 2;
} else {
$scoreDivider = $level * 3;
}
$ancestor->getAttributeNode('readability')->value += $contentScore / $scoreDivider;
} }
} }
@ -1038,18 +1083,6 @@ class Readability implements LoggerAwareInterface
for ($c = $candidates->length - 1; $c >= 0; --$c) { for ($c = $candidates->length - 1; $c >= 0; --$c) {
$node = $candidates->item($c); $node = $candidates->item($c);
// Remove unlikely candidates
$unlikelyMatchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id') . ' ' . $node->getAttribute('style');
if (mb_strlen($unlikelyMatchString) > 3 && // don't process "empty" strings
preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
!preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString)
) {
$this->logger->debug('Removing unlikely candidate (using conf) ' . $node->getNodePath() . ' by "' . $unlikelyMatchString . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0));
$node->parentNode->removeChild($node);
--$nodeIndex;
}
} }
unset($candidates); unset($candidates);
} }
@ -1058,10 +1091,11 @@ class Readability implements LoggerAwareInterface
* After we've calculated scores, loop through all of the possible candidate nodes we found * After we've calculated scores, loop through all of the possible candidate nodes we found
* and find the one with the highest score. * and find the one with the highest score.
*/ */
$topCandidate = null; $topCandidates = array_fill(0, 5, null);
if ($xpath) { if ($xpath) {
// Using array of DOMElements after deletion is a path to DOOMElement. // Using array of DOMElements after deletion is a path to DOOMElement.
$candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement); $candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement);
$this->logger->debug('Candidates: ' . $candidates->length);
for ($c = $candidates->length - 1; $c >= 0; --$c) { for ($c = $candidates->length - 1; $c >= 0; --$c) {
$item = $candidates->item($c); $item = $candidates->item($c);
@ -1072,15 +1106,26 @@ class Readability implements LoggerAwareInterface
$readability = $item->getAttributeNode('readability'); $readability = $item->getAttributeNode('readability');
$readability->value = round($readability->value * (1 - $this->getLinkDensity($item)), 0, \PHP_ROUND_HALF_UP); $readability->value = round($readability->value * (1 - $this->getLinkDensity($item)), 0, \PHP_ROUND_HALF_UP);
if (!$topCandidate || $readability->value > (int) $topCandidate->getAttribute('readability')) { for ($t = 0; $t < 5; ++$t) {
$this->logger->debug('Candidate: ' . $item->getNodePath() . ' (' . $item->getAttribute('class') . ':' . $item->getAttribute('id') . ') with score ' . $readability->value); $aTopCandidate = $topCandidates[$t];
$topCandidate = $item;
if (!$aTopCandidate || $readability->value > (int) $aTopCandidate->getAttribute('readability')) {
$this->logger->debug('Candidate: ' . $item->getNodePath() . ' (' . $item->getAttribute('class') . ':' . $item->getAttribute('id') . ') with score ' . $readability->value);
array_splice($topCandidates, $t, 0, [$item]);
if (\count($topCandidates) > 5) {
array_pop($topCandidates);
}
break;
}
} }
} }
unset($candidates);
} }
$topCandidates = array_filter($topCandidates, function ($v, $idx) {
return 0 === $idx || null !== $v;
}, \ARRAY_FILTER_USE_BOTH);
$topCandidate = $topCandidates[0];
/* /*
* If we still have no top candidate, just use the body as a last resort. * If we still have no top candidate, just use the body as a last resort.
* We also have to copy the body node so it is something we can modify. * We also have to copy the body node so it is something we can modify.
@ -1106,6 +1151,59 @@ class Readability implements LoggerAwareInterface
} }
$this->initializeNode($topCandidate); $this->initializeNode($topCandidate);
} elseif ($topCandidate) {
$alternativeCandidateAncestors = [];
foreach ($topCandidates as $candidate) {
if ((int) $candidate->getAttribute('readability') / (int) $topCandidate->getAttribute('readability') >= 0.75) {
$ancestors = $this->getAncestors($candidate);
$this->logger->debug('Adding ' . \count($ancestors) . ' alternative ancestors for ' . $candidate->getNodePath());
$alternativeCandidateAncestors[] = $ancestors;
}
}
if (\count($alternativeCandidateAncestors) >= 3) {
$parentOfTopCandidate = $topCandidate->parentNode;
while ('body' !== $parentOfTopCandidate->nodeName) {
$listsContainingThisAncestor = 0;
for ($ancestorIndex = 0; $ancestorIndex < \count($alternativeCandidateAncestors) && $listsContainingThisAncestor < 3; ++$ancestorIndex) {
$listsContainingThisAncestor += (int) \in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex], true);
}
if ($listsContainingThisAncestor >= 3) {
$topCandidate = $parentOfTopCandidate;
break;
}
$parentOfTopCandidate = $parentOfTopCandidate->parentNode;
}
}
if (!$topCandidate->hasAttribute('readability')) {
$this->initializeNode($topCandidate);
}
$parentOfTopCandidate = $topCandidate->parentNode;
$lastScore = (int) $topCandidate->getAttribute('readability');
$scoreThreshold = $lastScore / 3;
while ('body' !== $parentOfTopCandidate->nodeName) {
if (!$parentOfTopCandidate->hasAttribute('readability')) {
$parentOfTopCandidate = $parentOfTopCandidate->parentNode;
continue;
}
$parentScore = (int) $parentOfTopCandidate->getAttribute('readability');
if ($parentScore < $scoreThreshold) {
break;
}
if ($parentScore > $lastScore) {
$topCandidate = $parentOfTopCandidate;
break;
}
$lastScore = (int) $parentOfTopCandidate->getAttribute('readability');
$parentOfTopCandidate = $parentOfTopCandidate->parentNode;
}
$parentOfTopCandidate = $topCandidate->parentNode;
while ('body' !== $parentOfTopCandidate->nodeName && 1 === $parentOfTopCandidate->childNodes->length) {
$topCandidate = $parentOfTopCandidate;
$parentOfTopCandidate = $topCandidate->parentNode;
}
if (!$topCandidate->hasAttribute('readability')) {
$this->initializeNode($topCandidate);
}
} }
// Set table as the main node if resulted data is table element. // Set table as the main node if resulted data is table element.
@ -1131,7 +1229,8 @@ class Readability implements LoggerAwareInterface
$articleContent = $this->dom->createElement('div'); $articleContent = $this->dom->createElement('div');
$articleContent->setAttribute('class', 'readability-content'); $articleContent->setAttribute('class', 'readability-content');
$siblingScoreThreshold = max(10, ((int) $topCandidate->getAttribute('readability')) * 0.2); $siblingScoreThreshold = max(10, ((int) $topCandidate->getAttribute('readability')) * 0.2);
$siblingNodes = $topCandidate->parentNode->childNodes; $parentOfTopCandidate = $topCandidate->parentNode;
$siblingNodes = $parentOfTopCandidate->childNodes;
if (0 === $siblingNodes->length) { if (0 === $siblingNodes->length) {
$siblingNodes = new \stdClass(); $siblingNodes = new \stdClass();
@ -1146,27 +1245,25 @@ class Readability implements LoggerAwareInterface
if ($siblingNode->isSameNode($topCandidate)) { if ($siblingNode->isSameNode($topCandidate)) {
$append = true; $append = true;
} } else {
$contentBonus = 0;
$contentBonus = 0;
// Give a bonus if sibling nodes and top candidates have the same classname.
if (\XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && '' !== $topCandidate->getAttribute('class')) {
$contentBonus += ((int) $topCandidate->getAttribute('readability')) * 0.2;
}
if (\XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) {
$append = true;
}
if (0 === strcasecmp($siblingNodeName, 'p')) { // Give a bonus if sibling nodes and top candidates have the same classname.
$linkDensity = $this->getLinkDensity($siblingNode); if (\XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && '' !== $topCandidate->getAttribute('class')) {
$nodeContent = $this->getInnerText($siblingNode, true, true); $contentBonus += ((int) $topCandidate->getAttribute('readability')) * 0.2;
$nodeLength = mb_strlen($nodeContent); }
if (($nodeLength > self::MIN_NODE_LENGTH && $linkDensity < self::MAX_LINK_DENSITY) if (\XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) {
|| ($nodeLength < self::MIN_NODE_LENGTH && 0 === (int) $linkDensity && preg_match('/\.( |$)/', $nodeContent))) {
$append = true; $append = true;
} elseif (0 === strcasecmp($siblingNodeName, 'p')) {
$linkDensity = (int) $this->getLinkDensity($siblingNode);
$nodeContent = $this->getInnerText($siblingNode, true, true);
$nodeLength = mb_strlen($nodeContent);
if (($nodeLength > self::MIN_NODE_LENGTH && $linkDensity < self::MAX_LINK_DENSITY)
|| ($nodeLength < self::MIN_NODE_LENGTH && 0 === $nodeLength && 0 === $linkDensity && preg_match('/\.( |$)/', $nodeContent))) {
$append = true;
}
} }
} }
@ -1348,4 +1445,56 @@ class Readability implements LoggerAwareInterface
$this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement'); $this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement');
} }
private function getAncestors(\DOMElement $node, int $maxDepth = 0): array
{
$ancestors = [];
$i = 0;
while ($node->parentNode instanceof \DOMElement) {
$ancestors[] = $node->parentNode;
if (++$i === $maxDepth) {
break;
}
$node = $node->parentNode;
}
return $ancestors;
}
private function isPhrasingContent($node): bool
{
return \XML_TEXT_NODE === $node->nodeType
|| \in_array($node->nodeName, $this->phrasingElements, true)
|| (\in_array($node->nodeName, ['a', 'del', 'ins'], true) && !\in_array(false, array_map(function ($c) {
return $this->isPhrasingContent($c);
}, iterator_to_array($node->childNodes)), true));
}
private function hasSingleTagInsideElement(\DOMElement $node, string $tag): bool
{
if (1 !== $node->childNodes->length || $node->childNodes->item(0)->nodeName !== $tag) {
return false;
}
$a = array_filter(iterator_to_array($node->childNodes), function ($childNode) {
return $childNode instanceof \DOMText &&
preg_match($this->regexps['hasContent'], $this->getInnerText($childNode));
});
return 0 === \count($a);
}
/**
* Return whether a given node is visible or not.
*
* Tidy must be configured to not clean the input for this function to
* work as expected, see $this->tidy_config['clean']
*/
private function isNodeVisible(\DOMElement $node): bool
{
return !($node->hasAttribute('style')
&& preg_match($this->regexps['isNotVisible'], $node->getAttribute('style'))
)
&& !$node->hasAttribute('hidden');
}
} }

@ -201,7 +201,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertEmpty($readability->getTitle()->getInnerHtml()); $this->assertEmpty($readability->getTitle()->getInnerHtml());
$this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml());
$this->assertStringNotContainsString('<aside>', $readability->getContent()->getInnerHtml()); $this->assertStringNotContainsString('<aside>', $readability->getContent()->getInnerHtml());
$this->assertStringContainsString('<footer readability="5"/>', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('<footer readability="9"/>', $readability->getContent()->getInnerHtml());
} }
public function testWithClasses(): void public function testWithClasses(): void
@ -499,6 +499,48 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('<a href="alice-I.html">Down the Rabbit-Hole</a>', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('<a href="alice-I.html">Down the Rabbit-Hole</a>', $readability->getContent()->getInnerHtml());
} }
public function dataForVisibleNode(): array
{
return [
'visible node' => [
'<div>' . str_repeat('<p>This <b>is</b> the awesome and WONDERFUL content :)</p>', 7) . '</div>',
true,
],
'display=none' => [
'<div style="display:none;">' . str_repeat('<p>This <b>is</b> the awesome and WONDERFUL content :)</p>', 7) . '</div>',
false,
],
'display=inline' => [
'<div style="display:inline;">' . str_repeat('<p>This <b>is</b> the awesome and WONDERFUL content :)</p>', 7) . '</div>',
true,
],
'hidden attribute' => [
'<div hidden>' . str_repeat('<p>This <b>is</b> the awesome and WONDERFUL content :)</p>', 7) . '</div>',
false,
],
'missing display' => [
'<div style="color:#ccc;">' . str_repeat('<p>This <b>is</b> the awesome and WONDERFUL content :)</p>', 7) . '</div>',
true,
],
];
}
/**
* @dataProvider dataForVisibleNode
*/
public function testVisibleNode(string $content, bool $shouldBeVisible): void
{
$readability = $this->getReadability($content, 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
if ($shouldBeVisible) {
$this->assertStringContainsString('WONDERFUL content', $readability->getContent()->getInnerHtml());
} else {
$this->assertStringNotContainsString('WONDERFUL content', $readability->getContent()->getInnerHtml());
}
}
private function getReadability(string $html, string $url = null, string $parser = 'libxml', bool $useTidy = true): Readability private function getReadability(string $html, string $url = null, string $parser = 'libxml', bool $useTidy = true): Readability
{ {
$readability = new Readability($html, $url, $parser, $useTidy); $readability = new Readability($html, $url, $parser, $useTidy);

Loading…
Cancel
Save