Use helpers for content score manipulation

`DOMAttr::$value` must be a `string`.

Let’s add helpers for manipulating the `readability` attribute
so that we do not have to keep casting it from and to `string`
in order to appease `strict_types`.
pull/93/head
Jan Tojnar 2 years ago
parent fae4e78845
commit 4f5360df90
  1. 59
      src/Readability.php

@ -618,7 +618,7 @@ class Readability implements LoggerAwareInterface
for ($i = $curTagsLength - 1; $i >= 0; --$i) { for ($i = $curTagsLength - 1; $i >= 0; --$i) {
$node = $tagsList->item($i); $node = $tagsList->item($i);
$weight = $this->getWeight($node); $weight = $this->getWeight($node);
$contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0; $contentScore = self::getContentScore($node);
$this->logger->debug('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : '')); $this->logger->debug('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : ''));
// XXX Incomplete implementation // XXX Incomplete implementation
@ -834,29 +834,26 @@ class Readability implements LoggerAwareInterface
return; return;
} }
$readability = $this->dom->createAttribute('readability'); $contentScore = 0;
// this is our contentScore
$readability->value = 0;
$node->setAttributeNode($readability);
// using strtoupper just in case // using strtoupper just in case
switch (strtoupper($node->tagName)) { switch (strtoupper($node->tagName)) {
case 'ARTICLE': case 'ARTICLE':
$readability->value += 15; $contentScore += 15;
// no break // no break
case 'DIV': case 'DIV':
$readability->value += 5; $contentScore += 5;
break; break;
case 'PRE': case 'PRE':
case 'CODE': case 'CODE':
case 'TD': case 'TD':
case 'BLOCKQUOTE': case 'BLOCKQUOTE':
case 'FIGURE': case 'FIGURE':
$readability->value += 3; $contentScore += 3;
break; break;
case 'SECTION': case 'SECTION':
// often misused // often misused
// $readability->value += 2; // $contentScore += 2;
break; break;
case 'OL': case 'OL':
case 'UL': case 'UL':
@ -864,7 +861,7 @@ class Readability implements LoggerAwareInterface
case 'DD': case 'DD':
case 'DT': case 'DT':
case 'LI': case 'LI':
$readability->value -= 3; $contentScore -= 3;
break; break;
case 'ASIDE': case 'ASIDE':
case 'FOOTER': case 'FOOTER':
@ -875,7 +872,7 @@ class Readability implements LoggerAwareInterface
case 'TEXTAREA': case 'TEXTAREA':
case 'INPUT': case 'INPUT':
case 'NAV': case 'NAV':
$readability->value -= 3; $contentScore -= 3;
break; break;
case 'H1': case 'H1':
case 'H2': case 'H2':
@ -885,11 +882,15 @@ class Readability implements LoggerAwareInterface
case 'H6': case 'H6':
case 'TH': case 'TH':
case 'HGROUP': case 'HGROUP':
$readability->value -= 5; $contentScore -= 5;
break; break;
} }
$readability->value += $this->getWeight($node); $contentScore += $this->getWeight($node);
$readability = $this->dom->createAttribute('readability');
$readability->value = (string) $contentScore;
$node->setAttributeNode($readability);
} }
/** /**
@ -1059,7 +1060,8 @@ class Readability implements LoggerAwareInterface
} else { } else {
$scoreDivider = $level * 3; $scoreDivider = $level * 3;
} }
$ancestor->getAttributeNode('readability')->value += $contentScore / $scoreDivider;
self::updateContentScore($ancestor, fn ($prevScore) => $prevScore + $contentScore / $scoreDivider);
} }
} }
@ -1074,7 +1076,7 @@ class Readability implements LoggerAwareInterface
$node = $candidates->item($c); $node = $candidates->item($c);
// node should be readable but not inside of an article otherwise it's probably non-readable block // node should be readable but not inside of an article otherwise it's probably non-readable block
if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? 0 !== strcasecmp($node->parentNode->tagName, 'article') : true)) { if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? 0 !== strcasecmp($node->parentNode->tagName, 'article') : true)) {
$this->logger->debug('Removing unlikely candidate (using note) ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); $this->logger->debug('Removing unlikely candidate (using note) ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . self::getContentScore($node));
$node->parentNode->removeChild($node); $node->parentNode->removeChild($node);
} }
} }
@ -1098,14 +1100,13 @@ class Readability implements LoggerAwareInterface
// Scale the final candidates score based on link density. Good content should have a // Scale the final candidates score based on link density. Good content should have a
// relatively small link density (5% or less) and be mostly unaffected by this operation. // relatively small link density (5% or less) and be mostly unaffected by this operation.
// If not for this we would have used XPath to find maximum @readability. // If not for this we would have used XPath to find maximum @readability.
$readability = $item->getAttributeNode('readability'); self::updateContentScore($item, fn ($prevScore) => round($prevScore * (1 - $this->getLinkDensity($item)), 0, \PHP_ROUND_HALF_UP));
$readability->value = round($readability->value * (1 - $this->getLinkDensity($item)), 0, \PHP_ROUND_HALF_UP);
for ($t = 0; $t < 5; ++$t) { for ($t = 0; $t < 5; ++$t) {
$aTopCandidate = $topCandidates[$t]; $aTopCandidate = $topCandidates[$t];
if (!$aTopCandidate || $readability->value > (int) $aTopCandidate->getAttribute('readability')) { if (!$aTopCandidate || self::getContentScore($item) > self::getContentScore($aTopCandidate)) {
$this->logger->debug('Candidate: ' . $item->getNodePath() . ' (' . $item->getAttribute('class') . ':' . $item->getAttribute('id') . ') with score ' . $readability->value); $this->logger->debug('Candidate: ' . $item->getNodePath() . ' (' . $item->getAttribute('class') . ':' . $item->getAttribute('id') . ') with score ' . self::getContentScore($item));
array_splice($topCandidates, $t, 0, [$item]); array_splice($topCandidates, $t, 0, [$item]);
if (\count($topCandidates) > 5) { if (\count($topCandidates) > 5) {
array_pop($topCandidates); array_pop($topCandidates);
@ -1371,6 +1372,26 @@ class Readability implements LoggerAwareInterface
} }
} }
/**
* Updates the content score for the given element using the provided function.
*
* @param callable(float): float $f
*/
private static function updateContentScore(\DOMElement $element, callable $f): void
{
$readabilityAttr = $element->getAttributeNode('readability');
$prevScore = (float) $readabilityAttr->value;
$readabilityAttr->value = (string) $f($prevScore);
}
/**
* Gets the content score for given element.
*/
private static function getContentScore(\DOMElement $element): float
{
return $element->hasAttribute('readability') ? (float) $element->getAttribute('readability') : 0;
}
/** /**
* Load HTML in a DOMDocument. * Load HTML in a DOMDocument.
* Apply Pre filters * Apply Pre filters

Loading…
Cancel
Save