Add routine to remove invisible nodes

Readability was previously removing (was trying to actually, see next
section) invisible nodes using a pattern from `unlikelyCandidates`. This
was quite hacky and was removed during a backport of logics from
mozilla/readability. There is still a need to remove them so here we
are. We still use a pattern but specifically against the style
attribute. We also remove nodes with the attribute `hidden`.

The clean feature of tidy actually replaces inline style attributes
with css classes thus preventing readability to detect invisible nodes,
see https://github.com/htacg/tidy-html5/blob/5.6.0/src/clean.c#L1488
We therefore set clean configuration to false.

Signed-off-by: Kevin Decherf <kevin@kdecherf.com>
pull/64/head
Kevin Decherf 4 years ago
parent b580cf216d
commit a44c4e5482
  1. 25
      src/Readability.php
  2. 42
      tests/ReadabilityTest.php

@ -55,6 +55,7 @@ class Readability implements LoggerAwareInterface
'media' => '!//(?:[^\.\?/]+\.)?(?:youtu(?:be)?|giphy|soundcloud|dailymotion|vimeo|pornhub|xvideos|twitvid|rutube|openload\.co|viddler)\.(?:com|be|org|net)/!i',
'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i',
'hasContent' => '/\S$/',
'isNotVisible' => '/display\s*:\s*none/',
];
public $defaultTagsToScore = ['section', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'td', 'pre'];
// The commented out elements qualify as phrasing content but tend to be
@ -74,7 +75,7 @@ class Readability implements LoggerAwareInterface
'numeric-entities' => false,
// 'preserve-entities' => true,
'break-before-br' => false,
'clean' => true,
'clean' => false,
'output-xhtml' => true,
'logical-emphasis' => true,
'show-body-only' => false,
@ -922,6 +923,14 @@ class Readability implements LoggerAwareInterface
continue;
}
// Remove invisible nodes
if (!$this->isNodeVisible($node)) {
$this->logger->debug('Removing invisible node ' . $node->getNodePath());
$node->parentNode->removeChild($node);
--$nodeIndex;
continue;
}
// Remove unlikely candidates
$unlikelyMatchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id') . ' ' . $node->getAttribute('style');
@ -1474,4 +1483,18 @@ class Readability implements LoggerAwareInterface
return 0 === \count($a);
}
/**
* Return whether a given node is visible or not.
*
* Tidy must be configured to not clean the input for this function to
* work as expected, see $this->tidy_config['clean']
*/
private function isNodeVisible(\DOMElement $node): bool
{
return !($node->hasAttribute('style')
&& preg_match($this->regexps['isNotVisible'], $node->getAttribute('style'))
)
&& !$node->hasAttribute('hidden');
}
}

@ -499,6 +499,48 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('<a href="alice-I.html">Down the Rabbit-Hole</a>', $readability->getContent()->getInnerHtml());
}
public function dataForVisibleNode(): array
{
return [
'visible node' => [
'<div>' . str_repeat('<p>This <b>is</b> the awesome and WONDERFUL content :)</p>', 7) . '</div>',
true,
],
'display=none' => [
'<div style="display:none;">' . str_repeat('<p>This <b>is</b> the awesome and WONDERFUL content :)</p>', 7) . '</div>',
false,
],
'display=inline' => [
'<div style="display:inline;">' . str_repeat('<p>This <b>is</b> the awesome and WONDERFUL content :)</p>', 7) . '</div>',
true,
],
'hidden attribute' => [
'<div hidden>' . str_repeat('<p>This <b>is</b> the awesome and WONDERFUL content :)</p>', 7) . '</div>',
false,
],
'missing display' => [
'<div style="color:#ccc;">' . str_repeat('<p>This <b>is</b> the awesome and WONDERFUL content :)</p>', 7) . '</div>',
true,
],
];
}
/**
* @dataProvider dataForVisibleNode
*/
public function testVisibleNode(string $content, bool $shouldBeVisible): void
{
$readability = $this->getReadability($content, 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
if ($shouldBeVisible) {
$this->assertStringContainsString('WONDERFUL content', $readability->getContent()->getInnerHtml());
} else {
$this->assertStringNotContainsString('WONDERFUL content', $readability->getContent()->getInnerHtml());
}
}
private function getReadability(string $html, string $url = null, string $parser = 'libxml', bool $useTidy = true): Readability
{
$readability = new Readability($html, $url, $parser, $useTidy);

Loading…
Cancel
Save