From a44c4e548299a02ebb8461a17c95a6dac191f4e0 Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Thu, 3 Feb 2022 02:46:14 +0100 Subject: [PATCH] Add routine to remove invisible nodes Readability was previously removing (was trying to actually, see next section) invisible nodes using a pattern from `unlikelyCandidates`. This was quite hacky and was removed during a backport of logics from mozilla/readability. There is still a need to remove them so here we are. We still use a pattern but specifically against the style attribute. We also remove nodes with the attribute `hidden`. The clean feature of tidy actually replaces inline style attributes with css classes thus preventing readability to detect invisible nodes, see https://github.com/htacg/tidy-html5/blob/5.6.0/src/clean.c#L1488 We therefore set clean configuration to false. Signed-off-by: Kevin Decherf --- src/Readability.php | 25 ++++++++++++++++++++++- tests/ReadabilityTest.php | 42 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 1 deletion(-) diff --git a/src/Readability.php b/src/Readability.php index b9d6241..c250f05 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -55,6 +55,7 @@ class Readability implements LoggerAwareInterface 'media' => '!//(?:[^\.\?/]+\.)?(?:youtu(?:be)?|giphy|soundcloud|dailymotion|vimeo|pornhub|xvideos|twitvid|rutube|openload\.co|viddler)\.(?:com|be|org|net)/!i', 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i', 'hasContent' => '/\S$/', + 'isNotVisible' => '/display\s*:\s*none/', ]; public $defaultTagsToScore = ['section', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'td', 'pre']; // The commented out elements qualify as phrasing content but tend to be @@ -74,7 +75,7 @@ class Readability implements LoggerAwareInterface 'numeric-entities' => false, // 'preserve-entities' => true, 'break-before-br' => false, - 'clean' => true, + 'clean' => false, 'output-xhtml' => true, 'logical-emphasis' => true, 'show-body-only' => false, @@ -922,6 +923,14 @@ class Readability implements LoggerAwareInterface continue; } + // Remove invisible nodes + if (!$this->isNodeVisible($node)) { + $this->logger->debug('Removing invisible node ' . $node->getNodePath()); + $node->parentNode->removeChild($node); + --$nodeIndex; + continue; + } + // Remove unlikely candidates $unlikelyMatchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id') . ' ' . $node->getAttribute('style'); @@ -1474,4 +1483,18 @@ class Readability implements LoggerAwareInterface return 0 === \count($a); } + + /** + * Return whether a given node is visible or not. + * + * Tidy must be configured to not clean the input for this function to + * work as expected, see $this->tidy_config['clean'] + */ + private function isNodeVisible(\DOMElement $node): bool + { + return !($node->hasAttribute('style') + && preg_match($this->regexps['isNotVisible'], $node->getAttribute('style')) + ) + && !$node->hasAttribute('hidden'); + } } diff --git a/tests/ReadabilityTest.php b/tests/ReadabilityTest.php index fb0bed8..04ac99c 100644 --- a/tests/ReadabilityTest.php +++ b/tests/ReadabilityTest.php @@ -499,6 +499,48 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase $this->assertStringContainsString('Down the Rabbit-Hole', $readability->getContent()->getInnerHtml()); } + public function dataForVisibleNode(): array + { + return [ + 'visible node' => [ + '
' . str_repeat('

This is the awesome and WONDERFUL content :)

', 7) . '
', + true, + ], + 'display=none' => [ + '
' . str_repeat('

This is the awesome and WONDERFUL content :)

', 7) . '
', + false, + ], + 'display=inline' => [ + '
' . str_repeat('

This is the awesome and WONDERFUL content :)

', 7) . '
', + true, + ], + 'hidden attribute' => [ + '', + false, + ], + 'missing display' => [ + '
' . str_repeat('

This is the awesome and WONDERFUL content :)

', 7) . '
', + true, + ], + ]; + } + + /** + * @dataProvider dataForVisibleNode + */ + public function testVisibleNode(string $content, bool $shouldBeVisible): void + { + $readability = $this->getReadability($content, 'http://0.0.0.0'); + $readability->debug = true; + $res = $readability->init(); + + if ($shouldBeVisible) { + $this->assertStringContainsString('WONDERFUL content', $readability->getContent()->getInnerHtml()); + } else { + $this->assertStringNotContainsString('WONDERFUL content', $readability->getContent()->getInnerHtml()); + } + } + private function getReadability(string $html, string $url = null, string $parser = 'libxml', bool $useTidy = true): Readability { $readability = new Readability($html, $url, $parser, $useTidy);