From 209c404d7b651ec292541debec20234967e0bd77 Mon Sep 17 00:00:00 2001 From: Jeremy Benoist Date: Mon, 29 Feb 2016 11:45:48 +0100 Subject: [PATCH] Fix instanceof DOMElement We previously checked `instanceof DOMElement` which was wrong since we are in the namespace class, the class `Readability\DOMElement` does not exists. --- src/Readability.php | 176 ++++++++++++++++++++++---------------- tests/ReadabilityTest.php | 10 +-- 2 files changed, 101 insertions(+), 85 deletions(-) diff --git a/src/Readability.php b/src/Readability.php index 644cf62..d658a97 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -53,16 +53,23 @@ class Readability public $articleContent; public $original_html; public $dom; - public $url = null; // optional - URL where HTML was retrieved - public $lightClean = true; // preserves more content (experimental) + // optional - URL where HTML was retrieved + public $url = null; + // preserves more content (experimental) + public $lightClean = true; public $debug = false; public $tidied = false; - protected $debugText = ''; // error text for one time output - protected $domainRegExp = null; // article domain regexp for calibration + // error text for one time output + protected $debugText = ''; + // article domain regexp for calibration + protected $domainRegExp = null; protected $body = null; // - protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later - protected $flags = 7; // 1 | 2 | 4; // Start with all processing flags set. - protected $success = false; // indicates whether we were able to extract or not + // Cache the body HTML in case we need to re-use it later + protected $bodyCache = null; + // 1 | 2 | 4; // Start with all processing flags set. + protected $flags = 7; + // indicates whether we were able to extract or not + protected $success = false; /** * All of the regular expressions in use within readability. @@ -105,21 +112,33 @@ class Readability ); // raw HTML filters protected $pre_filters = array( - '!]*>(.*?)!is' => '', // remove obvious scripts - '!]*>(.*?)!is' => '', // remove obvious styles - '!]*>!is' => '', // remove spans as we redefine styles and they're probably special-styled - '!]*>\s*\[AD\]\s*!is' => '', // HACK: firewall-filtered content - '!(]*>[ \r\n\s]*){2,}!i' => '

', // HACK: replace linebreaks plus br's with p's - //'!!is' => '', // replace noscripts - '!<(/?)font[^>]*>!is' => '<\\1span>', // replace fonts to spans + // remove obvious scripts + '!]*>(.*?)!is' => '', + // remove obvious styles + '!]*>(.*?)!is' => '', + // remove spans as we redefine styles and they're probably special-styled + '!]*>!is' => '', + // HACK: firewall-filtered content + '!]*>\s*\[AD\]\s*!is' => '', + // HACK: replace linebreaks plus br's with p's + '!(]*>[ \r\n\s]*){2,}!i' => '

', + // replace noscripts + //'!!is' => '', + // replace fonts to spans + '!<(/?)font[^>]*>!is' => '<\\1span>', ); // output HTML filters protected $post_filters = array( - '/\s*

']+/>!is' => '', // replace empty tags that break layouts - //'!<(\s*/?\s*(?:blockquote|br|hr|code|div|article|span|footer|aside|p|pre|dl|li|ul|ol)) [^>]+>!is' => "<\\1>", // remove all attributes on text tags - "/\n+/" => "\n", //single newlines cleanup - '!]*>\s* '\s*

']+/>!is' => '', + // remove all attributes on text tags + //'!<(\s*/?\s*(?:blockquote|br|hr|code|div|article|span|footer|aside|p|pre|dl|li|ul|ol)) [^>]+>!is' => "<\\1>", + //single newlines cleanup + "/\n+/" => "\n", + // modern web... + '!]*>\s* '\s*!is' => '', '!<[hb]r>!is' => '<\\1 />', ); @@ -366,12 +385,12 @@ class Readability */ protected function getArticleTitle() { - $curTitle = ''; $origTitle = ''; try { $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); - } catch (Exception $e) { + } catch (\Exception $e) { + $curTitle = ''; } if (preg_match('/ [\|\-] /', $curTitle)) { @@ -504,14 +523,10 @@ class Readability */ public function prepArticle(\DOMElement $articleContent) { - if ($this->lightClean) { - $this->dbg('Light clean enabled.'); - } else { - $this->dbg('Standard clean enabled.'); - } - + $this->dbg($this->lightClean ? 'Light clean enabled.' : 'Standard clean enabled.'); $this->cleanStyles($articleContent); $this->killBreaks($articleContent); + $xpath = new \DOMXPath($articleContent->ownerDocument); if ($this->revertForcedParagraphElements) { @@ -563,23 +578,25 @@ class Readability $articleParagraphs = $articleContent->getElementsByTagName('p'); for ($i = $articleParagraphs->length - 1; $i >= 0; --$i) { - $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length; - $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length; - $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length; - $videoCount = $articleParagraphs->item($i)->getElementsByTagName('video')->length; - $audioCount = $articleParagraphs->item($i)->getElementsByTagName('audio')->length; - $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length; - - if ($iframeCount === 0 && $imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $videoCount === 0 && $audioCount === 0 && mb_strlen(preg_replace('/\s+/is', '', $this->getInnerText($articleParagraphs->item($i), false, false))) === 0) { - $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i)); + $item = $articleParagraphs->item($i); + + $imgCount = $item->getElementsByTagName('img')->length; + $embedCount = $item->getElementsByTagName('embed')->length; + $objectCount = $item->getElementsByTagName('object')->length; + $videoCount = $item->getElementsByTagName('video')->length; + $audioCount = $item->getElementsByTagName('audio')->length; + $iframeCount = $item->getElementsByTagName('iframe')->length; + + if ($iframeCount === 0 && $imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $videoCount === 0 && $audioCount === 0 && mb_strlen(preg_replace('/\s+/is', '', $this->getInnerText($item, false, false))) === 0) { + $item->parentNode->removeChild($item); } // add extra text to iframe tag to avoid an auto-closing iframe and then break the html code if ($iframeCount) { - $iframe = $articleParagraphs->item($i)->getElementsByTagName('iframe'); + $iframe = $item->getElementsByTagName('iframe'); $iframe->item(0)->nodeValue = ' '; - $articleParagraphs->item($i)->parentNode->replaceChild($iframe->item(0), $articleParagraphs->item($i)); + $item->parentNode->replaceChild($iframe->item(0), $item); } } @@ -589,7 +606,7 @@ class Readability $articleContent->innerHTML = preg_replace($search, $replace, $articleContent->innerHTML); } unset($search, $replace); - } catch (Exception $e) { + } catch (\Exception $e) { $this->dbg('Cleaning output HTML failed. Ignoring: '.$e->getMessage()); } } @@ -626,10 +643,10 @@ class Readability case 'FIGURE': $readability->value += 3; break; -/* case 'SECTION': // often misused - $readability->value += 2; + case 'SECTION': + // often misused + // $readability->value += 2; break; -*/ case 'OL': case 'UL': case 'DL': @@ -665,12 +682,12 @@ class Readability } /** - * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is + * Using a variety of metrics (content score, classname, element types), find the content that is * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. * * @param \DOMElement $page * - * @return \DOMElement + * @return \DOMElement|bool */ protected function grabArticle(\DOMElement $page = null) { @@ -703,13 +720,11 @@ class Readability try { $newNode->innerHTML = $node->innerHTML; - // It's easier to debug using original attributes. - //$newNode->setAttribute('class', $node->getAttribute('class')); - //$newNode->setAttribute('id', $node->getAttribute('id')); - $node = $node->parentNode->replaceChild($newNode, $node); + + $node->parentNode->replaceChild($newNode, $node); --$nodeIndex; $nodesToScore[] = $newNode; - } catch (Exception $e) { + } catch (\Exception $e) { $this->dbg('Could not alter div/article to p, reverting back to div: '.$e->getMessage()); } } else { @@ -717,12 +732,15 @@ class Readability for ($i = 0, $il = $node->childNodes->length; $i < $il; ++$i) { $childNode = $node->childNodes->item($i); - if (is_object($childNode) && get_class($childNode) === 'DOMProcessingInstruction') { //executable tags (parentNode->removeChild($childNode); + continue; } - if ($childNode->nodeType == 3) { // XML_TEXT_NODE + // XML_TEXT_NODE + if ($childNode->nodeType == 3) { //$this->dbg('replacing text node with a P tag with the same content.'); $p = $this->dom->createElement('p'); $p->innerHTML = $childNode->nodeValue; @@ -743,12 +761,13 @@ class Readability */ for ($pt = 0, $scored = count($nodesToScore); $pt < $scored; ++$pt) { $parentNode = $nodesToScore[$pt]->parentNode; + // No parent node? Move on... if (!$parentNode) { continue; } - $grandParentNode = ($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null; + $grandParentNode = ($parentNode->parentNode instanceof \DOMElement) ? $parentNode->parentNode : null; $innerText = $this->getInnerText($nodesToScore[$pt]); // If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it. @@ -778,7 +797,7 @@ class Readability /* TEST: For every positive/negative parent tag, add/substract half point. Up to 3 points. *\/ $up = $nodesToScore[$pt]; $score = 0; - while ($up->parentNode instanceof DOMElement) { + while ($up->parentNode instanceof \DOMElement) { $up = $up->parentNode; if (preg_match($this->regexps['positive'], $up->getAttribute('class') . ' ' . $up->getAttribute('id'))) { $score += 0.5; @@ -802,8 +821,9 @@ class Readability */ if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) && $xpath) { $candidates = $xpath->query('.//*[(self::footer and count(//footer)<2) or (self::aside and count(//aside)<2)]', $page->documentElement); + $node = null; - for ($node = null, $c = $candidates->length - 1; $c >= 0; --$c) { + for ($c = $candidates->length - 1; $c >= 0; --$c) { $node = $candidates->item($c); // node should be readable but not inside of an article otherwise it's probably non-readable block if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? strcasecmp($node->parentNode->tagName, 'article') !== 0 : true)) { @@ -813,8 +833,9 @@ class Readability } $candidates = $xpath->query('.//*[not(self::body) and (@class or @id or @style) and ((number(@readability) < 40) or not(@readability))]', $page->documentElement); + $node = null; - for ($node = null, $c = $candidates->length - 1; $c >= 0; --$c) { + for ($c = $candidates->length - 1; $c >= 0; --$c) { $node = $candidates->item($c); // Remove unlikely candidates @@ -842,15 +863,17 @@ class Readability $candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement); for ($c = $candidates->length - 1; $c >= 0; --$c) { + $item = $candidates->item($c); + // Scale the final candidates score based on link density. Good content should have a // relatively small link density (5% or less) and be mostly unaffected by this operation. // If not for this we would have used XPath to find maximum @readability. - $readability = $candidates->item($c)->getAttributeNode('readability'); - $readability->value = round($readability->value * (1 - $this->getLinkDensity($candidates->item($c))), 0, PHP_ROUND_HALF_UP); + $readability = $item->getAttributeNode('readability'); + $readability->value = round($readability->value * (1 - $this->getLinkDensity($item)), 0, PHP_ROUND_HALF_UP); if (!$topCandidate || $readability->value > (int) $topCandidate->getAttribute('readability')) { - $this->dbg('Candidate: '.$candidates->item($c)->getNodePath().' ('.$candidates->item($c)->getAttribute('class').':'.$candidates->item($c)->getAttribute('id').') with score '.$readability->value); - $topCandidate = $candidates->item($c); + $this->dbg('Candidate: '.$item->getNodePath().' ('.$item->getAttribute('class').':'.$item->getAttribute('id').') with score '.$readability->value); + $topCandidate = $item; } } @@ -889,7 +912,7 @@ class Readability if (strcasecmp($tagName, 'td') === 0 || strcasecmp($tagName, 'tr') === 0) { $up = $topCandidate; - if ($up->parentNode instanceof DOMElement) { + if ($up->parentNode instanceof \DOMElement) { $up = $up->parentNode; if (strcasecmp($up->tagName, 'table') === 0) { @@ -949,7 +972,6 @@ class Readability if ($append) { $this->dbg('Appending node: '.$siblingNode->getNodePath()); - $nodeToAppend = null; if (strcasecmp($siblingNodeName, 'div') !== 0 && strcasecmp($siblingNodeName, 'p') !== 0) { // We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. @@ -959,7 +981,7 @@ class Readability try { $nodeToAppend->setAttribute('alt', $siblingNodeName); $nodeToAppend->innerHTML = $siblingNode->innerHTML; - } catch (Exception $e) { + } catch (\Exception $e) { $this->dbg('Could not alter siblingNode "'.$siblingNodeName.'" to "div", reverting to original.'); $nodeToAppend = $siblingNode; --$s; @@ -1133,19 +1155,20 @@ class Readability } $weight = 0; - //$attribute_val = trim($element->getAttribute('class')." ".$element->getAttribute('id')); - $attribute_val = trim($element->getAttribute($attribute)); - if ($attribute_val != '') { - if (preg_match($this->regexps['negative'], $attribute_val)) { + // $attributeValue = trim($element->getAttribute('class')." ".$element->getAttribute('id')); + $attributeValue = trim($element->getAttribute($attribute)); + + if ($attributeValue != '') { + if (preg_match($this->regexps['negative'], $attributeValue)) { $weight -= 25; } - if (preg_match($this->regexps['positive'], $attribute_val)) { + if (preg_match($this->regexps['positive'], $attributeValue)) { $weight += 25; } - if (preg_match($this->regexps['unlikelyCandidates'], $attribute_val)) { + if (preg_match($this->regexps['unlikelyCandidates'], $attributeValue)) { $weight -= 5; } - if (preg_match($this->regexps['okMaybeItsACandidate'], $attribute_val)) { + if (preg_match($this->regexps['okMaybeItsACandidate'], $attributeValue)) { $weight += 5; } } @@ -1198,15 +1221,16 @@ class Readability */ public function clean(\DOMElement $e, $tag) { + $currentItem = null; $targetList = $e->getElementsByTagName($tag); $isEmbed = ($tag === 'audio' || $tag === 'video' || $tag === 'iframe' || $tag === 'object' || $tag === 'embed'); - for ($cur_item = null, $y = $targetList->length - 1; $y >= 0; --$y) { + for ($y = $targetList->length - 1; $y >= 0; --$y) { // Allow youtube and vimeo videos through as people usually want to see those. - $cur_item = $targetList->item($y); + $currentItem = $targetList->item($y); if ($isEmbed) { - $attributeValues = $cur_item->getAttribute('src').' '.$cur_item->getAttribute('href'); + $attributeValues = $currentItem->getAttribute('src').' '.$currentItem->getAttribute('href'); // First, check the elements attributes to see if any of them contain known media hosts if (preg_match($this->regexps['media'], $attributeValues)) { @@ -1219,7 +1243,7 @@ class Readability } } - $cur_item->parentNode->removeChild($cur_item); + $currentItem->parentNode->removeChild($currentItem); } } @@ -1239,6 +1263,7 @@ class Readability $tagsList = $e->getElementsByTagName($tag); $curTagsLength = $tagsList->length; + $node = null; /* * Gather counts for other typical elements embedded within. @@ -1246,9 +1271,8 @@ class Readability * * TODO: Consider taking into account original contentScore here. */ - for ($node = null, $i = $curTagsLength - 1; $i >= 0; --$i) { + for ($i = $curTagsLength - 1; $i >= 0; --$i) { $node = $tagsList->item($i); - //$class = $node->getAttribute('class').' '.$node->getAttribute('id'); //debug $weight = $this->getWeight($node); $contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0; $this->dbg('Start conditional cleaning of '.$node->getNodePath().' (class='.$node->getAttribute('class').'; id='.$node->getAttribute('id').')'.(($node->hasAttribute('readability')) ? (' with score '.$node->getAttribute('readability')) : '')); @@ -1332,7 +1356,6 @@ class Readability } if ($toRemove) { - //$this->dbg('Removing: '.$node->innerHTML); $this->dbg('Removing...'); $node->parentNode->removeChild($node); } @@ -1349,6 +1372,7 @@ class Readability { for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) { $headers = $e->getElementsByTagName('h'.$headerIndex); + for ($i = $headers->length - 1; $i >= 0; --$i) { if ($this->getWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { $headers->item($i)->parentNode->removeChild($headers->item($i)); diff --git a/tests/ReadabilityTest.php b/tests/ReadabilityTest.php index f1fd94a..e9a0ef3 100644 --- a/tests/ReadabilityTest.php +++ b/tests/ReadabilityTest.php @@ -194,11 +194,10 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase $this->assertTrue($res); $this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); $this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getTitle()); - $this->assertContains('alt="article"', $readability->getContent()->innerHTML); $this->assertEmpty($readability->getTitle()->innerHTML); $this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML); $this->assertNotContains('