Fix instanceof DOMElement

We previously checked `instanceof DOMElement` which was wrong since we
are in the namespace class, the class `Readability\DOMElement` does not
exists.
pull/13/head
Jeremy Benoist 10 years ago
parent 6e3f2e8c0b
commit 209c404d7b
  1. 174
      src/Readability.php
  2. 10
      tests/ReadabilityTest.php

@ -53,16 +53,23 @@ class Readability
public $articleContent;
public $original_html;
public $dom;
public $url = null; // optional - URL where HTML was retrieved
public $lightClean = true; // preserves more content (experimental)
// optional - URL where HTML was retrieved
public $url = null;
// preserves more content (experimental)
public $lightClean = true;
public $debug = false;
public $tidied = false;
protected $debugText = ''; // error text for one time output
protected $domainRegExp = null; // article domain regexp for calibration
// error text for one time output
protected $debugText = '';
// article domain regexp for calibration
protected $domainRegExp = null;
protected $body = null; //
protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later
protected $flags = 7; // 1 | 2 | 4; // Start with all processing flags set.
protected $success = false; // indicates whether we were able to extract or not
// Cache the body HTML in case we need to re-use it later
protected $bodyCache = null;
// 1 | 2 | 4; // Start with all processing flags set.
protected $flags = 7;
// indicates whether we were able to extract or not
protected $success = false;
/**
* All of the regular expressions in use within readability.
@ -105,21 +112,33 @@ class Readability
);
// raw HTML filters
protected $pre_filters = array(
'!<script[^>]*>(.*?)</script>!is' => '', // remove obvious scripts
'!<style[^>]*>(.*?)</style>!is' => '', // remove obvious styles
'!</?span[^>]*>!is' => '', // remove spans as we redefine styles and they're probably special-styled
'!<font[^>]*>\s*\[AD\]\s*</font>!is' => '', // HACK: firewall-filtered content
'!(<br[^>]*>[ \r\n\s]*){2,}!i' => '</p><p>', // HACK: replace linebreaks plus br's with p's
//'!</?noscript>!is' => '', // replace noscripts
'!<(/?)font[^>]*>!is' => '<\\1span>', // replace fonts to spans
// remove obvious scripts
'!<script[^>]*>(.*?)</script>!is' => '',
// remove obvious styles
'!<style[^>]*>(.*?)</style>!is' => '',
// remove spans as we redefine styles and they're probably special-styled
'!</?span[^>]*>!is' => '',
// HACK: firewall-filtered content
'!<font[^>]*>\s*\[AD\]\s*</font>!is' => '',
// HACK: replace linebreaks plus br's with p's
'!(<br[^>]*>[ \r\n\s]*){2,}!i' => '</p><p>',
// replace noscripts
//'!</?noscript>!is' => '',
// replace fonts to spans
'!<(/?)font[^>]*>!is' => '<\\1span>',
);
// output HTML filters
protected $post_filters = array(
'/<br\s*\/?>\s*<p/i' => '<p', // replace excessive br's
'!<(?:a|div|p)[^>]+/>!is' => '', // replace empty tags that break layouts
//'!<(\s*/?\s*(?:blockquote|br|hr|code|div|article|span|footer|aside|p|pre|dl|li|ul|ol)) [^>]+>!is' => "<\\1>", // remove all attributes on text tags
"/\n+/" => "\n", //single newlines cleanup
'!<pre[^>]*>\s*<code!is' => '<pre', // modern web...
// replace excessive br's
'/<br\s*\/?>\s*<p/i' => '<p',
// replace empty tags that break layouts
'!<(?:a|div|p)[^>]+/>!is' => '',
// remove all attributes on text tags
//'!<(\s*/?\s*(?:blockquote|br|hr|code|div|article|span|footer|aside|p|pre|dl|li|ul|ol)) [^>]+>!is' => "<\\1>",
//single newlines cleanup
"/\n+/" => "\n",
// modern web...
'!<pre[^>]*>\s*<code!is' => '<pre',
'!</code>\s*</pre>!is' => '</pre>',
'!<[hb]r>!is' => '<\\1 />',
);
@ -366,12 +385,12 @@ class Readability
*/
protected function getArticleTitle()
{
$curTitle = '';
$origTitle = '';
try {
$curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
} catch (Exception $e) {
} catch (\Exception $e) {
$curTitle = '';
}
if (preg_match('/ [\|\-] /', $curTitle)) {
@ -504,14 +523,10 @@ class Readability
*/
public function prepArticle(\DOMElement $articleContent)
{
if ($this->lightClean) {
$this->dbg('Light clean enabled.');
} else {
$this->dbg('Standard clean enabled.');
}
$this->dbg($this->lightClean ? 'Light clean enabled.' : 'Standard clean enabled.');
$this->cleanStyles($articleContent);
$this->killBreaks($articleContent);
$xpath = new \DOMXPath($articleContent->ownerDocument);
if ($this->revertForcedParagraphElements) {
@ -563,23 +578,25 @@ class Readability
$articleParagraphs = $articleContent->getElementsByTagName('p');
for ($i = $articleParagraphs->length - 1; $i >= 0; --$i) {
$imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length;
$embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
$objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
$videoCount = $articleParagraphs->item($i)->getElementsByTagName('video')->length;
$audioCount = $articleParagraphs->item($i)->getElementsByTagName('audio')->length;
$iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length;
$item = $articleParagraphs->item($i);
if ($iframeCount === 0 && $imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $videoCount === 0 && $audioCount === 0 && mb_strlen(preg_replace('/\s+/is', '', $this->getInnerText($articleParagraphs->item($i), false, false))) === 0) {
$articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));
$imgCount = $item->getElementsByTagName('img')->length;
$embedCount = $item->getElementsByTagName('embed')->length;
$objectCount = $item->getElementsByTagName('object')->length;
$videoCount = $item->getElementsByTagName('video')->length;
$audioCount = $item->getElementsByTagName('audio')->length;
$iframeCount = $item->getElementsByTagName('iframe')->length;
if ($iframeCount === 0 && $imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $videoCount === 0 && $audioCount === 0 && mb_strlen(preg_replace('/\s+/is', '', $this->getInnerText($item, false, false))) === 0) {
$item->parentNode->removeChild($item);
}
// add extra text to iframe tag to avoid an auto-closing iframe and then break the html code
if ($iframeCount) {
$iframe = $articleParagraphs->item($i)->getElementsByTagName('iframe');
$iframe = $item->getElementsByTagName('iframe');
$iframe->item(0)->nodeValue = ' ';
$articleParagraphs->item($i)->parentNode->replaceChild($iframe->item(0), $articleParagraphs->item($i));
$item->parentNode->replaceChild($iframe->item(0), $item);
}
}
@ -589,7 +606,7 @@ class Readability
$articleContent->innerHTML = preg_replace($search, $replace, $articleContent->innerHTML);
}
unset($search, $replace);
} catch (Exception $e) {
} catch (\Exception $e) {
$this->dbg('Cleaning output HTML failed. Ignoring: '.$e->getMessage());
}
}
@ -626,10 +643,10 @@ class Readability
case 'FIGURE':
$readability->value += 3;
break;
/* case 'SECTION': // often misused
$readability->value += 2;
case 'SECTION':
// often misused
// $readability->value += 2;
break;
*/
case 'OL':
case 'UL':
case 'DL':
@ -665,12 +682,12 @@ class Readability
}
/**
* grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
* Using a variety of metrics (content score, classname, element types), find the content that is
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
*
* @param \DOMElement $page
*
* @return \DOMElement
* @return \DOMElement|bool
*/
protected function grabArticle(\DOMElement $page = null)
{
@ -703,13 +720,11 @@ class Readability
try {
$newNode->innerHTML = $node->innerHTML;
// It's easier to debug using original attributes.
//$newNode->setAttribute('class', $node->getAttribute('class'));
//$newNode->setAttribute('id', $node->getAttribute('id'));
$node = $node->parentNode->replaceChild($newNode, $node);
$node->parentNode->replaceChild($newNode, $node);
--$nodeIndex;
$nodesToScore[] = $newNode;
} catch (Exception $e) {
} catch (\Exception $e) {
$this->dbg('Could not alter div/article to p, reverting back to div: '.$e->getMessage());
}
} else {
@ -717,12 +732,15 @@ class Readability
for ($i = 0, $il = $node->childNodes->length; $i < $il; ++$i) {
$childNode = $node->childNodes->item($i);
if (is_object($childNode) && get_class($childNode) === 'DOMProcessingInstruction') { //executable tags (<?php or <?xml) warning
// executable tags (<?php or <?xml) warning
if (is_object($childNode) && get_class($childNode) === 'DOMProcessingInstruction') {
$childNode->parentNode->removeChild($childNode);
continue;
}
if ($childNode->nodeType == 3) { // XML_TEXT_NODE
// XML_TEXT_NODE
if ($childNode->nodeType == 3) {
//$this->dbg('replacing text node with a P tag with the same content.');
$p = $this->dom->createElement('p');
$p->innerHTML = $childNode->nodeValue;
@ -743,12 +761,13 @@ class Readability
*/
for ($pt = 0, $scored = count($nodesToScore); $pt < $scored; ++$pt) {
$parentNode = $nodesToScore[$pt]->parentNode;
// No parent node? Move on...
if (!$parentNode) {
continue;
}
$grandParentNode = ($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null;
$grandParentNode = ($parentNode->parentNode instanceof \DOMElement) ? $parentNode->parentNode : null;
$innerText = $this->getInnerText($nodesToScore[$pt]);
// If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it.
@ -778,7 +797,7 @@ class Readability
/* TEST: For every positive/negative parent tag, add/substract half point. Up to 3 points. *\/
$up = $nodesToScore[$pt];
$score = 0;
while ($up->parentNode instanceof DOMElement) {
while ($up->parentNode instanceof \DOMElement) {
$up = $up->parentNode;
if (preg_match($this->regexps['positive'], $up->getAttribute('class') . ' ' . $up->getAttribute('id'))) {
$score += 0.5;
@ -802,8 +821,9 @@ class Readability
*/
if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) && $xpath) {
$candidates = $xpath->query('.//*[(self::footer and count(//footer)<2) or (self::aside and count(//aside)<2)]', $page->documentElement);
$node = null;
for ($node = null, $c = $candidates->length - 1; $c >= 0; --$c) {
for ($c = $candidates->length - 1; $c >= 0; --$c) {
$node = $candidates->item($c);
// node should be readable but not inside of an article otherwise it's probably non-readable block
if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? strcasecmp($node->parentNode->tagName, 'article') !== 0 : true)) {
@ -813,8 +833,9 @@ class Readability
}
$candidates = $xpath->query('.//*[not(self::body) and (@class or @id or @style) and ((number(@readability) < 40) or not(@readability))]', $page->documentElement);
$node = null;
for ($node = null, $c = $candidates->length - 1; $c >= 0; --$c) {
for ($c = $candidates->length - 1; $c >= 0; --$c) {
$node = $candidates->item($c);
// Remove unlikely candidates
@ -842,15 +863,17 @@ class Readability
$candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement);
for ($c = $candidates->length - 1; $c >= 0; --$c) {
$item = $candidates->item($c);
// Scale the final candidates score based on link density. Good content should have a
// relatively small link density (5% or less) and be mostly unaffected by this operation.
// If not for this we would have used XPath to find maximum @readability.
$readability = $candidates->item($c)->getAttributeNode('readability');
$readability->value = round($readability->value * (1 - $this->getLinkDensity($candidates->item($c))), 0, PHP_ROUND_HALF_UP);
$readability = $item->getAttributeNode('readability');
$readability->value = round($readability->value * (1 - $this->getLinkDensity($item)), 0, PHP_ROUND_HALF_UP);
if (!$topCandidate || $readability->value > (int) $topCandidate->getAttribute('readability')) {
$this->dbg('Candidate: '.$candidates->item($c)->getNodePath().' ('.$candidates->item($c)->getAttribute('class').':'.$candidates->item($c)->getAttribute('id').') with score '.$readability->value);
$topCandidate = $candidates->item($c);
$this->dbg('Candidate: '.$item->getNodePath().' ('.$item->getAttribute('class').':'.$item->getAttribute('id').') with score '.$readability->value);
$topCandidate = $item;
}
}
@ -889,7 +912,7 @@ class Readability
if (strcasecmp($tagName, 'td') === 0 || strcasecmp($tagName, 'tr') === 0) {
$up = $topCandidate;
if ($up->parentNode instanceof DOMElement) {
if ($up->parentNode instanceof \DOMElement) {
$up = $up->parentNode;
if (strcasecmp($up->tagName, 'table') === 0) {
@ -949,7 +972,6 @@ class Readability
if ($append) {
$this->dbg('Appending node: '.$siblingNode->getNodePath());
$nodeToAppend = null;
if (strcasecmp($siblingNodeName, 'div') !== 0 && strcasecmp($siblingNodeName, 'p') !== 0) {
// We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident.
@ -959,7 +981,7 @@ class Readability
try {
$nodeToAppend->setAttribute('alt', $siblingNodeName);
$nodeToAppend->innerHTML = $siblingNode->innerHTML;
} catch (Exception $e) {
} catch (\Exception $e) {
$this->dbg('Could not alter siblingNode "'.$siblingNodeName.'" to "div", reverting to original.');
$nodeToAppend = $siblingNode;
--$s;
@ -1133,19 +1155,20 @@ class Readability
}
$weight = 0;
//$attribute_val = trim($element->getAttribute('class')." ".$element->getAttribute('id'));
$attribute_val = trim($element->getAttribute($attribute));
if ($attribute_val != '') {
if (preg_match($this->regexps['negative'], $attribute_val)) {
// $attributeValue = trim($element->getAttribute('class')." ".$element->getAttribute('id'));
$attributeValue = trim($element->getAttribute($attribute));
if ($attributeValue != '') {
if (preg_match($this->regexps['negative'], $attributeValue)) {
$weight -= 25;
}
if (preg_match($this->regexps['positive'], $attribute_val)) {
if (preg_match($this->regexps['positive'], $attributeValue)) {
$weight += 25;
}
if (preg_match($this->regexps['unlikelyCandidates'], $attribute_val)) {
if (preg_match($this->regexps['unlikelyCandidates'], $attributeValue)) {
$weight -= 5;
}
if (preg_match($this->regexps['okMaybeItsACandidate'], $attribute_val)) {
if (preg_match($this->regexps['okMaybeItsACandidate'], $attributeValue)) {
$weight += 5;
}
}
@ -1198,15 +1221,16 @@ class Readability
*/
public function clean(\DOMElement $e, $tag)
{
$currentItem = null;
$targetList = $e->getElementsByTagName($tag);
$isEmbed = ($tag === 'audio' || $tag === 'video' || $tag === 'iframe' || $tag === 'object' || $tag === 'embed');
for ($cur_item = null, $y = $targetList->length - 1; $y >= 0; --$y) {
for ($y = $targetList->length - 1; $y >= 0; --$y) {
// Allow youtube and vimeo videos through as people usually want to see those.
$cur_item = $targetList->item($y);
$currentItem = $targetList->item($y);
if ($isEmbed) {
$attributeValues = $cur_item->getAttribute('src').' '.$cur_item->getAttribute('href');
$attributeValues = $currentItem->getAttribute('src').' '.$currentItem->getAttribute('href');
// First, check the elements attributes to see if any of them contain known media hosts
if (preg_match($this->regexps['media'], $attributeValues)) {
@ -1219,7 +1243,7 @@ class Readability
}
}
$cur_item->parentNode->removeChild($cur_item);
$currentItem->parentNode->removeChild($currentItem);
}
}
@ -1239,6 +1263,7 @@ class Readability
$tagsList = $e->getElementsByTagName($tag);
$curTagsLength = $tagsList->length;
$node = null;
/*
* Gather counts for other typical elements embedded within.
@ -1246,9 +1271,8 @@ class Readability
*
* TODO: Consider taking into account original contentScore here.
*/
for ($node = null, $i = $curTagsLength - 1; $i >= 0; --$i) {
for ($i = $curTagsLength - 1; $i >= 0; --$i) {
$node = $tagsList->item($i);
//$class = $node->getAttribute('class').' '.$node->getAttribute('id'); //debug
$weight = $this->getWeight($node);
$contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0;
$this->dbg('Start conditional cleaning of '.$node->getNodePath().' (class='.$node->getAttribute('class').'; id='.$node->getAttribute('id').')'.(($node->hasAttribute('readability')) ? (' with score '.$node->getAttribute('readability')) : ''));
@ -1332,7 +1356,6 @@ class Readability
}
if ($toRemove) {
//$this->dbg('Removing: '.$node->innerHTML);
$this->dbg('Removing...');
$node->parentNode->removeChild($node);
}
@ -1349,6 +1372,7 @@ class Readability
{
for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) {
$headers = $e->getElementsByTagName('h'.$headerIndex);
for ($i = $headers->length - 1; $i >= 0; --$i) {
if ($this->getWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
$headers->item($i)->parentNode->removeChild($headers->item($i));

@ -194,11 +194,10 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
$this->assertTrue($res);
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent());
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getTitle());
$this->assertContains('alt="article"', $readability->getContent()->innerHTML);
$this->assertEmpty($readability->getTitle()->innerHTML);
$this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML);
$this->assertNotContains('<aside>', $readability->getContent()->innerHTML);
$this->assertContains('<footer/>', $readability->getContent()->innerHTML);
$this->assertContains('<footer readability="4"/>', $readability->getContent()->innerHTML);
}
public function testWithClasses()
@ -225,7 +224,6 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
$this->assertTrue($res);
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent());
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getTitle());
$this->assertContains('alt="tr"', $readability->getContent()->innerHTML);
$this->assertEmpty($readability->getTitle()->innerHTML);
$this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML);
}
@ -239,7 +237,6 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
$this->assertTrue($res);
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent());
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getTitle());
$this->assertContains('alt="article"', $readability->getContent()->innerHTML);
$this->assertEmpty($readability->getTitle()->innerHTML);
$this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML);
$this->assertContains('This text is also an awesome text and you should know that', $readability->getContent()->innerHTML);
@ -254,7 +251,6 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
$this->assertTrue($res);
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent());
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getTitle());
$this->assertContains('alt="article"', $readability->getContent()->innerHTML);
$this->assertEmpty($readability->getTitle()->innerHTML);
$this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML);
$this->assertNotContains('This text is also an awesome text and you should know that', $readability->getContent()->innerHTML);
@ -269,7 +265,6 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
$this->assertTrue($res);
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent());
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getTitle());
$this->assertContains('alt="article"', $readability->getContent()->innerHTML);
$this->assertEquals('this is my title', $readability->getTitle()->innerHTML);
$this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML);
$this->assertNotContains('This text is also an awesome text and you should know that', $readability->getContent()->innerHTML);
@ -284,7 +279,6 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
$this->assertTrue($res);
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent());
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getTitle());
$this->assertContains('alt="article"', $readability->getContent()->innerHTML);
$this->assertEquals('title2 - title3', $readability->getTitle()->innerHTML);
$this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML);
$this->assertNotContains('This text is also an awesome text and you should know that', $readability->getContent()->innerHTML);
@ -299,7 +293,6 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
$this->assertTrue($res);
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent());
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getTitle());
$this->assertContains('alt="article"', $readability->getContent()->innerHTML);
$this->assertEquals('title2 : title3', $readability->getTitle()->innerHTML);
$this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML);
$this->assertNotContains('This text is also an awesome text and you should know that', $readability->getContent()->innerHTML);
@ -314,7 +307,6 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
$this->assertTrue($res);
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent());
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getTitle());
$this->assertContains('alt="article"', $readability->getContent()->innerHTML);
$this->assertEquals('this is my h1 title !', $readability->getTitle()->innerHTML);
$this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML);
$this->assertNotContains('This text is also an awesome text and you should know that', $readability->getContent()->innerHTML);

Loading…
Cancel
Save