diff --git a/.travis.yml b/.travis.yml index 01483ae..e55b1c3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -23,6 +23,10 @@ install: - composer self-update before_script: + # disable TLS for composer because openssl is disabled for PHP 5.3.3 on travis + # see: https://blog.travis-ci.com/upcoming_ubuntu_11_10_migration/ + - if [[ $TRAVIS_PHP_VERSION = 5.3.3 ]]; then composer config -g -- disable-tls true; fi; + - if [[ $TRAVIS_PHP_VERSION = 5.3.3 ]]; then composer config -g -- secure-http false; fi; - composer install --prefer-dist --no-interaction script: diff --git a/src/Readability.php b/src/Readability.php index 5654eb9..644cf62 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -69,7 +69,7 @@ class Readability * Defined up here so we don't instantiate them repeatedly in loops. */ public $regexps = array( - 'unlikelyCandidates' => '/display\s*:\s*none|ignore|\binfo|annoy|clock|date|time|author|intro|links|hidd?e|about|archive|\bprint|bookmark|tags|share|search|social|robot|published|combx|comment|mast(?:head)|subscri|community|category|disqus|extra|head|head(?:er|note)|floor|foot(?:er|note)|menu|tool|function|nav|remark|rss|shoutbox|tool|widget|meta|banner|sponsor|adsense|inner-?ad|ad-|sponsor|\badv\b|\bads\b|agr?egate?|pager|sidebar|popup|tweet|twitter/i', + 'unlikelyCandidates' => '/display\s*:\s*none|ignore|\binfo|annoy|clock|date|time|author|intro|links|hidd?e|about|archive|\bprint|bookmark|tags|tag-list|share|search|social|robot|published|combx|comment|mast(?:head)|subscri|community|category|disqus|extra|head|head(?:er|note)|floor|foot(?:er|note)|menu|tool|function|nav|remark|rss|shoutbox|tool|widget|meta|banner|sponsor|adsense|inner-?ad|ad-|sponsor|\badv\b|\bads\b|agr?egate?|pager|sidebar|popup|tweet|twitter/i', 'okMaybeItsACandidate' => '/article\b|contain|\bcontent|column|general|detail|shadow|lightbox|blog|body|entry|main|page/i', 'positive' => '/read|full|article|body|\bcontent|contain|entry|main|markdown|page|attach|pagination|post|text|blog|story/i', 'negative' => '/bottom|stat|info|discuss|e[\-]?mail|comment|reply|log.{2}(n|ed)|sign|single|combx|com-|contact|_nav|link|media|\bout|promo|\bad-|related|scroll|shoutbox|sidebar|sponsor|shopping|teaser|recommend/i', @@ -214,7 +214,7 @@ class Readability /** * Get article title element. * - * @return DOMElement + * @return \DOMElement */ public function getTitle() { @@ -224,7 +224,7 @@ class Readability /** * Get article content element. * - * @return DOMElement + * @return \DOMElement */ public function getContent() { @@ -326,6 +326,8 @@ class Readability /** * Debug. + * + * @param string $msg */ protected function dbg($msg) //, $error=false) { @@ -348,11 +350,11 @@ class Readability /** * Run any post-process modifications to article content as necessary. * - * @param DOMElement + * @param \DOMElement $articleContent */ - public function postProcessContent($articleContent) + public function postProcessContent(\DOMElement $articleContent) { - if ($this->convertLinksToFootnotes && !preg_match('/\bwiki/', @$this->url)) { + if ($this->convertLinksToFootnotes && !preg_match('/\bwiki/', $this->url)) { $this->addFootnotes($articleContent); } } @@ -360,7 +362,7 @@ class Readability /** * Get the article title as an H1. * - * @return DOMElement + * @return \DOMElement */ protected function getArticleTitle() { @@ -433,8 +435,10 @@ class Readability * For easier reading, convert this document to have footnotes at the bottom rather than inline links. * * @see http://www.roughtype.com/archives/2010/05/experiments_in.php + * + * @param \DOMElement $articleContent */ - public function addFootnotes($articleContent) + public function addFootnotes(\DOMElement $articleContent) { $footnotesWrapper = $this->dom->createElement('footer'); $footnotesWrapper->setAttribute('class', 'readability-footnotes'); @@ -496,9 +500,9 @@ class Readability * Prepare the article node for display. Clean out any inline styles, * iframes, forms, strip extraneous

tags, etc. * - * @param DOMElement + * @param \DOMElement $articleContent */ - public function prepArticle($articleContent) + public function prepArticle(\DOMElement $articleContent) { if ($this->lightClean) { $this->dbg('Light clean enabled.'); @@ -595,9 +599,9 @@ class Readability * Initialize a node with the readability object. Also checks the * className/id for special names to add to its score. * - * @param Element + * @param \DOMElement $node */ - protected function initializeNode($node) + protected function initializeNode(\DOMElement $node) { if (!isset($node->tagName)) { return; @@ -664,9 +668,11 @@ class Readability * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. * - * @return DOMElement + * @param \DOMElement $page + * + * @return \DOMElement */ - protected function grabArticle($page = null) + protected function grabArticle(\DOMElement $page = null) { if (!$page) { $page = $this->dom; @@ -789,6 +795,7 @@ class Readability $grandParentNode->getAttributeNode('readability')->value += $contentScore / self::GRANDPARENT_SCORE_DIVISOR; } } + /* * Node prepping: trash nodes that look cruddy (like ones with the class name "comment", etc). * This is faster to do before scoring but safer after. @@ -800,7 +807,7 @@ class Readability $node = $candidates->item($c); // node should be readable but not inside of an article otherwise it's probably non-readable block if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? strcasecmp($node->parentNode->tagName, 'article') !== 0 : true)) { - $this->dbg('Removing unlikely candidate '.$node->getNodePath().' by "'.$node->tagName.'" with readability '.($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); + $this->dbg('Removing unlikely candidate (using note) '.$node->getNodePath().' by "'.$node->tagName.'" with readability '.($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); $node->parentNode->removeChild($node); } } @@ -809,15 +816,15 @@ class Readability for ($node = null, $c = $candidates->length - 1; $c >= 0; --$c) { $node = $candidates->item($c); - $tagName = $node->tagName; - /* Remove unlikely candidates */ + + // Remove unlikely candidates $unlikelyMatchString = $node->getAttribute('class').' '.$node->getAttribute('id').' '.$node->getAttribute('style'); - //$this->dbg('Processing '.$node->getNodePath().' by "'. $unlikelyMatchString.'" with readability '.($node->hasAttribute('readability') ? (int)$node->getAttributeNode('readability')->value : 0)); + if (mb_strlen($unlikelyMatchString) > 3 && // don't process "empty" strings preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) ) { - $this->dbg('Removing unlikely candidate '.$node->getNodePath().' by "'.$unlikelyMatchString.'" with readability '.($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); + $this->dbg('Removing unlikely candidate (using conf) '.$node->getNodePath().' by "'.$unlikelyMatchString.'" with readability '.($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); $node->parentNode->removeChild($node); --$nodeIndex; } @@ -934,9 +941,8 @@ class Readability $nodeContent = $this->getInnerText($siblingNode, true, true); $nodeLength = mb_strlen($nodeContent); - if ($nodeLength > self::MIN_NODE_LENGTH && $linkDensity < self::MAX_LINK_DENSITY) { - $append = true; - } elseif ($nodeLength < self::MIN_NODE_LENGTH && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) { + if (($nodeLength > self::MIN_NODE_LENGTH && $linkDensity < self::MAX_LINK_DENSITY) + || ($nodeLength < self::MIN_NODE_LENGTH && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent))) { $append = true; } } @@ -946,19 +952,15 @@ class Readability $nodeToAppend = null; if (strcasecmp($siblingNodeName, 'div') !== 0 && strcasecmp($siblingNodeName, 'p') !== 0) { - /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ - $this->dbg('Altering siblingNode '.$siblingNodeName.' to div.'); + // We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. + $this->dbg('Altering siblingNode "'.$siblingNodeName.'" to "div".'); $nodeToAppend = $this->dom->createElement('div'); try { - if ($siblingNode->getAttribute('id')) { - $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id')); - } - $nodeToAppend->setAttribute('alt', $siblingNodeName); $nodeToAppend->innerHTML = $siblingNode->innerHTML; } catch (Exception $e) { - $this->dbg('Could not alter siblingNode '.$siblingNodeName.' to div, reverting to original.'); + $this->dbg('Could not alter siblingNode "'.$siblingNodeName.'" to "div", reverting to original.'); $nodeToAppend = $siblingNode; --$s; --$sl; @@ -1019,13 +1021,13 @@ class Readability * Get the inner text of a node. * This also strips out any excess whitespace to be found. * - * @param DOMElement $e - * @param bool $normalizeSpaces (default: true) - * @param bool $flattenLines (default: false) + * @param \DOMElement $e + * @param bool $normalizeSpaces (default: true) + * @param bool $flattenLines (default: false) * * @return string */ - public function getInnerText($e, $normalizeSpaces = true, $flattenLines = false) + public function getInnerText(\DOMElement $e, $normalizeSpaces = true, $flattenLines = false) { if (!isset($e->textContent) || $e->textContent === '') { return ''; @@ -1045,9 +1047,9 @@ class Readability /** * Remove the style attribute on every $e and under. * - * @param DOMElement $e + * @param \DOMElement $e */ - public function cleanStyles($e) + public function cleanStyles(\DOMElement $e) { if (!is_object($e)) { return; @@ -1065,7 +1067,7 @@ class Readability * * @param string $text * - * @return number (integer) + * @return int */ public function getCommaCount($text) { @@ -1078,7 +1080,7 @@ class Readability * * @param string $text * - * @return number (integer) + * @return int */ public function getWordCount($text) { @@ -1090,12 +1092,12 @@ class Readability * This is the amount of text that is inside a link divided by the total text in the node. * Can exclude external references to differentiate between simple text and menus/infoblocks. * - * @param DOMElement $e - * @param string $excludeExternal + * @param \DOMElement $e + * @param string $excludeExternal * - * @return number (float) + * @return int */ - public function getLinkDensity($e, $excludeExternal = false) + public function getLinkDensity(\DOMElement $e, $excludeExternal = false) { $links = $e->getElementsByTagName('a'); $textLength = mb_strlen($this->getInnerText($e, true, true)); @@ -1119,12 +1121,12 @@ class Readability * Get an element weight by attribute. * Uses regular expressions to tell if this element looks good or bad. * - * @param DOMElement $element - * @param string $attribute + * @param \DOMElement $element + * @param string $attribute * - * @return number (Integer) + * @return int */ - protected function weightAttribute($element, $attribute) + protected function weightAttribute(\DOMElement $element, $attribute) { if (!$element->hasAttribute($attribute)) { return 0; @@ -1154,20 +1156,20 @@ class Readability /** * Get an element relative weight. * - * @param DOMElement $e + * @param \DOMElement $e * - * @return number (Integer) + * @return int */ - public function getWeight($e) + public function getWeight(\DOMElement $e) { if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) { return 0; } $weight = 0; - /* Look for a special classname */ + // Look for a special classname $weight += $this->weightAttribute($e, 'class'); - /* Look for a special ID */ + // Look for a special ID $weight += $this->weightAttribute($e, 'id'); return $weight; @@ -1176,9 +1178,9 @@ class Readability /** * Remove extraneous break tags from a node. * - * @param DOMElement $node + * @param \DOMElement $node */ - public function killBreaks($node) + public function killBreaks(\DOMElement $node) { $html = $node->innerHTML; $html = preg_replace($this->regexps['killBreaks'], '
', $html); @@ -1191,27 +1193,27 @@ class Readability * * Updated 2012-09-18 to preserve youtube/vimeo iframes * - * @param DOMElement $e - * @param string $tag + * @param \DOMElement $e + * @param string $tag */ - public function clean($e, $tag) + public function clean(\DOMElement $e, $tag) { $targetList = $e->getElementsByTagName($tag); $isEmbed = ($tag === 'audio' || $tag === 'video' || $tag === 'iframe' || $tag === 'object' || $tag === 'embed'); for ($cur_item = null, $y = $targetList->length - 1; $y >= 0; --$y) { - /* Allow youtube and vimeo videos through as people usually want to see those. */ + // Allow youtube and vimeo videos through as people usually want to see those. $cur_item = $targetList->item($y); if ($isEmbed) { $attributeValues = $cur_item->getAttribute('src').' '.$cur_item->getAttribute('href'); - /* First, check the elements attributes to see if any of them contain known media hosts */ + // First, check the elements attributes to see if any of them contain known media hosts if (preg_match($this->regexps['media'], $attributeValues)) { continue; } - /* Then check the elements inside this element for the same. */ + // Then check the elements inside this element for the same. if (preg_match($this->regexps['media'], $targetList->item($y)->innerHTML)) { continue; } @@ -1226,10 +1228,10 @@ class Readability * "Fishy" is an algorithm based on content length, classnames, * link density, number of images & embeds, etc. * - * @param DOMElement $e - * @param string $tag + * @param \DOMElement $e + * @param string $tag */ - public function cleanConditionally($e, $tag) + public function cleanConditionally(\DOMElement $e, $tag) { if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { return; @@ -1341,9 +1343,9 @@ class Readability /** * Clean out spurious headers from an Element. Checks things like classnames and link density. * - * @param DOMElement $e + * @param \DOMElement $e */ - public function cleanHeaders($e) + public function cleanHeaders(\DOMElement $e) { for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) { $headers = $e->getElementsByTagName('h'.$headerIndex); @@ -1355,16 +1357,33 @@ class Readability } } + /** + * Check if the given flag is active. + * + * @param int $flag + * + * @return bool + */ public function flagIsActive($flag) { return ($this->flags & $flag) > 0; } + /** + * Add a flag. + * + * @param int $flag + */ public function addFlag($flag) { $this->flags = $this->flags | $flag; } + /** + * Remove a flag. + * + * @param int $flag + */ public function removeFlag($flag) { $this->flags = $this->flags & ~$flag; diff --git a/tests/ReadabilityTest.php b/tests/ReadabilityTest.php index e82c1d6..f1fd94a 100644 --- a/tests/ReadabilityTest.php +++ b/tests/ReadabilityTest.php @@ -379,4 +379,65 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase $this->assertContains('', $readability->getContent()->innerHTML); $this->assertContains('3D Touch', $readability->getTitle()->innerHTML); } + + /** + * This should generate an Exception "DOMElement::setAttribute(): ID post-60 already defined" + */ + public function testAppendIdAlreadyHere() + { + $data = ' + + + + + + +

+ + + + +
+
+ This is an awesome text with some links, here there are
+ This is an awesome text with some links, here there are
+ This is an awesome text with some links, here there are
+ This is an awesome text with some links, here there are
+ This is an awesome text with some links, here there are
+ This is an awesome text with some links, here there are
+ This is an awesome text with some links, here there are
+ This is an awesome text with some links, here there are
+ This is an awesome text with some links, here there are
+ This is an awesome text with some links, here there are
+ This is an awesome text with some links, here there are +
+ +
+
+ + '; + + $readability = new ReadabilityTested($data, 'http://0.0.0.0'); + $readability->debug = true; + + $res = $readability->init(); + + $this->assertTrue($res); + $this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); + $this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getTitle()); + // $this->assertContains('', $readability->getContent()->innerHTML); + // $this->assertContains('3D Touch', $readability->getTitle()->innerHTML); + } }