From f5d473780da96889eff75c8f08f71ba2052206a4 Mon Sep 17 00:00:00 2001 From: Jeremy Benoist Date: Tue, 28 Apr 2015 17:56:54 +0200 Subject: [PATCH 1/2] Fix javascript typo And add coverage --- .gitignore | 2 ++ phpunit.xml.dist | 5 ++++- src/Readability.php | 5 +++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 48b8bf9..d191143 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ vendor/ +coverage/ +composer.lock diff --git a/phpunit.xml.dist b/phpunit.xml.dist index f727e46..e6a0903 100644 --- a/phpunit.xml.dist +++ b/phpunit.xml.dist @@ -19,11 +19,14 @@ - ./src/TubeLink/ + ./src/ ./tests + + + diff --git a/src/Readability.php b/src/Readability.php index 7f1837e..50e5d2f 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -52,6 +52,7 @@ class Readability public $revertForcedParagraphElements = true; public $articleTitle; public $articleContent; + public $original_html; public $dom; public $url = null; // optional - URL where HTML was retrieved public $lightClean = true; // preserves more content (experimental) @@ -179,7 +180,7 @@ class Readability $this->debugText .= 'Tidying document'."\n"; $tidy = tidy_parse_string($html, $this->tidy_config, 'UTF8'); if (tidy_clean_repair($tidy)) { - $original_html = $html; + $this->original_html = $html; $this->tidied = true; $html = $tidy->value; $html = preg_replace('/]+>/i', '', $html); @@ -258,7 +259,7 @@ class Readability if ($this->bodyCache == null) { $this->bodyCache = ''; foreach ($bodyElems as $bodyNode) { - $this->bodyCache += $bodyNode->innerHTML; + $this->bodyCache .= trim($bodyNode->innerHTML); } } if ($bodyElems->length > 0 && $this->body == null) { From 1963319a5561c6baff4d59ed11ff49bf7561dc11 Mon Sep 17 00:00:00 2001 From: Jeremy Benoist Date: Tue, 28 Apr 2015 17:57:11 +0200 Subject: [PATCH 2/2] Improve Travis & add Scrutinizer + CS + Update README --- .scrutinizer.yml | 3 + .travis.yml | 23 +++- README.md | 1 + src/JSLikeHTMLElement.php | 13 ++- src/Readability.php | 237 +++++++++++++++++++------------------- tests/ReadabilityTest.php | 1 - 6 files changed, 152 insertions(+), 126 deletions(-) create mode 100644 .scrutinizer.yml diff --git a/.scrutinizer.yml b/.scrutinizer.yml new file mode 100644 index 0000000..4bb53b4 --- /dev/null +++ b/.scrutinizer.yml @@ -0,0 +1,3 @@ +tools: + external_code_coverage: + timeout: 600 diff --git a/.travis.yml b/.travis.yml index 4e7dccd..190b404 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,10 +4,29 @@ php: - 5.4 - 5.5 - 5.6 + - nightly + - hhvm-nightly -before_script: +# run build against nightly but allow them to fail +matrix: + fast_finish: true + allow_failures: + - php: nightly + - php: hhvm-nightly + +# faster builds on new travis setup not using sudo +sudo: false + +install: - composer self-update + +before_script: - composer install --prefer-dist --no-interaction script: - - phpunit --coverage-text + - phpunit --coverage-clover=coverage.clover + +after_script: + - | + wget https://scrutinizer-ci.com/ocular.phar + php ocular.phar code-coverage:upload --format=php-clover coverage.clover diff --git a/README.md b/README.md index a5cf760..223d6ba 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # Readability [![Build Status](https://travis-ci.org/j0k3r/php-readability.svg?branch=master)](https://travis-ci.org/j0k3r/php-readability) +[![Code Coverage](https://scrutinizer-ci.com/g/j0k3r/php-readability/badges/coverage.png?b=master)](https://scrutinizer-ci.com/g/j0k3r/php-readability/?branch=master) This is an extract of the Readability class from the [full-text-rss](https://github.com/Dither/full-text-rss) fork. It kind be defined as a better version of the original [php-readability](http://code.fivefilters.org/php-readability). diff --git a/src/JSLikeHTMLElement.php b/src/JSLikeHTMLElement.php index 6a732cb..ef158fa 100644 --- a/src/JSLikeHTMLElement.php +++ b/src/JSLikeHTMLElement.php @@ -3,7 +3,7 @@ namespace Readability; /** - * JavaScript-like HTML DOM Element + * JavaScript-like HTML DOM Element. * * This class extends PHP's DOMElement to allow * users to get and set the innerHTML property of @@ -31,12 +31,14 @@ namespace Readability; * echo $doc->saveXML(); * * @author Keyvan Minoukadeh - http://www.keyvan.net - keyvan@keyvan.net + * * @see http://fivefilters.org (the project this was written for) */ class JSLikeHTMLElement extends \DOMElement { /** - * Used for setting innerHTML like it's done in JavaScript: + * Used for setting innerHTML like it's done in JavaScript:. + * * @code * $div->innerHTML = '

Chapter 2

The story begins...

'; * @endcode @@ -45,7 +47,7 @@ class JSLikeHTMLElement extends \DOMElement { if ($name == 'innerHTML') { // first, empty the element - for ($x=$this->childNodes->length-1; $x>=0; $x--) { + for ($x = $this->childNodes->length - 1; $x >= 0; $x--) { $this->removeChild($this->childNodes->item($x)); } // $value holds our new inner HTML @@ -86,7 +88,8 @@ class JSLikeHTMLElement extends \DOMElement } /** - * Used for getting innerHTML like it's done in JavaScript: + * Used for getting innerHTML like it's done in JavaScript:. + * * @code * $string = $div->innerHTML; * @endcode @@ -105,7 +108,7 @@ class JSLikeHTMLElement extends \DOMElement $trace = debug_backtrace(); trigger_error('Undefined property via __get(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE); - return null; + return; } public function __toString() diff --git a/src/Readability.php b/src/Readability.php index 50e5d2f..eabcca8 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -14,7 +14,7 @@ namespace Readability; * More information: http://fivefilters.org/content-only/ * License: Apache License, Version 2.0 * Requires: PHP version 5.2.0+ - * Date: 2013-08-02 + * Date: 2013-08-02. * * Differences between the PHP port and the original * ------------------------------------------------------ @@ -76,7 +76,7 @@ class Readability 'divToPElements' => '/<(?:blockquote|code|div|article|footer|aside|img|p|pre|dl|ol|ul)/mi', 'killBreaks' => '/(([ \r\n\s]| ?)*)+/', 'media' => '!//(?:[^\.\?/]+\.)?(?:youtu(?:be)?|soundcloud|dailymotion|vimeo|pornhub|xvideos|twitvid|rutube|viddler)\.(?:com|be|org|net)/!i', - 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i' + 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i', ); public $tidy_config = array( 'tidy-mark' => false, @@ -101,7 +101,7 @@ class Readability // 'merge-spans' => true, 'input-encoding' => '????', 'output-encoding' => 'utf8', - 'hide-comments' => true + 'hide-comments' => true, ); // raw HTML filters protected $pre_filters = array( @@ -111,7 +111,7 @@ class Readability '!]*>\s*\[AD\]\s*!is' => '', // HACK: firewall-filtered content '!(]*>[ \r\n\s]*){2,}!i' => '

', // HACK: replace linebreaks plus br's with p's //'!!is' => '', // replace noscripts - '!<(/?)font[^>]*>!is' => '<\\1span>' // replace fonts to spans + '!<(/?)font[^>]*>!is' => '<\\1span>', // replace fonts to spans ); // output HTML filters protected $post_filters = array( @@ -121,7 +121,7 @@ class Readability "/\n+/" => "\n", //single newlines cleanup '!]*>\s* '\s*!is' => '', - '!<[hb]r>!is' => '<\\1 />' + '!<[hb]r>!is' => '<\\1 />', ); // flags const FLAG_STRIP_UNLIKELYS = 1; @@ -139,13 +139,14 @@ class Readability const MIN_NODE_LENGTH = 80; const MAX_LINK_DENSITY = 0.25; /** - * Create instance of Readability + * Create instance of Readability. + * * @param string UTF-8 encoded string * @param string (optional) URL associated with HTML (for footnotes) * @param string (optional) Which parser to use for turning raw HTML into a DOMDocument - * @param boolean (optional) Use tidy + * @param bool (optional) Use tidy */ - public function __construct($html, $url=null, $parser='libxml', $use_tidy=true) + public function __construct($html, $url = null, $parser = 'libxml', $use_tidy = true) { $this->url = $url; $this->debugText = 'Parsing URL: '.$url."\n"; @@ -154,9 +155,9 @@ class Readability $this->domainRegExp = '/'.strtr(preg_replace('/www\d*\./', '', parse_url($url, PHP_URL_HOST)), array('.' => '\.')).'/'; } - mb_internal_encoding("UTF-8"); - mb_http_output("UTF-8"); - mb_regex_encoding("UTF-8"); + mb_internal_encoding('UTF-8'); + mb_http_output('UTF-8'); + mb_regex_encoding('UTF-8'); // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well... if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) { @@ -170,7 +171,7 @@ class Readability $html = ''; } - /** + /* * Use tidy (if it exists). * This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing. * Although sometimes it makes matters worse, which is why there is an option to disable it. @@ -188,9 +189,9 @@ class Readability } unset($tidy); } - $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); + $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'); - if (!($parser=='html5lib' && ($this->dom = \HTML5_Parser::parse($html)))) { + if (!($parser == 'html5lib' && ($this->dom = \HTML5_Parser::parse($html)))) { libxml_use_internal_errors(true); $this->dom = new \DOMDocument(); $this->dom->preserveWhiteSpace = false; @@ -201,7 +202,8 @@ class Readability $this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement'); } /** - * Get article title element + * Get article title element. + * * @return DOMElement */ public function getTitle() @@ -209,7 +211,8 @@ class Readability return $this->articleTitle; } /** - * Get article content element + * Get article content element. + * * @return DOMElement */ public function getContent() @@ -217,20 +220,22 @@ class Readability return $this->articleContent; } /** - * Add pre filter for raw input HTML processing + * Add pre filter for raw input HTML processing. + * * @param string RegExp for replace * @param string (optional) Replacer */ - public function addPreFilter($filter, $replacer='') + public function addPreFilter($filter, $replacer = '') { $this->pre_filters[$filter] = $replacer; } /** - * Add post filter for raw output HTML processing + * Add post filter for raw output HTML processing. + * * @param string RegExp for replace * @param string (optional) Replacer */ - public function addPostFilter($filter, $replacer='') + public function addPostFilter($filter, $replacer = '') { $this->post_filters[$filter] = $replacer; } @@ -244,7 +249,7 @@ class Readability * 4. Replace the current DOM tree with the new one. * 5. Read peacefully. * - * @return boolean true if we found content, false otherwise + * @return bool true if we found content, false otherwise */ public function init() { @@ -296,7 +301,7 @@ class Readability return $this->success; } /** - * Debug + * Debug. */ protected function dbg($msg) //, $error=false) { @@ -306,12 +311,12 @@ class Readability } /** - * Dump debug info + * Dump debug info. */ protected function dump_dbg() { if ($this->debug) { - openlog("Readability PHP ", LOG_PID | LOG_PERROR, 0); + openlog('Readability PHP ', LOG_PID | LOG_PERROR, 0); syslog(6, $this->debugText); // 1 - error 6 - info } } @@ -319,7 +324,6 @@ class Readability * Run any post-process modifications to article content as necessary. * * @param DOMElement - * @return void */ public function postProcessContent($articleContent) { @@ -338,7 +342,8 @@ class Readability $origTitle = ''; try { $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); - } catch (Exception $e) {} + } catch (Exception $e) { + } if (preg_match('/ [\|\-] /', $curTitle)) { $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); if (count(explode(' ', $curTitle)) < 3) { @@ -347,7 +352,7 @@ class Readability } elseif (strpos($curTitle, ': ') !== false) { $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle); if (count(explode(' ', $curTitle)) < 3) { - $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle); + $curTitle = preg_replace('/[^:]*[:](.*)/i', '$1', $origTitle); } } elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) { $hOnes = $this->dom->getElementsByTagName('h1'); @@ -367,12 +372,10 @@ class Readability /** * Prepare the HTML document for readability to scrape it. * This includes things like stripping javascript, CSS, and handling terrible markup. - * - * @return void */ protected function prepDocument() { - /** + /* * In some cases a body element can't be found (if the HTML is totally hosed for example) * so we create a new body node and append it to the document. */ @@ -383,19 +386,18 @@ class Readability $this->body->setAttribute('id', 'readabilityBody'); // Remove all style tags in head. $styleTags = $this->dom->getElementsByTagName('style'); - for ($i = $styleTags->length-1; $i >= 0; $i--) { + for ($i = $styleTags->length - 1; $i >= 0; $i--) { $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); } $linkTags = $this->dom->getElementsByTagName('link'); - for ($i = $linkTags->length-1; $i >= 0; $i--) { + for ($i = $linkTags->length - 1; $i >= 0; $i--) { $linkTags->item($i)->parentNode->removeChild($linkTags->item($i)); } } /** * For easier reading, convert this document to have footnotes at the bottom rather than inline links. - * @see http://www.roughtype.com/archives/2010/05/experiments_in.php * - * @return void + * @see http://www.roughtype.com/archives/2010/05/experiments_in.php */ public function addFootnotes($articleContent) { @@ -422,8 +424,8 @@ class Readability } $linkCount++; // Add a superscript reference after the article link. - $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount); - $refLink->innerHTML = '[' . $linkCount . ']'; + $refLink->setAttribute('href', '#readabilityFootnoteLink-'.$linkCount); + $refLink->innerHTML = '['.$linkCount.']'; $refLink->setAttribute('class', 'readability-DoNotFootnote'); $refLink->setAttribute('style', 'color: inherit;'); if ($articleLink->parentNode->lastChild->isSameNode($articleLink)) { @@ -432,13 +434,13 @@ class Readability $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling); } $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); - $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); - $footnote->innerHTML = '^ '; + $articleLink->setAttribute('name', 'readabilityLink-'.$linkCount); + $footnote->innerHTML = '^ '; $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText); - $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); + $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-'.$linkCount); $footnote->appendChild($footnoteLink); if ($linkDomain) { - $footnote->innerHTML = $footnote->innerHTML . ' (' . $linkDomain . ')'; + $footnote->innerHTML = $footnote->innerHTML.' ('.$linkDomain.')'; } $articleFootnotes->appendChild($footnote); } @@ -451,7 +453,6 @@ class Readability * iframes, forms, strip extraneous

tags, etc. * * @param DOMElement - * @return void */ public function prepArticle($articleContent) { @@ -464,25 +465,25 @@ class Readability $this->killBreaks($articleContent); $xpath = new \DOMXPath($articleContent->ownerDocument); if ($this->revertForcedParagraphElements) { - /** + /* * Reverts P elements with class 'readability-styled' to text nodes: * which is what they were before. */ $elems = $xpath->query('.//p[@data-readability-styled]', $articleContent); - for ($i = $elems->length-1; $i >= 0; $i--) { + for ($i = $elems->length - 1; $i >= 0; $i--) { $e = $elems->item($i); $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); } } // Remove service data-candidate attribute. $elems = $xpath->query('.//*[@data-candidate]', $articleContent); - for ($i = $elems->length-1; $i >= 0; $i--) { + for ($i = $elems->length - 1; $i >= 0; $i--) { $elems->item($i)->removeAttribute('data-candidate'); } // Remove unrelated links and other unneded stuff. // (not(*) and not(text()[normalize-space()])) or // What's wrong here? $elems = $xpath->query('.//a[@rel="nofollow"]', $articleContent); - for ($i = $elems->length-1; $i >= 0; $i--) { + for ($i = $elems->length - 1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } // Clean out junk from the article content. @@ -494,7 +495,7 @@ class Readability $this->clean($articleContent, 'canvas'); $this->clean($articleContent, 'h1'); - /** + /* * If there is only one h2, they are probably using it as a main header, so remove it since we * already have a header. */ @@ -511,7 +512,7 @@ class Readability $this->cleanConditionally($articleContent, 'div'); // Remove extra paragraphs. $articleParagraphs = $articleContent->getElementsByTagName('p'); - for ($i = $articleParagraphs->length-1; $i >= 0; $i--) { + for ($i = $articleParagraphs->length - 1; $i >= 0; $i--) { $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length; $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length; $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length; @@ -537,7 +538,7 @@ class Readability } unset($search, $replace); } catch (Exception $e) { - $this->dbg("Cleaning output HTML failed. Ignoring: " . $e->getMessage()); + $this->dbg('Cleaning output HTML failed. Ignoring: '.$e->getMessage()); } } } @@ -546,7 +547,6 @@ class Readability * className/id for special names to add to its score. * * @param Element - * @return void */ protected function initializeNode($node) { @@ -615,7 +615,7 @@ class Readability * * @return DOMElement */ - protected function grabArticle($page=null) + protected function grabArticle($page = null) { if (!$page) { $page = $this->dom; @@ -647,7 +647,7 @@ class Readability $nodeIndex--; $nodesToScore[] = $newNode; } catch (Exception $e) { - $this->dbg('Could not alter div/article to p, reverting back to div: ' . $e->getMessage()); + $this->dbg('Could not alter div/article to p, reverting back to div: '.$e->getMessage()); } } else { // Will change these P elements back to text nodes after processing. @@ -668,14 +668,14 @@ class Readability } } } - /** + /* * Loop through all paragraphs, and assign a score to them based on how content-y they look. * Then add their score to their parent node. * * A score is determined by things like number of commas, class names, etc. * Maybe eventually link density. */ - for ($pt=0, $scored = count($nodesToScore); $pt < $scored; $pt++) { + for ($pt = 0, $scored = count($nodesToScore); $pt < $scored; $pt++) { $parentNode = $nodesToScore[$pt]->parentNode; // No parent node? Move on... if (!$parentNode) { @@ -690,12 +690,12 @@ class Readability // Initialize readability data for the parent. if (!$parentNode->hasAttribute('readability')) { $this->initializeNode($parentNode); - $parentNode->setAttribute('data-candidate','true'); + $parentNode->setAttribute('data-candidate', 'true'); } // Initialize readability data for the grandparent. if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) { $this->initializeNode($grandParentNode); - $grandParentNode->setAttribute('data-candidate','true'); + $grandParentNode->setAttribute('data-candidate', 'true'); } // Add a point for the paragraph itself as a base. $contentScore = 1; @@ -704,7 +704,7 @@ class Readability // For every SCORE_CHARS_IN_PARAGRAPH (default:100) characters in this paragraph, add another point. Up to 3 points. $contentScore += min(floor(mb_strlen($innerText) / self::SCORE_CHARS_IN_PARAGRAPH), 3); // For every SCORE_WORDS_IN_PARAGRAPH (default:20) words in this paragraph, add another point. Up to 3 points. - $contentScore += min(floor($this->getWordCount($innerText)/ self::SCORE_WORDS_IN_PARAGRAPH), 3); + $contentScore += min(floor($this->getWordCount($innerText) / self::SCORE_WORDS_IN_PARAGRAPH), 3); /* TEST: For every positive/negative parent tag, add/substract half point. Up to 3 points. *\/ $up = $nodesToScore[$pt]; $score = 0; @@ -724,13 +724,13 @@ class Readability $grandParentNode->getAttributeNode('readability')->value += $contentScore / self::GRANDPARENT_SCORE_DIVISOR; } } - /** + /* * Node prepping: trash nodes that look cruddy (like ones with the class name "comment", etc). * This is faster to do before scoring but safer after. */ if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) && $xpath) { $candidates = $xpath->query('.//*[(self::footer and count(//footer)<2) or (self::aside and count(//aside)<2)]', $page->documentElement); - for ($node = null, $c = $candidates->length-1; $c >= 0; $c--) { + for ($node = null, $c = $candidates->length - 1; $c >= 0; $c--) { $node = $candidates->item($c); // node should be readable but not inside of an article otherwise it's probably non-readable block if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? strcasecmp($node->parentNode->tagName, 'article') !== 0 : true)) { @@ -739,24 +739,24 @@ class Readability } } $candidates = $xpath->query('.//*[not(self::body) and (@class or @id or @style) and ((number(@readability) < 40) or not(@readability))]', $page->documentElement); - for ($node = null, $c = $candidates->length-1; $c >= 0; $c--) { + for ($node = null, $c = $candidates->length - 1; $c >= 0; $c--) { $node = $candidates->item($c); $tagName = $node->tagName; /* Remove unlikely candidates */ - $unlikelyMatchString = $node->getAttribute('class')." ".$node->getAttribute('id')." ".$node->getAttribute('style'); + $unlikelyMatchString = $node->getAttribute('class').' '.$node->getAttribute('id').' '.$node->getAttribute('style'); //$this->dbg('Processing '.$node->getNodePath().' by "'. $unlikelyMatchString.'" with readability '.($node->hasAttribute('readability') ? (int)$node->getAttributeNode('readability')->value : 0)); if (mb_strlen($unlikelyMatchString) > 3 && // don't process "empty" strings preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) ) { - $this->dbg('Removing unlikely candidate '.$node->getNodePath().' by "'. $unlikelyMatchString.'" with readability '. ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); + $this->dbg('Removing unlikely candidate '.$node->getNodePath().' by "'.$unlikelyMatchString.'" with readability '.($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); $node->parentNode->removeChild($node); $nodeIndex--; } } unset($candidates); } - /** + /* * After we've calculated scores, loop through all of the possible candidate nodes we found * and find the one with the highest score. */ @@ -764,7 +764,7 @@ class Readability if ($xpath) { // Using array of DOMElements after deletion is a path to DOOMElement. $candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement); - for ($c = $candidates->length-1; $c >= 0; $c--) { + for ($c = $candidates->length - 1; $c >= 0; $c--) { // Scale the final candidates score based on link density. Good content should have a // relatively small link density (5% or less) and be mostly unaffected by this operation. // If not for this we would have used XPath to find maximum @readability. @@ -777,7 +777,7 @@ class Readability } unset($candidates); } - /** + /* * If we still have no top candidate, just use the body as a last resort. * We also have to copy the body node so it is something we can modify. */ @@ -812,7 +812,7 @@ class Readability } } $this->dbg('Top candidate: '.$topCandidate->getNodePath()); - /** + /* * Now that we have the top candidate, look through its siblings for content that might also be related. * Things like preambles, content split by ads that we removed, etc. */ @@ -828,7 +828,7 @@ class Readability $siblingNode = $siblingNodes->item($s); $siblingNodeName = $siblingNode->nodeName; $append = false; - $this->dbg('Looking at sibling node: ' . $siblingNode->getNodePath() . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); + $this->dbg('Looking at sibling node: '.$siblingNode->getNodePath().(($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score '.$siblingNode->getAttribute('readability')) : '')); //$this->dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown')); if ($siblingNode->isSameNode($topCandidate)) { $append = true; @@ -852,18 +852,18 @@ class Readability } } if ($append) { - $this->dbg('Appending node: ' . $siblingNode->getNodePath()); + $this->dbg('Appending node: '.$siblingNode->getNodePath()); $nodeToAppend = null; if (strcasecmp($siblingNodeName, 'div') !== 0 && strcasecmp($siblingNodeName, 'p') !== 0) { /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ - $this->dbg('Altering siblingNode ' . $siblingNodeName . ' to div.'); + $this->dbg('Altering siblingNode '.$siblingNodeName.' to div.'); $nodeToAppend = $this->dom->createElement('div'); try { $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id')); $nodeToAppend->setAttribute('alt', $siblingNodeName); $nodeToAppend->innerHTML = $siblingNode->innerHTML; } catch (Exception $e) { - $this->dbg('Could not alter siblingNode ' . $siblingNodeName . ' to div, reverting to original.'); + $this->dbg('Could not alter siblingNode '.$siblingNodeName.' to div, reverting to original.'); $nodeToAppend = $siblingNode; $s--; $sl--; @@ -884,7 +884,7 @@ class Readability unset($xpath); // So we have all of the content that we need. Now we clean it up for presentation. $this->prepArticle($articleContent); - /** + /* * Now that we've gone through the full algorithm, check to see if we got any meaningful content. * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher * likelihood of finding the content, and the sieve approach gives us a higher likelihood of @@ -897,17 +897,17 @@ class Readability $this->body->innerHTML = $this->bodyCache; if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); - $this->dbg("...content is shorter than ".self::MIN_ARTICLE_LENGTH." letters, trying not to strip unlikely content.\n"); + $this->dbg('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to strip unlikely content.\n"); return $this->grabArticle($this->body); } elseif ($this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) { $this->removeFlag(self::FLAG_WEIGHT_ATTRIBUTES); - $this->dbg("...content is shorter than ".self::MIN_ARTICLE_LENGTH." letters, trying not to weight attributes.\n"); + $this->dbg('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to weight attributes.\n"); return $this->grabArticle($this->body); } elseif ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); - $this->dbg("...content is shorter than ".self::MIN_ARTICLE_LENGTH." letters, trying not to clean at all.\n"); + $this->dbg('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to clean at all.\n"); return $this->grabArticle($this->body); } else { @@ -921,12 +921,13 @@ class Readability * Get the inner text of a node. * This also strips out any excess whitespace to be found. * - * @param DOMElement $e - * @param boolean $normalizeSpaces (default: true) - * @param boolean $flattenLines (default: false) + * @param DOMElement $e + * @param bool $normalizeSpaces (default: true) + * @param bool $flattenLines (default: false) + * * @return string */ - public function getInnerText($e, $normalizeSpaces=true, $flattenLines=false) + public function getInnerText($e, $normalizeSpaces = true, $flattenLines = false) { if (!isset($e->textContent) || $e->textContent === '') { return ''; @@ -943,8 +944,7 @@ class Readability /** * Remove the style attribute on every $e and under. * - * @param DOMElement $e - * @return void + * @param DOMElement $e */ public function cleanStyles($e) { @@ -959,7 +959,8 @@ class Readability /** * Get comma number for a given text. * - * @param string $text + * @param string $text + * * @return number (integer) */ public function getCommaCount($text) @@ -970,7 +971,8 @@ class Readability * Get words number for a given text if words separated by a space. * Input string should be normalized. * - * @param string $text + * @param string $text + * * @return number (integer) */ public function getWordCount($text) @@ -982,16 +984,17 @@ class Readability * This is the amount of text that is inside a link divided by the total text in the node. * Can exclude external references to differentiate between simple text and menus/infoblocks. * - * @param DOMElement $e - * @param string $excludeExternal - * @return number (float) + * @param DOMElement $e + * @param string $excludeExternal + * + * @return number (float) */ - public function getLinkDensity($e, $excludeExternal=false) + public function getLinkDensity($e, $excludeExternal = false) { $links = $e->getElementsByTagName('a'); $textLength = mb_strlen($this->getInnerText($e, true, true)); $linkLength = 0; - for ($dRe = $this->domainRegExp, $i=0, $il=$links->length; $i < $il; $i++) { + for ($dRe = $this->domainRegExp, $i = 0, $il = $links->length; $i < $il; $i++) { if ($excludeExternal && $dRe && !preg_match($dRe, $links->item($i)->getAttribute('href'))) { continue; } @@ -1007,9 +1010,10 @@ class Readability * Get an element weight by attribute. * Uses regular expressions to tell if this element looks good or bad. * - * @param DOMElement $element - * @param string $attribute - * @return number (Integer) + * @param DOMElement $element + * @param string $attribute + * + * @return number (Integer) */ protected function weightAttribute($element, $attribute) { @@ -1039,8 +1043,9 @@ class Readability /** * Get an element relative weight. * - * @param DOMElement $e - * @return number (Integer) + * @param DOMElement $e + * + * @return number (Integer) */ public function getWeight($e) { @@ -1058,8 +1063,7 @@ class Readability /** * Remove extraneous break tags from a node. * - * @param DOMElement $node - * @return void + * @param DOMElement $node */ public function killBreaks($node) { @@ -1069,19 +1073,18 @@ class Readability } /** * Clean a node of all elements of type "tag". - * (Unless it's a youtube/vimeo video. People love movies.) + * (Unless it's a youtube/vimeo video. People love movies.). * * Updated 2012-09-18 to preserve youtube/vimeo iframes * - * @param DOMElement $e - * @param string $tag - * @return void + * @param DOMElement $e + * @param string $tag */ public function clean($e, $tag) { $targetList = $e->getElementsByTagName($tag); $isEmbed = ($tag === 'audio' || $tag === 'video' || $tag === 'iframe' || $tag === 'object' || $tag === 'embed'); - for ($cur_item = null, $y = $targetList->length-1; $y >= 0; $y--) { + for ($cur_item = null, $y = $targetList->length - 1; $y >= 0; $y--) { /* Allow youtube and vimeo videos through as people usually want to see those. */ $cur_item = $targetList->item($y); if ($isEmbed) { @@ -1103,9 +1106,8 @@ class Readability * "Fishy" is an algorithm based on content length, classnames, * link density, number of images & embeds, etc. * - * @param DOMElement $e - * @param string $tag - * @return void + * @param DOMElement $e + * @param string $tag */ public function cleanConditionally($e, $tag) { @@ -1114,7 +1116,7 @@ class Readability } $tagsList = $e->getElementsByTagName($tag); $curTagsLength = $tagsList->length; - /** + /* * Gather counts for other typical elements embedded within. * Traverse backwards so we can remove nodes at the same time without effecting the traversal. * @@ -1125,29 +1127,29 @@ class Readability //$class = $node->getAttribute('class').' '.$node->getAttribute('id'); //debug $weight = $this->getWeight($node); $contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0; - $this->dbg('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : '')); + $this->dbg('Start conditional cleaning of '.$node->getNodePath().' (class='.$node->getAttribute('class').'; id='.$node->getAttribute('id').')'.(($node->hasAttribute('readability')) ? (' with score '.$node->getAttribute('readability')) : '')); if ($weight + $contentScore < 0) { $this->dbg('Removing...'); $node->parentNode->removeChild($node); } elseif ($this->getCommaCount($this->getInnerText($node)) < self::MIN_COMMAS_IN_PARAGRAPH) { - /** + /* * If there are not very many commas, and the number of * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. */ $p = $node->getElementsByTagName('p')->length; $img = $node->getElementsByTagName('img')->length; - $li = $node->getElementsByTagName('li')->length-100; + $li = $node->getElementsByTagName('li')->length - 100; $input = $node->getElementsByTagName('input')->length; $a = $node->getElementsByTagName('a')->length; $embedCount = 0; $embeds = $node->getElementsByTagName('embed'); - for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { + for ($ei = 0, $il = $embeds->length; $ei < $il; $ei++) { if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) { $embedCount++; } } $embeds = $node->getElementsByTagName('iframe'); - for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { + for ($ei = 0, $il = $embeds->length; $ei < $il; $ei++) { if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) { $embedCount++; } @@ -1159,17 +1161,17 @@ class Readability if ($li > $p && $tag != 'ul' && $tag != 'ol') { $this->dbg(' too many

  • elements, and parent is not
      or
        '); $toRemove = true; - } elseif ( $input > floor($p/3) ) { + } elseif ($input > floor($p / 3)) { $this->dbg(' too many elements'); $toRemove = true; } elseif ($contentLength < 6 && ($embedCount === 0 && ($img === 0 || $img > 2))) { $this->dbg(' content length less than 6 chars, 0 embeds and either 0 images or more than 2 images'); $toRemove = true; } elseif ($weight < 25 && $linkDensity > 0.25) { - $this->dbg(' weight is '.$weight.' < 25 and link density is '.sprintf("%.2f", $linkDensity).' > 0.25'); + $this->dbg(' weight is '.$weight.' < 25 and link density is '.sprintf('%.2f', $linkDensity).' > 0.25'); $toRemove = true; } elseif ($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) { - $this->dbg(' more than 2 links and weight is '.$weight.' > 25 but link density is '.sprintf("%.2f", $linkDensity).' > 0.5'); + $this->dbg(' more than 2 links and weight is '.$weight.' > 25 but link density is '.sprintf('%.2f', $linkDensity).' > 0.5'); $toRemove = true; } elseif ($embedCount > 3) { $this->dbg(' more than 3 embeds'); @@ -1182,17 +1184,17 @@ class Readability } elseif ($li > $p && $tag != 'ul' && $tag != 'ol') { $this->dbg(' too many
      1. elements, and parent is not
          or
            '); $toRemove = true; - } elseif ( $input > floor($p/3) ) { + } elseif ($input > floor($p / 3)) { $this->dbg(' too many elements'); $toRemove = true; - } elseif ($contentLength < 25 && ($img === 0 || $img > 2) ) { + } elseif ($contentLength < 25 && ($img === 0 || $img > 2)) { $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images'); $toRemove = true; } elseif ($weight < 25 && $linkDensity > 0.2) { - $this->dbg(' weight is '.$weight.' lower than 0 and link density is '.sprintf("%.2f", $linkDensity).' > 0.2'); + $this->dbg(' weight is '.$weight.' lower than 0 and link density is '.sprintf('%.2f', $linkDensity).' > 0.2'); $toRemove = true; } elseif ($weight >= 25 && $linkDensity > 0.5) { - $this->dbg(' weight above 25 but link density is '.sprintf("%.2f", $linkDensity).' > 0.5'); + $this->dbg(' weight above 25 but link density is '.sprintf('%.2f', $linkDensity).' > 0.5'); $toRemove = true; } elseif (($embedCount == 1 && $contentLength < 75) || $embedCount > 1) { $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed'); @@ -1210,14 +1212,13 @@ class Readability /** * Clean out spurious headers from an Element. Checks things like classnames and link density. * - * @param DOMElement $e - * @return void + * @param DOMElement $e */ public function cleanHeaders($e) { for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { - $headers = $e->getElementsByTagName('h' . $headerIndex); - for ($i=$headers->length-1; $i >=0; $i--) { + $headers = $e->getElementsByTagName('h'.$headerIndex); + for ($i = $headers->length - 1; $i >= 0; $i--) { if ($this->getWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { $headers->item($i)->parentNode->removeChild($headers->item($i)); } diff --git a/tests/ReadabilityTest.php b/tests/ReadabilityTest.php index 8107e63..52eddb0 100644 --- a/tests/ReadabilityTest.php +++ b/tests/ReadabilityTest.php @@ -3,7 +3,6 @@ namespace Tests\Readability; use Readability\Readability; -use Readability\JSLikeHTMLElement; class ReadabilityTested extends Readability {