Improve Travis & add Scrutinizer

+ CS
+ Update README
pull/3/head
Jeremy Benoist 11 years ago
parent f5d473780d
commit 1963319a55
  1. 3
      .scrutinizer.yml
  2. 23
      .travis.yml
  3. 1
      README.md
  4. 13
      src/JSLikeHTMLElement.php
  5. 237
      src/Readability.php
  6. 1
      tests/ReadabilityTest.php

@ -0,0 +1,3 @@
tools:
external_code_coverage:
timeout: 600

@ -4,10 +4,29 @@ php:
- 5.4 - 5.4
- 5.5 - 5.5
- 5.6 - 5.6
- nightly
- hhvm-nightly
before_script: # run build against nightly but allow them to fail
matrix:
fast_finish: true
allow_failures:
- php: nightly
- php: hhvm-nightly
# faster builds on new travis setup not using sudo
sudo: false
install:
- composer self-update - composer self-update
before_script:
- composer install --prefer-dist --no-interaction - composer install --prefer-dist --no-interaction
script: script:
- phpunit --coverage-text - phpunit --coverage-clover=coverage.clover
after_script:
- |
wget https://scrutinizer-ci.com/ocular.phar
php ocular.phar code-coverage:upload --format=php-clover coverage.clover

@ -1,6 +1,7 @@
# Readability # Readability
[![Build Status](https://travis-ci.org/j0k3r/php-readability.svg?branch=master)](https://travis-ci.org/j0k3r/php-readability) [![Build Status](https://travis-ci.org/j0k3r/php-readability.svg?branch=master)](https://travis-ci.org/j0k3r/php-readability)
[![Code Coverage](https://scrutinizer-ci.com/g/j0k3r/php-readability/badges/coverage.png?b=master)](https://scrutinizer-ci.com/g/j0k3r/php-readability/?branch=master)
This is an extract of the Readability class from the [full-text-rss](https://github.com/Dither/full-text-rss) fork. It kind be defined as a better version of the original [php-readability](http://code.fivefilters.org/php-readability). This is an extract of the Readability class from the [full-text-rss](https://github.com/Dither/full-text-rss) fork. It kind be defined as a better version of the original [php-readability](http://code.fivefilters.org/php-readability).

@ -3,7 +3,7 @@
namespace Readability; namespace Readability;
/** /**
* JavaScript-like HTML DOM Element * JavaScript-like HTML DOM Element.
* *
* This class extends PHP's DOMElement to allow * This class extends PHP's DOMElement to allow
* users to get and set the innerHTML property of * users to get and set the innerHTML property of
@ -31,12 +31,14 @@ namespace Readability;
* echo $doc->saveXML(); * echo $doc->saveXML();
* *
* @author Keyvan Minoukadeh - http://www.keyvan.net - keyvan@keyvan.net * @author Keyvan Minoukadeh - http://www.keyvan.net - keyvan@keyvan.net
*
* @see http://fivefilters.org (the project this was written for) * @see http://fivefilters.org (the project this was written for)
*/ */
class JSLikeHTMLElement extends \DOMElement class JSLikeHTMLElement extends \DOMElement
{ {
/** /**
* Used for setting innerHTML like it's done in JavaScript: * Used for setting innerHTML like it's done in JavaScript:.
*
* @code * @code
* $div->innerHTML = '<h2>Chapter 2</h2><p>The story begins...</p>'; * $div->innerHTML = '<h2>Chapter 2</h2><p>The story begins...</p>';
* @endcode * @endcode
@ -45,7 +47,7 @@ class JSLikeHTMLElement extends \DOMElement
{ {
if ($name == 'innerHTML') { if ($name == 'innerHTML') {
// first, empty the element // first, empty the element
for ($x=$this->childNodes->length-1; $x>=0; $x--) { for ($x = $this->childNodes->length - 1; $x >= 0; $x--) {
$this->removeChild($this->childNodes->item($x)); $this->removeChild($this->childNodes->item($x));
} }
// $value holds our new inner HTML // $value holds our new inner HTML
@ -86,7 +88,8 @@ class JSLikeHTMLElement extends \DOMElement
} }
/** /**
* Used for getting innerHTML like it's done in JavaScript: * Used for getting innerHTML like it's done in JavaScript:.
*
* @code * @code
* $string = $div->innerHTML; * $string = $div->innerHTML;
* @endcode * @endcode
@ -105,7 +108,7 @@ class JSLikeHTMLElement extends \DOMElement
$trace = debug_backtrace(); $trace = debug_backtrace();
trigger_error('Undefined property via __get(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE); trigger_error('Undefined property via __get(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE);
return null; return;
} }
public function __toString() public function __toString()

@ -14,7 +14,7 @@ namespace Readability;
* More information: http://fivefilters.org/content-only/ * More information: http://fivefilters.org/content-only/
* License: Apache License, Version 2.0 * License: Apache License, Version 2.0
* Requires: PHP version 5.2.0+ * Requires: PHP version 5.2.0+
* Date: 2013-08-02 * Date: 2013-08-02.
* *
* Differences between the PHP port and the original * Differences between the PHP port and the original
* ------------------------------------------------------ * ------------------------------------------------------
@ -76,7 +76,7 @@ class Readability
'divToPElements' => '/<(?:blockquote|code|div|article|footer|aside|img|p|pre|dl|ol|ul)/mi', 'divToPElements' => '/<(?:blockquote|code|div|article|footer|aside|img|p|pre|dl|ol|ul)/mi',
'killBreaks' => '/(<br\s*\/?>([ \r\n\s]|&nbsp;?)*)+/', 'killBreaks' => '/(<br\s*\/?>([ \r\n\s]|&nbsp;?)*)+/',
'media' => '!//(?:[^\.\?/]+\.)?(?:youtu(?:be)?|soundcloud|dailymotion|vimeo|pornhub|xvideos|twitvid|rutube|viddler)\.(?:com|be|org|net)/!i', 'media' => '!//(?:[^\.\?/]+\.)?(?:youtu(?:be)?|soundcloud|dailymotion|vimeo|pornhub|xvideos|twitvid|rutube|viddler)\.(?:com|be|org|net)/!i',
'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i' 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i',
); );
public $tidy_config = array( public $tidy_config = array(
'tidy-mark' => false, 'tidy-mark' => false,
@ -101,7 +101,7 @@ class Readability
// 'merge-spans' => true, // 'merge-spans' => true,
'input-encoding' => '????', 'input-encoding' => '????',
'output-encoding' => 'utf8', 'output-encoding' => 'utf8',
'hide-comments' => true 'hide-comments' => true,
); );
// raw HTML filters // raw HTML filters
protected $pre_filters = array( protected $pre_filters = array(
@ -111,7 +111,7 @@ class Readability
'!<font[^>]*>\s*\[AD\]\s*</font>!is' => '', // HACK: firewall-filtered content '!<font[^>]*>\s*\[AD\]\s*</font>!is' => '', // HACK: firewall-filtered content
'!(<br[^>]*>[ \r\n\s]*){2,}!i' => '</p><p>', // HACK: replace linebreaks plus br's with p's '!(<br[^>]*>[ \r\n\s]*){2,}!i' => '</p><p>', // HACK: replace linebreaks plus br's with p's
//'!</?noscript>!is' => '', // replace noscripts //'!</?noscript>!is' => '', // replace noscripts
'!<(/?)font[^>]*>!is' => '<\\1span>' // replace fonts to spans '!<(/?)font[^>]*>!is' => '<\\1span>', // replace fonts to spans
); );
// output HTML filters // output HTML filters
protected $post_filters = array( protected $post_filters = array(
@ -121,7 +121,7 @@ class Readability
"/\n+/" => "\n", //single newlines cleanup "/\n+/" => "\n", //single newlines cleanup
'!<pre[^>]*>\s*<code!is' => '<pre', // modern web... '!<pre[^>]*>\s*<code!is' => '<pre', // modern web...
'!</code>\s*</pre>!is' => '</pre>', '!</code>\s*</pre>!is' => '</pre>',
'!<[hb]r>!is' => '<\\1 />' '!<[hb]r>!is' => '<\\1 />',
); );
// flags // flags
const FLAG_STRIP_UNLIKELYS = 1; const FLAG_STRIP_UNLIKELYS = 1;
@ -139,13 +139,14 @@ class Readability
const MIN_NODE_LENGTH = 80; const MIN_NODE_LENGTH = 80;
const MAX_LINK_DENSITY = 0.25; const MAX_LINK_DENSITY = 0.25;
/** /**
* Create instance of Readability * Create instance of Readability.
*
* @param string UTF-8 encoded string * @param string UTF-8 encoded string
* @param string (optional) URL associated with HTML (for footnotes) * @param string (optional) URL associated with HTML (for footnotes)
* @param string (optional) Which parser to use for turning raw HTML into a DOMDocument * @param string (optional) Which parser to use for turning raw HTML into a DOMDocument
* @param boolean (optional) Use tidy * @param bool (optional) Use tidy
*/ */
public function __construct($html, $url=null, $parser='libxml', $use_tidy=true) public function __construct($html, $url = null, $parser = 'libxml', $use_tidy = true)
{ {
$this->url = $url; $this->url = $url;
$this->debugText = 'Parsing URL: '.$url."\n"; $this->debugText = 'Parsing URL: '.$url."\n";
@ -154,9 +155,9 @@ class Readability
$this->domainRegExp = '/'.strtr(preg_replace('/www\d*\./', '', parse_url($url, PHP_URL_HOST)), array('.' => '\.')).'/'; $this->domainRegExp = '/'.strtr(preg_replace('/www\d*\./', '', parse_url($url, PHP_URL_HOST)), array('.' => '\.')).'/';
} }
mb_internal_encoding("UTF-8"); mb_internal_encoding('UTF-8');
mb_http_output("UTF-8"); mb_http_output('UTF-8');
mb_regex_encoding("UTF-8"); mb_regex_encoding('UTF-8');
// HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well... // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well...
if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) { if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) {
@ -170,7 +171,7 @@ class Readability
$html = '<html></html>'; $html = '<html></html>';
} }
/** /*
* Use tidy (if it exists). * Use tidy (if it exists).
* This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing. * This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing.
* Although sometimes it makes matters worse, which is why there is an option to disable it. * Although sometimes it makes matters worse, which is why there is an option to disable it.
@ -188,9 +189,9 @@ class Readability
} }
unset($tidy); unset($tidy);
} }
$html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
if (!($parser=='html5lib' && ($this->dom = \HTML5_Parser::parse($html)))) { if (!($parser == 'html5lib' && ($this->dom = \HTML5_Parser::parse($html)))) {
libxml_use_internal_errors(true); libxml_use_internal_errors(true);
$this->dom = new \DOMDocument(); $this->dom = new \DOMDocument();
$this->dom->preserveWhiteSpace = false; $this->dom->preserveWhiteSpace = false;
@ -201,7 +202,8 @@ class Readability
$this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement'); $this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement');
} }
/** /**
* Get article title element * Get article title element.
*
* @return DOMElement * @return DOMElement
*/ */
public function getTitle() public function getTitle()
@ -209,7 +211,8 @@ class Readability
return $this->articleTitle; return $this->articleTitle;
} }
/** /**
* Get article content element * Get article content element.
*
* @return DOMElement * @return DOMElement
*/ */
public function getContent() public function getContent()
@ -217,20 +220,22 @@ class Readability
return $this->articleContent; return $this->articleContent;
} }
/** /**
* Add pre filter for raw input HTML processing * Add pre filter for raw input HTML processing.
*
* @param string RegExp for replace * @param string RegExp for replace
* @param string (optional) Replacer * @param string (optional) Replacer
*/ */
public function addPreFilter($filter, $replacer='') public function addPreFilter($filter, $replacer = '')
{ {
$this->pre_filters[$filter] = $replacer; $this->pre_filters[$filter] = $replacer;
} }
/** /**
* Add post filter for raw output HTML processing * Add post filter for raw output HTML processing.
*
* @param string RegExp for replace * @param string RegExp for replace
* @param string (optional) Replacer * @param string (optional) Replacer
*/ */
public function addPostFilter($filter, $replacer='') public function addPostFilter($filter, $replacer = '')
{ {
$this->post_filters[$filter] = $replacer; $this->post_filters[$filter] = $replacer;
} }
@ -244,7 +249,7 @@ class Readability
* 4. Replace the current DOM tree with the new one. * 4. Replace the current DOM tree with the new one.
* 5. Read peacefully. * 5. Read peacefully.
* *
* @return boolean true if we found content, false otherwise * @return bool true if we found content, false otherwise
*/ */
public function init() public function init()
{ {
@ -296,7 +301,7 @@ class Readability
return $this->success; return $this->success;
} }
/** /**
* Debug * Debug.
*/ */
protected function dbg($msg) //, $error=false) protected function dbg($msg) //, $error=false)
{ {
@ -306,12 +311,12 @@ class Readability
} }
/** /**
* Dump debug info * Dump debug info.
*/ */
protected function dump_dbg() protected function dump_dbg()
{ {
if ($this->debug) { if ($this->debug) {
openlog("Readability PHP ", LOG_PID | LOG_PERROR, 0); openlog('Readability PHP ', LOG_PID | LOG_PERROR, 0);
syslog(6, $this->debugText); // 1 - error 6 - info syslog(6, $this->debugText); // 1 - error 6 - info
} }
} }
@ -319,7 +324,6 @@ class Readability
* Run any post-process modifications to article content as necessary. * Run any post-process modifications to article content as necessary.
* *
* @param DOMElement * @param DOMElement
* @return void
*/ */
public function postProcessContent($articleContent) public function postProcessContent($articleContent)
{ {
@ -338,7 +342,8 @@ class Readability
$origTitle = ''; $origTitle = '';
try { try {
$curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
} catch (Exception $e) {} } catch (Exception $e) {
}
if (preg_match('/ [\|\-] /', $curTitle)) { if (preg_match('/ [\|\-] /', $curTitle)) {
$curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
if (count(explode(' ', $curTitle)) < 3) { if (count(explode(' ', $curTitle)) < 3) {
@ -347,7 +352,7 @@ class Readability
} elseif (strpos($curTitle, ': ') !== false) { } elseif (strpos($curTitle, ': ') !== false) {
$curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle); $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);
if (count(explode(' ', $curTitle)) < 3) { if (count(explode(' ', $curTitle)) < 3) {
$curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle); $curTitle = preg_replace('/[^:]*[:](.*)/i', '$1', $origTitle);
} }
} elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) { } elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) {
$hOnes = $this->dom->getElementsByTagName('h1'); $hOnes = $this->dom->getElementsByTagName('h1');
@ -367,12 +372,10 @@ class Readability
/** /**
* Prepare the HTML document for readability to scrape it. * Prepare the HTML document for readability to scrape it.
* This includes things like stripping javascript, CSS, and handling terrible markup. * This includes things like stripping javascript, CSS, and handling terrible markup.
*
* @return void
*/ */
protected function prepDocument() protected function prepDocument()
{ {
/** /*
* In some cases a body element can't be found (if the HTML is totally hosed for example) * In some cases a body element can't be found (if the HTML is totally hosed for example)
* so we create a new body node and append it to the document. * so we create a new body node and append it to the document.
*/ */
@ -383,19 +386,18 @@ class Readability
$this->body->setAttribute('id', 'readabilityBody'); $this->body->setAttribute('id', 'readabilityBody');
// Remove all style tags in head. // Remove all style tags in head.
$styleTags = $this->dom->getElementsByTagName('style'); $styleTags = $this->dom->getElementsByTagName('style');
for ($i = $styleTags->length-1; $i >= 0; $i--) { for ($i = $styleTags->length - 1; $i >= 0; $i--) {
$styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); $styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
} }
$linkTags = $this->dom->getElementsByTagName('link'); $linkTags = $this->dom->getElementsByTagName('link');
for ($i = $linkTags->length-1; $i >= 0; $i--) { for ($i = $linkTags->length - 1; $i >= 0; $i--) {
$linkTags->item($i)->parentNode->removeChild($linkTags->item($i)); $linkTags->item($i)->parentNode->removeChild($linkTags->item($i));
} }
} }
/** /**
* For easier reading, convert this document to have footnotes at the bottom rather than inline links. * For easier reading, convert this document to have footnotes at the bottom rather than inline links.
* @see http://www.roughtype.com/archives/2010/05/experiments_in.php
* *
* @return void * @see http://www.roughtype.com/archives/2010/05/experiments_in.php
*/ */
public function addFootnotes($articleContent) public function addFootnotes($articleContent)
{ {
@ -422,8 +424,8 @@ class Readability
} }
$linkCount++; $linkCount++;
// Add a superscript reference after the article link. // Add a superscript reference after the article link.
$refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount); $refLink->setAttribute('href', '#readabilityFootnoteLink-'.$linkCount);
$refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>'; $refLink->innerHTML = '<small><sup>['.$linkCount.']</sup></small>';
$refLink->setAttribute('class', 'readability-DoNotFootnote'); $refLink->setAttribute('class', 'readability-DoNotFootnote');
$refLink->setAttribute('style', 'color: inherit;'); $refLink->setAttribute('style', 'color: inherit;');
if ($articleLink->parentNode->lastChild->isSameNode($articleLink)) { if ($articleLink->parentNode->lastChild->isSameNode($articleLink)) {
@ -432,13 +434,13 @@ class Readability
$articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling); $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling);
} }
$articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');
$articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); $articleLink->setAttribute('name', 'readabilityLink-'.$linkCount);
$footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> '; $footnote->innerHTML = '<small><sup><a href="#readabilityLink-'.$linkCount.'" title="Jump to Link in Article">^</a></sup></small> ';
$footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText); $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText);
$footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-'.$linkCount);
$footnote->appendChild($footnoteLink); $footnote->appendChild($footnoteLink);
if ($linkDomain) { if ($linkDomain) {
$footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>'; $footnote->innerHTML = $footnote->innerHTML.'<small> ('.$linkDomain.')</small>';
} }
$articleFootnotes->appendChild($footnote); $articleFootnotes->appendChild($footnote);
} }
@ -451,7 +453,6 @@ class Readability
* iframes, forms, strip extraneous <p> tags, etc. * iframes, forms, strip extraneous <p> tags, etc.
* *
* @param DOMElement * @param DOMElement
* @return void
*/ */
public function prepArticle($articleContent) public function prepArticle($articleContent)
{ {
@ -464,25 +465,25 @@ class Readability
$this->killBreaks($articleContent); $this->killBreaks($articleContent);
$xpath = new \DOMXPath($articleContent->ownerDocument); $xpath = new \DOMXPath($articleContent->ownerDocument);
if ($this->revertForcedParagraphElements) { if ($this->revertForcedParagraphElements) {
/** /*
* Reverts P elements with class 'readability-styled' to text nodes: * Reverts P elements with class 'readability-styled' to text nodes:
* which is what they were before. * which is what they were before.
*/ */
$elems = $xpath->query('.//p[@data-readability-styled]', $articleContent); $elems = $xpath->query('.//p[@data-readability-styled]', $articleContent);
for ($i = $elems->length-1; $i >= 0; $i--) { for ($i = $elems->length - 1; $i >= 0; $i--) {
$e = $elems->item($i); $e = $elems->item($i);
$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
} }
} }
// Remove service data-candidate attribute. // Remove service data-candidate attribute.
$elems = $xpath->query('.//*[@data-candidate]', $articleContent); $elems = $xpath->query('.//*[@data-candidate]', $articleContent);
for ($i = $elems->length-1; $i >= 0; $i--) { for ($i = $elems->length - 1; $i >= 0; $i--) {
$elems->item($i)->removeAttribute('data-candidate'); $elems->item($i)->removeAttribute('data-candidate');
} }
// Remove unrelated links and other unneded stuff. // Remove unrelated links and other unneded stuff.
// (not(*) and not(text()[normalize-space()])) or // What's wrong here? // (not(*) and not(text()[normalize-space()])) or // What's wrong here?
$elems = $xpath->query('.//a[@rel="nofollow"]', $articleContent); $elems = $xpath->query('.//a[@rel="nofollow"]', $articleContent);
for ($i = $elems->length-1; $i >= 0; $i--) { for ($i = $elems->length - 1; $i >= 0; $i--) {
$elems->item($i)->parentNode->removeChild($elems->item($i)); $elems->item($i)->parentNode->removeChild($elems->item($i));
} }
// Clean out junk from the article content. // Clean out junk from the article content.
@ -494,7 +495,7 @@ class Readability
$this->clean($articleContent, 'canvas'); $this->clean($articleContent, 'canvas');
$this->clean($articleContent, 'h1'); $this->clean($articleContent, 'h1');
/** /*
* If there is only one h2, they are probably using it as a main header, so remove it since we * If there is only one h2, they are probably using it as a main header, so remove it since we
* already have a header. * already have a header.
*/ */
@ -511,7 +512,7 @@ class Readability
$this->cleanConditionally($articleContent, 'div'); $this->cleanConditionally($articleContent, 'div');
// Remove extra paragraphs. // Remove extra paragraphs.
$articleParagraphs = $articleContent->getElementsByTagName('p'); $articleParagraphs = $articleContent->getElementsByTagName('p');
for ($i = $articleParagraphs->length-1; $i >= 0; $i--) { for ($i = $articleParagraphs->length - 1; $i >= 0; $i--) {
$imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length; $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length;
$embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length; $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
$objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length; $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
@ -537,7 +538,7 @@ class Readability
} }
unset($search, $replace); unset($search, $replace);
} catch (Exception $e) { } catch (Exception $e) {
$this->dbg("Cleaning output HTML failed. Ignoring: " . $e->getMessage()); $this->dbg('Cleaning output HTML failed. Ignoring: '.$e->getMessage());
} }
} }
} }
@ -546,7 +547,6 @@ class Readability
* className/id for special names to add to its score. * className/id for special names to add to its score.
* *
* @param Element * @param Element
* @return void
*/ */
protected function initializeNode($node) protected function initializeNode($node)
{ {
@ -615,7 +615,7 @@ class Readability
* *
* @return DOMElement * @return DOMElement
*/ */
protected function grabArticle($page=null) protected function grabArticle($page = null)
{ {
if (!$page) { if (!$page) {
$page = $this->dom; $page = $this->dom;
@ -647,7 +647,7 @@ class Readability
$nodeIndex--; $nodeIndex--;
$nodesToScore[] = $newNode; $nodesToScore[] = $newNode;
} catch (Exception $e) { } catch (Exception $e) {
$this->dbg('Could not alter div/article to p, reverting back to div: ' . $e->getMessage()); $this->dbg('Could not alter div/article to p, reverting back to div: '.$e->getMessage());
} }
} else { } else {
// Will change these P elements back to text nodes after processing. // Will change these P elements back to text nodes after processing.
@ -668,14 +668,14 @@ class Readability
} }
} }
} }
/** /*
* Loop through all paragraphs, and assign a score to them based on how content-y they look. * Loop through all paragraphs, and assign a score to them based on how content-y they look.
* Then add their score to their parent node. * Then add their score to their parent node.
* *
* A score is determined by things like number of commas, class names, etc. * A score is determined by things like number of commas, class names, etc.
* Maybe eventually link density. * Maybe eventually link density.
*/ */
for ($pt=0, $scored = count($nodesToScore); $pt < $scored; $pt++) { for ($pt = 0, $scored = count($nodesToScore); $pt < $scored; $pt++) {
$parentNode = $nodesToScore[$pt]->parentNode; $parentNode = $nodesToScore[$pt]->parentNode;
// No parent node? Move on... // No parent node? Move on...
if (!$parentNode) { if (!$parentNode) {
@ -690,12 +690,12 @@ class Readability
// Initialize readability data for the parent. // Initialize readability data for the parent.
if (!$parentNode->hasAttribute('readability')) { if (!$parentNode->hasAttribute('readability')) {
$this->initializeNode($parentNode); $this->initializeNode($parentNode);
$parentNode->setAttribute('data-candidate','true'); $parentNode->setAttribute('data-candidate', 'true');
} }
// Initialize readability data for the grandparent. // Initialize readability data for the grandparent.
if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) { if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) {
$this->initializeNode($grandParentNode); $this->initializeNode($grandParentNode);
$grandParentNode->setAttribute('data-candidate','true'); $grandParentNode->setAttribute('data-candidate', 'true');
} }
// Add a point for the paragraph itself as a base. // Add a point for the paragraph itself as a base.
$contentScore = 1; $contentScore = 1;
@ -704,7 +704,7 @@ class Readability
// For every SCORE_CHARS_IN_PARAGRAPH (default:100) characters in this paragraph, add another point. Up to 3 points. // For every SCORE_CHARS_IN_PARAGRAPH (default:100) characters in this paragraph, add another point. Up to 3 points.
$contentScore += min(floor(mb_strlen($innerText) / self::SCORE_CHARS_IN_PARAGRAPH), 3); $contentScore += min(floor(mb_strlen($innerText) / self::SCORE_CHARS_IN_PARAGRAPH), 3);
// For every SCORE_WORDS_IN_PARAGRAPH (default:20) words in this paragraph, add another point. Up to 3 points. // For every SCORE_WORDS_IN_PARAGRAPH (default:20) words in this paragraph, add another point. Up to 3 points.
$contentScore += min(floor($this->getWordCount($innerText)/ self::SCORE_WORDS_IN_PARAGRAPH), 3); $contentScore += min(floor($this->getWordCount($innerText) / self::SCORE_WORDS_IN_PARAGRAPH), 3);
/* TEST: For every positive/negative parent tag, add/substract half point. Up to 3 points. *\/ /* TEST: For every positive/negative parent tag, add/substract half point. Up to 3 points. *\/
$up = $nodesToScore[$pt]; $up = $nodesToScore[$pt];
$score = 0; $score = 0;
@ -724,13 +724,13 @@ class Readability
$grandParentNode->getAttributeNode('readability')->value += $contentScore / self::GRANDPARENT_SCORE_DIVISOR; $grandParentNode->getAttributeNode('readability')->value += $contentScore / self::GRANDPARENT_SCORE_DIVISOR;
} }
} }
/** /*
* Node prepping: trash nodes that look cruddy (like ones with the class name "comment", etc). * Node prepping: trash nodes that look cruddy (like ones with the class name "comment", etc).
* This is faster to do before scoring but safer after. * This is faster to do before scoring but safer after.
*/ */
if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) && $xpath) { if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) && $xpath) {
$candidates = $xpath->query('.//*[(self::footer and count(//footer)<2) or (self::aside and count(//aside)<2)]', $page->documentElement); $candidates = $xpath->query('.//*[(self::footer and count(//footer)<2) or (self::aside and count(//aside)<2)]', $page->documentElement);
for ($node = null, $c = $candidates->length-1; $c >= 0; $c--) { for ($node = null, $c = $candidates->length - 1; $c >= 0; $c--) {
$node = $candidates->item($c); $node = $candidates->item($c);
// node should be readable but not inside of an article otherwise it's probably non-readable block // node should be readable but not inside of an article otherwise it's probably non-readable block
if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? strcasecmp($node->parentNode->tagName, 'article') !== 0 : true)) { if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? strcasecmp($node->parentNode->tagName, 'article') !== 0 : true)) {
@ -739,24 +739,24 @@ class Readability
} }
} }
$candidates = $xpath->query('.//*[not(self::body) and (@class or @id or @style) and ((number(@readability) < 40) or not(@readability))]', $page->documentElement); $candidates = $xpath->query('.//*[not(self::body) and (@class or @id or @style) and ((number(@readability) < 40) or not(@readability))]', $page->documentElement);
for ($node = null, $c = $candidates->length-1; $c >= 0; $c--) { for ($node = null, $c = $candidates->length - 1; $c >= 0; $c--) {
$node = $candidates->item($c); $node = $candidates->item($c);
$tagName = $node->tagName; $tagName = $node->tagName;
/* Remove unlikely candidates */ /* Remove unlikely candidates */
$unlikelyMatchString = $node->getAttribute('class')." ".$node->getAttribute('id')." ".$node->getAttribute('style'); $unlikelyMatchString = $node->getAttribute('class').' '.$node->getAttribute('id').' '.$node->getAttribute('style');
//$this->dbg('Processing '.$node->getNodePath().' by "'. $unlikelyMatchString.'" with readability '.($node->hasAttribute('readability') ? (int)$node->getAttributeNode('readability')->value : 0)); //$this->dbg('Processing '.$node->getNodePath().' by "'. $unlikelyMatchString.'" with readability '.($node->hasAttribute('readability') ? (int)$node->getAttributeNode('readability')->value : 0));
if (mb_strlen($unlikelyMatchString) > 3 && // don't process "empty" strings if (mb_strlen($unlikelyMatchString) > 3 && // don't process "empty" strings
preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
!preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString)
) { ) {
$this->dbg('Removing unlikely candidate '.$node->getNodePath().' by "'. $unlikelyMatchString.'" with readability '. ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); $this->dbg('Removing unlikely candidate '.$node->getNodePath().' by "'.$unlikelyMatchString.'" with readability '.($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0));
$node->parentNode->removeChild($node); $node->parentNode->removeChild($node);
$nodeIndex--; $nodeIndex--;
} }
} }
unset($candidates); unset($candidates);
} }
/** /*
* After we've calculated scores, loop through all of the possible candidate nodes we found * After we've calculated scores, loop through all of the possible candidate nodes we found
* and find the one with the highest score. * and find the one with the highest score.
*/ */
@ -764,7 +764,7 @@ class Readability
if ($xpath) { if ($xpath) {
// Using array of DOMElements after deletion is a path to DOOMElement. // Using array of DOMElements after deletion is a path to DOOMElement.
$candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement); $candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement);
for ($c = $candidates->length-1; $c >= 0; $c--) { for ($c = $candidates->length - 1; $c >= 0; $c--) {
// Scale the final candidates score based on link density. Good content should have a // Scale the final candidates score based on link density. Good content should have a
// relatively small link density (5% or less) and be mostly unaffected by this operation. // relatively small link density (5% or less) and be mostly unaffected by this operation.
// If not for this we would have used XPath to find maximum @readability. // If not for this we would have used XPath to find maximum @readability.
@ -777,7 +777,7 @@ class Readability
} }
unset($candidates); unset($candidates);
} }
/** /*
* If we still have no top candidate, just use the body as a last resort. * If we still have no top candidate, just use the body as a last resort.
* We also have to copy the body node so it is something we can modify. * We also have to copy the body node so it is something we can modify.
*/ */
@ -812,7 +812,7 @@ class Readability
} }
} }
$this->dbg('Top candidate: '.$topCandidate->getNodePath()); $this->dbg('Top candidate: '.$topCandidate->getNodePath());
/** /*
* Now that we have the top candidate, look through its siblings for content that might also be related. * Now that we have the top candidate, look through its siblings for content that might also be related.
* Things like preambles, content split by ads that we removed, etc. * Things like preambles, content split by ads that we removed, etc.
*/ */
@ -828,7 +828,7 @@ class Readability
$siblingNode = $siblingNodes->item($s); $siblingNode = $siblingNodes->item($s);
$siblingNodeName = $siblingNode->nodeName; $siblingNodeName = $siblingNode->nodeName;
$append = false; $append = false;
$this->dbg('Looking at sibling node: ' . $siblingNode->getNodePath() . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); $this->dbg('Looking at sibling node: '.$siblingNode->getNodePath().(($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score '.$siblingNode->getAttribute('readability')) : ''));
//$this->dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown')); //$this->dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown'));
if ($siblingNode->isSameNode($topCandidate)) { if ($siblingNode->isSameNode($topCandidate)) {
$append = true; $append = true;
@ -852,18 +852,18 @@ class Readability
} }
} }
if ($append) { if ($append) {
$this->dbg('Appending node: ' . $siblingNode->getNodePath()); $this->dbg('Appending node: '.$siblingNode->getNodePath());
$nodeToAppend = null; $nodeToAppend = null;
if (strcasecmp($siblingNodeName, 'div') !== 0 && strcasecmp($siblingNodeName, 'p') !== 0) { if (strcasecmp($siblingNodeName, 'div') !== 0 && strcasecmp($siblingNodeName, 'p') !== 0) {
/* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
$this->dbg('Altering siblingNode ' . $siblingNodeName . ' to div.'); $this->dbg('Altering siblingNode '.$siblingNodeName.' to div.');
$nodeToAppend = $this->dom->createElement('div'); $nodeToAppend = $this->dom->createElement('div');
try { try {
$nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id')); $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id'));
$nodeToAppend->setAttribute('alt', $siblingNodeName); $nodeToAppend->setAttribute('alt', $siblingNodeName);
$nodeToAppend->innerHTML = $siblingNode->innerHTML; $nodeToAppend->innerHTML = $siblingNode->innerHTML;
} catch (Exception $e) { } catch (Exception $e) {
$this->dbg('Could not alter siblingNode ' . $siblingNodeName . ' to div, reverting to original.'); $this->dbg('Could not alter siblingNode '.$siblingNodeName.' to div, reverting to original.');
$nodeToAppend = $siblingNode; $nodeToAppend = $siblingNode;
$s--; $s--;
$sl--; $sl--;
@ -884,7 +884,7 @@ class Readability
unset($xpath); unset($xpath);
// So we have all of the content that we need. Now we clean it up for presentation. // So we have all of the content that we need. Now we clean it up for presentation.
$this->prepArticle($articleContent); $this->prepArticle($articleContent);
/** /*
* Now that we've gone through the full algorithm, check to see if we got any meaningful content. * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
* If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
* likelihood of finding the content, and the sieve approach gives us a higher likelihood of * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
@ -897,17 +897,17 @@ class Readability
$this->body->innerHTML = $this->bodyCache; $this->body->innerHTML = $this->bodyCache;
if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
$this->removeFlag(self::FLAG_STRIP_UNLIKELYS); $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
$this->dbg("...content is shorter than ".self::MIN_ARTICLE_LENGTH." letters, trying not to strip unlikely content.\n"); $this->dbg('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to strip unlikely content.\n");
return $this->grabArticle($this->body); return $this->grabArticle($this->body);
} elseif ($this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) { } elseif ($this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
$this->removeFlag(self::FLAG_WEIGHT_ATTRIBUTES); $this->removeFlag(self::FLAG_WEIGHT_ATTRIBUTES);
$this->dbg("...content is shorter than ".self::MIN_ARTICLE_LENGTH." letters, trying not to weight attributes.\n"); $this->dbg('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to weight attributes.\n");
return $this->grabArticle($this->body); return $this->grabArticle($this->body);
} elseif ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { } elseif ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
$this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
$this->dbg("...content is shorter than ".self::MIN_ARTICLE_LENGTH." letters, trying not to clean at all.\n"); $this->dbg('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to clean at all.\n");
return $this->grabArticle($this->body); return $this->grabArticle($this->body);
} else { } else {
@ -921,12 +921,13 @@ class Readability
* Get the inner text of a node. * Get the inner text of a node.
* This also strips out any excess whitespace to be found. * This also strips out any excess whitespace to be found.
* *
* @param DOMElement $e * @param DOMElement $e
* @param boolean $normalizeSpaces (default: true) * @param bool $normalizeSpaces (default: true)
* @param boolean $flattenLines (default: false) * @param bool $flattenLines (default: false)
*
* @return string * @return string
*/ */
public function getInnerText($e, $normalizeSpaces=true, $flattenLines=false) public function getInnerText($e, $normalizeSpaces = true, $flattenLines = false)
{ {
if (!isset($e->textContent) || $e->textContent === '') { if (!isset($e->textContent) || $e->textContent === '') {
return ''; return '';
@ -943,8 +944,7 @@ class Readability
/** /**
* Remove the style attribute on every $e and under. * Remove the style attribute on every $e and under.
* *
* @param DOMElement $e * @param DOMElement $e
* @return void
*/ */
public function cleanStyles($e) public function cleanStyles($e)
{ {
@ -959,7 +959,8 @@ class Readability
/** /**
* Get comma number for a given text. * Get comma number for a given text.
* *
* @param string $text * @param string $text
*
* @return number (integer) * @return number (integer)
*/ */
public function getCommaCount($text) public function getCommaCount($text)
@ -970,7 +971,8 @@ class Readability
* Get words number for a given text if words separated by a space. * Get words number for a given text if words separated by a space.
* Input string should be normalized. * Input string should be normalized.
* *
* @param string $text * @param string $text
*
* @return number (integer) * @return number (integer)
*/ */
public function getWordCount($text) public function getWordCount($text)
@ -982,16 +984,17 @@ class Readability
* This is the amount of text that is inside a link divided by the total text in the node. * This is the amount of text that is inside a link divided by the total text in the node.
* Can exclude external references to differentiate between simple text and menus/infoblocks. * Can exclude external references to differentiate between simple text and menus/infoblocks.
* *
* @param DOMElement $e * @param DOMElement $e
* @param string $excludeExternal * @param string $excludeExternal
* @return number (float) *
* @return number (float)
*/ */
public function getLinkDensity($e, $excludeExternal=false) public function getLinkDensity($e, $excludeExternal = false)
{ {
$links = $e->getElementsByTagName('a'); $links = $e->getElementsByTagName('a');
$textLength = mb_strlen($this->getInnerText($e, true, true)); $textLength = mb_strlen($this->getInnerText($e, true, true));
$linkLength = 0; $linkLength = 0;
for ($dRe = $this->domainRegExp, $i=0, $il=$links->length; $i < $il; $i++) { for ($dRe = $this->domainRegExp, $i = 0, $il = $links->length; $i < $il; $i++) {
if ($excludeExternal && $dRe && !preg_match($dRe, $links->item($i)->getAttribute('href'))) { if ($excludeExternal && $dRe && !preg_match($dRe, $links->item($i)->getAttribute('href'))) {
continue; continue;
} }
@ -1007,9 +1010,10 @@ class Readability
* Get an element weight by attribute. * Get an element weight by attribute.
* Uses regular expressions to tell if this element looks good or bad. * Uses regular expressions to tell if this element looks good or bad.
* *
* @param DOMElement $element * @param DOMElement $element
* @param string $attribute * @param string $attribute
* @return number (Integer) *
* @return number (Integer)
*/ */
protected function weightAttribute($element, $attribute) protected function weightAttribute($element, $attribute)
{ {
@ -1039,8 +1043,9 @@ class Readability
/** /**
* Get an element relative weight. * Get an element relative weight.
* *
* @param DOMElement $e * @param DOMElement $e
* @return number (Integer) *
* @return number (Integer)
*/ */
public function getWeight($e) public function getWeight($e)
{ {
@ -1058,8 +1063,7 @@ class Readability
/** /**
* Remove extraneous break tags from a node. * Remove extraneous break tags from a node.
* *
* @param DOMElement $node * @param DOMElement $node
* @return void
*/ */
public function killBreaks($node) public function killBreaks($node)
{ {
@ -1069,19 +1073,18 @@ class Readability
} }
/** /**
* Clean a node of all elements of type "tag". * Clean a node of all elements of type "tag".
* (Unless it's a youtube/vimeo video. People love movies.) * (Unless it's a youtube/vimeo video. People love movies.).
* *
* Updated 2012-09-18 to preserve youtube/vimeo iframes * Updated 2012-09-18 to preserve youtube/vimeo iframes
* *
* @param DOMElement $e * @param DOMElement $e
* @param string $tag * @param string $tag
* @return void
*/ */
public function clean($e, $tag) public function clean($e, $tag)
{ {
$targetList = $e->getElementsByTagName($tag); $targetList = $e->getElementsByTagName($tag);
$isEmbed = ($tag === 'audio' || $tag === 'video' || $tag === 'iframe' || $tag === 'object' || $tag === 'embed'); $isEmbed = ($tag === 'audio' || $tag === 'video' || $tag === 'iframe' || $tag === 'object' || $tag === 'embed');
for ($cur_item = null, $y = $targetList->length-1; $y >= 0; $y--) { for ($cur_item = null, $y = $targetList->length - 1; $y >= 0; $y--) {
/* Allow youtube and vimeo videos through as people usually want to see those. */ /* Allow youtube and vimeo videos through as people usually want to see those. */
$cur_item = $targetList->item($y); $cur_item = $targetList->item($y);
if ($isEmbed) { if ($isEmbed) {
@ -1103,9 +1106,8 @@ class Readability
* "Fishy" is an algorithm based on content length, classnames, * "Fishy" is an algorithm based on content length, classnames,
* link density, number of images & embeds, etc. * link density, number of images & embeds, etc.
* *
* @param DOMElement $e * @param DOMElement $e
* @param string $tag * @param string $tag
* @return void
*/ */
public function cleanConditionally($e, $tag) public function cleanConditionally($e, $tag)
{ {
@ -1114,7 +1116,7 @@ class Readability
} }
$tagsList = $e->getElementsByTagName($tag); $tagsList = $e->getElementsByTagName($tag);
$curTagsLength = $tagsList->length; $curTagsLength = $tagsList->length;
/** /*
* Gather counts for other typical elements embedded within. * Gather counts for other typical elements embedded within.
* Traverse backwards so we can remove nodes at the same time without effecting the traversal. * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
* *
@ -1125,29 +1127,29 @@ class Readability
//$class = $node->getAttribute('class').' '.$node->getAttribute('id'); //debug //$class = $node->getAttribute('class').' '.$node->getAttribute('id'); //debug
$weight = $this->getWeight($node); $weight = $this->getWeight($node);
$contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0; $contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0;
$this->dbg('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : '')); $this->dbg('Start conditional cleaning of '.$node->getNodePath().' (class='.$node->getAttribute('class').'; id='.$node->getAttribute('id').')'.(($node->hasAttribute('readability')) ? (' with score '.$node->getAttribute('readability')) : ''));
if ($weight + $contentScore < 0) { if ($weight + $contentScore < 0) {
$this->dbg('Removing...'); $this->dbg('Removing...');
$node->parentNode->removeChild($node); $node->parentNode->removeChild($node);
} elseif ($this->getCommaCount($this->getInnerText($node)) < self::MIN_COMMAS_IN_PARAGRAPH) { } elseif ($this->getCommaCount($this->getInnerText($node)) < self::MIN_COMMAS_IN_PARAGRAPH) {
/** /*
* If there are not very many commas, and the number of * If there are not very many commas, and the number of
* non-paragraph elements is more than paragraphs or other ominous signs, remove the element. * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
*/ */
$p = $node->getElementsByTagName('p')->length; $p = $node->getElementsByTagName('p')->length;
$img = $node->getElementsByTagName('img')->length; $img = $node->getElementsByTagName('img')->length;
$li = $node->getElementsByTagName('li')->length-100; $li = $node->getElementsByTagName('li')->length - 100;
$input = $node->getElementsByTagName('input')->length; $input = $node->getElementsByTagName('input')->length;
$a = $node->getElementsByTagName('a')->length; $a = $node->getElementsByTagName('a')->length;
$embedCount = 0; $embedCount = 0;
$embeds = $node->getElementsByTagName('embed'); $embeds = $node->getElementsByTagName('embed');
for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { for ($ei = 0, $il = $embeds->length; $ei < $il; $ei++) {
if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) { if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) {
$embedCount++; $embedCount++;
} }
} }
$embeds = $node->getElementsByTagName('iframe'); $embeds = $node->getElementsByTagName('iframe');
for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { for ($ei = 0, $il = $embeds->length; $ei < $il; $ei++) {
if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) { if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) {
$embedCount++; $embedCount++;
} }
@ -1159,17 +1161,17 @@ class Readability
if ($li > $p && $tag != 'ul' && $tag != 'ol') { if ($li > $p && $tag != 'ul' && $tag != 'ol') {
$this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
$toRemove = true; $toRemove = true;
} elseif ( $input > floor($p/3) ) { } elseif ($input > floor($p / 3)) {
$this->dbg(' too many <input> elements'); $this->dbg(' too many <input> elements');
$toRemove = true; $toRemove = true;
} elseif ($contentLength < 6 && ($embedCount === 0 && ($img === 0 || $img > 2))) { } elseif ($contentLength < 6 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
$this->dbg(' content length less than 6 chars, 0 embeds and either 0 images or more than 2 images'); $this->dbg(' content length less than 6 chars, 0 embeds and either 0 images or more than 2 images');
$toRemove = true; $toRemove = true;
} elseif ($weight < 25 && $linkDensity > 0.25) { } elseif ($weight < 25 && $linkDensity > 0.25) {
$this->dbg(' weight is '.$weight.' < 25 and link density is '.sprintf("%.2f", $linkDensity).' > 0.25'); $this->dbg(' weight is '.$weight.' < 25 and link density is '.sprintf('%.2f', $linkDensity).' > 0.25');
$toRemove = true; $toRemove = true;
} elseif ($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) { } elseif ($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
$this->dbg(' more than 2 links and weight is '.$weight.' > 25 but link density is '.sprintf("%.2f", $linkDensity).' > 0.5'); $this->dbg(' more than 2 links and weight is '.$weight.' > 25 but link density is '.sprintf('%.2f', $linkDensity).' > 0.5');
$toRemove = true; $toRemove = true;
} elseif ($embedCount > 3) { } elseif ($embedCount > 3) {
$this->dbg(' more than 3 embeds'); $this->dbg(' more than 3 embeds');
@ -1182,17 +1184,17 @@ class Readability
} elseif ($li > $p && $tag != 'ul' && $tag != 'ol') { } elseif ($li > $p && $tag != 'ul' && $tag != 'ol') {
$this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
$toRemove = true; $toRemove = true;
} elseif ( $input > floor($p/3) ) { } elseif ($input > floor($p / 3)) {
$this->dbg(' too many <input> elements'); $this->dbg(' too many <input> elements');
$toRemove = true; $toRemove = true;
} elseif ($contentLength < 25 && ($img === 0 || $img > 2) ) { } elseif ($contentLength < 25 && ($img === 0 || $img > 2)) {
$this->dbg(' content length less than 25 chars and 0 images, or more than 2 images'); $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images');
$toRemove = true; $toRemove = true;
} elseif ($weight < 25 && $linkDensity > 0.2) { } elseif ($weight < 25 && $linkDensity > 0.2) {
$this->dbg(' weight is '.$weight.' lower than 0 and link density is '.sprintf("%.2f", $linkDensity).' > 0.2'); $this->dbg(' weight is '.$weight.' lower than 0 and link density is '.sprintf('%.2f', $linkDensity).' > 0.2');
$toRemove = true; $toRemove = true;
} elseif ($weight >= 25 && $linkDensity > 0.5) { } elseif ($weight >= 25 && $linkDensity > 0.5) {
$this->dbg(' weight above 25 but link density is '.sprintf("%.2f", $linkDensity).' > 0.5'); $this->dbg(' weight above 25 but link density is '.sprintf('%.2f', $linkDensity).' > 0.5');
$toRemove = true; $toRemove = true;
} elseif (($embedCount == 1 && $contentLength < 75) || $embedCount > 1) { } elseif (($embedCount == 1 && $contentLength < 75) || $embedCount > 1) {
$this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed'); $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed');
@ -1210,14 +1212,13 @@ class Readability
/** /**
* Clean out spurious headers from an Element. Checks things like classnames and link density. * Clean out spurious headers from an Element. Checks things like classnames and link density.
* *
* @param DOMElement $e * @param DOMElement $e
* @return void
*/ */
public function cleanHeaders($e) public function cleanHeaders($e)
{ {
for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) {
$headers = $e->getElementsByTagName('h' . $headerIndex); $headers = $e->getElementsByTagName('h'.$headerIndex);
for ($i=$headers->length-1; $i >=0; $i--) { for ($i = $headers->length - 1; $i >= 0; $i--) {
if ($this->getWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { if ($this->getWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
$headers->item($i)->parentNode->removeChild($headers->item($i)); $headers->item($i)->parentNode->removeChild($headers->item($i));
} }

@ -3,7 +3,6 @@
namespace Tests\Readability; namespace Tests\Readability;
use Readability\Readability; use Readability\Readability;
use Readability\JSLikeHTMLElement;
class ReadabilityTested extends Readability class ReadabilityTested extends Readability
{ {

Loading…
Cancel
Save