@ -14,7 +14,7 @@ namespace Readability;
* More information: http://fivefilters.org/content-only/
* More information: http://fivefilters.org/content-only/
* License: Apache License, Version 2.0
* License: Apache License, Version 2.0
* Requires: PHP version 5.2.0+
* Requires: PHP version 5.2.0+
* Date: 2013-08-02
* Date: 2013-08-02.
*
*
* Differences between the PHP port and the original
* Differences between the PHP port and the original
* ------------------------------------------------------
* ------------------------------------------------------
@ -76,7 +76,7 @@ class Readability
'divToPElements' => '/< (?:blockquote|code|div|article|footer|aside|img|p|pre|dl|ol|ul)/mi',
'divToPElements' => '/< (?:blockquote|code|div|article|footer|aside|img|p|pre|dl|ol|ul)/mi',
'killBreaks' => '/(< br \ s * \ / ? > ([ \r\n\s]| ?)*)+/',
'killBreaks' => '/(< br \ s * \ / ? > ([ \r\n\s]| ?)*)+/',
'media' => '!//(?:[^\.\?/]+\.)?(?:youtu(?:be)?|soundcloud|dailymotion|vimeo|pornhub|xvideos|twitvid|rutube|viddler)\.(?:com|be|org|net)/!i',
'media' => '!//(?:[^\.\?/]+\.)?(?:youtu(?:be)?|soundcloud|dailymotion|vimeo|pornhub|xvideos|twitvid|rutube|viddler)\.(?:com|be|org|net)/!i',
'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i',
);
);
public $tidy_config = array(
public $tidy_config = array(
'tidy-mark' => false,
'tidy-mark' => false,
@ -101,7 +101,7 @@ class Readability
// 'merge-spans' => true,
// 'merge-spans' => true,
'input-encoding' => '????',
'input-encoding' => '????',
'output-encoding' => 'utf8',
'output-encoding' => 'utf8',
'hide-comments' => true
'hide-comments' => true,
);
);
// raw HTML filters
// raw HTML filters
protected $pre_filters = array(
protected $pre_filters = array(
@ -111,7 +111,7 @@ class Readability
'!< font [ ^ > ]*>\s*\[AD\]\s*< / font > !is' => '', // HACK: firewall-filtered content
'!< font [ ^ > ]*>\s*\[AD\]\s*< / font > !is' => '', // HACK: firewall-filtered content
'!(< br [ ^ > ]*>[ \r\n\s]*){2,}!i' => '< / p > < p > ', // HACK: replace linebreaks plus br's with p's
'!(< br [ ^ > ]*>[ \r\n\s]*){2,}!i' => '< / p > < p > ', // HACK: replace linebreaks plus br's with p's
//'!< /?noscript>!is' => '', // replace noscripts
//'!< /?noscript>!is' => '', // replace noscripts
'!< (/?)font[^>]*>!is' => '< \\1span>' // replace fonts to spans
'!< (/?)font[^>]*>!is' => '< \\1span>', // replace fonts to spans
);
);
// output HTML filters
// output HTML filters
protected $post_filters = array(
protected $post_filters = array(
@ -121,7 +121,7 @@ class Readability
"/\n+/" => "\n", //single newlines cleanup
"/\n+/" => "\n", //single newlines cleanup
'!< pre [ ^ > ]*>\s*< code ! is ' = > '< pre ' , / / modern web . . .
'!< pre [ ^ > ]*>\s*< code ! is ' = > '< pre ' , / / modern web . . .
'!< / code > \s*< / pre > !is' => '< / pre > ',
'!< / code > \s*< / pre > !is' => '< / pre > ',
'!< [hb]r>!is' => '< \\1 />'
'!< [hb]r>!is' => '< \\1 />',
);
);
// flags
// flags
const FLAG_STRIP_UNLIKELYS = 1;
const FLAG_STRIP_UNLIKELYS = 1;
@ -139,11 +139,12 @@ class Readability
const MIN_NODE_LENGTH = 80;
const MIN_NODE_LENGTH = 80;
const MAX_LINK_DENSITY = 0.25;
const MAX_LINK_DENSITY = 0.25;
/**
/**
* Create instance of Readability
* Create instance of Readability.
*
* @param string UTF-8 encoded string
* @param string UTF-8 encoded string
* @param string (optional) URL associated with HTML (for footnotes)
* @param string (optional) URL associated with HTML (for footnotes)
* @param string (optional) Which parser to use for turning raw HTML into a DOMDocument
* @param string (optional) Which parser to use for turning raw HTML into a DOMDocument
* @param boolean (optional) Use tidy
* @param bool (optional) Use tidy
*/
*/
public function __construct($html, $url = null, $parser = 'libxml', $use_tidy = true)
public function __construct($html, $url = null, $parser = 'libxml', $use_tidy = true)
{
{
@ -154,9 +155,9 @@ class Readability
$this->domainRegExp = '/'.strtr(preg_replace('/www\d*\./', '', parse_url($url, PHP_URL_HOST)), array('.' => '\.')).'/';
$this->domainRegExp = '/'.strtr(preg_replace('/www\d*\./', '', parse_url($url, PHP_URL_HOST)), array('.' => '\.')).'/';
}
}
mb_internal_encoding("UTF-8" );
mb_internal_encoding('UTF-8' );
mb_http_output("UTF-8" );
mb_http_output('UTF-8' );
mb_regex_encoding("UTF-8" );
mb_regex_encoding('UTF-8' );
// HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well...
// HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well...
if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) {
if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) {
@ -170,7 +171,7 @@ class Readability
$html = '< html > < / html > ';
$html = '< html > < / html > ';
}
}
/**
/*
* Use tidy (if it exists).
* Use tidy (if it exists).
* This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing.
* This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing.
* Although sometimes it makes matters worse, which is why there is an option to disable it.
* Although sometimes it makes matters worse, which is why there is an option to disable it.
@ -188,7 +189,7 @@ class Readability
}
}
unset($tidy);
unset($tidy);
}
}
$html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8" );
$html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8' );
if (!($parser == 'html5lib' & & ($this->dom = \HTML5_Parser::parse($html)))) {
if (!($parser == 'html5lib' & & ($this->dom = \HTML5_Parser::parse($html)))) {
libxml_use_internal_errors(true);
libxml_use_internal_errors(true);
@ -201,7 +202,8 @@ class Readability
$this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement');
$this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement');
}
}
/**
/**
* Get article title element
* Get article title element.
*
* @return DOMElement
* @return DOMElement
*/
*/
public function getTitle()
public function getTitle()
@ -209,7 +211,8 @@ class Readability
return $this->articleTitle;
return $this->articleTitle;
}
}
/**
/**
* Get article content element
* Get article content element.
*
* @return DOMElement
* @return DOMElement
*/
*/
public function getContent()
public function getContent()
@ -217,7 +220,8 @@ class Readability
return $this->articleContent;
return $this->articleContent;
}
}
/**
/**
* Add pre filter for raw input HTML processing
* Add pre filter for raw input HTML processing.
*
* @param string RegExp for replace
* @param string RegExp for replace
* @param string (optional) Replacer
* @param string (optional) Replacer
*/
*/
@ -226,7 +230,8 @@ class Readability
$this->pre_filters[$filter] = $replacer;
$this->pre_filters[$filter] = $replacer;
}
}
/**
/**
* Add post filter for raw output HTML processing
* Add post filter for raw output HTML processing.
*
* @param string RegExp for replace
* @param string RegExp for replace
* @param string (optional) Replacer
* @param string (optional) Replacer
*/
*/
@ -244,7 +249,7 @@ class Readability
* 4. Replace the current DOM tree with the new one.
* 4. Replace the current DOM tree with the new one.
* 5. Read peacefully.
* 5. Read peacefully.
*
*
* @return boolean true if we found content, false otherwise
* @return bool true if we found content, false otherwise
*/
*/
public function init()
public function init()
{
{
@ -296,7 +301,7 @@ class Readability
return $this->success;
return $this->success;
}
}
/**
/**
* Debug
* Debug.
*/
*/
protected function dbg($msg) //, $error=false)
protected function dbg($msg) //, $error=false)
{
{
@ -306,12 +311,12 @@ class Readability
}
}
/**
/**
* Dump debug info
* Dump debug info.
*/
*/
protected function dump_dbg()
protected function dump_dbg()
{
{
if ($this->debug) {
if ($this->debug) {
openlog("Readability PHP " , LOG_PID | LOG_PERROR, 0);
openlog('Readability PHP ' , LOG_PID | LOG_PERROR, 0);
syslog(6, $this->debugText); // 1 - error 6 - info
syslog(6, $this->debugText); // 1 - error 6 - info
}
}
}
}
@ -319,7 +324,6 @@ class Readability
* Run any post-process modifications to article content as necessary.
* Run any post-process modifications to article content as necessary.
*
*
* @param DOMElement
* @param DOMElement
* @return void
*/
*/
public function postProcessContent($articleContent)
public function postProcessContent($articleContent)
{
{
@ -338,7 +342,8 @@ class Readability
$origTitle = '';
$origTitle = '';
try {
try {
$curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
$curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
} catch (Exception $e) {}
} catch (Exception $e) {
}
if (preg_match('/ [\|\-] /', $curTitle)) {
if (preg_match('/ [\|\-] /', $curTitle)) {
$curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
$curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
if (count(explode(' ', $curTitle)) < 3 ) {
if (count(explode(' ', $curTitle)) < 3 ) {
@ -367,12 +372,10 @@ class Readability
/**
/**
* Prepare the HTML document for readability to scrape it.
* Prepare the HTML document for readability to scrape it.
* This includes things like stripping javascript, CSS, and handling terrible markup.
* This includes things like stripping javascript, CSS, and handling terrible markup.
*
* @return void
*/
*/
protected function prepDocument()
protected function prepDocument()
{
{
/**
/*
* In some cases a body element can't be found (if the HTML is totally hosed for example)
* In some cases a body element can't be found (if the HTML is totally hosed for example)
* so we create a new body node and append it to the document.
* so we create a new body node and append it to the document.
*/
*/
@ -393,9 +396,8 @@ class Readability
}
}
/**
/**
* For easier reading, convert this document to have footnotes at the bottom rather than inline links.
* For easier reading, convert this document to have footnotes at the bottom rather than inline links.
* @see http://www.roughtype.com/archives/2010/05/experiments_in.php
*
*
* @return void
* @see http://www.roughtype.com/archives/2010/05/experiments_in.php
*/
*/
public function addFootnotes($articleContent)
public function addFootnotes($articleContent)
{
{
@ -451,7 +453,6 @@ class Readability
* iframes, forms, strip extraneous < p > tags, etc.
* iframes, forms, strip extraneous < p > tags, etc.
*
*
* @param DOMElement
* @param DOMElement
* @return void
*/
*/
public function prepArticle($articleContent)
public function prepArticle($articleContent)
{
{
@ -464,7 +465,7 @@ class Readability
$this->killBreaks($articleContent);
$this->killBreaks($articleContent);
$xpath = new \DOMXPath($articleContent->ownerDocument);
$xpath = new \DOMXPath($articleContent->ownerDocument);
if ($this->revertForcedParagraphElements) {
if ($this->revertForcedParagraphElements) {
/**
/*
* Reverts P elements with class 'readability-styled' to text nodes:
* Reverts P elements with class 'readability-styled' to text nodes:
* which is what they were before.
* which is what they were before.
*/
*/
@ -494,7 +495,7 @@ class Readability
$this->clean($articleContent, 'canvas');
$this->clean($articleContent, 'canvas');
$this->clean($articleContent, 'h1');
$this->clean($articleContent, 'h1');
/**
/*
* If there is only one h2, they are probably using it as a main header, so remove it since we
* If there is only one h2, they are probably using it as a main header, so remove it since we
* already have a header.
* already have a header.
*/
*/
@ -537,7 +538,7 @@ class Readability
}
}
unset($search, $replace);
unset($search, $replace);
} catch (Exception $e) {
} catch (Exception $e) {
$this->dbg("Cleaning output HTML failed. Ignoring: " . $e->getMessage());
$this->dbg('Cleaning output HTML failed. Ignoring: '. $e->getMessage());
}
}
}
}
}
}
@ -546,7 +547,6 @@ class Readability
* className/id for special names to add to its score.
* className/id for special names to add to its score.
*
*
* @param Element
* @param Element
* @return void
*/
*/
protected function initializeNode($node)
protected function initializeNode($node)
{
{
@ -668,7 +668,7 @@ class Readability
}
}
}
}
}
}
/**
/*
* Loop through all paragraphs, and assign a score to them based on how content-y they look.
* Loop through all paragraphs, and assign a score to them based on how content-y they look.
* Then add their score to their parent node.
* Then add their score to their parent node.
*
*
@ -724,7 +724,7 @@ class Readability
$grandParentNode->getAttributeNode('readability')->value += $contentScore / self::GRANDPARENT_SCORE_DIVISOR;
$grandParentNode->getAttributeNode('readability')->value += $contentScore / self::GRANDPARENT_SCORE_DIVISOR;
}
}
}
}
/**
/*
* Node prepping: trash nodes that look cruddy (like ones with the class name "comment", etc).
* Node prepping: trash nodes that look cruddy (like ones with the class name "comment", etc).
* This is faster to do before scoring but safer after.
* This is faster to do before scoring but safer after.
*/
*/
@ -743,7 +743,7 @@ class Readability
$node = $candidates->item($c);
$node = $candidates->item($c);
$tagName = $node->tagName;
$tagName = $node->tagName;
/* Remove unlikely candidates */
/* Remove unlikely candidates */
$unlikelyMatchString = $node->getAttribute('class')." ".$node->getAttribute('id')." " .$node->getAttribute('style');
$unlikelyMatchString = $node->getAttribute('class').' '.$node->getAttribute('id').' ' .$node->getAttribute('style');
//$this->dbg('Processing '.$node->getNodePath().' by "'. $unlikelyMatchString.'" with readability '.($node->hasAttribute('readability') ? (int)$node->getAttributeNode('readability')->value : 0));
//$this->dbg('Processing '.$node->getNodePath().' by "'. $unlikelyMatchString.'" with readability '.($node->hasAttribute('readability') ? (int)$node->getAttributeNode('readability')->value : 0));
if (mb_strlen($unlikelyMatchString) > 3 & & // don't process "empty" strings
if (mb_strlen($unlikelyMatchString) > 3 & & // don't process "empty" strings
preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) & &
preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) & &
@ -756,7 +756,7 @@ class Readability
}
}
unset($candidates);
unset($candidates);
}
}
/**
/*
* After we've calculated scores, loop through all of the possible candidate nodes we found
* After we've calculated scores, loop through all of the possible candidate nodes we found
* and find the one with the highest score.
* and find the one with the highest score.
*/
*/
@ -777,7 +777,7 @@ class Readability
}
}
unset($candidates);
unset($candidates);
}
}
/**
/*
* If we still have no top candidate, just use the body as a last resort.
* If we still have no top candidate, just use the body as a last resort.
* We also have to copy the body node so it is something we can modify.
* We also have to copy the body node so it is something we can modify.
*/
*/
@ -812,7 +812,7 @@ class Readability
}
}
}
}
$this->dbg('Top candidate: '.$topCandidate->getNodePath());
$this->dbg('Top candidate: '.$topCandidate->getNodePath());
/**
/*
* Now that we have the top candidate, look through its siblings for content that might also be related.
* Now that we have the top candidate, look through its siblings for content that might also be related.
* Things like preambles, content split by ads that we removed, etc.
* Things like preambles, content split by ads that we removed, etc.
*/
*/
@ -884,7 +884,7 @@ class Readability
unset($xpath);
unset($xpath);
// So we have all of the content that we need. Now we clean it up for presentation.
// So we have all of the content that we need. Now we clean it up for presentation.
$this->prepArticle($articleContent);
$this->prepArticle($articleContent);
/**
/*
* Now that we've gone through the full algorithm, check to see if we got any meaningful content.
* Now that we've gone through the full algorithm, check to see if we got any meaningful content.
* If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
* If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
* likelihood of finding the content, and the sieve approach gives us a higher likelihood of
* likelihood of finding the content, and the sieve approach gives us a higher likelihood of
@ -897,17 +897,17 @@ class Readability
$this->body->innerHTML = $this->bodyCache;
$this->body->innerHTML = $this->bodyCache;
if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
$this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
$this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
$this->dbg("...content is shorter than " .self::MIN_ARTICLE_LENGTH." letters, trying not to strip unlikely content.\n");
$this->dbg('...content is shorter than ' .self::MIN_ARTICLE_LENGTH." letters, trying not to strip unlikely content.\n");
return $this->grabArticle($this->body);
return $this->grabArticle($this->body);
} elseif ($this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
} elseif ($this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
$this->removeFlag(self::FLAG_WEIGHT_ATTRIBUTES);
$this->removeFlag(self::FLAG_WEIGHT_ATTRIBUTES);
$this->dbg("...content is shorter than " .self::MIN_ARTICLE_LENGTH." letters, trying not to weight attributes.\n");
$this->dbg('...content is shorter than ' .self::MIN_ARTICLE_LENGTH." letters, trying not to weight attributes.\n");
return $this->grabArticle($this->body);
return $this->grabArticle($this->body);
} elseif ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
} elseif ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
$this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
$this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
$this->dbg("...content is shorter than " .self::MIN_ARTICLE_LENGTH." letters, trying not to clean at all.\n");
$this->dbg('...content is shorter than ' .self::MIN_ARTICLE_LENGTH." letters, trying not to clean at all.\n");
return $this->grabArticle($this->body);
return $this->grabArticle($this->body);
} else {
} else {
@ -922,8 +922,9 @@ class Readability
* This also strips out any excess whitespace to be found.
* This also strips out any excess whitespace to be found.
*
*
* @param DOMElement $e
* @param DOMElement $e
* @param boolean $normalizeSpaces (default: true)
* @param bool $normalizeSpaces (default: true)
* @param boolean $flattenLines (default: false)
* @param bool $flattenLines (default: false)
*
* @return string
* @return string
*/
*/
public function getInnerText($e, $normalizeSpaces = true, $flattenLines = false)
public function getInnerText($e, $normalizeSpaces = true, $flattenLines = false)
@ -944,7 +945,6 @@ class Readability
* Remove the style attribute on every $e and under.
* Remove the style attribute on every $e and under.
*
*
* @param DOMElement $e
* @param DOMElement $e
* @return void
*/
*/
public function cleanStyles($e)
public function cleanStyles($e)
{
{
@ -960,6 +960,7 @@ class Readability
* Get comma number for a given text.
* Get comma number for a given text.
*
*
* @param string $text
* @param string $text
*
* @return number (integer)
* @return number (integer)
*/
*/
public function getCommaCount($text)
public function getCommaCount($text)
@ -971,6 +972,7 @@ class Readability
* Input string should be normalized.
* Input string should be normalized.
*
*
* @param string $text
* @param string $text
*
* @return number (integer)
* @return number (integer)
*/
*/
public function getWordCount($text)
public function getWordCount($text)
@ -984,6 +986,7 @@ class Readability
*
*
* @param DOMElement $e
* @param DOMElement $e
* @param string $excludeExternal
* @param string $excludeExternal
*
* @return number (float)
* @return number (float)
*/
*/
public function getLinkDensity($e, $excludeExternal = false)
public function getLinkDensity($e, $excludeExternal = false)
@ -1009,6 +1012,7 @@ class Readability
*
*
* @param DOMElement $element
* @param DOMElement $element
* @param string $attribute
* @param string $attribute
*
* @return number (Integer)
* @return number (Integer)
*/
*/
protected function weightAttribute($element, $attribute)
protected function weightAttribute($element, $attribute)
@ -1040,6 +1044,7 @@ class Readability
* Get an element relative weight.
* Get an element relative weight.
*
*
* @param DOMElement $e
* @param DOMElement $e
*
* @return number (Integer)
* @return number (Integer)
*/
*/
public function getWeight($e)
public function getWeight($e)
@ -1059,7 +1064,6 @@ class Readability
* Remove extraneous break tags from a node.
* Remove extraneous break tags from a node.
*
*
* @param DOMElement $node
* @param DOMElement $node
* @return void
*/
*/
public function killBreaks($node)
public function killBreaks($node)
{
{
@ -1069,13 +1073,12 @@ class Readability
}
}
/**
/**
* Clean a node of all elements of type "tag".
* Clean a node of all elements of type "tag".
* (Unless it's a youtube/vimeo video. People love movies.)
* (Unless it's a youtube/vimeo video. People love movies.).
*
*
* Updated 2012-09-18 to preserve youtube/vimeo iframes
* Updated 2012-09-18 to preserve youtube/vimeo iframes
*
*
* @param DOMElement $e
* @param DOMElement $e
* @param string $tag
* @param string $tag
* @return void
*/
*/
public function clean($e, $tag)
public function clean($e, $tag)
{
{
@ -1105,7 +1108,6 @@ class Readability
*
*
* @param DOMElement $e
* @param DOMElement $e
* @param string $tag
* @param string $tag
* @return void
*/
*/
public function cleanConditionally($e, $tag)
public function cleanConditionally($e, $tag)
{
{
@ -1114,7 +1116,7 @@ class Readability
}
}
$tagsList = $e->getElementsByTagName($tag);
$tagsList = $e->getElementsByTagName($tag);
$curTagsLength = $tagsList->length;
$curTagsLength = $tagsList->length;
/**
/*
* Gather counts for other typical elements embedded within.
* Gather counts for other typical elements embedded within.
* Traverse backwards so we can remove nodes at the same time without effecting the traversal.
* Traverse backwards so we can remove nodes at the same time without effecting the traversal.
*
*
@ -1130,7 +1132,7 @@ class Readability
$this->dbg('Removing...');
$this->dbg('Removing...');
$node->parentNode->removeChild($node);
$node->parentNode->removeChild($node);
} elseif ($this->getCommaCount($this->getInnerText($node)) < self::MIN_COMMAS_IN_PARAGRAPH ) {
} elseif ($this->getCommaCount($this->getInnerText($node)) < self::MIN_COMMAS_IN_PARAGRAPH ) {
/**
/*
* If there are not very many commas, and the number of
* If there are not very many commas, and the number of
* non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
* non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
*/
*/
@ -1166,10 +1168,10 @@ class Readability
$this->dbg(' content length less than 6 chars, 0 embeds and either 0 images or more than 2 images');
$this->dbg(' content length less than 6 chars, 0 embeds and either 0 images or more than 2 images');
$toRemove = true;
$toRemove = true;
} elseif ($weight < 25 & & $ linkDensity > 0.25) {
} elseif ($weight < 25 & & $ linkDensity > 0.25) {
$this->dbg(' weight is '.$weight.' < 25 and link density is ' . sprintf ( " % . 2f " , $ linkDensity ) . ' > 0.25');
$this->dbg(' weight is '.$weight.' < 25 and link density is ' . sprintf ( ' % . 2f ' , $ linkDensity ) . ' > 0.25');
$toRemove = true;
$toRemove = true;
} elseif ($a > 2 & & ($weight >= 25 & & $linkDensity > 0.5)) {
} elseif ($a > 2 & & ($weight >= 25 & & $linkDensity > 0.5)) {
$this->dbg(' more than 2 links and weight is '.$weight.' > 25 but link density is '.sprintf("%.2f" , $linkDensity).' > 0.5');
$this->dbg(' more than 2 links and weight is '.$weight.' > 25 but link density is '.sprintf('%.2f' , $linkDensity).' > 0.5');
$toRemove = true;
$toRemove = true;
} elseif ($embedCount > 3) {
} elseif ($embedCount > 3) {
$this->dbg(' more than 3 embeds');
$this->dbg(' more than 3 embeds');
@ -1189,10 +1191,10 @@ class Readability
$this->dbg(' content length less than 25 chars and 0 images, or more than 2 images');
$this->dbg(' content length less than 25 chars and 0 images, or more than 2 images');
$toRemove = true;
$toRemove = true;
} elseif ($weight < 25 & & $ linkDensity > 0.2) {
} elseif ($weight < 25 & & $ linkDensity > 0.2) {
$this->dbg(' weight is '.$weight.' lower than 0 and link density is '.sprintf("%.2f" , $linkDensity).' > 0.2');
$this->dbg(' weight is '.$weight.' lower than 0 and link density is '.sprintf('%.2f' , $linkDensity).' > 0.2');
$toRemove = true;
$toRemove = true;
} elseif ($weight >= 25 & & $linkDensity > 0.5) {
} elseif ($weight >= 25 & & $linkDensity > 0.5) {
$this->dbg(' weight above 25 but link density is '.sprintf("%.2f" , $linkDensity).' > 0.5');
$this->dbg(' weight above 25 but link density is '.sprintf('%.2f' , $linkDensity).' > 0.5');
$toRemove = true;
$toRemove = true;
} elseif (($embedCount == 1 & & $contentLength < 75 ) | | $ embedCount > 1) {
} elseif (($embedCount == 1 & & $contentLength < 75 ) | | $ embedCount > 1) {
$this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed');
$this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed');
@ -1211,7 +1213,6 @@ class Readability
* Clean out spurious headers from an Element. Checks things like classnames and link density.
* Clean out spurious headers from an Element. Checks things like classnames and link density.
*
*
* @param DOMElement $e
* @param DOMElement $e
* @return void
*/
*/
public function cleanHeaders($e)
public function cleanHeaders($e)
{
{