diff --git a/.php_cs b/.php_cs
index 240846c..8aac3e6 100644
--- a/.php_cs
+++ b/.php_cs
@@ -1,20 +1,28 @@
setUsingCache(true)
- ->level(Symfony\CS\FixerInterface::SYMFONY_LEVEL)
- // use default SYMFONY_LEVEL and extra fixers:
- ->fixers(array(
- 'concat_with_spaces',
- 'ordered_use',
- 'phpdoc_order',
- 'strict',
- 'strict_param',
- 'long_array_syntax',
- ))
- ->finder(
- Symfony\CS\Finder\DefaultFinder::create()
- ->in(__DIR__)
+return PhpCsFixer\Config::create()
+ ->setRiskyAllowed(true)
+ ->setRules([
+ '@Symfony' => true,
+ '@Symfony:risky' => true,
+ 'combine_consecutive_unsets' => true,
+ 'heredoc_to_nowdoc' => true,
+ 'no_extra_consecutive_blank_lines' => array('break', 'continue', 'extra', 'return', 'throw', 'use', 'parenthesis_brace_block', 'square_brace_block', 'curly_brace_block'),
+ 'no_unreachable_default_argument_value' => true,
+ 'no_useless_else' => true,
+ 'no_useless_return' => true,
+ 'ordered_class_elements' => true,
+ 'ordered_imports' => true,
+ 'php_unit_strict' => false,
+ 'phpdoc_order' => true,
+ // 'psr4' => true,
+ 'strict_comparison' => true,
+ 'strict_param' => true,
+ 'concat_space' => array('spacing' => 'one'),
+ ])
+ ->setFinder(
+ PhpCsFixer\Finder::create()
->exclude(array('vendor'))
+ ->in(__DIR__)
)
;
diff --git a/composer.json b/composer.json
index 8f39405..4d9bd16 100644
--- a/composer.json
+++ b/composer.json
@@ -31,7 +31,7 @@
},
"require-dev": {
"satooshi/php-coveralls": "~0.6",
- "friendsofphp/php-cs-fixer": "<2",
+ "friendsofphp/php-cs-fixer": "~2.0",
"monolog/monolog": "^1.13",
"symfony/phpunit-bridge": "^3.2"
},
diff --git a/src/JSLikeHTMLElement.php b/src/JSLikeHTMLElement.php
index 15e7281..b908d06 100644
--- a/src/JSLikeHTMLElement.php
+++ b/src/JSLikeHTMLElement.php
@@ -45,7 +45,7 @@ class JSLikeHTMLElement extends \DOMElement
*/
public function __set($name, $value)
{
- if ($name !== 'innerHTML') {
+ if ('innerHTML' !== $name) {
$trace = debug_backtrace();
trigger_error('Undefined property via __set(): ' . $name . ' in ' . $trace[0]['file'] . ' on line ' . $trace[0]['line'], E_USER_NOTICE);
@@ -109,7 +109,7 @@ class JSLikeHTMLElement extends \DOMElement
*/
public function __get($name)
{
- if ($name === 'innerHTML') {
+ if ('innerHTML' === $name) {
$inner = '';
foreach ($this->childNodes as $child) {
@@ -121,8 +121,6 @@ class JSLikeHTMLElement extends \DOMElement
$trace = debug_backtrace();
trigger_error('Undefined property via __get(): ' . $name . ' in ' . $trace[0]['file'] . ' on line ' . $trace[0]['line'], E_USER_NOTICE);
-
- return;
}
public function __toString()
diff --git a/src/Readability.php b/src/Readability.php
index 9a2e9ba..5fdf85d 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -52,6 +52,23 @@ use Psr\Log\NullLogger;
*/
class Readability implements LoggerAwareInterface
{
+ // flags
+ const FLAG_STRIP_UNLIKELYS = 1;
+ const FLAG_WEIGHT_ATTRIBUTES = 2;
+ const FLAG_CLEAN_CONDITIONALLY = 4;
+ const FLAG_DISABLE_PREFILTER = 8;
+ const FLAG_DISABLE_POSTFILTER = 16;
+
+ // constants
+ const SCORE_CHARS_IN_PARAGRAPH = 100;
+ const SCORE_WORDS_IN_PARAGRAPH = 20;
+ const GRANDPARENT_SCORE_DIVISOR = 2.2;
+ const MIN_PARAGRAPH_LENGTH = 20;
+ const MIN_COMMAS_IN_PARAGRAPH = 6;
+ const MIN_ARTICLE_LENGTH = 200;
+ const MIN_NODE_LENGTH = 80;
+ const MAX_LINK_DENSITY = 0.25;
+
public $convertLinksToFootnotes = false;
public $revertForcedParagraphElements = true;
public $articleTitle;
@@ -65,19 +82,6 @@ class Readability implements LoggerAwareInterface
// no more used, keept to avoid BC
public $debug = false;
public $tidied = false;
- // article domain regexp for calibration
- protected $domainRegExp = null;
- protected $body = null;
- // Cache the body HTML in case we need to re-use it later
- protected $bodyCache = null;
- // 1 | 2 | 4; // Start with all processing flags set.
- protected $flags = 7;
- // indicates whether we were able to extract or not
- protected $success = false;
- protected $logger;
- protected $parser;
- protected $html;
- protected $useTidy;
/**
* All of the regular expressions in use within readability.
@@ -118,6 +122,19 @@ class Readability implements LoggerAwareInterface
'output-encoding' => 'utf8',
'hide-comments' => true,
);
+ // article domain regexp for calibration
+ protected $domainRegExp = null;
+ protected $body = null;
+ // Cache the body HTML in case we need to re-use it later
+ protected $bodyCache = null;
+ // 1 | 2 | 4; // Start with all processing flags set.
+ protected $flags = 7;
+ // indicates whether we were able to extract or not
+ protected $success = false;
+ protected $logger;
+ protected $parser;
+ protected $html;
+ protected $useTidy;
// raw HTML filters
protected $pre_filters = array(
// remove obvious scripts
@@ -151,22 +168,6 @@ class Readability implements LoggerAwareInterface
'!<[hb]r>!is' => '<\\1 />',
);
- // flags
- const FLAG_STRIP_UNLIKELYS = 1;
- const FLAG_WEIGHT_ATTRIBUTES = 2;
- const FLAG_CLEAN_CONDITIONALLY = 4;
- const FLAG_DISABLE_PREFILTER = 8;
- const FLAG_DISABLE_POSTFILTER = 16;
- // constants
- const SCORE_CHARS_IN_PARAGRAPH = 100;
- const SCORE_WORDS_IN_PARAGRAPH = 20;
- const GRANDPARENT_SCORE_DIVISOR = 2.2;
- const MIN_PARAGRAPH_LENGTH = 20;
- const MIN_COMMAS_IN_PARAGRAPH = 6;
- const MIN_ARTICLE_LENGTH = 200;
- const MIN_NODE_LENGTH = 80;
- const MAX_LINK_DENSITY = 0.25;
-
/**
* Create instance of Readability.
*
@@ -233,76 +234,6 @@ class Readability implements LoggerAwareInterface
$this->post_filters[$filter] = $replacer;
}
- /**
- * Load HTML in a DOMDocument.
- * Apply Pre filters
- * Cleanup HTML using Tidy (or not).
- *
- * @todo This should be called in init() instead of from __construct
- */
- private function loadHtml()
- {
- $this->original_html = $this->html;
-
- $this->logger->debug('Parsing URL: ' . $this->url);
-
- if ($this->url) {
- $this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', parse_url($this->url, PHP_URL_HOST)), array('.' => '\.')) . '/';
- }
-
- mb_internal_encoding('UTF-8');
- mb_http_output('UTF-8');
- mb_regex_encoding('UTF-8');
-
- // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well...
- if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) {
- foreach ($this->pre_filters as $search => $replace) {
- $this->html = preg_replace($search, $replace, $this->html);
- }
- unset($search, $replace);
- }
-
- if (trim($this->html) === '') {
- $this->html = '';
- }
-
- /*
- * Use tidy (if it exists).
- * This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing.
- * Although sometimes it makes matters worse, which is why there is an option to disable it.
- */
- if ($this->useTidy) {
- $this->logger->debug('Tidying document');
-
- $tidy = tidy_parse_string($this->html, $this->tidy_config, 'UTF8');
- if (tidy_clean_repair($tidy)) {
- $this->tidied = true;
- $this->html = $tidy->value;
- $this->html = preg_replace('/[\r\n]+/is', "\n", $this->html);
- }
- unset($tidy);
- }
-
- $this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8');
-
- if (!($this->parser === 'html5lib' && ($this->dom = Parser::parse($this->html)))) {
- libxml_use_internal_errors(true);
-
- $this->dom = new \DOMDocument();
- $this->dom->preserveWhiteSpace = false;
-
- if (PHP_VERSION_ID >= 50400) {
- $this->dom->loadHTML($this->html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR);
- } else {
- $this->dom->loadHTML($this->html);
- }
-
- libxml_use_internal_errors(false);
- }
-
- $this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement');
- }
-
/**
* Runs readability.
*
@@ -326,14 +257,14 @@ class Readability implements LoggerAwareInterface
$bodyElems = $this->dom->getElementsByTagName('body');
// WTF multiple body nodes?
- if ($this->bodyCache === null) {
+ if (null === $this->bodyCache) {
$this->bodyCache = '';
foreach ($bodyElems as $bodyNode) {
$this->bodyCache .= trim($bodyNode->innerHTML);
}
}
- if ($bodyElems->length > 0 && $this->body === null) {
+ if ($bodyElems->length > 0 && null === $this->body) {
$this->body = $bodyElems->item(0);
}
@@ -373,27 +304,6 @@ class Readability implements LoggerAwareInterface
return $this->success;
}
- /**
- * Debug.
- *
- * @deprecated use $this->logger->debug() instead
- * @codeCoverageIgnore
- */
- protected function dbg($msg)
- {
- $this->logger->debug($msg);
- }
-
- /**
- * Dump debug info.
- *
- * @deprecated since Monolog gather log, we don't need it
- * @codeCoverageIgnore
- */
- protected function dump_dbg()
- {
- }
-
/**
* Run any post-process modifications to article content as necessary.
*
@@ -406,77 +316,6 @@ class Readability implements LoggerAwareInterface
}
}
- /**
- * Get the article title as an H1.
- *
- * @return \DOMElement
- */
- protected function getArticleTitle()
- {
- try {
- $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
- } catch (\Exception $e) {
- $curTitle = '';
- $origTitle = '';
- }
-
- if (preg_match('/ [\|\-] /', $curTitle)) {
- $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
- if (count(explode(' ', $curTitle)) < 3) {
- $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
- }
- } elseif (strpos($curTitle, ': ') !== false) {
- $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);
- if (count(explode(' ', $curTitle)) < 3) {
- $curTitle = preg_replace('/[^:]*[:](.*)/i', '$1', $origTitle);
- }
- } elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) {
- $hOnes = $this->dom->getElementsByTagName('h1');
- if ($hOnes->length === 1) {
- $curTitle = $this->getInnerText($hOnes->item(0));
- }
- }
-
- $curTitle = trim($curTitle);
- if (count(explode(' ', $curTitle)) <= 4) {
- $curTitle = $origTitle;
- }
-
- $articleTitle = $this->dom->createElement('h1');
- $articleTitle->innerHTML = $curTitle;
-
- return $articleTitle;
- }
-
- /**
- * Prepare the HTML document for readability to scrape it.
- * This includes things like stripping javascript, CSS, and handling terrible markup.
- */
- protected function prepDocument()
- {
- /*
- * In some cases a body element can't be found (if the HTML is totally hosed for example)
- * so we create a new body node and append it to the document.
- */
- if ($this->body === null) {
- $this->body = $this->dom->createElement('body');
- $this->dom->documentElement->appendChild($this->body);
- }
-
- $this->body->setAttribute('class', 'readabilityBody');
-
- // Remove all style tags in head.
- $styleTags = $this->dom->getElementsByTagName('style');
- for ($i = $styleTags->length - 1; $i >= 0; --$i) {
- $styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
- }
-
- $linkTags = $this->dom->getElementsByTagName('link');
- for ($i = $linkTags->length - 1; $i >= 0; --$i) {
- $linkTags->item($i)->parentNode->removeChild($linkTags->item($i));
- }
- }
-
/**
* For easier reading, convert this document to have footnotes at the bottom rather than inline links.
*
@@ -506,7 +345,7 @@ class Readability implements LoggerAwareInterface
}
$linkText = $this->getInnerText($articleLink);
- if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
+ if ((false !== strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote')) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
continue;
}
@@ -527,7 +366,7 @@ class Readability implements LoggerAwareInterface
$articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');
$articleLink->setAttribute('name', 'readabilityLink-' . $linkCount);
$footnote->innerHTML = '^ ';
- $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') !== '' ? $footnoteLink->getAttribute('title') : $linkText);
+ $footnoteLink->innerHTML = ('' !== $footnoteLink->getAttribute('title') ? $footnoteLink->getAttribute('title') : $linkText);
$footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);
$footnote->appendChild($footnoteLink);
@@ -589,7 +428,7 @@ class Readability implements LoggerAwareInterface
* already have a header.
*/
$h2s = $articleContent->getElementsByTagName('h2');
- if ($h2s->length === 1 && mb_strlen($this->getInnerText($h2s->item(0), true, true)) < 100) {
+ if (1 === $h2s->length && mb_strlen($this->getInnerText($h2s->item(0), true, true)) < 100) {
$this->clean($articleContent, 'h2');
}
@@ -614,7 +453,7 @@ class Readability implements LoggerAwareInterface
$audioCount = $item->getElementsByTagName('audio')->length;
$iframeCount = $item->getElementsByTagName('iframe')->length;
- if ($iframeCount === 0 && $imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $videoCount === 0 && $audioCount === 0 && mb_strlen(preg_replace('/\s+/is', '', $this->getInnerText($item, false, false))) === 0) {
+ if (0 === $iframeCount && 0 === $imgCount && 0 === $embedCount && 0 === $objectCount && 0 === $videoCount && 0 === $audioCount && 0 === mb_strlen(preg_replace('/\s+/is', '', $this->getInnerText($item, false, false)))) {
$item->parentNode->removeChild($item);
}
@@ -640,806 +479,899 @@ class Readability implements LoggerAwareInterface
}
/**
- * Initialize a node with the readability object. Also checks the
- * className/id for special names to add to its score.
+ * Get the inner text of a node.
+ * This also strips out any excess whitespace to be found.
*
- * @param \DOMElement $node
+ * @param \DOMElement $e
+ * @param bool $normalizeSpaces (default: true)
+ * @param bool $flattenLines (default: false)
+ *
+ * @return string
*/
- protected function initializeNode($node)
+ public function getInnerText($e, $normalizeSpaces = true, $flattenLines = false)
{
- if (!isset($node->tagName)) {
+ if (null === $e || !isset($e->textContent) || '' === $e->textContent) {
+ return '';
+ }
+
+ $textContent = trim($e->textContent);
+
+ if ($flattenLines) {
+ $textContent = mb_ereg_replace('(?:[\r\n](?:\s| )*)+', '', $textContent);
+ } elseif ($normalizeSpaces) {
+ $textContent = mb_ereg_replace('\s\s+', ' ', $textContent);
+ }
+
+ return $textContent;
+ }
+
+ /**
+ * Remove the style attribute on every $e and under.
+ *
+ * @param \DOMElement $e
+ */
+ public function cleanStyles($e)
+ {
+ if (!is_object($e)) {
return;
}
- $readability = $this->dom->createAttribute('readability');
- // this is our contentScore
- $readability->value = 0;
- $node->setAttributeNode($readability);
+ $elems = $e->getElementsByTagName('*');
- // using strtoupper just in case
- switch (strtoupper($node->tagName)) {
- case 'ARTICLE':
- $readability->value += 15;
- case 'DIV':
- $readability->value += 5;
- break;
- case 'PRE':
- case 'CODE':
- case 'TD':
- case 'BLOCKQUOTE':
- case 'FIGURE':
- $readability->value += 3;
- break;
- case 'SECTION':
- // often misused
- // $readability->value += 2;
- break;
- case 'OL':
- case 'UL':
- case 'DL':
- case 'DD':
- case 'DT':
- case 'LI':
- $readability->value -= 2 * round($this->getLinkDensity($node), 0, PHP_ROUND_HALF_UP);
- break;
- case 'ASIDE':
- case 'FOOTER':
- case 'HEADER':
- case 'ADDRESS':
- case 'FORM':
- case 'BUTTON':
- case 'TEXTAREA':
- case 'INPUT':
- case 'NAV':
- $readability->value -= 3;
- break;
- case 'H1':
- case 'H2':
- case 'H3':
- case 'H4':
- case 'H5':
- case 'H6':
- case 'TH':
- case 'HGROUP':
- $readability->value -= 5;
- break;
+ foreach ($elems as $elem) {
+ $elem->removeAttribute('style');
}
+ }
- $readability->value += $this->getWeight($node);
+ /**
+ * Get comma number for a given text.
+ *
+ * @param string $text
+ *
+ * @return int
+ */
+ public function getCommaCount($text)
+ {
+ return substr_count($text, ',');
}
/**
- * Using a variety of metrics (content score, classname, element types), find the content that is
- * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
+ * Get words number for a given text if words separated by a space.
+ * Input string should be normalized.
*
- * @param \DOMElement $page
+ * @param string $text
*
- * @return \DOMElement|bool
+ * @return int
*/
- protected function grabArticle($page = null)
+ public function getWordCount($text)
{
- if (!$page) {
- $page = $this->dom;
- }
+ return substr_count($text, ' ');
+ }
- $xpath = null;
- $nodesToScore = array();
+ /**
+ * Get the density of links as a percentage of the content
+ * This is the amount of text that is inside a link divided by the total text in the node.
+ * Can exclude external references to differentiate between simple text and menus/infoblocks.
+ *
+ * @param \DOMElement $e
+ * @param string $excludeExternal
+ *
+ * @return int
+ */
+ public function getLinkDensity($e, $excludeExternal = false)
+ {
+ $links = $e->getElementsByTagName('a');
+ $textLength = mb_strlen($this->getInnerText($e, true, true));
+ $linkLength = 0;
- if ($page instanceof \DOMDocument && isset($page->documentElement)) {
- $xpath = new \DOMXPath($page);
+ for ($dRe = $this->domainRegExp, $i = 0, $il = $links->length; $i < $il; ++$i) {
+ if ($excludeExternal && $dRe && !preg_match($dRe, $links->item($i)->getAttribute('href'))) {
+ continue;
+ }
+ $linkLength += mb_strlen($this->getInnerText($links->item($i)));
}
- $allElements = $page->getElementsByTagName('*');
+ if ($textLength > 0 && $linkLength > 0) {
+ return $linkLength / $textLength;
+ }
- for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); ++$nodeIndex) {
- $tagName = $node->tagName;
+ return 0;
+ }
- // Some well known site uses sections as paragraphs.
- if (strcasecmp($tagName, 'p') === 0 || strcasecmp($tagName, 'td') === 0 || strcasecmp($tagName, 'pre') === 0 || strcasecmp($tagName, 'section') === 0) {
- $nodesToScore[] = $node;
- }
+ /**
+ * Get an element relative weight.
+ *
+ * @param \DOMElement $e
+ *
+ * @return int
+ */
+ public function getWeight($e)
+ {
+ if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
+ return 0;
+ }
- // Turn divs into P tags where they have been used inappropriately
- // (as in, where they contain no other block level elements).
- if (strcasecmp($tagName, 'div') === 0 || strcasecmp($tagName, 'article') === 0 || strcasecmp($tagName, 'section') === 0) {
- if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
- $newNode = $this->dom->createElement('p');
+ $weight = 0;
+ // Look for a special classname
+ $weight += $this->weightAttribute($e, 'class');
+ // Look for a special ID
+ $weight += $this->weightAttribute($e, 'id');
- try {
- $newNode->innerHTML = $node->innerHTML;
+ return $weight;
+ }
- $node->parentNode->replaceChild($newNode, $node);
- --$nodeIndex;
- $nodesToScore[] = $newNode;
- } catch (\Exception $e) {
- $this->logger->error('Could not alter div/article to p, reverting back to div: ' . $e->getMessage());
- }
- } else {
- // Will change these P elements back to text nodes after processing.
- for ($i = 0, $il = $node->childNodes->length; $i < $il; ++$i) {
- $childNode = $node->childNodes->item($i);
+ /**
+ * Remove extraneous break tags from a node.
+ *
+ * @param \DOMElement $node
+ */
+ public function killBreaks($node)
+ {
+ $html = $node->innerHTML;
+ $html = preg_replace($this->regexps['killBreaks'], '
', $html);
+ $node->innerHTML = $html;
+ }
- // it looks like sometimes the loop is going too far and we are retrieving a non-existant child
- if (null === $childNode) {
- continue;
- }
+ /**
+ * Clean a node of all elements of type "tag".
+ * (Unless it's a youtube/vimeo video. People love movies.).
+ *
+ * Updated 2012-09-18 to preserve youtube/vimeo iframes
+ *
+ * @param \DOMElement $e
+ * @param string $tag
+ */
+ public function clean($e, $tag)
+ {
+ $currentItem = null;
+ $targetList = $e->getElementsByTagName($tag);
+ $isEmbed = ('audio' === $tag || 'video' === $tag || 'iframe' === $tag || 'object' === $tag || 'embed' === $tag);
- // executable tags (parentNode->removeChild($childNode);
+ for ($y = $targetList->length - 1; $y >= 0; --$y) {
+ // Allow youtube and vimeo videos through as people usually want to see those.
+ $currentItem = $targetList->item($y);
- continue;
- }
+ if ($isEmbed) {
+ $attributeValues = $currentItem->getAttribute('src') . ' ' . $currentItem->getAttribute('href');
- if ($childNode->nodeType === XML_TEXT_NODE) {
- $p = $this->dom->createElement('p');
- $p->innerHTML = $childNode->nodeValue;
- $p->setAttribute('data-readability-styled', 'true');
- $childNode->parentNode->replaceChild($p, $childNode);
- }
- }
+ // First, check the elements attributes to see if any of them contain known media hosts
+ if (preg_match($this->regexps['media'], $attributeValues)) {
+ continue;
+ }
+
+ // Then check the elements inside this element for the same.
+ if (preg_match($this->regexps['media'], $targetList->item($y)->innerHTML)) {
+ continue;
}
}
+
+ $currentItem->parentNode->removeChild($currentItem);
+ }
+ }
+
+ /**
+ * Clean an element of all tags of type "tag" if they look fishy.
+ * "Fishy" is an algorithm based on content length, classnames,
+ * link density, number of images & embeds, etc.
+ *
+ * @param \DOMElement $e
+ * @param string $tag
+ */
+ public function cleanConditionally($e, $tag)
+ {
+ if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
+ return;
}
+ $tagsList = $e->getElementsByTagName($tag);
+ $curTagsLength = $tagsList->length;
+ $node = null;
+
/*
- * Loop through all paragraphs, and assign a score to them based on how content-y they look.
- * Then add their score to their parent node.
+ * Gather counts for other typical elements embedded within.
+ * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
*
- * A score is determined by things like number of commas, class names, etc.
- * Maybe eventually link density.
+ * TODO: Consider taking into account original contentScore here.
*/
- for ($pt = 0, $scored = count($nodesToScore); $pt < $scored; ++$pt) {
- $parentNode = $nodesToScore[$pt]->parentNode;
-
- // No parent node? Move on...
- if (!$parentNode) {
- continue;
- }
+ for ($i = $curTagsLength - 1; $i >= 0; --$i) {
+ $node = $tagsList->item($i);
+ $weight = $this->getWeight($node);
+ $contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0;
+ $this->logger->debug('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : ''));
- $grandParentNode = $parentNode->parentNode instanceof \DOMElement ? $parentNode->parentNode : null;
- $innerText = $this->getInnerText($nodesToScore[$pt]);
+ if ($weight + $contentScore < 0) {
+ $this->logger->debug('Removing...');
+ $node->parentNode->removeChild($node);
+ } elseif ($this->getCommaCount($this->getInnerText($node)) < self::MIN_COMMAS_IN_PARAGRAPH) {
+ /*
+ * If there are not very many commas, and the number of
+ * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
+ */
+ $p = $node->getElementsByTagName('p')->length;
+ $img = $node->getElementsByTagName('img')->length;
+ $li = $node->getElementsByTagName('li')->length - 100;
+ $input = $node->getElementsByTagName('input')->length;
+ $a = $node->getElementsByTagName('a')->length;
+ $embedCount = 0;
+ $embeds = $node->getElementsByTagName('embed');
- // If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it.
- if (mb_strlen($innerText) < self::MIN_PARAGRAPH_LENGTH) {
- continue;
- }
-
- // Initialize readability data for the parent.
- if (!$parentNode->hasAttribute('readability')) {
- $this->initializeNode($parentNode);
- $parentNode->setAttribute('data-candidate', 'true');
- }
-
- // Initialize readability data for the grandparent.
- if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) {
- $this->initializeNode($grandParentNode);
- $grandParentNode->setAttribute('data-candidate', 'true');
- }
- // Add a point for the paragraph itself as a base.
- $contentScore = 1;
- // Add points for any commas within this paragraph.
- $contentScore += $this->getCommaCount($innerText);
- // For every SCORE_CHARS_IN_PARAGRAPH (default:100) characters in this paragraph, add another point. Up to 3 points.
- $contentScore += min(floor(mb_strlen($innerText) / self::SCORE_CHARS_IN_PARAGRAPH), 3);
- // For every SCORE_WORDS_IN_PARAGRAPH (default:20) words in this paragraph, add another point. Up to 3 points.
- $contentScore += min(floor($this->getWordCount($innerText) / self::SCORE_WORDS_IN_PARAGRAPH), 3);
- /* TEST: For every positive/negative parent tag, add/substract half point. Up to 3 points. *\/
- $up = $nodesToScore[$pt];
- $score = 0;
- while ($up->parentNode instanceof \DOMElement) {
- $up = $up->parentNode;
- if (preg_match($this->regexps['positive'], $up->getAttribute('class') . ' ' . $up->getAttribute('id'))) {
- $score += 0.5;
- } elseif (preg_match($this->regexps['negative'], $up->getAttribute('class') . ' ' . $up->getAttribute('id'))) {
- $score -= 0.5;
- }
- }
- $score = floor($score);
- $contentScore += max(min($score, 3), -3);/**/
-
- // Add the score to the parent. The grandparent gets half.
- $parentNode->getAttributeNode('readability')->value += $contentScore;
- if ($grandParentNode) {
- $grandParentNode->getAttributeNode('readability')->value += $contentScore / self::GRANDPARENT_SCORE_DIVISOR;
- }
- }
-
- /*
- * Node prepping: trash nodes that look cruddy (like ones with the class name "comment", etc).
- * This is faster to do before scoring but safer after.
- */
- if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) && $xpath) {
- $candidates = $xpath->query('.//*[(self::footer and count(//footer)<2) or (self::aside and count(//aside)<2)]', $page->documentElement);
- $node = null;
-
- for ($c = $candidates->length - 1; $c >= 0; --$c) {
- $node = $candidates->item($c);
- // node should be readable but not inside of an article otherwise it's probably non-readable block
- if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? strcasecmp($node->parentNode->tagName, 'article') !== 0 : true)) {
- $this->logger->debug('Removing unlikely candidate (using note) ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0));
- $node->parentNode->removeChild($node);
- }
- }
-
- $candidates = $xpath->query('.//*[not(self::body) and (@class or @id or @style) and ((number(@readability) < 40) or not(@readability))]', $page->documentElement);
- $node = null;
-
- for ($c = $candidates->length - 1; $c >= 0; --$c) {
- $node = $candidates->item($c);
-
- // Remove unlikely candidates
- $unlikelyMatchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id') . ' ' . $node->getAttribute('style');
-
- if (mb_strlen($unlikelyMatchString) > 3 && // don't process "empty" strings
- preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
- !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString)
- ) {
- $this->logger->debug('Removing unlikely candidate (using conf) ' . $node->getNodePath() . ' by "' . $unlikelyMatchString . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0));
- $node->parentNode->removeChild($node);
- --$nodeIndex;
+ for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) {
+ if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) {
+ ++$embedCount;
+ }
}
- }
- unset($candidates);
- }
- /*
- * After we've calculated scores, loop through all of the possible candidate nodes we found
- * and find the one with the highest score.
- */
- $topCandidate = null;
- if ($xpath) {
- // Using array of DOMElements after deletion is a path to DOOMElement.
- $candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement);
-
- for ($c = $candidates->length - 1; $c >= 0; --$c) {
- $item = $candidates->item($c);
-
- // Scale the final candidates score based on link density. Good content should have a
- // relatively small link density (5% or less) and be mostly unaffected by this operation.
- // If not for this we would have used XPath to find maximum @readability.
- $readability = $item->getAttributeNode('readability');
- $readability->value = round($readability->value * (1 - $this->getLinkDensity($item)), 0, PHP_ROUND_HALF_UP);
-
- if (!$topCandidate || $readability->value > (int) $topCandidate->getAttribute('readability')) {
- $this->logger->debug('Candidate: ' . $item->getNodePath() . ' (' . $item->getAttribute('class') . ':' . $item->getAttribute('id') . ') with score ' . $readability->value);
- $topCandidate = $item;
+ $embeds = $node->getElementsByTagName('iframe');
+ for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) {
+ if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) {
+ ++$embedCount;
+ }
}
- }
-
- unset($candidates);
- }
- /*
- * If we still have no top candidate, just use the body as a last resort.
- * We also have to copy the body node so it is something we can modify.
- */
- if ($topCandidate === null || strcasecmp($topCandidate->tagName, 'body') === 0) {
- $topCandidate = $this->dom->createElement('div');
+ $linkDensity = $this->getLinkDensity($node, true);
+ $contentLength = mb_strlen($this->getInnerText($node));
+ $toRemove = false;
- if ($page instanceof \DOMDocument) {
- if (!isset($page->documentElement)) {
- // we don't have a body either? what a mess! :)
- $this->logger->debug('The page has no body!');
+ if ($this->lightClean) {
+ if ($li > $p && 'ul' !== $tag && 'ol' !== $tag) {
+ $this->logger->debug(' too many