diff --git a/.php_cs b/.php_cs index 240846c..8aac3e6 100644 --- a/.php_cs +++ b/.php_cs @@ -1,20 +1,28 @@ setUsingCache(true) - ->level(Symfony\CS\FixerInterface::SYMFONY_LEVEL) - // use default SYMFONY_LEVEL and extra fixers: - ->fixers(array( - 'concat_with_spaces', - 'ordered_use', - 'phpdoc_order', - 'strict', - 'strict_param', - 'long_array_syntax', - )) - ->finder( - Symfony\CS\Finder\DefaultFinder::create() - ->in(__DIR__) +return PhpCsFixer\Config::create() + ->setRiskyAllowed(true) + ->setRules([ + '@Symfony' => true, + '@Symfony:risky' => true, + 'combine_consecutive_unsets' => true, + 'heredoc_to_nowdoc' => true, + 'no_extra_consecutive_blank_lines' => array('break', 'continue', 'extra', 'return', 'throw', 'use', 'parenthesis_brace_block', 'square_brace_block', 'curly_brace_block'), + 'no_unreachable_default_argument_value' => true, + 'no_useless_else' => true, + 'no_useless_return' => true, + 'ordered_class_elements' => true, + 'ordered_imports' => true, + 'php_unit_strict' => false, + 'phpdoc_order' => true, + // 'psr4' => true, + 'strict_comparison' => true, + 'strict_param' => true, + 'concat_space' => array('spacing' => 'one'), + ]) + ->setFinder( + PhpCsFixer\Finder::create() ->exclude(array('vendor')) + ->in(__DIR__) ) ; diff --git a/composer.json b/composer.json index 8f39405..4d9bd16 100644 --- a/composer.json +++ b/composer.json @@ -31,7 +31,7 @@ }, "require-dev": { "satooshi/php-coveralls": "~0.6", - "friendsofphp/php-cs-fixer": "<2", + "friendsofphp/php-cs-fixer": "~2.0", "monolog/monolog": "^1.13", "symfony/phpunit-bridge": "^3.2" }, diff --git a/src/JSLikeHTMLElement.php b/src/JSLikeHTMLElement.php index 15e7281..b908d06 100644 --- a/src/JSLikeHTMLElement.php +++ b/src/JSLikeHTMLElement.php @@ -45,7 +45,7 @@ class JSLikeHTMLElement extends \DOMElement */ public function __set($name, $value) { - if ($name !== 'innerHTML') { + if ('innerHTML' !== $name) { $trace = debug_backtrace(); trigger_error('Undefined property via __set(): ' . $name . ' in ' . $trace[0]['file'] . ' on line ' . $trace[0]['line'], E_USER_NOTICE); @@ -109,7 +109,7 @@ class JSLikeHTMLElement extends \DOMElement */ public function __get($name) { - if ($name === 'innerHTML') { + if ('innerHTML' === $name) { $inner = ''; foreach ($this->childNodes as $child) { @@ -121,8 +121,6 @@ class JSLikeHTMLElement extends \DOMElement $trace = debug_backtrace(); trigger_error('Undefined property via __get(): ' . $name . ' in ' . $trace[0]['file'] . ' on line ' . $trace[0]['line'], E_USER_NOTICE); - - return; } public function __toString() diff --git a/src/Readability.php b/src/Readability.php index 9a2e9ba..5fdf85d 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -52,6 +52,23 @@ use Psr\Log\NullLogger; */ class Readability implements LoggerAwareInterface { + // flags + const FLAG_STRIP_UNLIKELYS = 1; + const FLAG_WEIGHT_ATTRIBUTES = 2; + const FLAG_CLEAN_CONDITIONALLY = 4; + const FLAG_DISABLE_PREFILTER = 8; + const FLAG_DISABLE_POSTFILTER = 16; + + // constants + const SCORE_CHARS_IN_PARAGRAPH = 100; + const SCORE_WORDS_IN_PARAGRAPH = 20; + const GRANDPARENT_SCORE_DIVISOR = 2.2; + const MIN_PARAGRAPH_LENGTH = 20; + const MIN_COMMAS_IN_PARAGRAPH = 6; + const MIN_ARTICLE_LENGTH = 200; + const MIN_NODE_LENGTH = 80; + const MAX_LINK_DENSITY = 0.25; + public $convertLinksToFootnotes = false; public $revertForcedParagraphElements = true; public $articleTitle; @@ -65,19 +82,6 @@ class Readability implements LoggerAwareInterface // no more used, keept to avoid BC public $debug = false; public $tidied = false; - // article domain regexp for calibration - protected $domainRegExp = null; - protected $body = null; - // Cache the body HTML in case we need to re-use it later - protected $bodyCache = null; - // 1 | 2 | 4; // Start with all processing flags set. - protected $flags = 7; - // indicates whether we were able to extract or not - protected $success = false; - protected $logger; - protected $parser; - protected $html; - protected $useTidy; /** * All of the regular expressions in use within readability. @@ -118,6 +122,19 @@ class Readability implements LoggerAwareInterface 'output-encoding' => 'utf8', 'hide-comments' => true, ); + // article domain regexp for calibration + protected $domainRegExp = null; + protected $body = null; + // Cache the body HTML in case we need to re-use it later + protected $bodyCache = null; + // 1 | 2 | 4; // Start with all processing flags set. + protected $flags = 7; + // indicates whether we were able to extract or not + protected $success = false; + protected $logger; + protected $parser; + protected $html; + protected $useTidy; // raw HTML filters protected $pre_filters = array( // remove obvious scripts @@ -151,22 +168,6 @@ class Readability implements LoggerAwareInterface '!<[hb]r>!is' => '<\\1 />', ); - // flags - const FLAG_STRIP_UNLIKELYS = 1; - const FLAG_WEIGHT_ATTRIBUTES = 2; - const FLAG_CLEAN_CONDITIONALLY = 4; - const FLAG_DISABLE_PREFILTER = 8; - const FLAG_DISABLE_POSTFILTER = 16; - // constants - const SCORE_CHARS_IN_PARAGRAPH = 100; - const SCORE_WORDS_IN_PARAGRAPH = 20; - const GRANDPARENT_SCORE_DIVISOR = 2.2; - const MIN_PARAGRAPH_LENGTH = 20; - const MIN_COMMAS_IN_PARAGRAPH = 6; - const MIN_ARTICLE_LENGTH = 200; - const MIN_NODE_LENGTH = 80; - const MAX_LINK_DENSITY = 0.25; - /** * Create instance of Readability. * @@ -233,76 +234,6 @@ class Readability implements LoggerAwareInterface $this->post_filters[$filter] = $replacer; } - /** - * Load HTML in a DOMDocument. - * Apply Pre filters - * Cleanup HTML using Tidy (or not). - * - * @todo This should be called in init() instead of from __construct - */ - private function loadHtml() - { - $this->original_html = $this->html; - - $this->logger->debug('Parsing URL: ' . $this->url); - - if ($this->url) { - $this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', parse_url($this->url, PHP_URL_HOST)), array('.' => '\.')) . '/'; - } - - mb_internal_encoding('UTF-8'); - mb_http_output('UTF-8'); - mb_regex_encoding('UTF-8'); - - // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well... - if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) { - foreach ($this->pre_filters as $search => $replace) { - $this->html = preg_replace($search, $replace, $this->html); - } - unset($search, $replace); - } - - if (trim($this->html) === '') { - $this->html = ''; - } - - /* - * Use tidy (if it exists). - * This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing. - * Although sometimes it makes matters worse, which is why there is an option to disable it. - */ - if ($this->useTidy) { - $this->logger->debug('Tidying document'); - - $tidy = tidy_parse_string($this->html, $this->tidy_config, 'UTF8'); - if (tidy_clean_repair($tidy)) { - $this->tidied = true; - $this->html = $tidy->value; - $this->html = preg_replace('/[\r\n]+/is', "\n", $this->html); - } - unset($tidy); - } - - $this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8'); - - if (!($this->parser === 'html5lib' && ($this->dom = Parser::parse($this->html)))) { - libxml_use_internal_errors(true); - - $this->dom = new \DOMDocument(); - $this->dom->preserveWhiteSpace = false; - - if (PHP_VERSION_ID >= 50400) { - $this->dom->loadHTML($this->html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR); - } else { - $this->dom->loadHTML($this->html); - } - - libxml_use_internal_errors(false); - } - - $this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement'); - } - /** * Runs readability. * @@ -326,14 +257,14 @@ class Readability implements LoggerAwareInterface $bodyElems = $this->dom->getElementsByTagName('body'); // WTF multiple body nodes? - if ($this->bodyCache === null) { + if (null === $this->bodyCache) { $this->bodyCache = ''; foreach ($bodyElems as $bodyNode) { $this->bodyCache .= trim($bodyNode->innerHTML); } } - if ($bodyElems->length > 0 && $this->body === null) { + if ($bodyElems->length > 0 && null === $this->body) { $this->body = $bodyElems->item(0); } @@ -373,27 +304,6 @@ class Readability implements LoggerAwareInterface return $this->success; } - /** - * Debug. - * - * @deprecated use $this->logger->debug() instead - * @codeCoverageIgnore - */ - protected function dbg($msg) - { - $this->logger->debug($msg); - } - - /** - * Dump debug info. - * - * @deprecated since Monolog gather log, we don't need it - * @codeCoverageIgnore - */ - protected function dump_dbg() - { - } - /** * Run any post-process modifications to article content as necessary. * @@ -406,77 +316,6 @@ class Readability implements LoggerAwareInterface } } - /** - * Get the article title as an H1. - * - * @return \DOMElement - */ - protected function getArticleTitle() - { - try { - $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); - } catch (\Exception $e) { - $curTitle = ''; - $origTitle = ''; - } - - if (preg_match('/ [\|\-] /', $curTitle)) { - $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); - if (count(explode(' ', $curTitle)) < 3) { - $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle); - } - } elseif (strpos($curTitle, ': ') !== false) { - $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle); - if (count(explode(' ', $curTitle)) < 3) { - $curTitle = preg_replace('/[^:]*[:](.*)/i', '$1', $origTitle); - } - } elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) { - $hOnes = $this->dom->getElementsByTagName('h1'); - if ($hOnes->length === 1) { - $curTitle = $this->getInnerText($hOnes->item(0)); - } - } - - $curTitle = trim($curTitle); - if (count(explode(' ', $curTitle)) <= 4) { - $curTitle = $origTitle; - } - - $articleTitle = $this->dom->createElement('h1'); - $articleTitle->innerHTML = $curTitle; - - return $articleTitle; - } - - /** - * Prepare the HTML document for readability to scrape it. - * This includes things like stripping javascript, CSS, and handling terrible markup. - */ - protected function prepDocument() - { - /* - * In some cases a body element can't be found (if the HTML is totally hosed for example) - * so we create a new body node and append it to the document. - */ - if ($this->body === null) { - $this->body = $this->dom->createElement('body'); - $this->dom->documentElement->appendChild($this->body); - } - - $this->body->setAttribute('class', 'readabilityBody'); - - // Remove all style tags in head. - $styleTags = $this->dom->getElementsByTagName('style'); - for ($i = $styleTags->length - 1; $i >= 0; --$i) { - $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); - } - - $linkTags = $this->dom->getElementsByTagName('link'); - for ($i = $linkTags->length - 1; $i >= 0; --$i) { - $linkTags->item($i)->parentNode->removeChild($linkTags->item($i)); - } - } - /** * For easier reading, convert this document to have footnotes at the bottom rather than inline links. * @@ -506,7 +345,7 @@ class Readability implements LoggerAwareInterface } $linkText = $this->getInnerText($articleLink); - if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) { + if ((false !== strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote')) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) { continue; } @@ -527,7 +366,7 @@ class Readability implements LoggerAwareInterface $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); $footnote->innerHTML = '^ '; - $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') !== '' ? $footnoteLink->getAttribute('title') : $linkText); + $footnoteLink->innerHTML = ('' !== $footnoteLink->getAttribute('title') ? $footnoteLink->getAttribute('title') : $linkText); $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); $footnote->appendChild($footnoteLink); @@ -589,7 +428,7 @@ class Readability implements LoggerAwareInterface * already have a header. */ $h2s = $articleContent->getElementsByTagName('h2'); - if ($h2s->length === 1 && mb_strlen($this->getInnerText($h2s->item(0), true, true)) < 100) { + if (1 === $h2s->length && mb_strlen($this->getInnerText($h2s->item(0), true, true)) < 100) { $this->clean($articleContent, 'h2'); } @@ -614,7 +453,7 @@ class Readability implements LoggerAwareInterface $audioCount = $item->getElementsByTagName('audio')->length; $iframeCount = $item->getElementsByTagName('iframe')->length; - if ($iframeCount === 0 && $imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $videoCount === 0 && $audioCount === 0 && mb_strlen(preg_replace('/\s+/is', '', $this->getInnerText($item, false, false))) === 0) { + if (0 === $iframeCount && 0 === $imgCount && 0 === $embedCount && 0 === $objectCount && 0 === $videoCount && 0 === $audioCount && 0 === mb_strlen(preg_replace('/\s+/is', '', $this->getInnerText($item, false, false)))) { $item->parentNode->removeChild($item); } @@ -640,806 +479,899 @@ class Readability implements LoggerAwareInterface } /** - * Initialize a node with the readability object. Also checks the - * className/id for special names to add to its score. + * Get the inner text of a node. + * This also strips out any excess whitespace to be found. * - * @param \DOMElement $node + * @param \DOMElement $e + * @param bool $normalizeSpaces (default: true) + * @param bool $flattenLines (default: false) + * + * @return string */ - protected function initializeNode($node) + public function getInnerText($e, $normalizeSpaces = true, $flattenLines = false) { - if (!isset($node->tagName)) { + if (null === $e || !isset($e->textContent) || '' === $e->textContent) { + return ''; + } + + $textContent = trim($e->textContent); + + if ($flattenLines) { + $textContent = mb_ereg_replace('(?:[\r\n](?:\s| )*)+', '', $textContent); + } elseif ($normalizeSpaces) { + $textContent = mb_ereg_replace('\s\s+', ' ', $textContent); + } + + return $textContent; + } + + /** + * Remove the style attribute on every $e and under. + * + * @param \DOMElement $e + */ + public function cleanStyles($e) + { + if (!is_object($e)) { return; } - $readability = $this->dom->createAttribute('readability'); - // this is our contentScore - $readability->value = 0; - $node->setAttributeNode($readability); + $elems = $e->getElementsByTagName('*'); - // using strtoupper just in case - switch (strtoupper($node->tagName)) { - case 'ARTICLE': - $readability->value += 15; - case 'DIV': - $readability->value += 5; - break; - case 'PRE': - case 'CODE': - case 'TD': - case 'BLOCKQUOTE': - case 'FIGURE': - $readability->value += 3; - break; - case 'SECTION': - // often misused - // $readability->value += 2; - break; - case 'OL': - case 'UL': - case 'DL': - case 'DD': - case 'DT': - case 'LI': - $readability->value -= 2 * round($this->getLinkDensity($node), 0, PHP_ROUND_HALF_UP); - break; - case 'ASIDE': - case 'FOOTER': - case 'HEADER': - case 'ADDRESS': - case 'FORM': - case 'BUTTON': - case 'TEXTAREA': - case 'INPUT': - case 'NAV': - $readability->value -= 3; - break; - case 'H1': - case 'H2': - case 'H3': - case 'H4': - case 'H5': - case 'H6': - case 'TH': - case 'HGROUP': - $readability->value -= 5; - break; + foreach ($elems as $elem) { + $elem->removeAttribute('style'); } + } - $readability->value += $this->getWeight($node); + /** + * Get comma number for a given text. + * + * @param string $text + * + * @return int + */ + public function getCommaCount($text) + { + return substr_count($text, ','); } /** - * Using a variety of metrics (content score, classname, element types), find the content that is - * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. + * Get words number for a given text if words separated by a space. + * Input string should be normalized. * - * @param \DOMElement $page + * @param string $text * - * @return \DOMElement|bool + * @return int */ - protected function grabArticle($page = null) + public function getWordCount($text) { - if (!$page) { - $page = $this->dom; - } + return substr_count($text, ' '); + } - $xpath = null; - $nodesToScore = array(); + /** + * Get the density of links as a percentage of the content + * This is the amount of text that is inside a link divided by the total text in the node. + * Can exclude external references to differentiate between simple text and menus/infoblocks. + * + * @param \DOMElement $e + * @param string $excludeExternal + * + * @return int + */ + public function getLinkDensity($e, $excludeExternal = false) + { + $links = $e->getElementsByTagName('a'); + $textLength = mb_strlen($this->getInnerText($e, true, true)); + $linkLength = 0; - if ($page instanceof \DOMDocument && isset($page->documentElement)) { - $xpath = new \DOMXPath($page); + for ($dRe = $this->domainRegExp, $i = 0, $il = $links->length; $i < $il; ++$i) { + if ($excludeExternal && $dRe && !preg_match($dRe, $links->item($i)->getAttribute('href'))) { + continue; + } + $linkLength += mb_strlen($this->getInnerText($links->item($i))); } - $allElements = $page->getElementsByTagName('*'); + if ($textLength > 0 && $linkLength > 0) { + return $linkLength / $textLength; + } - for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); ++$nodeIndex) { - $tagName = $node->tagName; + return 0; + } - // Some well known site uses sections as paragraphs. - if (strcasecmp($tagName, 'p') === 0 || strcasecmp($tagName, 'td') === 0 || strcasecmp($tagName, 'pre') === 0 || strcasecmp($tagName, 'section') === 0) { - $nodesToScore[] = $node; - } + /** + * Get an element relative weight. + * + * @param \DOMElement $e + * + * @return int + */ + public function getWeight($e) + { + if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) { + return 0; + } - // Turn divs into P tags where they have been used inappropriately - // (as in, where they contain no other block level elements). - if (strcasecmp($tagName, 'div') === 0 || strcasecmp($tagName, 'article') === 0 || strcasecmp($tagName, 'section') === 0) { - if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) { - $newNode = $this->dom->createElement('p'); + $weight = 0; + // Look for a special classname + $weight += $this->weightAttribute($e, 'class'); + // Look for a special ID + $weight += $this->weightAttribute($e, 'id'); - try { - $newNode->innerHTML = $node->innerHTML; + return $weight; + } - $node->parentNode->replaceChild($newNode, $node); - --$nodeIndex; - $nodesToScore[] = $newNode; - } catch (\Exception $e) { - $this->logger->error('Could not alter div/article to p, reverting back to div: ' . $e->getMessage()); - } - } else { - // Will change these P elements back to text nodes after processing. - for ($i = 0, $il = $node->childNodes->length; $i < $il; ++$i) { - $childNode = $node->childNodes->item($i); + /** + * Remove extraneous break tags from a node. + * + * @param \DOMElement $node + */ + public function killBreaks($node) + { + $html = $node->innerHTML; + $html = preg_replace($this->regexps['killBreaks'], '
', $html); + $node->innerHTML = $html; + } - // it looks like sometimes the loop is going too far and we are retrieving a non-existant child - if (null === $childNode) { - continue; - } + /** + * Clean a node of all elements of type "tag". + * (Unless it's a youtube/vimeo video. People love movies.). + * + * Updated 2012-09-18 to preserve youtube/vimeo iframes + * + * @param \DOMElement $e + * @param string $tag + */ + public function clean($e, $tag) + { + $currentItem = null; + $targetList = $e->getElementsByTagName($tag); + $isEmbed = ('audio' === $tag || 'video' === $tag || 'iframe' === $tag || 'object' === $tag || 'embed' === $tag); - // executable tags (parentNode->removeChild($childNode); + for ($y = $targetList->length - 1; $y >= 0; --$y) { + // Allow youtube and vimeo videos through as people usually want to see those. + $currentItem = $targetList->item($y); - continue; - } + if ($isEmbed) { + $attributeValues = $currentItem->getAttribute('src') . ' ' . $currentItem->getAttribute('href'); - if ($childNode->nodeType === XML_TEXT_NODE) { - $p = $this->dom->createElement('p'); - $p->innerHTML = $childNode->nodeValue; - $p->setAttribute('data-readability-styled', 'true'); - $childNode->parentNode->replaceChild($p, $childNode); - } - } + // First, check the elements attributes to see if any of them contain known media hosts + if (preg_match($this->regexps['media'], $attributeValues)) { + continue; + } + + // Then check the elements inside this element for the same. + if (preg_match($this->regexps['media'], $targetList->item($y)->innerHTML)) { + continue; } } + + $currentItem->parentNode->removeChild($currentItem); + } + } + + /** + * Clean an element of all tags of type "tag" if they look fishy. + * "Fishy" is an algorithm based on content length, classnames, + * link density, number of images & embeds, etc. + * + * @param \DOMElement $e + * @param string $tag + */ + public function cleanConditionally($e, $tag) + { + if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { + return; } + $tagsList = $e->getElementsByTagName($tag); + $curTagsLength = $tagsList->length; + $node = null; + /* - * Loop through all paragraphs, and assign a score to them based on how content-y they look. - * Then add their score to their parent node. + * Gather counts for other typical elements embedded within. + * Traverse backwards so we can remove nodes at the same time without effecting the traversal. * - * A score is determined by things like number of commas, class names, etc. - * Maybe eventually link density. + * TODO: Consider taking into account original contentScore here. */ - for ($pt = 0, $scored = count($nodesToScore); $pt < $scored; ++$pt) { - $parentNode = $nodesToScore[$pt]->parentNode; - - // No parent node? Move on... - if (!$parentNode) { - continue; - } + for ($i = $curTagsLength - 1; $i >= 0; --$i) { + $node = $tagsList->item($i); + $weight = $this->getWeight($node); + $contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0; + $this->logger->debug('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : '')); - $grandParentNode = $parentNode->parentNode instanceof \DOMElement ? $parentNode->parentNode : null; - $innerText = $this->getInnerText($nodesToScore[$pt]); + if ($weight + $contentScore < 0) { + $this->logger->debug('Removing...'); + $node->parentNode->removeChild($node); + } elseif ($this->getCommaCount($this->getInnerText($node)) < self::MIN_COMMAS_IN_PARAGRAPH) { + /* + * If there are not very many commas, and the number of + * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. + */ + $p = $node->getElementsByTagName('p')->length; + $img = $node->getElementsByTagName('img')->length; + $li = $node->getElementsByTagName('li')->length - 100; + $input = $node->getElementsByTagName('input')->length; + $a = $node->getElementsByTagName('a')->length; + $embedCount = 0; + $embeds = $node->getElementsByTagName('embed'); - // If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it. - if (mb_strlen($innerText) < self::MIN_PARAGRAPH_LENGTH) { - continue; - } - - // Initialize readability data for the parent. - if (!$parentNode->hasAttribute('readability')) { - $this->initializeNode($parentNode); - $parentNode->setAttribute('data-candidate', 'true'); - } - - // Initialize readability data for the grandparent. - if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) { - $this->initializeNode($grandParentNode); - $grandParentNode->setAttribute('data-candidate', 'true'); - } - // Add a point for the paragraph itself as a base. - $contentScore = 1; - // Add points for any commas within this paragraph. - $contentScore += $this->getCommaCount($innerText); - // For every SCORE_CHARS_IN_PARAGRAPH (default:100) characters in this paragraph, add another point. Up to 3 points. - $contentScore += min(floor(mb_strlen($innerText) / self::SCORE_CHARS_IN_PARAGRAPH), 3); - // For every SCORE_WORDS_IN_PARAGRAPH (default:20) words in this paragraph, add another point. Up to 3 points. - $contentScore += min(floor($this->getWordCount($innerText) / self::SCORE_WORDS_IN_PARAGRAPH), 3); - /* TEST: For every positive/negative parent tag, add/substract half point. Up to 3 points. *\/ - $up = $nodesToScore[$pt]; - $score = 0; - while ($up->parentNode instanceof \DOMElement) { - $up = $up->parentNode; - if (preg_match($this->regexps['positive'], $up->getAttribute('class') . ' ' . $up->getAttribute('id'))) { - $score += 0.5; - } elseif (preg_match($this->regexps['negative'], $up->getAttribute('class') . ' ' . $up->getAttribute('id'))) { - $score -= 0.5; - } - } - $score = floor($score); - $contentScore += max(min($score, 3), -3);/**/ - - // Add the score to the parent. The grandparent gets half. - $parentNode->getAttributeNode('readability')->value += $contentScore; - if ($grandParentNode) { - $grandParentNode->getAttributeNode('readability')->value += $contentScore / self::GRANDPARENT_SCORE_DIVISOR; - } - } - - /* - * Node prepping: trash nodes that look cruddy (like ones with the class name "comment", etc). - * This is faster to do before scoring but safer after. - */ - if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) && $xpath) { - $candidates = $xpath->query('.//*[(self::footer and count(//footer)<2) or (self::aside and count(//aside)<2)]', $page->documentElement); - $node = null; - - for ($c = $candidates->length - 1; $c >= 0; --$c) { - $node = $candidates->item($c); - // node should be readable but not inside of an article otherwise it's probably non-readable block - if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? strcasecmp($node->parentNode->tagName, 'article') !== 0 : true)) { - $this->logger->debug('Removing unlikely candidate (using note) ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); - $node->parentNode->removeChild($node); - } - } - - $candidates = $xpath->query('.//*[not(self::body) and (@class or @id or @style) and ((number(@readability) < 40) or not(@readability))]', $page->documentElement); - $node = null; - - for ($c = $candidates->length - 1; $c >= 0; --$c) { - $node = $candidates->item($c); - - // Remove unlikely candidates - $unlikelyMatchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id') . ' ' . $node->getAttribute('style'); - - if (mb_strlen($unlikelyMatchString) > 3 && // don't process "empty" strings - preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && - !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) - ) { - $this->logger->debug('Removing unlikely candidate (using conf) ' . $node->getNodePath() . ' by "' . $unlikelyMatchString . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); - $node->parentNode->removeChild($node); - --$nodeIndex; + for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) { + if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) { + ++$embedCount; + } } - } - unset($candidates); - } - /* - * After we've calculated scores, loop through all of the possible candidate nodes we found - * and find the one with the highest score. - */ - $topCandidate = null; - if ($xpath) { - // Using array of DOMElements after deletion is a path to DOOMElement. - $candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement); - - for ($c = $candidates->length - 1; $c >= 0; --$c) { - $item = $candidates->item($c); - - // Scale the final candidates score based on link density. Good content should have a - // relatively small link density (5% or less) and be mostly unaffected by this operation. - // If not for this we would have used XPath to find maximum @readability. - $readability = $item->getAttributeNode('readability'); - $readability->value = round($readability->value * (1 - $this->getLinkDensity($item)), 0, PHP_ROUND_HALF_UP); - - if (!$topCandidate || $readability->value > (int) $topCandidate->getAttribute('readability')) { - $this->logger->debug('Candidate: ' . $item->getNodePath() . ' (' . $item->getAttribute('class') . ':' . $item->getAttribute('id') . ') with score ' . $readability->value); - $topCandidate = $item; + $embeds = $node->getElementsByTagName('iframe'); + for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) { + if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) { + ++$embedCount; + } } - } - - unset($candidates); - } - /* - * If we still have no top candidate, just use the body as a last resort. - * We also have to copy the body node so it is something we can modify. - */ - if ($topCandidate === null || strcasecmp($topCandidate->tagName, 'body') === 0) { - $topCandidate = $this->dom->createElement('div'); + $linkDensity = $this->getLinkDensity($node, true); + $contentLength = mb_strlen($this->getInnerText($node)); + $toRemove = false; - if ($page instanceof \DOMDocument) { - if (!isset($page->documentElement)) { - // we don't have a body either? what a mess! :) - $this->logger->debug('The page has no body!'); + if ($this->lightClean) { + if ($li > $p && 'ul' !== $tag && 'ol' !== $tag) { + $this->logger->debug(' too many
  • elements, and parent is not