From 2dce2879bfe6f7cb57d5073f58a2c0a34d9b6f27 Mon Sep 17 00:00:00 2001 From: Jeremy Benoist Date: Mon, 4 Feb 2019 11:21:31 +0100 Subject: [PATCH] Update fixer rules Following graby, wallabag, etc. --- .php_cs | 23 +- src/JSLikeHTMLElement.php | 6 +- src/Readability.php | 1114 ++++++++++++++++++------------------- tests/ReadabilityTest.php | 24 +- 4 files changed, 585 insertions(+), 582 deletions(-) diff --git a/.php_cs b/.php_cs index 8340221..5dc5396 100644 --- a/.php_cs +++ b/.php_cs @@ -4,22 +4,27 @@ return PhpCsFixer\Config::create() ->setUsingCache(true) ->setRiskyAllowed(true) ->setRules([ - 'concat_space' => [ - 'spacing' => 'one', - ], + '@Symfony' => true, + '@Symfony:risky' => true, + 'array_syntax' => ['syntax' => 'short'], + 'combine_consecutive_unsets' => true, + 'heredoc_to_nowdoc' => true, + 'no_extra_consecutive_blank_lines' => ['break', 'continue', 'extra', 'return', 'throw', 'use', 'parenthesis_brace_block', 'square_brace_block', 'curly_brace_block'], + 'no_unreachable_default_argument_value' => true, + 'no_useless_else' => true, + 'no_useless_return' => true, + 'ordered_class_elements' => true, 'ordered_imports' => true, + 'php_unit_strict' => false, 'phpdoc_order' => true, + // 'psr4' => true, 'strict_comparison' => true, 'strict_param' => true, - 'array_syntax' => [ - 'syntax' => 'long', - ], + 'concat_space' => ['spacing' => 'one'], ]) ->setFinder( PhpCsFixer\Finder::create() - ->exclude([ - 'vendor', - ]) + ->exclude(['vendor']) ->in(__DIR__) ) ; diff --git a/src/JSLikeHTMLElement.php b/src/JSLikeHTMLElement.php index 15e7281..b908d06 100644 --- a/src/JSLikeHTMLElement.php +++ b/src/JSLikeHTMLElement.php @@ -45,7 +45,7 @@ class JSLikeHTMLElement extends \DOMElement */ public function __set($name, $value) { - if ($name !== 'innerHTML') { + if ('innerHTML' !== $name) { $trace = debug_backtrace(); trigger_error('Undefined property via __set(): ' . $name . ' in ' . $trace[0]['file'] . ' on line ' . $trace[0]['line'], E_USER_NOTICE); @@ -109,7 +109,7 @@ class JSLikeHTMLElement extends \DOMElement */ public function __get($name) { - if ($name === 'innerHTML') { + if ('innerHTML' === $name) { $inner = ''; foreach ($this->childNodes as $child) { @@ -121,8 +121,6 @@ class JSLikeHTMLElement extends \DOMElement $trace = debug_backtrace(); trigger_error('Undefined property via __get(): ' . $name . ' in ' . $trace[0]['file'] . ' on line ' . $trace[0]['line'], E_USER_NOTICE); - - return; } public function __toString() diff --git a/src/Readability.php b/src/Readability.php index 986cce5..8d0aa33 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -52,6 +52,21 @@ use Psr\Log\NullLogger; */ class Readability implements LoggerAwareInterface { + // flags + const FLAG_STRIP_UNLIKELYS = 1; + const FLAG_WEIGHT_ATTRIBUTES = 2; + const FLAG_CLEAN_CONDITIONALLY = 4; + const FLAG_DISABLE_PREFILTER = 8; + const FLAG_DISABLE_POSTFILTER = 16; + // constants + const SCORE_CHARS_IN_PARAGRAPH = 100; + const SCORE_WORDS_IN_PARAGRAPH = 20; + const GRANDPARENT_SCORE_DIVISOR = 2.2; + const MIN_PARAGRAPH_LENGTH = 20; + const MIN_COMMAS_IN_PARAGRAPH = 6; + const MIN_ARTICLE_LENGTH = 200; + const MIN_NODE_LENGTH = 80; + const MAX_LINK_DENSITY = 0.25; public $convertLinksToFootnotes = false; public $revertForcedParagraphElements = true; public $articleTitle; @@ -65,25 +80,12 @@ class Readability implements LoggerAwareInterface // no more used, keept to avoid BC public $debug = false; public $tidied = false; - // article domain regexp for calibration - protected $domainRegExp = null; - protected $body = null; - // Cache the body HTML in case we need to re-use it later - protected $bodyCache = null; - // 1 | 2 | 4; // Start with all processing flags set. - protected $flags = 7; - // indicates whether we were able to extract or not - protected $success = false; - protected $logger; - protected $parser; - protected $html; - protected $useTidy; /** * All of the regular expressions in use within readability. * Defined up here so we don't instantiate them repeatedly in loops. */ - public $regexps = array( + public $regexps = [ 'unlikelyCandidates' => '/display\s*:\s*none|ignore|\binfos?\b|annoy|clock|date|time|author|intro|hidd?e|about|archive|\bprint|bookmark|tags|tag-list|share|search|social|robot|published|combx|comment|mast(?:head)|subscri|community|category|disqus|extra|head|head(?:er|note)|floor|foot(?:er|note)|menu|tool\b|function|nav|remark|rss|shoutbox|widget|meta|banner|sponsor|adsense|inner-?ad|ad-|sponsor|\badv\b|\bads\b|agr?egate?|pager|sidebar|popup|tweet|twitter/i', 'okMaybeItsACandidate' => '/article\b|contain|\bcontent|column|general|detail|shadow|lightbox|blog|body|entry|main|page|footnote/i', 'positive' => '/read|full|article|body|\bcontent|contain|entry|main|markdown|media|page|attach|pagination|post|text|blog|story/i', @@ -92,8 +94,8 @@ class Readability implements LoggerAwareInterface 'killBreaks' => '/(([ \r\n\s]| ?)*)+/', 'media' => '!//(?:[^\.\?/]+\.)?(?:youtu(?:be)?|giphy|soundcloud|dailymotion|vimeo|pornhub|xvideos|twitvid|rutube|viddler)\.(?:com|be|org|net)/!i', 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i', - ); - public $tidy_config = array( + ]; + public $tidy_config = [ 'tidy-mark' => false, 'vertical-space' => false, 'doctype' => 'omit', @@ -117,9 +119,22 @@ class Readability implements LoggerAwareInterface 'input-encoding' => '????', 'output-encoding' => 'utf8', 'hide-comments' => true, - ); + ]; + // article domain regexp for calibration + protected $domainRegExp = null; + protected $body = null; + // Cache the body HTML in case we need to re-use it later + protected $bodyCache = null; + // 1 | 2 | 4; // Start with all processing flags set. + protected $flags = 7; + // indicates whether we were able to extract or not + protected $success = false; + protected $logger; + protected $parser; + protected $html; + protected $useTidy; // raw HTML filters - protected $pre_filters = array( + protected $pre_filters = [ // remove obvious scripts '!]*>(.*?)!is' => '', // remove obvious styles @@ -134,9 +149,9 @@ class Readability implements LoggerAwareInterface //'!!is' => '', // replace fonts to spans '!<(/?)font[^>]*>!is' => '<\\1span>', - ); + ]; // output HTML filters - protected $post_filters = array( + protected $post_filters = [ // replace excessive br's '/\s*

']*>\s* '\s*!is' => '', '!<[hb]r>!is' => '<\\1 />', - ); - - // flags - const FLAG_STRIP_UNLIKELYS = 1; - const FLAG_WEIGHT_ATTRIBUTES = 2; - const FLAG_CLEAN_CONDITIONALLY = 4; - const FLAG_DISABLE_PREFILTER = 8; - const FLAG_DISABLE_POSTFILTER = 16; - // constants - const SCORE_CHARS_IN_PARAGRAPH = 100; - const SCORE_WORDS_IN_PARAGRAPH = 20; - const GRANDPARENT_SCORE_DIVISOR = 2.2; - const MIN_PARAGRAPH_LENGTH = 20; - const MIN_COMMAS_IN_PARAGRAPH = 6; - const MIN_ARTICLE_LENGTH = 200; - const MIN_NODE_LENGTH = 80; - const MAX_LINK_DENSITY = 0.25; + ]; /** * Create instance of Readability. @@ -180,7 +179,7 @@ class Readability implements LoggerAwareInterface $this->url = $url; $this->html = $html; $this->parser = $parser; - $this->useTidy = $use_tidy && function_exists('tidy_parse_string'); + $this->useTidy = $use_tidy && \function_exists('tidy_parse_string'); $this->logger = new NullLogger(); $this->loadHtml(); @@ -233,76 +232,6 @@ class Readability implements LoggerAwareInterface $this->post_filters[$filter] = $replacer; } - /** - * Load HTML in a DOMDocument. - * Apply Pre filters - * Cleanup HTML using Tidy (or not). - * - * @todo This should be called in init() instead of from __construct - */ - private function loadHtml() - { - $this->original_html = $this->html; - - $this->logger->debug('Parsing URL: ' . $this->url); - - if ($this->url) { - $this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', parse_url($this->url, PHP_URL_HOST)), array('.' => '\.')) . '/'; - } - - mb_internal_encoding('UTF-8'); - mb_http_output('UTF-8'); - mb_regex_encoding('UTF-8'); - - // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well... - if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) { - foreach ($this->pre_filters as $search => $replace) { - $this->html = preg_replace($search, $replace, $this->html); - } - unset($search, $replace); - } - - if (trim($this->html) === '') { - $this->html = ''; - } - - /* - * Use tidy (if it exists). - * This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing. - * Although sometimes it makes matters worse, which is why there is an option to disable it. - */ - if ($this->useTidy) { - $this->logger->debug('Tidying document'); - - $tidy = tidy_repair_string($this->html, $this->tidy_config, 'UTF8'); - if (false !== $tidy && $this->html !== $tidy) { - $this->tidied = true; - $this->html = $tidy; - $this->html = preg_replace('/[\r\n]+/is', "\n", $this->html); - } - unset($tidy); - } - - $this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8'); - - if (!($this->parser === 'html5lib' && ($this->dom = Parser::parse($this->html)))) { - libxml_use_internal_errors(true); - - $this->dom = new \DOMDocument(); - $this->dom->preserveWhiteSpace = false; - - if (PHP_VERSION_ID >= 50400) { - $this->dom->loadHTML($this->html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR); - } else { - $this->dom->loadHTML($this->html); - } - - libxml_use_internal_errors(false); - } - - $this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement'); - } - /** * Runs readability. * @@ -326,14 +255,14 @@ class Readability implements LoggerAwareInterface $bodyElems = $this->dom->getElementsByTagName('body'); // WTF multiple body nodes? - if ($this->bodyCache === null) { + if (null === $this->bodyCache) { $this->bodyCache = ''; foreach ($bodyElems as $bodyNode) { $this->bodyCache .= trim($bodyNode->innerHTML); } } - if ($bodyElems->length > 0 && $this->body === null) { + if ($bodyElems->length > 0 && null === $this->body) { $this->body = $bodyElems->item(0); } @@ -373,27 +302,6 @@ class Readability implements LoggerAwareInterface return $this->success; } - /** - * Debug. - * - * @deprecated use $this->logger->debug() instead - * @codeCoverageIgnore - */ - protected function dbg($msg) - { - $this->logger->debug($msg); - } - - /** - * Dump debug info. - * - * @deprecated since Monolog gather log, we don't need it - * @codeCoverageIgnore - */ - protected function dump_dbg() - { - } - /** * Run any post-process modifications to article content as necessary. * @@ -406,77 +314,6 @@ class Readability implements LoggerAwareInterface } } - /** - * Get the article title as an H1. - * - * @return \DOMElement - */ - protected function getArticleTitle() - { - try { - $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); - } catch (\Exception $e) { - $curTitle = ''; - $origTitle = ''; - } - - if (preg_match('/ [\|\-] /', $curTitle)) { - $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); - if (count(explode(' ', $curTitle)) < 3) { - $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle); - } - } elseif (strpos($curTitle, ': ') !== false) { - $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle); - if (count(explode(' ', $curTitle)) < 3) { - $curTitle = preg_replace('/[^:]*[:](.*)/i', '$1', $origTitle); - } - } elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) { - $hOnes = $this->dom->getElementsByTagName('h1'); - if ($hOnes->length === 1) { - $curTitle = $this->getInnerText($hOnes->item(0)); - } - } - - $curTitle = trim($curTitle); - if (count(explode(' ', $curTitle)) <= 4) { - $curTitle = $origTitle; - } - - $articleTitle = $this->dom->createElement('h1'); - $articleTitle->innerHTML = $curTitle; - - return $articleTitle; - } - - /** - * Prepare the HTML document for readability to scrape it. - * This includes things like stripping javascript, CSS, and handling terrible markup. - */ - protected function prepDocument() - { - /* - * In some cases a body element can't be found (if the HTML is totally hosed for example) - * so we create a new body node and append it to the document. - */ - if ($this->body === null) { - $this->body = $this->dom->createElement('body'); - $this->dom->documentElement->appendChild($this->body); - } - - $this->body->setAttribute('class', 'readabilityBody'); - - // Remove all style tags in head. - $styleTags = $this->dom->getElementsByTagName('style'); - for ($i = $styleTags->length - 1; $i >= 0; --$i) { - $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); - } - - $linkTags = $this->dom->getElementsByTagName('link'); - for ($i = $linkTags->length - 1; $i >= 0; --$i) { - $linkTags->item($i)->parentNode->removeChild($linkTags->item($i)); - } - } - /** * For easier reading, convert this document to have footnotes at the bottom rather than inline links. * @@ -506,7 +343,7 @@ class Readability implements LoggerAwareInterface } $linkText = $this->getInnerText($articleLink); - if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) { + if ((false !== strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote')) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) { continue; } @@ -527,7 +364,7 @@ class Readability implements LoggerAwareInterface $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); $footnote->innerHTML = '^ '; - $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') !== '' ? $footnoteLink->getAttribute('title') : $linkText); + $footnoteLink->innerHTML = ('' !== $footnoteLink->getAttribute('title') ? $footnoteLink->getAttribute('title') : $linkText); $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); $footnote->appendChild($footnoteLink); @@ -589,7 +426,7 @@ class Readability implements LoggerAwareInterface * already have a header. */ $h2s = $articleContent->getElementsByTagName('h2'); - if ($h2s->length === 1 && mb_strlen($this->getInnerText($h2s->item(0), true, true)) < 100) { + if (1 === $h2s->length && mb_strlen($this->getInnerText($h2s->item(0), true, true)) < 100) { $this->clean($articleContent, 'h2'); } @@ -614,7 +451,7 @@ class Readability implements LoggerAwareInterface $audioCount = $item->getElementsByTagName('audio')->length; $iframeCount = $item->getElementsByTagName('iframe')->length; - if ($iframeCount === 0 && $imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $videoCount === 0 && $audioCount === 0 && mb_strlen(preg_replace('/\s+/is', '', $this->getInnerText($item, false, false))) === 0) { + if (0 === $iframeCount && 0 === $imgCount && 0 === $embedCount && 0 === $objectCount && 0 === $videoCount && 0 === $audioCount && 0 === mb_strlen(preg_replace('/\s+/is', '', $this->getInnerText($item, false, false)))) { $item->parentNode->removeChild($item); } @@ -640,30 +477,460 @@ class Readability implements LoggerAwareInterface } /** - * Initialize a node with the readability object. Also checks the - * className/id for special names to add to its score. + * Get the inner text of a node. + * This also strips out any excess whitespace to be found. * - * @param \DOMElement $node + * @param \DOMElement $e + * @param bool $normalizeSpaces (default: true) + * @param bool $flattenLines (default: false) + * + * @return string */ - protected function initializeNode($node) + public function getInnerText($e, $normalizeSpaces = true, $flattenLines = false) { - if (!isset($node->tagName)) { + if (null === $e || !isset($e->textContent) || '' === $e->textContent) { + return ''; + } + + $textContent = trim($e->textContent); + + if ($flattenLines) { + $textContent = mb_ereg_replace('(?:[\r\n](?:\s| )*)+', '', $textContent); + } elseif ($normalizeSpaces) { + $textContent = mb_ereg_replace('\s\s+', ' ', $textContent); + } + + return $textContent; + } + + /** + * Remove the style attribute on every $e and under. + * + * @param \DOMElement $e + */ + public function cleanStyles($e) + { + if (!\is_object($e)) { return; } - $readability = $this->dom->createAttribute('readability'); - // this is our contentScore - $readability->value = 0; - $node->setAttributeNode($readability); + $elems = $e->getElementsByTagName('*'); - // using strtoupper just in case - switch (strtoupper($node->tagName)) { - case 'ARTICLE': - $readability->value += 15; - case 'DIV': - $readability->value += 5; - break; - case 'PRE': + foreach ($elems as $elem) { + $elem->removeAttribute('style'); + } + } + + /** + * Get comma number for a given text. + * + * @param string $text + * + * @return int + */ + public function getCommaCount($text) + { + return substr_count($text, ','); + } + + /** + * Get words number for a given text if words separated by a space. + * Input string should be normalized. + * + * @param string $text + * + * @return int + */ + public function getWordCount($text) + { + return substr_count($text, ' '); + } + + /** + * Get the density of links as a percentage of the content + * This is the amount of text that is inside a link divided by the total text in the node. + * Can exclude external references to differentiate between simple text and menus/infoblocks. + * + * @param \DOMElement $e + * @param string $excludeExternal + * + * @return int + */ + public function getLinkDensity($e, $excludeExternal = false) + { + $links = $e->getElementsByTagName('a'); + $textLength = mb_strlen($this->getInnerText($e, true, true)); + $linkLength = 0; + + for ($dRe = $this->domainRegExp, $i = 0, $il = $links->length; $i < $il; ++$i) { + if ($excludeExternal && $dRe && !preg_match($dRe, $links->item($i)->getAttribute('href'))) { + continue; + } + $linkLength += mb_strlen($this->getInnerText($links->item($i))); + } + + if ($textLength > 0 && $linkLength > 0) { + return $linkLength / $textLength; + } + + return 0; + } + + /** + * Get an element relative weight. + * + * @param \DOMElement $e + * + * @return int + */ + public function getWeight($e) + { + if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) { + return 0; + } + + $weight = 0; + // Look for a special classname + $weight += $this->weightAttribute($e, 'class'); + // Look for a special ID + $weight += $this->weightAttribute($e, 'id'); + + return $weight; + } + + /** + * Remove extraneous break tags from a node. + * + * @param \DOMElement $node + */ + public function killBreaks($node) + { + $html = $node->innerHTML; + $html = preg_replace($this->regexps['killBreaks'], '
', $html); + $node->innerHTML = $html; + } + + /** + * Clean a node of all elements of type "tag". + * (Unless it's a youtube/vimeo video. People love movies.). + * + * Updated 2012-09-18 to preserve youtube/vimeo iframes + * + * @param \DOMElement $e + * @param string $tag + */ + public function clean($e, $tag) + { + $currentItem = null; + $targetList = $e->getElementsByTagName($tag); + $isEmbed = ('audio' === $tag || 'video' === $tag || 'iframe' === $tag || 'object' === $tag || 'embed' === $tag); + + for ($y = $targetList->length - 1; $y >= 0; --$y) { + // Allow youtube and vimeo videos through as people usually want to see those. + $currentItem = $targetList->item($y); + + if ($isEmbed) { + $attributeValues = $currentItem->getAttribute('src') . ' ' . $currentItem->getAttribute('href'); + + // First, check the elements attributes to see if any of them contain known media hosts + if (preg_match($this->regexps['media'], $attributeValues)) { + continue; + } + + // Then check the elements inside this element for the same. + if (preg_match($this->regexps['media'], $targetList->item($y)->innerHTML)) { + continue; + } + } + + $currentItem->parentNode->removeChild($currentItem); + } + } + + /** + * Clean an element of all tags of type "tag" if they look fishy. + * "Fishy" is an algorithm based on content length, classnames, + * link density, number of images & embeds, etc. + * + * @param \DOMElement $e + * @param string $tag + */ + public function cleanConditionally($e, $tag) + { + if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { + return; + } + + $tagsList = $e->getElementsByTagName($tag); + $curTagsLength = $tagsList->length; + $node = null; + + /* + * Gather counts for other typical elements embedded within. + * Traverse backwards so we can remove nodes at the same time without effecting the traversal. + * + * TODO: Consider taking into account original contentScore here. + */ + for ($i = $curTagsLength - 1; $i >= 0; --$i) { + $node = $tagsList->item($i); + $weight = $this->getWeight($node); + $contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0; + $this->logger->debug('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : '')); + + if ($weight + $contentScore < 0) { + $this->logger->debug('Removing...'); + $node->parentNode->removeChild($node); + } elseif ($this->getCommaCount($this->getInnerText($node)) < self::MIN_COMMAS_IN_PARAGRAPH) { + /* + * If there are not very many commas, and the number of + * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. + */ + $p = $node->getElementsByTagName('p')->length; + $img = $node->getElementsByTagName('img')->length; + $li = $node->getElementsByTagName('li')->length - 100; + $input = $node->getElementsByTagName('input')->length; + $a = $node->getElementsByTagName('a')->length; + $embedCount = 0; + $embeds = $node->getElementsByTagName('embed'); + + for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) { + if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) { + ++$embedCount; + } + } + + $embeds = $node->getElementsByTagName('iframe'); + for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) { + if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) { + ++$embedCount; + } + } + + $linkDensity = $this->getLinkDensity($node, true); + $contentLength = mb_strlen($this->getInnerText($node)); + $toRemove = false; + + if ($this->lightClean) { + if ($li > $p && 'ul' !== $tag && 'ol' !== $tag) { + $this->logger->debug(' too many

  • elements, and parent is not
      or
        '); + $toRemove = true; + } elseif ($input > floor($p / 3)) { + $this->logger->debug(' too many elements'); + $toRemove = true; + } elseif ($contentLength < 6 && (0 === $embedCount && (0 === $img || $img > 2))) { + $this->logger->debug(' content length less than 6 chars, 0 embeds and either 0 images or more than 2 images'); + $toRemove = true; + } elseif ($weight < 25 && $linkDensity > 0.25) { + $this->logger->debug(' weight is ' . $weight . ' < 25 and link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.25'); + $toRemove = true; + } elseif ($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) { + $this->logger->debug(' more than 2 links and weight is ' . $weight . ' > 25 but link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.5'); + $toRemove = true; + } elseif ($embedCount > 3) { + $this->logger->debug(' more than 3 embeds'); + $toRemove = true; + } + } else { + if ($img > $p) { + $this->logger->debug(' more image elements than paragraph elements'); + $toRemove = true; + } elseif ($li > $p && 'ul' !== $tag && 'ol' !== $tag) { + $this->logger->debug(' too many
      1. elements, and parent is not
          or
            '); + $toRemove = true; + } elseif ($input > floor($p / 3)) { + $this->logger->debug(' too many elements'); + $toRemove = true; + } elseif ($contentLength < 10 && (0 === $img || $img > 2)) { + $this->logger->debug(' content length less than 10 chars and 0 images, or more than 2 images'); + $toRemove = true; + } elseif ($weight < 25 && $linkDensity > 0.2) { + $this->logger->debug(' weight is ' . $weight . ' lower than 0 and link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.2'); + $toRemove = true; + } elseif ($weight >= 25 && $linkDensity > 0.5) { + $this->logger->debug(' weight above 25 but link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.5'); + $toRemove = true; + } elseif ((1 === $embedCount && $contentLength < 75) || $embedCount > 1) { + $this->logger->debug(' 1 embed and content length smaller than 75 chars, or more than one embed'); + $toRemove = true; + } + } + + if ($toRemove) { + $this->logger->debug('Removing...'); + $node->parentNode->removeChild($node); + } + } + } + } + + /** + * Clean out spurious headers from an Element. Checks things like classnames and link density. + * + * @param \DOMElement $e + */ + public function cleanHeaders($e) + { + for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) { + $headers = $e->getElementsByTagName('h' . $headerIndex); + + for ($i = $headers->length - 1; $i >= 0; --$i) { + if ($this->getWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { + $headers->item($i)->parentNode->removeChild($headers->item($i)); + } + } + } + } + + /** + * Check if the given flag is active. + * + * @param int $flag + * + * @return bool + */ + public function flagIsActive($flag) + { + return ($this->flags & $flag) > 0; + } + + /** + * Add a flag. + * + * @param int $flag + */ + public function addFlag($flag) + { + $this->flags = $this->flags | $flag; + } + + /** + * Remove a flag. + * + * @param int $flag + */ + public function removeFlag($flag) + { + $this->flags = $this->flags & ~$flag; + } + + /** + * Debug. + * + * @deprecated use $this->logger->debug() instead + * @codeCoverageIgnore + */ + protected function dbg($msg) + { + $this->logger->debug($msg); + } + + /** + * Dump debug info. + * + * @deprecated since Monolog gather log, we don't need it + * @codeCoverageIgnore + */ + protected function dump_dbg() + { + } + + /** + * Get the article title as an H1. + * + * @return \DOMElement + */ + protected function getArticleTitle() + { + try { + $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); + } catch (\Exception $e) { + $curTitle = ''; + $origTitle = ''; + } + + if (preg_match('/ [\|\-] /', $curTitle)) { + $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); + if (\count(explode(' ', $curTitle)) < 3) { + $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle); + } + } elseif (false !== strpos($curTitle, ': ')) { + $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle); + if (\count(explode(' ', $curTitle)) < 3) { + $curTitle = preg_replace('/[^:]*[:](.*)/i', '$1', $origTitle); + } + } elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) { + $hOnes = $this->dom->getElementsByTagName('h1'); + if (1 === $hOnes->length) { + $curTitle = $this->getInnerText($hOnes->item(0)); + } + } + + $curTitle = trim($curTitle); + if (\count(explode(' ', $curTitle)) <= 4) { + $curTitle = $origTitle; + } + + $articleTitle = $this->dom->createElement('h1'); + $articleTitle->innerHTML = $curTitle; + + return $articleTitle; + } + + /** + * Prepare the HTML document for readability to scrape it. + * This includes things like stripping javascript, CSS, and handling terrible markup. + */ + protected function prepDocument() + { + /* + * In some cases a body element can't be found (if the HTML is totally hosed for example) + * so we create a new body node and append it to the document. + */ + if (null === $this->body) { + $this->body = $this->dom->createElement('body'); + $this->dom->documentElement->appendChild($this->body); + } + + $this->body->setAttribute('class', 'readabilityBody'); + + // Remove all style tags in head. + $styleTags = $this->dom->getElementsByTagName('style'); + for ($i = $styleTags->length - 1; $i >= 0; --$i) { + $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); + } + + $linkTags = $this->dom->getElementsByTagName('link'); + for ($i = $linkTags->length - 1; $i >= 0; --$i) { + $linkTags->item($i)->parentNode->removeChild($linkTags->item($i)); + } + } + + /** + * Initialize a node with the readability object. Also checks the + * className/id for special names to add to its score. + * + * @param \DOMElement $node + */ + protected function initializeNode($node) + { + if (!isset($node->tagName)) { + return; + } + + $readability = $this->dom->createAttribute('readability'); + // this is our contentScore + $readability->value = 0; + $node->setAttributeNode($readability); + + // using strtoupper just in case + switch (strtoupper($node->tagName)) { + case 'ARTICLE': + $readability->value += 15; + // no break + case 'DIV': + $readability->value += 5; + break; + case 'PRE': case 'CODE': case 'TD': case 'BLOCKQUOTE': @@ -723,7 +990,7 @@ class Readability implements LoggerAwareInterface } $xpath = null; - $nodesToScore = array(); + $nodesToScore = []; if ($page instanceof \DOMDocument && isset($page->documentElement)) { $xpath = new \DOMXPath($page); @@ -735,13 +1002,13 @@ class Readability implements LoggerAwareInterface $tagName = $node->tagName; // Some well known site uses sections as paragraphs. - if (strcasecmp($tagName, 'p') === 0 || strcasecmp($tagName, 'td') === 0 || strcasecmp($tagName, 'pre') === 0 || strcasecmp($tagName, 'section') === 0) { + if (0 === strcasecmp($tagName, 'p') || 0 === strcasecmp($tagName, 'td') || 0 === strcasecmp($tagName, 'pre') || 0 === strcasecmp($tagName, 'section')) { $nodesToScore[] = $node; } // Turn divs into P tags where they have been used inappropriately // (as in, where they contain no other block level elements). - if (strcasecmp($tagName, 'div') === 0 || strcasecmp($tagName, 'article') === 0 || strcasecmp($tagName, 'section') === 0) { + if (0 === strcasecmp($tagName, 'div') || 0 === strcasecmp($tagName, 'article') || 0 === strcasecmp($tagName, 'section')) { if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) { $newNode = $this->dom->createElement('p'); @@ -765,13 +1032,13 @@ class Readability implements LoggerAwareInterface } // executable tags (parentNode->removeChild($childNode); continue; } - if ($childNode->nodeType === XML_TEXT_NODE) { + if (XML_TEXT_NODE === $childNode->nodeType) { $p = $this->dom->createElement('p'); $p->innerHTML = $childNode->nodeValue; $p->setAttribute('data-readability-styled', 'true'); @@ -789,7 +1056,7 @@ class Readability implements LoggerAwareInterface * A score is determined by things like number of commas, class names, etc. * Maybe eventually link density. */ - for ($pt = 0, $scored = count($nodesToScore); $pt < $scored; ++$pt) { + for ($pt = 0, $scored = \count($nodesToScore); $pt < $scored; ++$pt) { $parentNode = $nodesToScore[$pt]->parentNode; // No parent node? Move on... @@ -856,7 +1123,7 @@ class Readability implements LoggerAwareInterface for ($c = $candidates->length - 1; $c >= 0; --$c) { $node = $candidates->item($c); // node should be readable but not inside of an article otherwise it's probably non-readable block - if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? strcasecmp($node->parentNode->tagName, 'article') !== 0 : true)) { + if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? 0 !== strcasecmp($node->parentNode->tagName, 'article') : true)) { $this->logger->debug('Removing unlikely candidate (using note) ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); $node->parentNode->removeChild($node); } @@ -914,7 +1181,7 @@ class Readability implements LoggerAwareInterface * If we still have no top candidate, just use the body as a last resort. * We also have to copy the body node so it is something we can modify. */ - if ($topCandidate === null || strcasecmp($topCandidate->tagName, 'body') === 0) { + if (null === $topCandidate || 0 === strcasecmp($topCandidate->tagName, 'body')) { $topCandidate = $this->dom->createElement('div'); if ($page instanceof \DOMDocument) { @@ -939,13 +1206,13 @@ class Readability implements LoggerAwareInterface // Set table as the main node if resulted data is table element. $tagName = $topCandidate->tagName; - if (strcasecmp($tagName, 'td') === 0 || strcasecmp($tagName, 'tr') === 0) { + if (0 === strcasecmp($tagName, 'td') || 0 === strcasecmp($tagName, 'tr')) { $up = $topCandidate; if ($up->parentNode instanceof \DOMElement) { $up = $up->parentNode; - if (strcasecmp($up->tagName, 'table') === 0) { + if (0 === strcasecmp($up->tagName, 'table')) { $topCandidate = $up; } } @@ -971,7 +1238,7 @@ class Readability implements LoggerAwareInterface $siblingNode = $siblingNodes->item($s); $siblingNodeName = $siblingNode->nodeName; $append = false; - $this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); + $this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . ((XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); if ($siblingNode->isSameNode($topCandidate)) { $append = true; @@ -980,21 +1247,21 @@ class Readability implements LoggerAwareInterface $contentBonus = 0; // Give a bonus if sibling nodes and top candidates have the same classname. - if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') !== '') { + if (XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && '' !== $topCandidate->getAttribute('class')) { $contentBonus += ((int) $topCandidate->getAttribute('readability')) * 0.2; } - if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) { + if (XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) { $append = true; } - if (strcasecmp($siblingNodeName, 'p') === 0) { + if (0 === strcasecmp($siblingNodeName, 'p')) { $linkDensity = $this->getLinkDensity($siblingNode); $nodeContent = $this->getInnerText($siblingNode, true, true); $nodeLength = mb_strlen($nodeContent); if (($nodeLength > self::MIN_NODE_LENGTH && $linkDensity < self::MAX_LINK_DENSITY) - || ($nodeLength < self::MIN_NODE_LENGTH && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent))) { + || ($nodeLength < self::MIN_NODE_LENGTH && 0 === $linkDensity && preg_match('/\.( |$)/', $nodeContent))) { $append = true; } } @@ -1002,7 +1269,7 @@ class Readability implements LoggerAwareInterface if ($append) { $this->logger->debug('Appending node: ' . $siblingNode->getNodePath()); - if (strcasecmp($siblingNodeName, 'div') !== 0 && strcasecmp($siblingNodeName, 'p') !== 0) { + if (0 !== strcasecmp($siblingNodeName, 'div') && 0 !== strcasecmp($siblingNodeName, 'p')) { // We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. $this->logger->debug('Altering siblingNode "' . $siblingNodeName . '" to "div".'); $nodeToAppend = $this->dom->createElement('div'); @@ -1065,107 +1332,7 @@ class Readability implements LoggerAwareInterface return false; } - return $articleContent; - } - - /** - * Get the inner text of a node. - * This also strips out any excess whitespace to be found. - * - * @param \DOMElement $e - * @param bool $normalizeSpaces (default: true) - * @param bool $flattenLines (default: false) - * - * @return string - */ - public function getInnerText($e, $normalizeSpaces = true, $flattenLines = false) - { - if (null === $e || !isset($e->textContent) || $e->textContent === '') { - return ''; - } - - $textContent = trim($e->textContent); - - if ($flattenLines) { - $textContent = mb_ereg_replace('(?:[\r\n](?:\s| )*)+', '', $textContent); - } elseif ($normalizeSpaces) { - $textContent = mb_ereg_replace('\s\s+', ' ', $textContent); - } - - return $textContent; - } - - /** - * Remove the style attribute on every $e and under. - * - * @param \DOMElement $e - */ - public function cleanStyles($e) - { - if (!is_object($e)) { - return; - } - - $elems = $e->getElementsByTagName('*'); - - foreach ($elems as $elem) { - $elem->removeAttribute('style'); - } - } - - /** - * Get comma number for a given text. - * - * @param string $text - * - * @return int - */ - public function getCommaCount($text) - { - return substr_count($text, ','); - } - - /** - * Get words number for a given text if words separated by a space. - * Input string should be normalized. - * - * @param string $text - * - * @return int - */ - public function getWordCount($text) - { - return substr_count($text, ' '); - } - - /** - * Get the density of links as a percentage of the content - * This is the amount of text that is inside a link divided by the total text in the node. - * Can exclude external references to differentiate between simple text and menus/infoblocks. - * - * @param \DOMElement $e - * @param string $excludeExternal - * - * @return int - */ - public function getLinkDensity($e, $excludeExternal = false) - { - $links = $e->getElementsByTagName('a'); - $textLength = mb_strlen($this->getInnerText($e, true, true)); - $linkLength = 0; - - for ($dRe = $this->domainRegExp, $i = 0, $il = $links->length; $i < $il; ++$i) { - if ($excludeExternal && $dRe && !preg_match($dRe, $links->item($i)->getAttribute('href'))) { - continue; - } - $linkLength += mb_strlen($this->getInnerText($links->item($i))); - } - - if ($textLength > 0 && $linkLength > 0) { - return $linkLength / $textLength; - } - - return 0; + return $articleContent; } /** @@ -1187,7 +1354,7 @@ class Readability implements LoggerAwareInterface // $attributeValue = trim($element->getAttribute('class')." ".$element->getAttribute('id')); $attributeValue = trim($element->getAttribute($attribute)); - if ($attributeValue !== '') { + if ('' !== $attributeValue) { if (preg_match($this->regexps['negative'], $attributeValue)) { $weight -= 25; } @@ -1206,250 +1373,83 @@ class Readability implements LoggerAwareInterface } /** - * Get an element relative weight. - * - * @param \DOMElement $e - * - * @return int + * Will recreate previously deleted body property. */ - public function getWeight($e) + protected function reinitBody() { - if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) { - return 0; + if (!isset($this->body->childNodes)) { + $this->body = $this->dom->createElement('body'); + $this->body->innerHTML = $this->bodyCache; } - - $weight = 0; - // Look for a special classname - $weight += $this->weightAttribute($e, 'class'); - // Look for a special ID - $weight += $this->weightAttribute($e, 'id'); - - return $weight; - } - - /** - * Remove extraneous break tags from a node. - * - * @param \DOMElement $node - */ - public function killBreaks($node) - { - $html = $node->innerHTML; - $html = preg_replace($this->regexps['killBreaks'], '
            ', $html); - $node->innerHTML = $html; } /** - * Clean a node of all elements of type "tag". - * (Unless it's a youtube/vimeo video. People love movies.). - * - * Updated 2012-09-18 to preserve youtube/vimeo iframes + * Load HTML in a DOMDocument. + * Apply Pre filters + * Cleanup HTML using Tidy (or not). * - * @param \DOMElement $e - * @param string $tag + * @todo This should be called in init() instead of from __construct */ - public function clean($e, $tag) + private function loadHtml() { - $currentItem = null; - $targetList = $e->getElementsByTagName($tag); - $isEmbed = ($tag === 'audio' || $tag === 'video' || $tag === 'iframe' || $tag === 'object' || $tag === 'embed'); + $this->original_html = $this->html; - for ($y = $targetList->length - 1; $y >= 0; --$y) { - // Allow youtube and vimeo videos through as people usually want to see those. - $currentItem = $targetList->item($y); + $this->logger->debug('Parsing URL: ' . $this->url); - if ($isEmbed) { - $attributeValues = $currentItem->getAttribute('src') . ' ' . $currentItem->getAttribute('href'); + if ($this->url) { + $this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', parse_url($this->url, PHP_URL_HOST)), ['.' => '\.']) . '/'; + } - // First, check the elements attributes to see if any of them contain known media hosts - if (preg_match($this->regexps['media'], $attributeValues)) { - continue; - } + mb_internal_encoding('UTF-8'); + mb_http_output('UTF-8'); + mb_regex_encoding('UTF-8'); - // Then check the elements inside this element for the same. - if (preg_match($this->regexps['media'], $targetList->item($y)->innerHTML)) { - continue; - } + // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well... + if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) { + foreach ($this->pre_filters as $search => $replace) { + $this->html = preg_replace($search, $replace, $this->html); } - - $currentItem->parentNode->removeChild($currentItem); + unset($search, $replace); } - } - /** - * Clean an element of all tags of type "tag" if they look fishy. - * "Fishy" is an algorithm based on content length, classnames, - * link density, number of images & embeds, etc. - * - * @param \DOMElement $e - * @param string $tag - */ - public function cleanConditionally($e, $tag) - { - if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { - return; + if ('' === trim($this->html)) { + $this->html = ''; } - $tagsList = $e->getElementsByTagName($tag); - $curTagsLength = $tagsList->length; - $node = null; - /* - * Gather counts for other typical elements embedded within. - * Traverse backwards so we can remove nodes at the same time without effecting the traversal. - * - * TODO: Consider taking into account original contentScore here. + * Use tidy (if it exists). + * This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing. + * Although sometimes it makes matters worse, which is why there is an option to disable it. */ - for ($i = $curTagsLength - 1; $i >= 0; --$i) { - $node = $tagsList->item($i); - $weight = $this->getWeight($node); - $contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0; - $this->logger->debug('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : '')); - - if ($weight + $contentScore < 0) { - $this->logger->debug('Removing...'); - $node->parentNode->removeChild($node); - } elseif ($this->getCommaCount($this->getInnerText($node)) < self::MIN_COMMAS_IN_PARAGRAPH) { - /* - * If there are not very many commas, and the number of - * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. - */ - $p = $node->getElementsByTagName('p')->length; - $img = $node->getElementsByTagName('img')->length; - $li = $node->getElementsByTagName('li')->length - 100; - $input = $node->getElementsByTagName('input')->length; - $a = $node->getElementsByTagName('a')->length; - $embedCount = 0; - $embeds = $node->getElementsByTagName('embed'); - - for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) { - if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) { - ++$embedCount; - } - } - - $embeds = $node->getElementsByTagName('iframe'); - for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) { - if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) { - ++$embedCount; - } - } - - $linkDensity = $this->getLinkDensity($node, true); - $contentLength = mb_strlen($this->getInnerText($node)); - $toRemove = false; - - if ($this->lightClean) { - if ($li > $p && $tag !== 'ul' && $tag !== 'ol') { - $this->logger->debug(' too many
          1. elements, and parent is not
              or
                '); - $toRemove = true; - } elseif ($input > floor($p / 3)) { - $this->logger->debug(' too many elements'); - $toRemove = true; - } elseif ($contentLength < 6 && ($embedCount === 0 && ($img === 0 || $img > 2))) { - $this->logger->debug(' content length less than 6 chars, 0 embeds and either 0 images or more than 2 images'); - $toRemove = true; - } elseif ($weight < 25 && $linkDensity > 0.25) { - $this->logger->debug(' weight is ' . $weight . ' < 25 and link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.25'); - $toRemove = true; - } elseif ($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) { - $this->logger->debug(' more than 2 links and weight is ' . $weight . ' > 25 but link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.5'); - $toRemove = true; - } elseif ($embedCount > 3) { - $this->logger->debug(' more than 3 embeds'); - $toRemove = true; - } - } else { - if ($img > $p) { - $this->logger->debug(' more image elements than paragraph elements'); - $toRemove = true; - } elseif ($li > $p && $tag !== 'ul' && $tag !== 'ol') { - $this->logger->debug(' too many
              1. elements, and parent is not
                  or
                    '); - $toRemove = true; - } elseif ($input > floor($p / 3)) { - $this->logger->debug(' too many elements'); - $toRemove = true; - } elseif ($contentLength < 10 && ($img === 0 || $img > 2)) { - $this->logger->debug(' content length less than 10 chars and 0 images, or more than 2 images'); - $toRemove = true; - } elseif ($weight < 25 && $linkDensity > 0.2) { - $this->logger->debug(' weight is ' . $weight . ' lower than 0 and link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.2'); - $toRemove = true; - } elseif ($weight >= 25 && $linkDensity > 0.5) { - $this->logger->debug(' weight above 25 but link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.5'); - $toRemove = true; - } elseif (($embedCount === 1 && $contentLength < 75) || $embedCount > 1) { - $this->logger->debug(' 1 embed and content length smaller than 75 chars, or more than one embed'); - $toRemove = true; - } - } + if ($this->useTidy) { + $this->logger->debug('Tidying document'); - if ($toRemove) { - $this->logger->debug('Removing...'); - $node->parentNode->removeChild($node); - } + $tidy = tidy_repair_string($this->html, $this->tidy_config, 'UTF8'); + if (false !== $tidy && $this->html !== $tidy) { + $this->tidied = true; + $this->html = $tidy; + $this->html = preg_replace('/[\r\n]+/is', "\n", $this->html); } + unset($tidy); } - } - - /** - * Clean out spurious headers from an Element. Checks things like classnames and link density. - * - * @param \DOMElement $e - */ - public function cleanHeaders($e) - { - for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) { - $headers = $e->getElementsByTagName('h' . $headerIndex); - for ($i = $headers->length - 1; $i >= 0; --$i) { - if ($this->getWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { - $headers->item($i)->parentNode->removeChild($headers->item($i)); - } - } - } - } + $this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8'); - /** - * Check if the given flag is active. - * - * @param int $flag - * - * @return bool - */ - public function flagIsActive($flag) - { - return ($this->flags & $flag) > 0; - } + if (!('html5lib' === $this->parser && ($this->dom = Parser::parse($this->html)))) { + libxml_use_internal_errors(true); - /** - * Add a flag. - * - * @param int $flag - */ - public function addFlag($flag) - { - $this->flags = $this->flags | $flag; - } + $this->dom = new \DOMDocument(); + $this->dom->preserveWhiteSpace = false; - /** - * Remove a flag. - * - * @param int $flag - */ - public function removeFlag($flag) - { - $this->flags = $this->flags & ~$flag; - } + if (\PHP_VERSION_ID >= 50400) { + $this->dom->loadHTML($this->html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR); + } else { + $this->dom->loadHTML($this->html); + } - /** - * Will recreate previously deleted body property. - */ - protected function reinitBody() - { - if (!isset($this->body->childNodes)) { - $this->body = $this->dom->createElement('body'); - $this->body->innerHTML = $this->bodyCache; + libxml_use_internal_errors(false); } + + $this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement'); } } diff --git a/tests/ReadabilityTest.php b/tests/ReadabilityTest.php index be03bd8..5a3efdd 100644 --- a/tests/ReadabilityTest.php +++ b/tests/ReadabilityTest.php @@ -11,17 +11,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public $logHandler; public $logger; - private function getReadability($html, $url = null, $parser = 'libxml', $useTidy = true) - { - $readability = new Readability($html, $url, $parser, $useTidy); - - $this->logHandler = new TestHandler(); - $this->logger = new Logger('test', array($this->logHandler)); - $readability->setLogger($this->logger); - - return $readability; - } - /** * @requires extension tidy */ @@ -345,7 +334,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase { error_reporting(E_ALL | E_STRICT); ini_set('display_errors', true); - set_error_handler(array($this, 'error2Exception'), E_ALL | E_STRICT); + set_error_handler([$this, 'error2Exception'], E_ALL | E_STRICT); $data = ' @@ -493,4 +482,15 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase $this->assertContains('2', $readability->getContent()->innerHTML); $this->assertContains('getContent()->innerHTML); } + + private function getReadability($html, $url = null, $parser = 'libxml', $useTidy = true) + { + $readability = new Readability($html, $url, $parser, $useTidy); + + $this->logHandler = new TestHandler(); + $this->logger = new Logger('test', [$this->logHandler]); + $readability->setLogger($this->logger); + + return $readability; + } }