diff --git a/.php_cs b/.php_cs
index 8340221..5dc5396 100644
--- a/.php_cs
+++ b/.php_cs
@@ -4,22 +4,27 @@ return PhpCsFixer\Config::create()
->setUsingCache(true)
->setRiskyAllowed(true)
->setRules([
- 'concat_space' => [
- 'spacing' => 'one',
- ],
+ '@Symfony' => true,
+ '@Symfony:risky' => true,
+ 'array_syntax' => ['syntax' => 'short'],
+ 'combine_consecutive_unsets' => true,
+ 'heredoc_to_nowdoc' => true,
+ 'no_extra_consecutive_blank_lines' => ['break', 'continue', 'extra', 'return', 'throw', 'use', 'parenthesis_brace_block', 'square_brace_block', 'curly_brace_block'],
+ 'no_unreachable_default_argument_value' => true,
+ 'no_useless_else' => true,
+ 'no_useless_return' => true,
+ 'ordered_class_elements' => true,
'ordered_imports' => true,
+ 'php_unit_strict' => false,
'phpdoc_order' => true,
+ // 'psr4' => true,
'strict_comparison' => true,
'strict_param' => true,
- 'array_syntax' => [
- 'syntax' => 'long',
- ],
+ 'concat_space' => ['spacing' => 'one'],
])
->setFinder(
PhpCsFixer\Finder::create()
- ->exclude([
- 'vendor',
- ])
+ ->exclude(['vendor'])
->in(__DIR__)
)
;
diff --git a/src/JSLikeHTMLElement.php b/src/JSLikeHTMLElement.php
index 15e7281..b908d06 100644
--- a/src/JSLikeHTMLElement.php
+++ b/src/JSLikeHTMLElement.php
@@ -45,7 +45,7 @@ class JSLikeHTMLElement extends \DOMElement
*/
public function __set($name, $value)
{
- if ($name !== 'innerHTML') {
+ if ('innerHTML' !== $name) {
$trace = debug_backtrace();
trigger_error('Undefined property via __set(): ' . $name . ' in ' . $trace[0]['file'] . ' on line ' . $trace[0]['line'], E_USER_NOTICE);
@@ -109,7 +109,7 @@ class JSLikeHTMLElement extends \DOMElement
*/
public function __get($name)
{
- if ($name === 'innerHTML') {
+ if ('innerHTML' === $name) {
$inner = '';
foreach ($this->childNodes as $child) {
@@ -121,8 +121,6 @@ class JSLikeHTMLElement extends \DOMElement
$trace = debug_backtrace();
trigger_error('Undefined property via __get(): ' . $name . ' in ' . $trace[0]['file'] . ' on line ' . $trace[0]['line'], E_USER_NOTICE);
-
- return;
}
public function __toString()
diff --git a/src/Readability.php b/src/Readability.php
index 986cce5..8d0aa33 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -52,6 +52,21 @@ use Psr\Log\NullLogger;
*/
class Readability implements LoggerAwareInterface
{
+ // flags
+ const FLAG_STRIP_UNLIKELYS = 1;
+ const FLAG_WEIGHT_ATTRIBUTES = 2;
+ const FLAG_CLEAN_CONDITIONALLY = 4;
+ const FLAG_DISABLE_PREFILTER = 8;
+ const FLAG_DISABLE_POSTFILTER = 16;
+ // constants
+ const SCORE_CHARS_IN_PARAGRAPH = 100;
+ const SCORE_WORDS_IN_PARAGRAPH = 20;
+ const GRANDPARENT_SCORE_DIVISOR = 2.2;
+ const MIN_PARAGRAPH_LENGTH = 20;
+ const MIN_COMMAS_IN_PARAGRAPH = 6;
+ const MIN_ARTICLE_LENGTH = 200;
+ const MIN_NODE_LENGTH = 80;
+ const MAX_LINK_DENSITY = 0.25;
public $convertLinksToFootnotes = false;
public $revertForcedParagraphElements = true;
public $articleTitle;
@@ -65,25 +80,12 @@ class Readability implements LoggerAwareInterface
// no more used, keept to avoid BC
public $debug = false;
public $tidied = false;
- // article domain regexp for calibration
- protected $domainRegExp = null;
- protected $body = null;
- // Cache the body HTML in case we need to re-use it later
- protected $bodyCache = null;
- // 1 | 2 | 4; // Start with all processing flags set.
- protected $flags = 7;
- // indicates whether we were able to extract or not
- protected $success = false;
- protected $logger;
- protected $parser;
- protected $html;
- protected $useTidy;
/**
* All of the regular expressions in use within readability.
* Defined up here so we don't instantiate them repeatedly in loops.
*/
- public $regexps = array(
+ public $regexps = [
'unlikelyCandidates' => '/display\s*:\s*none|ignore|\binfos?\b|annoy|clock|date|time|author|intro|hidd?e|about|archive|\bprint|bookmark|tags|tag-list|share|search|social|robot|published|combx|comment|mast(?:head)|subscri|community|category|disqus|extra|head|head(?:er|note)|floor|foot(?:er|note)|menu|tool\b|function|nav|remark|rss|shoutbox|widget|meta|banner|sponsor|adsense|inner-?ad|ad-|sponsor|\badv\b|\bads\b|agr?egate?|pager|sidebar|popup|tweet|twitter/i',
'okMaybeItsACandidate' => '/article\b|contain|\bcontent|column|general|detail|shadow|lightbox|blog|body|entry|main|page|footnote/i',
'positive' => '/read|full|article|body|\bcontent|contain|entry|main|markdown|media|page|attach|pagination|post|text|blog|story/i',
@@ -92,8 +94,8 @@ class Readability implements LoggerAwareInterface
'killBreaks' => '/(
([ \r\n\s]| ?)*)+/',
'media' => '!//(?:[^\.\?/]+\.)?(?:youtu(?:be)?|giphy|soundcloud|dailymotion|vimeo|pornhub|xvideos|twitvid|rutube|viddler)\.(?:com|be|org|net)/!i',
'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i',
- );
- public $tidy_config = array(
+ ];
+ public $tidy_config = [
'tidy-mark' => false,
'vertical-space' => false,
'doctype' => 'omit',
@@ -117,9 +119,22 @@ class Readability implements LoggerAwareInterface
'input-encoding' => '????',
'output-encoding' => 'utf8',
'hide-comments' => true,
- );
+ ];
+ // article domain regexp for calibration
+ protected $domainRegExp = null;
+ protected $body = null;
+ // Cache the body HTML in case we need to re-use it later
+ protected $bodyCache = null;
+ // 1 | 2 | 4; // Start with all processing flags set.
+ protected $flags = 7;
+ // indicates whether we were able to extract or not
+ protected $success = false;
+ protected $logger;
+ protected $parser;
+ protected $html;
+ protected $useTidy;
// raw HTML filters
- protected $pre_filters = array(
+ protected $pre_filters = [
// remove obvious scripts
'!!is' => '',
// remove obvious styles
@@ -134,9 +149,9 @@ class Readability implements LoggerAwareInterface
//'!?noscript>!is' => '',
// replace fonts to spans
'!<(/?)font[^>]*>!is' => '<\\1span>',
- );
+ ];
// output HTML filters
- protected $post_filters = array(
+ protected $post_filters = [
// replace excessive br's
'/
\s*
'
]*>\s* '\s*
!is' => '',
'!<[hb]r>!is' => '<\\1 />',
- );
-
- // flags
- const FLAG_STRIP_UNLIKELYS = 1;
- const FLAG_WEIGHT_ATTRIBUTES = 2;
- const FLAG_CLEAN_CONDITIONALLY = 4;
- const FLAG_DISABLE_PREFILTER = 8;
- const FLAG_DISABLE_POSTFILTER = 16;
- // constants
- const SCORE_CHARS_IN_PARAGRAPH = 100;
- const SCORE_WORDS_IN_PARAGRAPH = 20;
- const GRANDPARENT_SCORE_DIVISOR = 2.2;
- const MIN_PARAGRAPH_LENGTH = 20;
- const MIN_COMMAS_IN_PARAGRAPH = 6;
- const MIN_ARTICLE_LENGTH = 200;
- const MIN_NODE_LENGTH = 80;
- const MAX_LINK_DENSITY = 0.25;
+ ];
/**
* Create instance of Readability.
@@ -180,7 +179,7 @@ class Readability implements LoggerAwareInterface
$this->url = $url;
$this->html = $html;
$this->parser = $parser;
- $this->useTidy = $use_tidy && function_exists('tidy_parse_string');
+ $this->useTidy = $use_tidy && \function_exists('tidy_parse_string');
$this->logger = new NullLogger();
$this->loadHtml();
@@ -233,76 +232,6 @@ class Readability implements LoggerAwareInterface
$this->post_filters[$filter] = $replacer;
}
- /**
- * Load HTML in a DOMDocument.
- * Apply Pre filters
- * Cleanup HTML using Tidy (or not).
- *
- * @todo This should be called in init() instead of from __construct
- */
- private function loadHtml()
- {
- $this->original_html = $this->html;
-
- $this->logger->debug('Parsing URL: ' . $this->url);
-
- if ($this->url) {
- $this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', parse_url($this->url, PHP_URL_HOST)), array('.' => '\.')) . '/';
- }
-
- mb_internal_encoding('UTF-8');
- mb_http_output('UTF-8');
- mb_regex_encoding('UTF-8');
-
- // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well...
- if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) {
- foreach ($this->pre_filters as $search => $replace) {
- $this->html = preg_replace($search, $replace, $this->html);
- }
- unset($search, $replace);
- }
-
- if (trim($this->html) === '') {
- $this->html = '';
- }
-
- /*
- * Use tidy (if it exists).
- * This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing.
- * Although sometimes it makes matters worse, which is why there is an option to disable it.
- */
- if ($this->useTidy) {
- $this->logger->debug('Tidying document');
-
- $tidy = tidy_repair_string($this->html, $this->tidy_config, 'UTF8');
- if (false !== $tidy && $this->html !== $tidy) {
- $this->tidied = true;
- $this->html = $tidy;
- $this->html = preg_replace('/[\r\n]+/is', "\n", $this->html);
- }
- unset($tidy);
- }
-
- $this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8');
-
- if (!($this->parser === 'html5lib' && ($this->dom = Parser::parse($this->html)))) {
- libxml_use_internal_errors(true);
-
- $this->dom = new \DOMDocument();
- $this->dom->preserveWhiteSpace = false;
-
- if (PHP_VERSION_ID >= 50400) {
- $this->dom->loadHTML($this->html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR);
- } else {
- $this->dom->loadHTML($this->html);
- }
-
- libxml_use_internal_errors(false);
- }
-
- $this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement');
- }
-
/**
* Runs readability.
*
@@ -326,14 +255,14 @@ class Readability implements LoggerAwareInterface
$bodyElems = $this->dom->getElementsByTagName('body');
// WTF multiple body nodes?
- if ($this->bodyCache === null) {
+ if (null === $this->bodyCache) {
$this->bodyCache = '';
foreach ($bodyElems as $bodyNode) {
$this->bodyCache .= trim($bodyNode->innerHTML);
}
}
- if ($bodyElems->length > 0 && $this->body === null) {
+ if ($bodyElems->length > 0 && null === $this->body) {
$this->body = $bodyElems->item(0);
}
@@ -373,27 +302,6 @@ class Readability implements LoggerAwareInterface
return $this->success;
}
- /**
- * Debug.
- *
- * @deprecated use $this->logger->debug() instead
- * @codeCoverageIgnore
- */
- protected function dbg($msg)
- {
- $this->logger->debug($msg);
- }
-
- /**
- * Dump debug info.
- *
- * @deprecated since Monolog gather log, we don't need it
- * @codeCoverageIgnore
- */
- protected function dump_dbg()
- {
- }
-
/**
* Run any post-process modifications to article content as necessary.
*
@@ -406,77 +314,6 @@ class Readability implements LoggerAwareInterface
}
}
- /**
- * Get the article title as an H1.
- *
- * @return \DOMElement
- */
- protected function getArticleTitle()
- {
- try {
- $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
- } catch (\Exception $e) {
- $curTitle = '';
- $origTitle = '';
- }
-
- if (preg_match('/ [\|\-] /', $curTitle)) {
- $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
- if (count(explode(' ', $curTitle)) < 3) {
- $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
- }
- } elseif (strpos($curTitle, ': ') !== false) {
- $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);
- if (count(explode(' ', $curTitle)) < 3) {
- $curTitle = preg_replace('/[^:]*[:](.*)/i', '$1', $origTitle);
- }
- } elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) {
- $hOnes = $this->dom->getElementsByTagName('h1');
- if ($hOnes->length === 1) {
- $curTitle = $this->getInnerText($hOnes->item(0));
- }
- }
-
- $curTitle = trim($curTitle);
- if (count(explode(' ', $curTitle)) <= 4) {
- $curTitle = $origTitle;
- }
-
- $articleTitle = $this->dom->createElement('h1');
- $articleTitle->innerHTML = $curTitle;
-
- return $articleTitle;
- }
-
- /**
- * Prepare the HTML document for readability to scrape it.
- * This includes things like stripping javascript, CSS, and handling terrible markup.
- */
- protected function prepDocument()
- {
- /*
- * In some cases a body element can't be found (if the HTML is totally hosed for example)
- * so we create a new body node and append it to the document.
- */
- if ($this->body === null) {
- $this->body = $this->dom->createElement('body');
- $this->dom->documentElement->appendChild($this->body);
- }
-
- $this->body->setAttribute('class', 'readabilityBody');
-
- // Remove all style tags in head.
- $styleTags = $this->dom->getElementsByTagName('style');
- for ($i = $styleTags->length - 1; $i >= 0; --$i) {
- $styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
- }
-
- $linkTags = $this->dom->getElementsByTagName('link');
- for ($i = $linkTags->length - 1; $i >= 0; --$i) {
- $linkTags->item($i)->parentNode->removeChild($linkTags->item($i));
- }
- }
-
/**
* For easier reading, convert this document to have footnotes at the bottom rather than inline links.
*
@@ -506,7 +343,7 @@ class Readability implements LoggerAwareInterface
}
$linkText = $this->getInnerText($articleLink);
- if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
+ if ((false !== strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote')) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
continue;
}
@@ -527,7 +364,7 @@ class Readability implements LoggerAwareInterface
$articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');
$articleLink->setAttribute('name', 'readabilityLink-' . $linkCount);
$footnote->innerHTML = '^ ';
- $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') !== '' ? $footnoteLink->getAttribute('title') : $linkText);
+ $footnoteLink->innerHTML = ('' !== $footnoteLink->getAttribute('title') ? $footnoteLink->getAttribute('title') : $linkText);
$footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);
$footnote->appendChild($footnoteLink);
@@ -589,7 +426,7 @@ class Readability implements LoggerAwareInterface
* already have a header.
*/
$h2s = $articleContent->getElementsByTagName('h2');
- if ($h2s->length === 1 && mb_strlen($this->getInnerText($h2s->item(0), true, true)) < 100) {
+ if (1 === $h2s->length && mb_strlen($this->getInnerText($h2s->item(0), true, true)) < 100) {
$this->clean($articleContent, 'h2');
}
@@ -614,7 +451,7 @@ class Readability implements LoggerAwareInterface
$audioCount = $item->getElementsByTagName('audio')->length;
$iframeCount = $item->getElementsByTagName('iframe')->length;
- if ($iframeCount === 0 && $imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $videoCount === 0 && $audioCount === 0 && mb_strlen(preg_replace('/\s+/is', '', $this->getInnerText($item, false, false))) === 0) {
+ if (0 === $iframeCount && 0 === $imgCount && 0 === $embedCount && 0 === $objectCount && 0 === $videoCount && 0 === $audioCount && 0 === mb_strlen(preg_replace('/\s+/is', '', $this->getInnerText($item, false, false)))) {
$item->parentNode->removeChild($item);
}
@@ -640,30 +477,460 @@ class Readability implements LoggerAwareInterface
}
/**
- * Initialize a node with the readability object. Also checks the
- * className/id for special names to add to its score.
+ * Get the inner text of a node.
+ * This also strips out any excess whitespace to be found.
*
- * @param \DOMElement $node
+ * @param \DOMElement $e
+ * @param bool $normalizeSpaces (default: true)
+ * @param bool $flattenLines (default: false)
+ *
+ * @return string
*/
- protected function initializeNode($node)
+ public function getInnerText($e, $normalizeSpaces = true, $flattenLines = false)
{
- if (!isset($node->tagName)) {
+ if (null === $e || !isset($e->textContent) || '' === $e->textContent) {
+ return '';
+ }
+
+ $textContent = trim($e->textContent);
+
+ if ($flattenLines) {
+ $textContent = mb_ereg_replace('(?:[\r\n](?:\s| )*)+', '', $textContent);
+ } elseif ($normalizeSpaces) {
+ $textContent = mb_ereg_replace('\s\s+', ' ', $textContent);
+ }
+
+ return $textContent;
+ }
+
+ /**
+ * Remove the style attribute on every $e and under.
+ *
+ * @param \DOMElement $e
+ */
+ public function cleanStyles($e)
+ {
+ if (!\is_object($e)) {
return;
}
- $readability = $this->dom->createAttribute('readability');
- // this is our contentScore
- $readability->value = 0;
- $node->setAttributeNode($readability);
+ $elems = $e->getElementsByTagName('*');
- // using strtoupper just in case
- switch (strtoupper($node->tagName)) {
- case 'ARTICLE':
- $readability->value += 15;
- case 'DIV':
- $readability->value += 5;
- break;
- case 'PRE':
+ foreach ($elems as $elem) {
+ $elem->removeAttribute('style');
+ }
+ }
+
+ /**
+ * Get comma number for a given text.
+ *
+ * @param string $text
+ *
+ * @return int
+ */
+ public function getCommaCount($text)
+ {
+ return substr_count($text, ',');
+ }
+
+ /**
+ * Get words number for a given text if words separated by a space.
+ * Input string should be normalized.
+ *
+ * @param string $text
+ *
+ * @return int
+ */
+ public function getWordCount($text)
+ {
+ return substr_count($text, ' ');
+ }
+
+ /**
+ * Get the density of links as a percentage of the content
+ * This is the amount of text that is inside a link divided by the total text in the node.
+ * Can exclude external references to differentiate between simple text and menus/infoblocks.
+ *
+ * @param \DOMElement $e
+ * @param string $excludeExternal
+ *
+ * @return int
+ */
+ public function getLinkDensity($e, $excludeExternal = false)
+ {
+ $links = $e->getElementsByTagName('a');
+ $textLength = mb_strlen($this->getInnerText($e, true, true));
+ $linkLength = 0;
+
+ for ($dRe = $this->domainRegExp, $i = 0, $il = $links->length; $i < $il; ++$i) {
+ if ($excludeExternal && $dRe && !preg_match($dRe, $links->item($i)->getAttribute('href'))) {
+ continue;
+ }
+ $linkLength += mb_strlen($this->getInnerText($links->item($i)));
+ }
+
+ if ($textLength > 0 && $linkLength > 0) {
+ return $linkLength / $textLength;
+ }
+
+ return 0;
+ }
+
+ /**
+ * Get an element relative weight.
+ *
+ * @param \DOMElement $e
+ *
+ * @return int
+ */
+ public function getWeight($e)
+ {
+ if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
+ return 0;
+ }
+
+ $weight = 0;
+ // Look for a special classname
+ $weight += $this->weightAttribute($e, 'class');
+ // Look for a special ID
+ $weight += $this->weightAttribute($e, 'id');
+
+ return $weight;
+ }
+
+ /**
+ * Remove extraneous break tags from a node.
+ *
+ * @param \DOMElement $node
+ */
+ public function killBreaks($node)
+ {
+ $html = $node->innerHTML;
+ $html = preg_replace($this->regexps['killBreaks'], '
', $html);
+ $node->innerHTML = $html;
+ }
+
+ /**
+ * Clean a node of all elements of type "tag".
+ * (Unless it's a youtube/vimeo video. People love movies.).
+ *
+ * Updated 2012-09-18 to preserve youtube/vimeo iframes
+ *
+ * @param \DOMElement $e
+ * @param string $tag
+ */
+ public function clean($e, $tag)
+ {
+ $currentItem = null;
+ $targetList = $e->getElementsByTagName($tag);
+ $isEmbed = ('audio' === $tag || 'video' === $tag || 'iframe' === $tag || 'object' === $tag || 'embed' === $tag);
+
+ for ($y = $targetList->length - 1; $y >= 0; --$y) {
+ // Allow youtube and vimeo videos through as people usually want to see those.
+ $currentItem = $targetList->item($y);
+
+ if ($isEmbed) {
+ $attributeValues = $currentItem->getAttribute('src') . ' ' . $currentItem->getAttribute('href');
+
+ // First, check the elements attributes to see if any of them contain known media hosts
+ if (preg_match($this->regexps['media'], $attributeValues)) {
+ continue;
+ }
+
+ // Then check the elements inside this element for the same.
+ if (preg_match($this->regexps['media'], $targetList->item($y)->innerHTML)) {
+ continue;
+ }
+ }
+
+ $currentItem->parentNode->removeChild($currentItem);
+ }
+ }
+
+ /**
+ * Clean an element of all tags of type "tag" if they look fishy.
+ * "Fishy" is an algorithm based on content length, classnames,
+ * link density, number of images & embeds, etc.
+ *
+ * @param \DOMElement $e
+ * @param string $tag
+ */
+ public function cleanConditionally($e, $tag)
+ {
+ if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
+ return;
+ }
+
+ $tagsList = $e->getElementsByTagName($tag);
+ $curTagsLength = $tagsList->length;
+ $node = null;
+
+ /*
+ * Gather counts for other typical elements embedded within.
+ * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
+ *
+ * TODO: Consider taking into account original contentScore here.
+ */
+ for ($i = $curTagsLength - 1; $i >= 0; --$i) {
+ $node = $tagsList->item($i);
+ $weight = $this->getWeight($node);
+ $contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0;
+ $this->logger->debug('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : ''));
+
+ if ($weight + $contentScore < 0) {
+ $this->logger->debug('Removing...');
+ $node->parentNode->removeChild($node);
+ } elseif ($this->getCommaCount($this->getInnerText($node)) < self::MIN_COMMAS_IN_PARAGRAPH) {
+ /*
+ * If there are not very many commas, and the number of
+ * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
+ */
+ $p = $node->getElementsByTagName('p')->length;
+ $img = $node->getElementsByTagName('img')->length;
+ $li = $node->getElementsByTagName('li')->length - 100;
+ $input = $node->getElementsByTagName('input')->length;
+ $a = $node->getElementsByTagName('a')->length;
+ $embedCount = 0;
+ $embeds = $node->getElementsByTagName('embed');
+
+ for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) {
+ if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) {
+ ++$embedCount;
+ }
+ }
+
+ $embeds = $node->getElementsByTagName('iframe');
+ for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) {
+ if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) {
+ ++$embedCount;
+ }
+ }
+
+ $linkDensity = $this->getLinkDensity($node, true);
+ $contentLength = mb_strlen($this->getInnerText($node));
+ $toRemove = false;
+
+ if ($this->lightClean) {
+ if ($li > $p && 'ul' !== $tag && 'ol' !== $tag) {
+ $this->logger->debug(' too many or
');
+ $toRemove = true;
+ } elseif ($input > floor($p / 3)) {
+ $this->logger->debug(' too many elements');
+ $toRemove = true;
+ } elseif ($contentLength < 6 && (0 === $embedCount && (0 === $img || $img > 2))) {
+ $this->logger->debug(' content length less than 6 chars, 0 embeds and either 0 images or more than 2 images');
+ $toRemove = true;
+ } elseif ($weight < 25 && $linkDensity > 0.25) {
+ $this->logger->debug(' weight is ' . $weight . ' < 25 and link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.25');
+ $toRemove = true;
+ } elseif ($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
+ $this->logger->debug(' more than 2 links and weight is ' . $weight . ' > 25 but link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.5');
+ $toRemove = true;
+ } elseif ($embedCount > 3) {
+ $this->logger->debug(' more than 3 embeds');
+ $toRemove = true;
+ }
+ } else {
+ if ($img > $p) {
+ $this->logger->debug(' more image elements than paragraph elements');
+ $toRemove = true;
+ } elseif ($li > $p && 'ul' !== $tag && 'ol' !== $tag) {
+ $this->logger->debug(' too many
or
');
+ $toRemove = true;
+ } elseif ($input > floor($p / 3)) {
+ $this->logger->debug(' too many elements');
+ $toRemove = true;
+ } elseif ($contentLength < 10 && (0 === $img || $img > 2)) {
+ $this->logger->debug(' content length less than 10 chars and 0 images, or more than 2 images');
+ $toRemove = true;
+ } elseif ($weight < 25 && $linkDensity > 0.2) {
+ $this->logger->debug(' weight is ' . $weight . ' lower than 0 and link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.2');
+ $toRemove = true;
+ } elseif ($weight >= 25 && $linkDensity > 0.5) {
+ $this->logger->debug(' weight above 25 but link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.5');
+ $toRemove = true;
+ } elseif ((1 === $embedCount && $contentLength < 75) || $embedCount > 1) {
+ $this->logger->debug(' 1 embed and content length smaller than 75 chars, or more than one embed');
+ $toRemove = true;
+ }
+ }
+
+ if ($toRemove) {
+ $this->logger->debug('Removing...');
+ $node->parentNode->removeChild($node);
+ }
+ }
+ }
+ }
+
+ /**
+ * Clean out spurious headers from an Element. Checks things like classnames and link density.
+ *
+ * @param \DOMElement $e
+ */
+ public function cleanHeaders($e)
+ {
+ for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) {
+ $headers = $e->getElementsByTagName('h' . $headerIndex);
+
+ for ($i = $headers->length - 1; $i >= 0; --$i) {
+ if ($this->getWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
+ $headers->item($i)->parentNode->removeChild($headers->item($i));
+ }
+ }
+ }
+ }
+
+ /**
+ * Check if the given flag is active.
+ *
+ * @param int $flag
+ *
+ * @return bool
+ */
+ public function flagIsActive($flag)
+ {
+ return ($this->flags & $flag) > 0;
+ }
+
+ /**
+ * Add a flag.
+ *
+ * @param int $flag
+ */
+ public function addFlag($flag)
+ {
+ $this->flags = $this->flags | $flag;
+ }
+
+ /**
+ * Remove a flag.
+ *
+ * @param int $flag
+ */
+ public function removeFlag($flag)
+ {
+ $this->flags = $this->flags & ~$flag;
+ }
+
+ /**
+ * Debug.
+ *
+ * @deprecated use $this->logger->debug() instead
+ * @codeCoverageIgnore
+ */
+ protected function dbg($msg)
+ {
+ $this->logger->debug($msg);
+ }
+
+ /**
+ * Dump debug info.
+ *
+ * @deprecated since Monolog gather log, we don't need it
+ * @codeCoverageIgnore
+ */
+ protected function dump_dbg()
+ {
+ }
+
+ /**
+ * Get the article title as an H1.
+ *
+ * @return \DOMElement
+ */
+ protected function getArticleTitle()
+ {
+ try {
+ $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
+ } catch (\Exception $e) {
+ $curTitle = '';
+ $origTitle = '';
+ }
+
+ if (preg_match('/ [\|\-] /', $curTitle)) {
+ $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
+ if (\count(explode(' ', $curTitle)) < 3) {
+ $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
+ }
+ } elseif (false !== strpos($curTitle, ': ')) {
+ $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);
+ if (\count(explode(' ', $curTitle)) < 3) {
+ $curTitle = preg_replace('/[^:]*[:](.*)/i', '$1', $origTitle);
+ }
+ } elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) {
+ $hOnes = $this->dom->getElementsByTagName('h1');
+ if (1 === $hOnes->length) {
+ $curTitle = $this->getInnerText($hOnes->item(0));
+ }
+ }
+
+ $curTitle = trim($curTitle);
+ if (\count(explode(' ', $curTitle)) <= 4) {
+ $curTitle = $origTitle;
+ }
+
+ $articleTitle = $this->dom->createElement('h1');
+ $articleTitle->innerHTML = $curTitle;
+
+ return $articleTitle;
+ }
+
+ /**
+ * Prepare the HTML document for readability to scrape it.
+ * This includes things like stripping javascript, CSS, and handling terrible markup.
+ */
+ protected function prepDocument()
+ {
+ /*
+ * In some cases a body element can't be found (if the HTML is totally hosed for example)
+ * so we create a new body node and append it to the document.
+ */
+ if (null === $this->body) {
+ $this->body = $this->dom->createElement('body');
+ $this->dom->documentElement->appendChild($this->body);
+ }
+
+ $this->body->setAttribute('class', 'readabilityBody');
+
+ // Remove all style tags in head.
+ $styleTags = $this->dom->getElementsByTagName('style');
+ for ($i = $styleTags->length - 1; $i >= 0; --$i) {
+ $styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
+ }
+
+ $linkTags = $this->dom->getElementsByTagName('link');
+ for ($i = $linkTags->length - 1; $i >= 0; --$i) {
+ $linkTags->item($i)->parentNode->removeChild($linkTags->item($i));
+ }
+ }
+
+ /**
+ * Initialize a node with the readability object. Also checks the
+ * className/id for special names to add to its score.
+ *
+ * @param \DOMElement $node
+ */
+ protected function initializeNode($node)
+ {
+ if (!isset($node->tagName)) {
+ return;
+ }
+
+ $readability = $this->dom->createAttribute('readability');
+ // this is our contentScore
+ $readability->value = 0;
+ $node->setAttributeNode($readability);
+
+ // using strtoupper just in case
+ switch (strtoupper($node->tagName)) {
+ case 'ARTICLE':
+ $readability->value += 15;
+ // no break
+ case 'DIV':
+ $readability->value += 5;
+ break;
+ case 'PRE':
case 'CODE':
case 'TD':
case 'BLOCKQUOTE':
@@ -723,7 +990,7 @@ class Readability implements LoggerAwareInterface
}
$xpath = null;
- $nodesToScore = array();
+ $nodesToScore = [];
if ($page instanceof \DOMDocument && isset($page->documentElement)) {
$xpath = new \DOMXPath($page);
@@ -735,13 +1002,13 @@ class Readability implements LoggerAwareInterface
$tagName = $node->tagName;
// Some well known site uses sections as paragraphs.
- if (strcasecmp($tagName, 'p') === 0 || strcasecmp($tagName, 'td') === 0 || strcasecmp($tagName, 'pre') === 0 || strcasecmp($tagName, 'section') === 0) {
+ if (0 === strcasecmp($tagName, 'p') || 0 === strcasecmp($tagName, 'td') || 0 === strcasecmp($tagName, 'pre') || 0 === strcasecmp($tagName, 'section')) {
$nodesToScore[] = $node;
}
// Turn divs into P tags where they have been used inappropriately
// (as in, where they contain no other block level elements).
- if (strcasecmp($tagName, 'div') === 0 || strcasecmp($tagName, 'article') === 0 || strcasecmp($tagName, 'section') === 0) {
+ if (0 === strcasecmp($tagName, 'div') || 0 === strcasecmp($tagName, 'article') || 0 === strcasecmp($tagName, 'section')) {
if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
$newNode = $this->dom->createElement('p');
@@ -765,13 +1032,13 @@ class Readability implements LoggerAwareInterface
}
// executable tags (parentNode->removeChild($childNode);
continue;
}
- if ($childNode->nodeType === XML_TEXT_NODE) {
+ if (XML_TEXT_NODE === $childNode->nodeType) {
$p = $this->dom->createElement('p');
$p->innerHTML = $childNode->nodeValue;
$p->setAttribute('data-readability-styled', 'true');
@@ -789,7 +1056,7 @@ class Readability implements LoggerAwareInterface
* A score is determined by things like number of commas, class names, etc.
* Maybe eventually link density.
*/
- for ($pt = 0, $scored = count($nodesToScore); $pt < $scored; ++$pt) {
+ for ($pt = 0, $scored = \count($nodesToScore); $pt < $scored; ++$pt) {
$parentNode = $nodesToScore[$pt]->parentNode;
// No parent node? Move on...
@@ -856,7 +1123,7 @@ class Readability implements LoggerAwareInterface
for ($c = $candidates->length - 1; $c >= 0; --$c) {
$node = $candidates->item($c);
// node should be readable but not inside of an article otherwise it's probably non-readable block
- if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? strcasecmp($node->parentNode->tagName, 'article') !== 0 : true)) {
+ if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? 0 !== strcasecmp($node->parentNode->tagName, 'article') : true)) {
$this->logger->debug('Removing unlikely candidate (using note) ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0));
$node->parentNode->removeChild($node);
}
@@ -914,7 +1181,7 @@ class Readability implements LoggerAwareInterface
* If we still have no top candidate, just use the body as a last resort.
* We also have to copy the body node so it is something we can modify.
*/
- if ($topCandidate === null || strcasecmp($topCandidate->tagName, 'body') === 0) {
+ if (null === $topCandidate || 0 === strcasecmp($topCandidate->tagName, 'body')) {
$topCandidate = $this->dom->createElement('div');
if ($page instanceof \DOMDocument) {
@@ -939,13 +1206,13 @@ class Readability implements LoggerAwareInterface
// Set table as the main node if resulted data is table element.
$tagName = $topCandidate->tagName;
- if (strcasecmp($tagName, 'td') === 0 || strcasecmp($tagName, 'tr') === 0) {
+ if (0 === strcasecmp($tagName, 'td') || 0 === strcasecmp($tagName, 'tr')) {
$up = $topCandidate;
if ($up->parentNode instanceof \DOMElement) {
$up = $up->parentNode;
- if (strcasecmp($up->tagName, 'table') === 0) {
+ if (0 === strcasecmp($up->tagName, 'table')) {
$topCandidate = $up;
}
}
@@ -971,7 +1238,7 @@ class Readability implements LoggerAwareInterface
$siblingNode = $siblingNodes->item($s);
$siblingNodeName = $siblingNode->nodeName;
$append = false;
- $this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
+ $this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . ((XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
if ($siblingNode->isSameNode($topCandidate)) {
$append = true;
@@ -980,21 +1247,21 @@ class Readability implements LoggerAwareInterface
$contentBonus = 0;
// Give a bonus if sibling nodes and top candidates have the same classname.
- if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') !== '') {
+ if (XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && '' !== $topCandidate->getAttribute('class')) {
$contentBonus += ((int) $topCandidate->getAttribute('readability')) * 0.2;
}
- if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) {
+ if (XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) {
$append = true;
}
- if (strcasecmp($siblingNodeName, 'p') === 0) {
+ if (0 === strcasecmp($siblingNodeName, 'p')) {
$linkDensity = $this->getLinkDensity($siblingNode);
$nodeContent = $this->getInnerText($siblingNode, true, true);
$nodeLength = mb_strlen($nodeContent);
if (($nodeLength > self::MIN_NODE_LENGTH && $linkDensity < self::MAX_LINK_DENSITY)
- || ($nodeLength < self::MIN_NODE_LENGTH && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent))) {
+ || ($nodeLength < self::MIN_NODE_LENGTH && 0 === $linkDensity && preg_match('/\.( |$)/', $nodeContent))) {
$append = true;
}
}
@@ -1002,7 +1269,7 @@ class Readability implements LoggerAwareInterface
if ($append) {
$this->logger->debug('Appending node: ' . $siblingNode->getNodePath());
- if (strcasecmp($siblingNodeName, 'div') !== 0 && strcasecmp($siblingNodeName, 'p') !== 0) {
+ if (0 !== strcasecmp($siblingNodeName, 'div') && 0 !== strcasecmp($siblingNodeName, 'p')) {
// We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident.
$this->logger->debug('Altering siblingNode "' . $siblingNodeName . '" to "div".');
$nodeToAppend = $this->dom->createElement('div');
@@ -1065,107 +1332,7 @@ class Readability implements LoggerAwareInterface
return false;
}
- return $articleContent;
- }
-
- /**
- * Get the inner text of a node.
- * This also strips out any excess whitespace to be found.
- *
- * @param \DOMElement $e
- * @param bool $normalizeSpaces (default: true)
- * @param bool $flattenLines (default: false)
- *
- * @return string
- */
- public function getInnerText($e, $normalizeSpaces = true, $flattenLines = false)
- {
- if (null === $e || !isset($e->textContent) || $e->textContent === '') {
- return '';
- }
-
- $textContent = trim($e->textContent);
-
- if ($flattenLines) {
- $textContent = mb_ereg_replace('(?:[\r\n](?:\s| )*)+', '', $textContent);
- } elseif ($normalizeSpaces) {
- $textContent = mb_ereg_replace('\s\s+', ' ', $textContent);
- }
-
- return $textContent;
- }
-
- /**
- * Remove the style attribute on every $e and under.
- *
- * @param \DOMElement $e
- */
- public function cleanStyles($e)
- {
- if (!is_object($e)) {
- return;
- }
-
- $elems = $e->getElementsByTagName('*');
-
- foreach ($elems as $elem) {
- $elem->removeAttribute('style');
- }
- }
-
- /**
- * Get comma number for a given text.
- *
- * @param string $text
- *
- * @return int
- */
- public function getCommaCount($text)
- {
- return substr_count($text, ',');
- }
-
- /**
- * Get words number for a given text if words separated by a space.
- * Input string should be normalized.
- *
- * @param string $text
- *
- * @return int
- */
- public function getWordCount($text)
- {
- return substr_count($text, ' ');
- }
-
- /**
- * Get the density of links as a percentage of the content
- * This is the amount of text that is inside a link divided by the total text in the node.
- * Can exclude external references to differentiate between simple text and menus/infoblocks.
- *
- * @param \DOMElement $e
- * @param string $excludeExternal
- *
- * @return int
- */
- public function getLinkDensity($e, $excludeExternal = false)
- {
- $links = $e->getElementsByTagName('a');
- $textLength = mb_strlen($this->getInnerText($e, true, true));
- $linkLength = 0;
-
- for ($dRe = $this->domainRegExp, $i = 0, $il = $links->length; $i < $il; ++$i) {
- if ($excludeExternal && $dRe && !preg_match($dRe, $links->item($i)->getAttribute('href'))) {
- continue;
- }
- $linkLength += mb_strlen($this->getInnerText($links->item($i)));
- }
-
- if ($textLength > 0 && $linkLength > 0) {
- return $linkLength / $textLength;
- }
-
- return 0;
+ return $articleContent;
}
/**
@@ -1187,7 +1354,7 @@ class Readability implements LoggerAwareInterface
// $attributeValue = trim($element->getAttribute('class')." ".$element->getAttribute('id'));
$attributeValue = trim($element->getAttribute($attribute));
- if ($attributeValue !== '') {
+ if ('' !== $attributeValue) {
if (preg_match($this->regexps['negative'], $attributeValue)) {
$weight -= 25;
}
@@ -1206,250 +1373,83 @@ class Readability implements LoggerAwareInterface
}
/**
- * Get an element relative weight.
- *
- * @param \DOMElement $e
- *
- * @return int
+ * Will recreate previously deleted body property.
*/
- public function getWeight($e)
+ protected function reinitBody()
{
- if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
- return 0;
+ if (!isset($this->body->childNodes)) {
+ $this->body = $this->dom->createElement('body');
+ $this->body->innerHTML = $this->bodyCache;
}
-
- $weight = 0;
- // Look for a special classname
- $weight += $this->weightAttribute($e, 'class');
- // Look for a special ID
- $weight += $this->weightAttribute($e, 'id');
-
- return $weight;
- }
-
- /**
- * Remove extraneous break tags from a node.
- *
- * @param \DOMElement $node
- */
- public function killBreaks($node)
- {
- $html = $node->innerHTML;
- $html = preg_replace($this->regexps['killBreaks'], '
', $html);
- $node->innerHTML = $html;
}
/**
- * Clean a node of all elements of type "tag".
- * (Unless it's a youtube/vimeo video. People love movies.).
- *
- * Updated 2012-09-18 to preserve youtube/vimeo iframes
+ * Load HTML in a DOMDocument.
+ * Apply Pre filters
+ * Cleanup HTML using Tidy (or not).
*
- * @param \DOMElement $e
- * @param string $tag
+ * @todo This should be called in init() instead of from __construct
*/
- public function clean($e, $tag)
+ private function loadHtml()
{
- $currentItem = null;
- $targetList = $e->getElementsByTagName($tag);
- $isEmbed = ($tag === 'audio' || $tag === 'video' || $tag === 'iframe' || $tag === 'object' || $tag === 'embed');
+ $this->original_html = $this->html;
- for ($y = $targetList->length - 1; $y >= 0; --$y) {
- // Allow youtube and vimeo videos through as people usually want to see those.
- $currentItem = $targetList->item($y);
+ $this->logger->debug('Parsing URL: ' . $this->url);
- if ($isEmbed) {
- $attributeValues = $currentItem->getAttribute('src') . ' ' . $currentItem->getAttribute('href');
+ if ($this->url) {
+ $this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', parse_url($this->url, PHP_URL_HOST)), ['.' => '\.']) . '/';
+ }
- // First, check the elements attributes to see if any of them contain known media hosts
- if (preg_match($this->regexps['media'], $attributeValues)) {
- continue;
- }
+ mb_internal_encoding('UTF-8');
+ mb_http_output('UTF-8');
+ mb_regex_encoding('UTF-8');
- // Then check the elements inside this element for the same.
- if (preg_match($this->regexps['media'], $targetList->item($y)->innerHTML)) {
- continue;
- }
+ // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well...
+ if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) {
+ foreach ($this->pre_filters as $search => $replace) {
+ $this->html = preg_replace($search, $replace, $this->html);
}
-
- $currentItem->parentNode->removeChild($currentItem);
+ unset($search, $replace);
}
- }
- /**
- * Clean an element of all tags of type "tag" if they look fishy.
- * "Fishy" is an algorithm based on content length, classnames,
- * link density, number of images & embeds, etc.
- *
- * @param \DOMElement $e
- * @param string $tag
- */
- public function cleanConditionally($e, $tag)
- {
- if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
- return;
+ if ('' === trim($this->html)) {
+ $this->html = '';
}
- $tagsList = $e->getElementsByTagName($tag);
- $curTagsLength = $tagsList->length;
- $node = null;
-
/*
- * Gather counts for other typical elements embedded within.
- * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
- *
- * TODO: Consider taking into account original contentScore here.
+ * Use tidy (if it exists).
+ * This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing.
+ * Although sometimes it makes matters worse, which is why there is an option to disable it.
*/
- for ($i = $curTagsLength - 1; $i >= 0; --$i) {
- $node = $tagsList->item($i);
- $weight = $this->getWeight($node);
- $contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0;
- $this->logger->debug('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : ''));
-
- if ($weight + $contentScore < 0) {
- $this->logger->debug('Removing...');
- $node->parentNode->removeChild($node);
- } elseif ($this->getCommaCount($this->getInnerText($node)) < self::MIN_COMMAS_IN_PARAGRAPH) {
- /*
- * If there are not very many commas, and the number of
- * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
- */
- $p = $node->getElementsByTagName('p')->length;
- $img = $node->getElementsByTagName('img')->length;
- $li = $node->getElementsByTagName('li')->length - 100;
- $input = $node->getElementsByTagName('input')->length;
- $a = $node->getElementsByTagName('a')->length;
- $embedCount = 0;
- $embeds = $node->getElementsByTagName('embed');
-
- for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) {
- if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) {
- ++$embedCount;
- }
- }
-
- $embeds = $node->getElementsByTagName('iframe');
- for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) {
- if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) {
- ++$embedCount;
- }
- }
-
- $linkDensity = $this->getLinkDensity($node, true);
- $contentLength = mb_strlen($this->getInnerText($node));
- $toRemove = false;
-
- if ($this->lightClean) {
- if ($li > $p && $tag !== 'ul' && $tag !== 'ol') {
- $this->logger->debug(' too many or
');
- $toRemove = true;
- } elseif ($input > floor($p / 3)) {
- $this->logger->debug(' too many elements');
- $toRemove = true;
- } elseif ($contentLength < 6 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
- $this->logger->debug(' content length less than 6 chars, 0 embeds and either 0 images or more than 2 images');
- $toRemove = true;
- } elseif ($weight < 25 && $linkDensity > 0.25) {
- $this->logger->debug(' weight is ' . $weight . ' < 25 and link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.25');
- $toRemove = true;
- } elseif ($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
- $this->logger->debug(' more than 2 links and weight is ' . $weight . ' > 25 but link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.5');
- $toRemove = true;
- } elseif ($embedCount > 3) {
- $this->logger->debug(' more than 3 embeds');
- $toRemove = true;
- }
- } else {
- if ($img > $p) {
- $this->logger->debug(' more image elements than paragraph elements');
- $toRemove = true;
- } elseif ($li > $p && $tag !== 'ul' && $tag !== 'ol') {
- $this->logger->debug(' too many
or
');
- $toRemove = true;
- } elseif ($input > floor($p / 3)) {
- $this->logger->debug(' too many elements');
- $toRemove = true;
- } elseif ($contentLength < 10 && ($img === 0 || $img > 2)) {
- $this->logger->debug(' content length less than 10 chars and 0 images, or more than 2 images');
- $toRemove = true;
- } elseif ($weight < 25 && $linkDensity > 0.2) {
- $this->logger->debug(' weight is ' . $weight . ' lower than 0 and link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.2');
- $toRemove = true;
- } elseif ($weight >= 25 && $linkDensity > 0.5) {
- $this->logger->debug(' weight above 25 but link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.5');
- $toRemove = true;
- } elseif (($embedCount === 1 && $contentLength < 75) || $embedCount > 1) {
- $this->logger->debug(' 1 embed and content length smaller than 75 chars, or more than one embed');
- $toRemove = true;
- }
- }
+ if ($this->useTidy) {
+ $this->logger->debug('Tidying document');
- if ($toRemove) {
- $this->logger->debug('Removing...');
- $node->parentNode->removeChild($node);
- }
+ $tidy = tidy_repair_string($this->html, $this->tidy_config, 'UTF8');
+ if (false !== $tidy && $this->html !== $tidy) {
+ $this->tidied = true;
+ $this->html = $tidy;
+ $this->html = preg_replace('/[\r\n]+/is', "\n", $this->html);
}
+ unset($tidy);
}
- }
-
- /**
- * Clean out spurious headers from an Element. Checks things like classnames and link density.
- *
- * @param \DOMElement $e
- */
- public function cleanHeaders($e)
- {
- for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) {
- $headers = $e->getElementsByTagName('h' . $headerIndex);
- for ($i = $headers->length - 1; $i >= 0; --$i) {
- if ($this->getWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
- $headers->item($i)->parentNode->removeChild($headers->item($i));
- }
- }
- }
- }
+ $this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8');
- /**
- * Check if the given flag is active.
- *
- * @param int $flag
- *
- * @return bool
- */
- public function flagIsActive($flag)
- {
- return ($this->flags & $flag) > 0;
- }
+ if (!('html5lib' === $this->parser && ($this->dom = Parser::parse($this->html)))) {
+ libxml_use_internal_errors(true);
- /**
- * Add a flag.
- *
- * @param int $flag
- */
- public function addFlag($flag)
- {
- $this->flags = $this->flags | $flag;
- }
+ $this->dom = new \DOMDocument();
+ $this->dom->preserveWhiteSpace = false;
- /**
- * Remove a flag.
- *
- * @param int $flag
- */
- public function removeFlag($flag)
- {
- $this->flags = $this->flags & ~$flag;
- }
+ if (\PHP_VERSION_ID >= 50400) {
+ $this->dom->loadHTML($this->html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR);
+ } else {
+ $this->dom->loadHTML($this->html);
+ }
- /**
- * Will recreate previously deleted body property.
- */
- protected function reinitBody()
- {
- if (!isset($this->body->childNodes)) {
- $this->body = $this->dom->createElement('body');
- $this->body->innerHTML = $this->bodyCache;
+ libxml_use_internal_errors(false);
}
+
+ $this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement');
}
}
diff --git a/tests/ReadabilityTest.php b/tests/ReadabilityTest.php
index be03bd8..5a3efdd 100644
--- a/tests/ReadabilityTest.php
+++ b/tests/ReadabilityTest.php
@@ -11,17 +11,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public $logHandler;
public $logger;
- private function getReadability($html, $url = null, $parser = 'libxml', $useTidy = true)
- {
- $readability = new Readability($html, $url, $parser, $useTidy);
-
- $this->logHandler = new TestHandler();
- $this->logger = new Logger('test', array($this->logHandler));
- $readability->setLogger($this->logger);
-
- return $readability;
- }
-
/**
* @requires extension tidy
*/
@@ -345,7 +334,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
{
error_reporting(E_ALL | E_STRICT);
ini_set('display_errors', true);
- set_error_handler(array($this, 'error2Exception'), E_ALL | E_STRICT);
+ set_error_handler([$this, 'error2Exception'], E_ALL | E_STRICT);
$data = '
@@ -493,4 +482,15 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertContains('2', $readability->getContent()->innerHTML);
$this->assertContains('getContent()->innerHTML);
}
+
+ private function getReadability($html, $url = null, $parser = 'libxml', $useTidy = true)
+ {
+ $readability = new Readability($html, $url, $parser, $useTidy);
+
+ $this->logHandler = new TestHandler();
+ $this->logger = new Logger('test', [$this->logHandler]);
+ $readability->setLogger($this->logger);
+
+ return $readability;
+ }
}