|
|
|
@ -284,7 +284,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
|
|
|
|
|
|
|
|
$this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8'); |
|
|
|
$this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8'); |
|
|
|
|
|
|
|
|
|
|
|
if (!($this->parser == 'html5lib' && ($this->dom = \HTML5_Parser::parse($this->html)))) { |
|
|
|
if (!($this->parser === 'html5lib' && ($this->dom = \HTML5_Parser::parse($this->html)))) { |
|
|
|
libxml_use_internal_errors(true); |
|
|
|
libxml_use_internal_errors(true); |
|
|
|
|
|
|
|
|
|
|
|
$this->dom = new \DOMDocument(); |
|
|
|
$this->dom = new \DOMDocument(); |
|
|
|
@ -325,7 +325,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
$bodyElems = $this->dom->getElementsByTagName('body'); |
|
|
|
$bodyElems = $this->dom->getElementsByTagName('body'); |
|
|
|
|
|
|
|
|
|
|
|
// WTF multiple body nodes? |
|
|
|
// WTF multiple body nodes? |
|
|
|
if ($this->bodyCache == null) { |
|
|
|
if ($this->bodyCache === null) { |
|
|
|
$this->bodyCache = ''; |
|
|
|
$this->bodyCache = ''; |
|
|
|
foreach ($bodyElems as $bodyNode) { |
|
|
|
foreach ($bodyElems as $bodyNode) { |
|
|
|
$this->bodyCache .= trim($bodyNode->innerHTML); |
|
|
|
$this->bodyCache .= trim($bodyNode->innerHTML); |
|
|
|
@ -429,7 +429,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
} |
|
|
|
} |
|
|
|
} elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) { |
|
|
|
} elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) { |
|
|
|
$hOnes = $this->dom->getElementsByTagName('h1'); |
|
|
|
$hOnes = $this->dom->getElementsByTagName('h1'); |
|
|
|
if ($hOnes->length == 1) { |
|
|
|
if ($hOnes->length === 1) { |
|
|
|
$curTitle = $this->getInnerText($hOnes->item(0)); |
|
|
|
$curTitle = $this->getInnerText($hOnes->item(0)); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
@ -524,7 +524,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
$articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); |
|
|
|
$articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); |
|
|
|
$articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); |
|
|
|
$articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); |
|
|
|
$footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> '; |
|
|
|
$footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> '; |
|
|
|
$footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText); |
|
|
|
$footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') !== '' ? $footnoteLink->getAttribute('title') : $linkText); |
|
|
|
$footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); |
|
|
|
$footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); |
|
|
|
$footnote->appendChild($footnoteLink); |
|
|
|
$footnote->appendChild($footnoteLink); |
|
|
|
|
|
|
|
|
|
|
|
@ -586,7 +586,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
* already have a header. |
|
|
|
* already have a header. |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
$h2s = $articleContent->getElementsByTagName('h2'); |
|
|
|
$h2s = $articleContent->getElementsByTagName('h2'); |
|
|
|
if ($h2s->length == 1 && mb_strlen($this->getInnerText($h2s->item(0), true, true)) < 100) { |
|
|
|
if ($h2s->length === 1 && mb_strlen($this->getInnerText($h2s->item(0), true, true)) < 100) { |
|
|
|
$this->clean($articleContent, 'h2'); |
|
|
|
$this->clean($articleContent, 'h2'); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
@ -971,7 +971,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
$contentBonus = 0; |
|
|
|
$contentBonus = 0; |
|
|
|
|
|
|
|
|
|
|
|
// Give a bonus if sibling nodes and top candidates have the same classname. |
|
|
|
// Give a bonus if sibling nodes and top candidates have the same classname. |
|
|
|
if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') { |
|
|
|
if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') !== '') { |
|
|
|
$contentBonus += ((int) $topCandidate->getAttribute('readability')) * 0.2; |
|
|
|
$contentBonus += ((int) $topCandidate->getAttribute('readability')) * 0.2; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
@ -1178,7 +1178,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
// $attributeValue = trim($element->getAttribute('class')." ".$element->getAttribute('id')); |
|
|
|
// $attributeValue = trim($element->getAttribute('class')." ".$element->getAttribute('id')); |
|
|
|
$attributeValue = trim($element->getAttribute($attribute)); |
|
|
|
$attributeValue = trim($element->getAttribute($attribute)); |
|
|
|
|
|
|
|
|
|
|
|
if ($attributeValue != '') { |
|
|
|
if ($attributeValue !== '') { |
|
|
|
if (preg_match($this->regexps['negative'], $attributeValue)) { |
|
|
|
if (preg_match($this->regexps['negative'], $attributeValue)) { |
|
|
|
$weight -= 25; |
|
|
|
$weight -= 25; |
|
|
|
} |
|
|
|
} |
|
|
|
@ -1331,7 +1331,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
$toRemove = false; |
|
|
|
$toRemove = false; |
|
|
|
|
|
|
|
|
|
|
|
if ($this->lightClean) { |
|
|
|
if ($this->lightClean) { |
|
|
|
if ($li > $p && $tag != 'ul' && $tag != 'ol') { |
|
|
|
if ($li > $p && $tag !== 'ul' && $tag !== 'ol') { |
|
|
|
$this->logger->debug(' too many <li> elements, and parent is not <ul> or <ol>'); |
|
|
|
$this->logger->debug(' too many <li> elements, and parent is not <ul> or <ol>'); |
|
|
|
$toRemove = true; |
|
|
|
$toRemove = true; |
|
|
|
} elseif ($input > floor($p / 3)) { |
|
|
|
} elseif ($input > floor($p / 3)) { |
|
|
|
@ -1354,7 +1354,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
if ($img > $p) { |
|
|
|
if ($img > $p) { |
|
|
|
$this->logger->debug(' more image elements than paragraph elements'); |
|
|
|
$this->logger->debug(' more image elements than paragraph elements'); |
|
|
|
$toRemove = true; |
|
|
|
$toRemove = true; |
|
|
|
} elseif ($li > $p && $tag != 'ul' && $tag != 'ol') { |
|
|
|
} elseif ($li > $p && $tag !== 'ul' && $tag !== 'ol') { |
|
|
|
$this->logger->debug(' too many <li> elements, and parent is not <ul> or <ol>'); |
|
|
|
$this->logger->debug(' too many <li> elements, and parent is not <ul> or <ol>'); |
|
|
|
$toRemove = true; |
|
|
|
$toRemove = true; |
|
|
|
} elseif ($input > floor($p / 3)) { |
|
|
|
} elseif ($input > floor($p / 3)) { |
|
|
|
@ -1369,7 +1369,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
} elseif ($weight >= 25 && $linkDensity > 0.5) { |
|
|
|
} elseif ($weight >= 25 && $linkDensity > 0.5) { |
|
|
|
$this->logger->debug(' weight above 25 but link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.5'); |
|
|
|
$this->logger->debug(' weight above 25 but link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.5'); |
|
|
|
$toRemove = true; |
|
|
|
$toRemove = true; |
|
|
|
} elseif (($embedCount == 1 && $contentLength < 75) || $embedCount > 1) { |
|
|
|
} elseif (($embedCount === 1 && $contentLength < 75) || $embedCount > 1) { |
|
|
|
$this->logger->debug(' 1 embed and content length smaller than 75 chars, or more than one embed'); |
|
|
|
$this->logger->debug(' 1 embed and content length smaller than 75 chars, or more than one embed'); |
|
|
|
$toRemove = true; |
|
|
|
$toRemove = true; |
|
|
|
} |
|
|
|
} |
|
|
|
|