@ -2,7 +2,6 @@
namespace Readability;
namespace Readability;
use DOMElement;
use Masterminds\HTML5;
use Masterminds\HTML5;
use Psr\Log\LoggerAwareInterface;
use Psr\Log\LoggerAwareInterface;
use Psr\Log\LoggerInterface;
use Psr\Log\LoggerInterface;
@ -115,7 +114,7 @@ class Readability implements LoggerAwareInterface
// HACK: replace linebreaks plus br's with p's
// HACK: replace linebreaks plus br's with p's
'!(< br [ ^ > ]*>[ \r\n\s]*){2,}!i' => '< / p > < p > ',
'!(< br [ ^ > ]*>[ \r\n\s]*){2,}!i' => '< / p > < p > ',
// replace noscripts
// replace noscripts
//'!< /?noscript>!is' => '',
// '!< /?noscript>!is' => '',
// replace fonts to spans
// replace fonts to spans
'!< (/?)font[^>]*>!is' => '< \\1span>',
'!< (/?)font[^>]*>!is' => '< \\1span>',
];
];
@ -126,8 +125,8 @@ class Readability implements LoggerAwareInterface
// replace empty tags that break layouts
// replace empty tags that break layouts
'!< (?:a|div|p|figure)[^>]+/>!is' => '',
'!< (?:a|div|p|figure)[^>]+/>!is' => '',
// remove all attributes on text tags
// remove all attributes on text tags
//'!< (\s*/?\s*(?:blockquote|br|hr|code|div|article|span|footer|aside|p|pre|dl|li|ul|ol)) [^>]+>!is' => "< \\1>",
// '!< (\s*/?\s*(?:blockquote|br|hr|code|div|article|span|footer|aside|p|pre|dl|li|ul|ol)) [^>]+>!is' => "< \\1>",
//single newlines cleanup
// single newlines cleanup
"/\n+/" => "\n",
"/\n+/" => "\n",
// modern web...
// modern web...
'!< pre [ ^ > ]*>\s*< code ! is ' = > '< pre ' ,
'!< pre [ ^ > ]*>\s*< code ! is ' = > '< pre ' ,
@ -161,7 +160,7 @@ class Readability implements LoggerAwareInterface
/**
/**
* Get article title element.
* Get article title element.
*
*
* @return DOMElement
* @return \ DOMElement
*/
*/
public function getTitle()
public function getTitle()
{
{
@ -171,7 +170,7 @@ class Readability implements LoggerAwareInterface
/**
/**
* Get article content element.
* Get article content element.
*
*
* @return DOMElement
* @return \ DOMElement
*/
*/
public function getContent()
public function getContent()
{
{
@ -280,7 +279,7 @@ class Readability implements LoggerAwareInterface
/**
/**
* Run any post-process modifications to article content as necessary.
* Run any post-process modifications to article content as necessary.
*/
*/
public function postProcessContent(DOMElement $articleContent): void
public function postProcessContent(\ DOMElement $articleContent): void
{
{
if ($this->convertLinksToFootnotes & & !preg_match('/\bwiki/', $this->url)) {
if ($this->convertLinksToFootnotes & & !preg_match('/\bwiki/', $this->url)) {
$this->addFootnotes($articleContent);
$this->addFootnotes($articleContent);
@ -292,7 +291,7 @@ class Readability implements LoggerAwareInterface
*
*
* @see http://www.roughtype.com/archives/2010/05/experiments_in.php
* @see http://www.roughtype.com/archives/2010/05/experiments_in.php
*/
*/
public function addFootnotes(DOMElement $articleContent): void
public function addFootnotes(\ DOMElement $articleContent): void
{
{
$footnotesWrapper = $this->dom->createElement('footer');
$footnotesWrapper = $this->dom->createElement('footer');
$footnotesWrapper->setAttribute('class', 'readability-footnotes');
$footnotesWrapper->setAttribute('class', 'readability-footnotes');
@ -335,7 +334,7 @@ class Readability implements LoggerAwareInterface
$articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');
$articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');
$articleLink->setAttribute('name', 'readabilityLink-' . $linkCount);
$articleLink->setAttribute('name', 'readabilityLink-' . $linkCount);
$footnote->setInnerHtml('< small > < sup > < a href = "#readabilityLink-' . $linkCount . '" title = "Jump to Link in Article" > ^< / a > < / sup > < / small > ');
$footnote->setInnerHtml('< small > < sup > < a href = "#readabilityLink-' . $linkCount . '" title = "Jump to Link in Article" > ^< / a > < / sup > < / small > ');
$footnoteLink->setInnerHtml(( '' !== $footnoteLink->getAttribute('title') ? $footnoteLink->getAttribute('title') : $linkText) );
$footnoteLink->setInnerHtml('' !== $footnoteLink->getAttribute('title') ? $footnoteLink->getAttribute('title') : $linkText);
$footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);
$footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);
$footnote->appendChild($footnoteLink);
$footnote->appendChild($footnoteLink);
@ -356,7 +355,7 @@ class Readability implements LoggerAwareInterface
*/
*/
public function prepArticle(\DOMNode $articleContent): void
public function prepArticle(\DOMNode $articleContent): void
{
{
if (!$articleContent instanceof DOMElement) {
if (!$articleContent instanceof \ DOMElement) {
return;
return;
}
}
@ -456,7 +455,7 @@ class Readability implements LoggerAwareInterface
* Get the inner text of a node.
* Get the inner text of a node.
* This also strips out any excess whitespace to be found.
* This also strips out any excess whitespace to be found.
*
*
* @param DOMElement $e
* @param \ DOMElement $e
* @param bool $normalizeSpaces (default: true)
* @param bool $normalizeSpaces (default: true)
* @param bool $flattenLines (default: false)
* @param bool $flattenLines (default: false)
*/
*/
@ -482,7 +481,7 @@ class Readability implements LoggerAwareInterface
/**
/**
* Remove the style attribute on every $e and under.
* Remove the style attribute on every $e and under.
*/
*/
public function cleanStyles(DOMElement $e): void
public function cleanStyles(\ DOMElement $e): void
{
{
if (\is_object($e)) {
if (\is_object($e)) {
$elems = $e->getElementsByTagName('*');
$elems = $e->getElementsByTagName('*');
@ -515,7 +514,7 @@ class Readability implements LoggerAwareInterface
* This is the amount of text that is inside a link divided by the total text in the node.
* This is the amount of text that is inside a link divided by the total text in the node.
* Can exclude external references to differentiate between simple text and menus/infoblocks.
* Can exclude external references to differentiate between simple text and menus/infoblocks.
*/
*/
public function getLinkDensity(DOMElement $e, bool $excludeExternal = false): float
public function getLinkDensity(\ DOMElement $e, bool $excludeExternal = false): float
{
{
$links = $e->getElementsByTagName('a');
$links = $e->getElementsByTagName('a');
$textLength = mb_strlen($this->getInnerText($e, true, true));
$textLength = mb_strlen($this->getInnerText($e, true, true));
@ -538,7 +537,7 @@ class Readability implements LoggerAwareInterface
/**
/**
* Get an element relative weight.
* Get an element relative weight.
*/
*/
public function getWeight(DOMElement $e): int
public function getWeight(\ DOMElement $e): int
{
{
if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
return 0;
return 0;
@ -556,7 +555,7 @@ class Readability implements LoggerAwareInterface
/**
/**
* Remove extraneous break tags from a node.
* Remove extraneous break tags from a node.
*/
*/
public function killBreaks(DOMElement $node): void
public function killBreaks(\ DOMElement $node): void
{
{
$html = $node->getInnerHTML();
$html = $node->getInnerHTML();
$html = preg_replace($this->regexps['killBreaks'], '< br / > ', $html);
$html = preg_replace($this->regexps['killBreaks'], '< br / > ', $html);
@ -569,7 +568,7 @@ class Readability implements LoggerAwareInterface
*
*
* Updated 2012-09-18 to preserve youtube/vimeo iframes
* Updated 2012-09-18 to preserve youtube/vimeo iframes
*/
*/
public function clean(DOMElement $e, string $tag): void
public function clean(\ DOMElement $e, string $tag): void
{
{
$targetList = $e->getElementsByTagName($tag);
$targetList = $e->getElementsByTagName($tag);
$isEmbed = ('audio' === $tag || 'video' === $tag || 'iframe' === $tag || 'object' === $tag || 'embed' === $tag);
$isEmbed = ('audio' === $tag || 'video' === $tag || 'iframe' === $tag || 'object' === $tag || 'embed' === $tag);
@ -601,7 +600,7 @@ class Readability implements LoggerAwareInterface
* "Fishy" is an algorithm based on content length, classnames,
* "Fishy" is an algorithm based on content length, classnames,
* link density, number of images & embeds, etc.
* link density, number of images & embeds, etc.
*/
*/
public function cleanConditionally(DOMElement $e, string $tag): void
public function cleanConditionally(\ DOMElement $e, string $tag): void
{
{
if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
return;
return;
@ -714,7 +713,7 @@ class Readability implements LoggerAwareInterface
/**
/**
* Clean out spurious headers from an Element. Checks things like classnames and link density.
* Clean out spurious headers from an Element. Checks things like classnames and link density.
*/
*/
public function cleanHeaders(DOMElement $e): void
public function cleanHeaders(\ DOMElement $e): void
{
{
for ($headerIndex = 1; $headerIndex < 3 ; + + $ headerIndex ) {
for ($headerIndex = 1; $headerIndex < 3 ; + + $ headerIndex ) {
$headers = $e->getElementsByTagName('h' . $headerIndex);
$headers = $e->getElementsByTagName('h' . $headerIndex);
@ -754,7 +753,7 @@ class Readability implements LoggerAwareInterface
/**
/**
* Get the article title as an H1.
* Get the article title as an H1.
*
*
* @return DOMElement
* @return \ DOMElement
*/
*/
protected function getArticleTitle()
protected function getArticleTitle()
{
{
@ -826,7 +825,7 @@ class Readability implements LoggerAwareInterface
* Initialize a node with the readability object. Also checks the
* Initialize a node with the readability object. Also checks the
* className/id for special names to add to its score.
* className/id for special names to add to its score.
*/
*/
protected function initializeNode(DOMElement $node): void
protected function initializeNode(\ DOMElement $node): void
{
{
if (!isset($node->tagName)) {
if (!isset($node->tagName)) {
return;
return;
@ -894,11 +893,11 @@ class Readability implements LoggerAwareInterface
* Using a variety of metrics (content score, classname, element types), find the content that is
* Using a variety of metrics (content score, classname, element types), find the content that is
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
*
*
* @param DOMElement $page
* @param \ DOMElement $page
*
*
* @return DOMElement|false
* @return \ DOMElement|false
*/
*/
protected function grabArticle(DOMElement $page = null)
protected function grabArticle(\ DOMElement $page = null)
{
{
if (!$page) {
if (!$page) {
$page = $this->dom;
$page = $this->dom;
@ -1040,7 +1039,7 @@ class Readability implements LoggerAwareInterface
// For every SCORE_CHARS_IN_PARAGRAPH (default:100) characters in this paragraph, add another point. Up to 3 points.
// For every SCORE_CHARS_IN_PARAGRAPH (default:100) characters in this paragraph, add another point. Up to 3 points.
$contentScore += min(floor(mb_strlen($innerText) / self::SCORE_CHARS_IN_PARAGRAPH), 3);
$contentScore += min(floor(mb_strlen($innerText) / self::SCORE_CHARS_IN_PARAGRAPH), 3);
// For every SCORE_WORDS_IN_PARAGRAPH (default:20) words in this paragraph, add another point. Up to 3 points.
// For every SCORE_WORDS_IN_PARAGRAPH (default:20) words in this paragraph, add another point. Up to 3 points.
//$contentScore += min(floor($this->getWordCount($innerText) / self::SCORE_WORDS_IN_PARAGRAPH), 3);
// $contentScore += min(floor($this->getWordCount($innerText) / self::SCORE_WORDS_IN_PARAGRAPH), 3);
foreach ($ancestors as $level => $ancestor) {
foreach ($ancestors as $level => $ancestor) {
if (!$ancestor->nodeName || !$ancestor->parentNode) {
if (!$ancestor->nodeName || !$ancestor->parentNode) {
@ -1211,7 +1210,7 @@ class Readability implements LoggerAwareInterface
if (0 === strcasecmp($tagName, 'td') || 0 === strcasecmp($tagName, 'tr')) {
if (0 === strcasecmp($tagName, 'td') || 0 === strcasecmp($tagName, 'tr')) {
$up = $topCandidate;
$up = $topCandidate;
if ($up->parentNode instanceof DOMElement) {
if ($up->parentNode instanceof \ DOMElement) {
$up = $up->parentNode;
$up = $up->parentNode;
if (0 === strcasecmp($up->tagName, 'table')) {
if (0 === strcasecmp($up->tagName, 'table')) {
@ -1292,8 +1291,8 @@ class Readability implements LoggerAwareInterface
// To ensure a node does not interfere with readability styles, remove its classnames & ids.
// To ensure a node does not interfere with readability styles, remove its classnames & ids.
// Now done via RegExp post_filter.
// Now done via RegExp post_filter.
//$nodeToAppend->removeAttribute('class');
// $nodeToAppend->removeAttribute('class');
//$nodeToAppend->removeAttribute('id');
// $nodeToAppend->removeAttribute('id');
// Append sibling and subtract from our list as appending removes a node.
// Append sibling and subtract from our list as appending removes a node.
$articleContent->appendChild($nodeToAppend);
$articleContent->appendChild($nodeToAppend);
}
}
@ -1340,7 +1339,7 @@ class Readability implements LoggerAwareInterface
* Get an element weight by attribute.
* Get an element weight by attribute.
* Uses regular expressions to tell if this element looks good or bad.
* Uses regular expressions to tell if this element looks good or bad.
*/
*/
protected function weightAttribute(DOMElement $element, string $attribute): int
protected function weightAttribute(\ DOMElement $element, string $attribute): int
{
{
if (!$element->hasAttribute($attribute)) {
if (!$element->hasAttribute($attribute)) {
return 0;
return 0;
@ -1443,14 +1442,14 @@ class Readability implements LoggerAwareInterface
libxml_use_internal_errors(false);
libxml_use_internal_errors(false);
}
}
$this->dom->registerNodeClass(DOMElement::class, \Readability\JSLikeHTMLElement::class);
$this->dom->registerNodeClass(\ DOMElement::class, \Readability\JSLikeHTMLElement::class);
}
}
private function getAncestors(DOMElement $node, int $maxDepth = 0): array
private function getAncestors(\ DOMElement $node, int $maxDepth = 0): array
{
{
$ancestors = [];
$ancestors = [];
$i = 0;
$i = 0;
while ($node->parentNode instanceof DOMElement) {
while ($node->parentNode instanceof \ DOMElement) {
$ancestors[] = $node->parentNode;
$ancestors[] = $node->parentNode;
if (++$i === $maxDepth) {
if (++$i === $maxDepth) {
break;
break;
@ -1470,7 +1469,7 @@ class Readability implements LoggerAwareInterface
}, iterator_to_array($node->childNodes)), true));
}, iterator_to_array($node->childNodes)), true));
}
}
private function hasSingleTagInsideElement(DOMElement $node, string $tag): bool
private function hasSingleTagInsideElement(\ DOMElement $node, string $tag): bool
{
{
if (1 !== $node->childNodes->length || $node->childNodes->item(0)->nodeName !== $tag) {
if (1 !== $node->childNodes->length || $node->childNodes->item(0)->nodeName !== $tag) {
return false;
return false;
@ -1490,7 +1489,7 @@ class Readability implements LoggerAwareInterface
* Tidy must be configured to not clean the input for this function to
* Tidy must be configured to not clean the input for this function to
* work as expected, see $this->tidy_config['clean']
* work as expected, see $this->tidy_config['clean']
*/
*/
private function isNodeVisible(DOMElement $node): bool
private function isNodeVisible(\ DOMElement $node): bool
{
{
return !($node->hasAttribute('style')
return !($node->hasAttribute('style')
& & preg_match($this->regexps['isNotVisible'], $node->getAttribute('style'))
& & preg_match($this->regexps['isNotVisible'], $node->getAttribute('style'))