|
|
|
@ -366,7 +366,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
$articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); |
|
|
|
$articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); |
|
|
|
$articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); |
|
|
|
$articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); |
|
|
|
$footnote->setInnerHtml('<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> '); |
|
|
|
$footnote->setInnerHtml('<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> '); |
|
|
|
$footnoteLink->setInnerHtml(('' !== $footnoteLink->getAttribute('title') ? $footnoteLink->getAttribute('title') : $linkText)); |
|
|
|
$footnoteLink->setInnerHtml('' !== $footnoteLink->getAttribute('title') ? $footnoteLink->getAttribute('title') : $linkText); |
|
|
|
$footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); |
|
|
|
$footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); |
|
|
|
$footnote->appendChild($footnoteLink); |
|
|
|
$footnote->appendChild($footnoteLink); |
|
|
|
|
|
|
|
|
|
|
|
@ -796,7 +796,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
public function addFlag($flag) |
|
|
|
public function addFlag($flag) |
|
|
|
{ |
|
|
|
{ |
|
|
|
$this->flags = $this->flags | $flag; |
|
|
|
$this->flags |= $flag; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/** |
|
|
|
/** |
|
|
|
@ -806,13 +806,14 @@ class Readability implements LoggerAwareInterface |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
public function removeFlag($flag) |
|
|
|
public function removeFlag($flag) |
|
|
|
{ |
|
|
|
{ |
|
|
|
$this->flags = $this->flags & ~$flag; |
|
|
|
$this->flags &= ~$flag; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/** |
|
|
|
/** |
|
|
|
* Debug. |
|
|
|
* Debug. |
|
|
|
* |
|
|
|
* |
|
|
|
* @deprecated use $this->logger->debug() instead |
|
|
|
* @deprecated use $this->logger->debug() instead |
|
|
|
|
|
|
|
* |
|
|
|
* @codeCoverageIgnore |
|
|
|
* @codeCoverageIgnore |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
protected function dbg($msg) |
|
|
|
protected function dbg($msg) |
|
|
|
@ -824,6 +825,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
* Dump debug info. |
|
|
|
* Dump debug info. |
|
|
|
* |
|
|
|
* |
|
|
|
* @deprecated since Monolog gather log, we don't need it |
|
|
|
* @deprecated since Monolog gather log, we don't need it |
|
|
|
|
|
|
|
* |
|
|
|
* @codeCoverageIgnore |
|
|
|
* @codeCoverageIgnore |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
protected function dump_dbg() |
|
|
|
protected function dump_dbg() |
|
|
|
@ -973,11 +975,11 @@ class Readability implements LoggerAwareInterface |
|
|
|
* Using a variety of metrics (content score, classname, element types), find the content that is |
|
|
|
* Using a variety of metrics (content score, classname, element types), find the content that is |
|
|
|
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div. |
|
|
|
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div. |
|
|
|
* |
|
|
|
* |
|
|
|
* @param \DOMElement $page |
|
|
|
* @param ?\DOMElement $page |
|
|
|
* |
|
|
|
* |
|
|
|
* @return \DOMElement|false |
|
|
|
* @return \DOMElement|false |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
protected function grabArticle(\DOMElement $page = null) |
|
|
|
protected function grabArticle($page = null) |
|
|
|
{ |
|
|
|
{ |
|
|
|
if (!$page) { |
|
|
|
if (!$page) { |
|
|
|
$page = $this->dom; |
|
|
|
$page = $this->dom; |
|
|
|
@ -992,7 +994,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
|
|
|
|
|
|
|
|
$allElements = $page->getElementsByTagName('*'); |
|
|
|
$allElements = $page->getElementsByTagName('*'); |
|
|
|
|
|
|
|
|
|
|
|
for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); ++$nodeIndex) { |
|
|
|
for ($nodeIndex = 0; $node = $allElements->item($nodeIndex); ++$nodeIndex) { |
|
|
|
$tagName = $node->tagName; |
|
|
|
$tagName = $node->tagName; |
|
|
|
|
|
|
|
|
|
|
|
$nodeContent = $node->getInnerHTML(); |
|
|
|
$nodeContent = $node->getInnerHTML(); |
|
|
|
@ -1136,9 +1138,9 @@ class Readability implements LoggerAwareInterface |
|
|
|
// Remove unlikely candidates |
|
|
|
// Remove unlikely candidates |
|
|
|
$unlikelyMatchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id') . ' ' . $node->getAttribute('style'); |
|
|
|
$unlikelyMatchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id') . ' ' . $node->getAttribute('style'); |
|
|
|
|
|
|
|
|
|
|
|
if (mb_strlen($unlikelyMatchString) > 3 && // don't process "empty" strings |
|
|
|
if (mb_strlen($unlikelyMatchString) > 3 // don't process "empty" strings |
|
|
|
preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && |
|
|
|
&& preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) |
|
|
|
!preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) |
|
|
|
&& !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) |
|
|
|
) { |
|
|
|
) { |
|
|
|
$this->logger->debug('Removing unlikely candidate (using conf) ' . $node->getNodePath() . ' by "' . $unlikelyMatchString . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); |
|
|
|
$this->logger->debug('Removing unlikely candidate (using conf) ' . $node->getNodePath() . ' by "' . $unlikelyMatchString . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); |
|
|
|
$node->parentNode->removeChild($node); |
|
|
|
$node->parentNode->removeChild($node); |
|
|
|
@ -1430,7 +1432,7 @@ class Readability implements LoggerAwareInterface |
|
|
|
unset($tidy); |
|
|
|
unset($tidy); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
$this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8'); |
|
|
|
$this->html = '<meta charset="utf-8">' . (string) $this->html; |
|
|
|
|
|
|
|
|
|
|
|
if ('html5lib' === $this->parser || 'html5' === $this->parser) { |
|
|
|
if ('html5lib' === $this->parser || 'html5' === $this->parser) { |
|
|
|
$this->dom = (new HTML5())->loadHTML($this->html); |
|
|
|
$this->dom = (new HTML5())->loadHTML($this->html); |
|
|
|
|