pull/87/merge
Jan Tojnar 2 years ago committed by GitHub
commit de34bd194c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 2
      .php-cs-fixer.php
  2. 2
      phpstan.neon
  3. 100
      src/JSLikeHTMLElement.php
  4. 318
      src/Readability.php
  5. 72
      tests/ReadabilityTest.php

@ -28,8 +28,6 @@ return (new PhpCsFixer\Config())
'concat_space' => ['spacing' => 'one'], 'concat_space' => ['spacing' => 'one'],
// Pulled in by @Symfony:risky but we still support PHP 7.4 // Pulled in by @Symfony:risky but we still support PHP 7.4
'modernize_strpos' => false, 'modernize_strpos' => false,
// Pulled in by @Symfony, we cannot add property types until we bump PHP to ≥ 7.4
'no_null_property_initialization' => false,
]) ])
->setFinder($finder) ->setFinder($finder)
; ;

@ -1,5 +1,5 @@
parameters: parameters:
level: 1 level: 5
paths: paths:
- src - src
- tests - tests

@ -1,57 +1,43 @@
<?php <?php
declare(strict_types=1);
// SPDX-FileCopyrightText: 2011 Keyvan Minoukadeh - http://www.keyvan.net - keyvan@keyvan.net
// SPDX-License-Identifier: Apache-2.0
namespace Readability; namespace Readability;
/** /**
* JavaScript-like HTML DOM Element. * Wrapper for DOMElement adding methods for accessing string representation of inner HTML contents.
* *
* This class extends PHP's DOMElement to allow * Inspired by JavaScript innerHTML property.
* users to get and set the innerHTML property of * https://developer.mozilla.org/en-US/docs/Web/API/Element/innerHTML
* HTML elements in the same way it's done in
* JavaScript.
* *
* Example usage: * Example usage:
* require_once 'JSLikeHTMLElement.php';
* header('Content-Type: text/plain');
* $doc = new DOMDocument();
* $doc->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
* $doc->loadHTML('<div><p>Para 1</p><p>Para 2</p></div>');
* $elem = $doc->getElementsByTagName('div')->item(0);
*
* // print innerHTML
* echo $elem->innerHTML; // prints '<p>Para 1</p><p>Para 2</p>'
* echo "\n\n";
* *
* // set innerHTML * ```php
* $elem->innerHTML = '<a href="http://fivefilters.org">FiveFilters.org</a>'; * $doc = new DOMDocument();
* echo $elem->innerHTML; // prints '<a href="http://fivefilters.org">FiveFilters.org</a>' * $doc->loadHTML('<div><p>Para 1</p><p>Para 2</p></div>');
* echo "\n\n"; * $elem = $doc->getElementsByTagName('div')->item(0);
* *
* // print document (with our changes) * // Get inner HTML
* echo $doc->saveXML(); * assert($elem->getInnerHtml() === '<p>Para 1</p><p>Para 2</p>');
* *
* @author Keyvan Minoukadeh - http://www.keyvan.net - keyvan@keyvan.net * // Set inner HTML
* $elem->setInnerHtml('<a href="http://fivefilters.org">FiveFilters.org</a>');
* assert($elem->getInnerHtml() === '<a href="http://fivefilters.org">FiveFilters.org</a>');
* *
* @see http://fivefilters.org (the project this was written for) * // print document (with our changes)
* echo $doc->saveXML();
* ```
*/ */
class JSLikeHTMLElement extends \DOMElement final class JSLikeHTMLElement extends \DOMElement
{ {
/** /**
* Used for setting innerHTML like it's done in JavaScript:. * Sets inner HTML.
*
* ```php
* $div->innerHTML = '<h2>Chapter 2</h2><p>The story begins...</p>';
* ```
*/ */
public function __set($name, $value) public function setInnerHtml(string $value): void
{ {
if ('innerHTML' !== $name) {
$trace = debug_backtrace();
trigger_error('Undefined property via __set(): ' . $name . ' in ' . $trace[0]['file'] . ' on line ' . $trace[0]['line'], \E_USER_NOTICE);
return;
}
// first, empty the element // first, empty the element
if (isset($this->childNodes)) { if (isset($this->childNodes)) {
for ($x = $this->childNodes->length - 1; $x >= 0; --$x) { for ($x = $this->childNodes->length - 1; $x >= 0; --$x) {
@ -81,7 +67,7 @@ class JSLikeHTMLElement extends \DOMElement
$f = new \DOMDocument(); $f = new \DOMDocument();
// Using <htmlfragment> will generate a warning, but so will bad HTML // Using <htmlfragment> will generate a warning, but so will bad HTML
// (and by this point, bad HTML is what we've got). // (and by element point, bad HTML is what we've got).
// We use it (and suppress the warning) because an HTML fragment will // We use it (and suppress the warning) because an HTML fragment will
// be wrapped around <html><body> tags which we don't really want to keep. // be wrapped around <html><body> tags which we don't really want to keep.
// Note: despite the warning, if loadHTML succeeds it will return true. // Note: despite the warning, if loadHTML succeeds it will return true.
@ -102,42 +88,18 @@ class JSLikeHTMLElement extends \DOMElement
} }
/** /**
* Used for getting innerHTML like it's done in JavaScript:. * Gets inner HTML.
*
* ```php
* $string = $div->innerHTML;
* ```
*/ */
public function __get($name) public function getInnerHtml(): string
{ {
if ('innerHTML' === $name) { $inner = '';
$inner = '';
if (isset($this->childNodes)) { if (isset($this->childNodes)) {
foreach ($this->childNodes as $child) { foreach ($this->childNodes as $child) {
$inner .= $this->ownerDocument->saveXML($child); $inner .= $this->ownerDocument->saveXML($child);
}
} }
return $inner;
} }
$trace = debug_backtrace(); return $inner;
trigger_error('Undefined property via __get(): ' . $name . ' in ' . $trace[0]['file'] . ' on line ' . $trace[0]['line'], \E_USER_NOTICE);
}
public function __toString()
{
return '[' . $this->tagName . ']';
}
public function getInnerHtml()
{
return $this->__get('innerHTML');
}
public function setInnerHtml($value)
{
return $this->__set('innerHTML', $value);
} }
} }

@ -1,5 +1,7 @@
<?php <?php
declare(strict_types=1);
namespace Readability; namespace Readability;
use Masterminds\HTML5; use Masterminds\HTML5;
@ -24,28 +26,36 @@ class Readability implements LoggerAwareInterface
public const MIN_ARTICLE_LENGTH = 200; public const MIN_ARTICLE_LENGTH = 200;
public const MIN_NODE_LENGTH = 80; public const MIN_NODE_LENGTH = 80;
public const MAX_LINK_DENSITY = 0.25; public const MAX_LINK_DENSITY = 0.25;
public $convertLinksToFootnotes = false;
public $revertForcedParagraphElements = false; public bool $convertLinksToFootnotes = false;
public $articleTitle; public bool $revertForcedParagraphElements = false;
public $articleContent;
public $original_html; public ?JSLikeHTMLElement $articleTitle;
public ?JSLikeHTMLElement $articleContent;
public ?string $original_html;
public ?\DOMDocument $dom;
/**
* @var ?string URL where HTML was retrieved
*/
public ?string $url = null;
/** /**
* @var \DOMDocument * @var bool preserves more content (experimental)
*/ */
public $dom; public bool $lightClean = true;
// optional - URL where HTML was retrieved
public $url = null; public bool $tidied = false;
// preserves more content (experimental)
public $lightClean = true;
// no more used, keept to avoid BC
public $debug = false;
public $tidied = false;
/** /**
* All of the regular expressions in use within readability. * @var array<string, string> All of the regular expressions in use within readability.
*
* Defined up here so we don't instantiate them repeatedly in loops. * Defined up here so we don't instantiate them repeatedly in loops.
*/ */
public $regexps = [ public array $regexps = [
'unlikelyCandidates' => '/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i', 'unlikelyCandidates' => '/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i',
'okMaybeItsACandidate' => '/article\b|contain|\bcontent|column|general|detail|shadow|lightbox|blog|body|entry|main|page|footnote|element/i', 'okMaybeItsACandidate' => '/article\b|contain|\bcontent|column|general|detail|shadow|lightbox|blog|body|entry|main|page|footnote|element/i',
'positive' => '/read|full|article|body|\bcontent|contain|entry|main|markdown|media|page|attach|pagination|post|text|blog|story/i', 'positive' => '/read|full|article|body|\bcontent|contain|entry|main|markdown|media|page|attach|pagination|post|text|blog|story/i',
@ -57,10 +67,18 @@ class Readability implements LoggerAwareInterface
'hasContent' => '/\S$/', 'hasContent' => '/\S$/',
'isNotVisible' => '/display\s*:\s*none/', 'isNotVisible' => '/display\s*:\s*none/',
]; ];
public $defaultTagsToScore = ['section', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'td', 'pre'];
// The commented out elements qualify as phrasing content but tend to be /**
// removed by readability when put into paragraphs, so we ignore them here. * @var array<string>
public $phrasingElements = [ */
public array $defaultTagsToScore = ['section', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'td', 'pre'];
/**
* @var array<string>
*/
public array $phrasingElements = [
// The commented out elements qualify as phrasing content but tend to be
// removed by readability when put into paragraphs, so we ignore them here.
// "CANVAS", "IFRAME", "SVG", "VIDEO", // "CANVAS", "IFRAME", "SVG", "VIDEO",
'ABBR', 'AUDIO', 'B', 'BDO', 'BR', 'BUTTON', 'CITE', 'CODE', 'DATA', 'ABBR', 'AUDIO', 'B', 'BDO', 'BR', 'BUTTON', 'CITE', 'CODE', 'DATA',
'DATALIST', 'DFN', 'EM', 'EMBED', 'I', 'IMG', 'INPUT', 'KBD', 'LABEL', 'DATALIST', 'DFN', 'EM', 'EMBED', 'I', 'IMG', 'INPUT', 'KBD', 'LABEL',
@ -68,7 +86,11 @@ class Readability implements LoggerAwareInterface
'RUBY', 'SAMP', 'SCRIPT', 'SELECT', 'SMALL', 'SPAN', 'STRONG', 'SUB', 'RUBY', 'SAMP', 'SCRIPT', 'SELECT', 'SMALL', 'SPAN', 'STRONG', 'SUB',
'SUP', 'TEXTAREA', 'TIME', 'VAR', 'WBR', 'SUP', 'TEXTAREA', 'TIME', 'VAR', 'WBR',
]; ];
public $tidy_config = [
/**
* @var array<string, bool|string>
*/
public array $tidy_config = [
'tidy-mark' => false, 'tidy-mark' => false,
'vertical-space' => false, 'vertical-space' => false,
'doctype' => 'omit', 'doctype' => 'omit',
@ -92,21 +114,41 @@ class Readability implements LoggerAwareInterface
'output-encoding' => 'utf8', 'output-encoding' => 'utf8',
'hide-comments' => true, 'hide-comments' => true,
]; ];
// article domain regexp for calibration
protected $domainRegExp = null; /**
protected $body = null; * @var ?string article domain regexp for calibration
// Cache the body HTML in case we need to re-use it later */
protected $bodyCache = null; protected ?string $domainRegExp = null;
// 1 | 2 | 4; // Start with all processing flags set.
protected $flags = 7; protected ?JSLikeHTMLElement $body = null;
// indicates whether we were able to extract or not
protected $success = false; /**
protected $logger; * @var ?string Cache the body HTML in case we need to re-use it later
protected $parser; */
protected $html; protected ?string $bodyCache = null;
protected $useTidy;
// raw HTML filters /**
protected $pre_filters = [ * @var int-mask-of<self::FLAG_*> start with all processing flags set
*/
protected int $flags = self::FLAG_STRIP_UNLIKELYS | self::FLAG_WEIGHT_ATTRIBUTES | self::FLAG_CLEAN_CONDITIONALLY;
/**
* @var bool indicates whether we were able to extract or not
*/
protected bool $success = false;
protected LoggerInterface $logger;
protected string $parser;
protected string $html;
protected bool $useTidy;
/**
* @var array<string, string> raw HTML filters
*/
protected array $pre_filters = [
// remove spans as we redefine styles and they're probably special-styled // remove spans as we redefine styles and they're probably special-styled
'!</?span[^>]*>!is' => '', '!</?span[^>]*>!is' => '',
// HACK: firewall-filtered content // HACK: firewall-filtered content
@ -118,8 +160,11 @@ class Readability implements LoggerAwareInterface
// replace fonts to spans // replace fonts to spans
'!<(/?)font[^>]*>!is' => '<\\1span>', '!<(/?)font[^>]*>!is' => '<\\1span>',
]; ];
// output HTML filters
protected $post_filters = [ /**
* @var array<string, string> output HTML filters
*/
protected array $post_filters = [
// replace excessive br's // replace excessive br's
'/<br\s*\/?>\s*<p/i' => '<p', '/<br\s*\/?>\s*<p/i' => '<p',
// replace empty tags that break layouts // replace empty tags that break layouts
@ -159,20 +204,16 @@ class Readability implements LoggerAwareInterface
/** /**
* Get article title element. * Get article title element.
*
* @return \DOMElement
*/ */
public function getTitle() public function getTitle(): JSLikeHTMLElement
{ {
return $this->articleTitle; return $this->articleTitle;
} }
/** /**
* Get article content element. * Get article content element.
*
* @return \DOMElement
*/ */
public function getContent() public function getContent(): JSLikeHTMLElement
{ {
return $this->articleContent; return $this->articleContent;
} }
@ -221,6 +262,7 @@ class Readability implements LoggerAwareInterface
// Assume successful outcome // Assume successful outcome
$this->success = true; $this->success = true;
/** @var \DOMNodeList<JSLikeHTMLElement> */
$bodyElems = $this->dom->getElementsByTagName('body'); $bodyElems = $this->dom->getElementsByTagName('body');
// WTF multiple body nodes? // WTF multiple body nodes?
@ -243,8 +285,9 @@ class Readability implements LoggerAwareInterface
$articleTitle = $this->getArticleTitle(); $articleTitle = $this->getArticleTitle();
$articleContent = $this->grabArticle(); $articleContent = $this->grabArticle();
if (!$articleContent) { if (null === $articleContent) {
$this->success = false; $this->success = false;
/** @var JSLikeHTMLElement */
$articleContent = $this->dom->createElement('div'); $articleContent = $this->dom->createElement('div');
$articleContent->setAttribute('class', 'readability-content'); $articleContent->setAttribute('class', 'readability-content');
$articleContent->setInnerHtml('<p>Sorry, Readability was unable to parse this page for content.</p>'); $articleContent->setInnerHtml('<p>Sorry, Readability was unable to parse this page for content.</p>');
@ -260,7 +303,9 @@ class Readability implements LoggerAwareInterface
// without tidy the body can (sometimes) be wiped, so re-create it // without tidy the body can (sometimes) be wiped, so re-create it
if (false === isset($this->body->childNodes)) { if (false === isset($this->body->childNodes)) {
$this->body = $this->dom->createElement('body'); /** @var JSLikeHTMLElement */
$body = $this->dom->createElement('body');
$this->body = $body;
} }
// Clear the old HTML, insert the new content. // Clear the old HTML, insert the new content.
@ -293,19 +338,23 @@ class Readability implements LoggerAwareInterface
*/ */
public function addFootnotes(\DOMElement $articleContent): void public function addFootnotes(\DOMElement $articleContent): void
{ {
/** @var JSLikeHTMLElement */
$footnotesWrapper = $this->dom->createElement('footer'); $footnotesWrapper = $this->dom->createElement('footer');
$footnotesWrapper->setAttribute('class', 'readability-footnotes'); $footnotesWrapper->setAttribute('class', 'readability-footnotes');
$footnotesWrapper->setInnerHtml('<h3>References</h3>'); $footnotesWrapper->setInnerHtml('<h3>References</h3>');
$articleFootnotes = $this->dom->createElement('ol'); $articleFootnotes = $this->dom->createElement('ol');
$articleFootnotes->setAttribute('class', 'readability-footnotes-list'); $articleFootnotes->setAttribute('class', 'readability-footnotes-list');
$footnotesWrapper->appendChild($articleFootnotes); $footnotesWrapper->appendChild($articleFootnotes);
/** @var \DOMNodeList<JSLikeHTMLElement> */
$articleLinks = $articleContent->getElementsByTagName('a'); $articleLinks = $articleContent->getElementsByTagName('a');
$linkCount = 0; $linkCount = 0;
for ($i = 0; $i < $articleLinks->length; ++$i) { for ($i = 0; $i < $articleLinks->length; ++$i) {
$articleLink = $articleLinks->item($i); $articleLink = $articleLinks->item($i);
$footnoteLink = $articleLink->cloneNode(true); $footnoteLink = $articleLink->cloneNode(true);
/** @var JSLikeHTMLElement */
$refLink = $this->dom->createElement('a'); $refLink = $this->dom->createElement('a');
/** @var JSLikeHTMLElement */
$footnote = $this->dom->createElement('li'); $footnote = $this->dom->createElement('li');
$linkDomain = @parse_url($footnoteLink->getAttribute('href'), \PHP_URL_HOST); $linkDomain = @parse_url($footnoteLink->getAttribute('href'), \PHP_URL_HOST);
if (!$linkDomain && isset($this->url)) { if (!$linkDomain && isset($this->url)) {
@ -355,7 +404,7 @@ class Readability implements LoggerAwareInterface
*/ */
public function prepArticle(\DOMNode $articleContent): void public function prepArticle(\DOMNode $articleContent): void
{ {
if (!$articleContent instanceof \DOMElement) { if (!$articleContent instanceof JSLikeHTMLElement) {
return; return;
} }
@ -382,6 +431,7 @@ class Readability implements LoggerAwareInterface
} }
// Remove service data-candidate attribute. // Remove service data-candidate attribute.
/** @var \DOMNodeList<JSLikeHTMLElement> */
$elems = $xpath->query('.//*[@data-candidate]', $articleContent); $elems = $xpath->query('.//*[@data-candidate]', $articleContent);
for ($i = $elems->length - 1; $i >= 0; --$i) { for ($i = $elems->length - 1; $i >= 0; --$i) {
$elems->item($i)->removeAttribute('data-candidate'); $elems->item($i)->removeAttribute('data-candidate');
@ -454,12 +504,8 @@ class Readability implements LoggerAwareInterface
/** /**
* Get the inner text of a node. * Get the inner text of a node.
* This also strips out any excess whitespace to be found. * This also strips out any excess whitespace to be found.
*
* @param \DOMElement $e
* @param bool $normalizeSpaces (default: true)
* @param bool $flattenLines (default: false)
*/ */
public function getInnerText($e, bool $normalizeSpaces = true, bool $flattenLines = false): string public function getInnerText(?\DOMNode $e, bool $normalizeSpaces = true, bool $flattenLines = false): string
{ {
if (null === $e || !isset($e->textContent) || '' === $e->textContent) { if (null === $e || !isset($e->textContent) || '' === $e->textContent) {
return ''; return '';
@ -481,7 +527,7 @@ class Readability implements LoggerAwareInterface
/** /**
* Remove the style attribute on every $e and under. * Remove the style attribute on every $e and under.
*/ */
public function cleanStyles(\DOMElement $e): void public function cleanStyles(JSLikeHTMLElement $e): void
{ {
if (\is_object($e)) { if (\is_object($e)) {
$elems = $e->getElementsByTagName('*'); $elems = $e->getElementsByTagName('*');
@ -514,7 +560,7 @@ class Readability implements LoggerAwareInterface
* This is the amount of text that is inside a link divided by the total text in the node. * This is the amount of text that is inside a link divided by the total text in the node.
* Can exclude external references to differentiate between simple text and menus/infoblocks. * Can exclude external references to differentiate between simple text and menus/infoblocks.
*/ */
public function getLinkDensity(\DOMElement $e, bool $excludeExternal = false): float public function getLinkDensity(JSLikeHTMLElement $e, bool $excludeExternal = false): float
{ {
$links = $e->getElementsByTagName('a'); $links = $e->getElementsByTagName('a');
$textLength = mb_strlen($this->getInnerText($e, true, true)); $textLength = mb_strlen($this->getInnerText($e, true, true));
@ -537,7 +583,7 @@ class Readability implements LoggerAwareInterface
/** /**
* Get an element relative weight. * Get an element relative weight.
*/ */
public function getWeight(\DOMElement $e): int public function getWeight(JSLikeHTMLElement $e): int
{ {
if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) { if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
return 0; return 0;
@ -555,7 +601,7 @@ class Readability implements LoggerAwareInterface
/** /**
* Remove extraneous break tags from a node. * Remove extraneous break tags from a node.
*/ */
public function killBreaks(\DOMElement $node): void public function killBreaks(JSLikeHTMLElement $node): void
{ {
$html = $node->getInnerHTML(); $html = $node->getInnerHTML();
$html = preg_replace($this->regexps['killBreaks'], '<br />', $html); $html = preg_replace($this->regexps['killBreaks'], '<br />', $html);
@ -568,8 +614,9 @@ class Readability implements LoggerAwareInterface
* *
* Updated 2012-09-18 to preserve youtube/vimeo iframes * Updated 2012-09-18 to preserve youtube/vimeo iframes
*/ */
public function clean(\DOMElement $e, string $tag): void public function clean(JSLikeHTMLElement $e, string $tag): void
{ {
/** @var \DOMNodeList<JSLikeHTMLElement> */
$targetList = $e->getElementsByTagName($tag); $targetList = $e->getElementsByTagName($tag);
$isEmbed = ('audio' === $tag || 'video' === $tag || 'iframe' === $tag || 'object' === $tag || 'embed' === $tag); $isEmbed = ('audio' === $tag || 'video' === $tag || 'iframe' === $tag || 'object' === $tag || 'embed' === $tag);
@ -600,12 +647,13 @@ class Readability implements LoggerAwareInterface
* "Fishy" is an algorithm based on content length, classnames, * "Fishy" is an algorithm based on content length, classnames,
* link density, number of images & embeds, etc. * link density, number of images & embeds, etc.
*/ */
public function cleanConditionally(\DOMElement $e, string $tag): void public function cleanConditionally(JSLikeHTMLElement $e, string $tag): void
{ {
if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
return; return;
} }
/** @var \DOMNodeList<JSLikeHTMLElement> */
$tagsList = $e->getElementsByTagName($tag); $tagsList = $e->getElementsByTagName($tag);
$curTagsLength = $tagsList->length; $curTagsLength = $tagsList->length;
@ -618,7 +666,7 @@ class Readability implements LoggerAwareInterface
for ($i = $curTagsLength - 1; $i >= 0; --$i) { for ($i = $curTagsLength - 1; $i >= 0; --$i) {
$node = $tagsList->item($i); $node = $tagsList->item($i);
$weight = $this->getWeight($node); $weight = $this->getWeight($node);
$contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0; $contentScore = self::getContentScore($node);
$this->logger->debug('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : '')); $this->logger->debug('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : ''));
// XXX Incomplete implementation // XXX Incomplete implementation
@ -713,9 +761,10 @@ class Readability implements LoggerAwareInterface
/** /**
* Clean out spurious headers from an Element. Checks things like classnames and link density. * Clean out spurious headers from an Element. Checks things like classnames and link density.
*/ */
public function cleanHeaders(\DOMElement $e): void public function cleanHeaders(JSLikeHTMLElement $e): void
{ {
for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) { for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) {
/** @var \DOMNodeList<JSLikeHTMLElement> */
$headers = $e->getElementsByTagName('h' . $headerIndex); $headers = $e->getElementsByTagName('h' . $headerIndex);
for ($i = $headers->length - 1; $i >= 0; --$i) { for ($i = $headers->length - 1; $i >= 0; --$i) {
@ -752,10 +801,8 @@ class Readability implements LoggerAwareInterface
/** /**
* Get the article title as an H1. * Get the article title as an H1.
*
* @return \DOMElement
*/ */
protected function getArticleTitle() protected function getArticleTitle(): JSLikeHTMLElement
{ {
try { try {
$curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
@ -786,6 +833,7 @@ class Readability implements LoggerAwareInterface
$curTitle = $origTitle; $curTitle = $origTitle;
} }
/** @var JSLikeHTMLElement */
$articleTitle = $this->dom->createElement('h1'); $articleTitle = $this->dom->createElement('h1');
$articleTitle->setInnerHtml($curTitle); $articleTitle->setInnerHtml($curTitle);
@ -803,7 +851,9 @@ class Readability implements LoggerAwareInterface
* so we create a new body node and append it to the document. * so we create a new body node and append it to the document.
*/ */
if (null === $this->body) { if (null === $this->body) {
$this->body = $this->dom->createElement('body'); /** @var JSLikeHTMLElement */
$body = $this->dom->createElement('body');
$this->body = $body;
$this->dom->documentElement->appendChild($this->body); $this->dom->documentElement->appendChild($this->body);
} }
@ -825,35 +875,32 @@ class Readability implements LoggerAwareInterface
* Initialize a node with the readability object. Also checks the * Initialize a node with the readability object. Also checks the
* className/id for special names to add to its score. * className/id for special names to add to its score.
*/ */
protected function initializeNode(\DOMElement $node): void protected function initializeNode(JSLikeHTMLElement $node): void
{ {
if (!isset($node->tagName)) { if (!isset($node->tagName)) {
return; return;
} }
$readability = $this->dom->createAttribute('readability'); $contentScore = 0;
// this is our contentScore
$readability->value = 0;
$node->setAttributeNode($readability);
// using strtoupper just in case // using strtoupper just in case
switch (strtoupper($node->tagName)) { switch (strtoupper($node->tagName)) {
case 'ARTICLE': case 'ARTICLE':
$readability->value += 15; $contentScore += 15;
// no break // no break
case 'DIV': case 'DIV':
$readability->value += 5; $contentScore += 5;
break; break;
case 'PRE': case 'PRE':
case 'CODE': case 'CODE':
case 'TD': case 'TD':
case 'BLOCKQUOTE': case 'BLOCKQUOTE':
case 'FIGURE': case 'FIGURE':
$readability->value += 3; $contentScore += 3;
break; break;
case 'SECTION': case 'SECTION':
// often misused // often misused
// $readability->value += 2; // $contentScore += 2;
break; break;
case 'OL': case 'OL':
case 'UL': case 'UL':
@ -861,7 +908,7 @@ class Readability implements LoggerAwareInterface
case 'DD': case 'DD':
case 'DT': case 'DT':
case 'LI': case 'LI':
$readability->value -= 3; $contentScore -= 3;
break; break;
case 'ASIDE': case 'ASIDE':
case 'FOOTER': case 'FOOTER':
@ -872,7 +919,7 @@ class Readability implements LoggerAwareInterface
case 'TEXTAREA': case 'TEXTAREA':
case 'INPUT': case 'INPUT':
case 'NAV': case 'NAV':
$readability->value -= 3; $contentScore -= 3;
break; break;
case 'H1': case 'H1':
case 'H2': case 'H2':
@ -882,20 +929,22 @@ class Readability implements LoggerAwareInterface
case 'H6': case 'H6':
case 'TH': case 'TH':
case 'HGROUP': case 'HGROUP':
$readability->value -= 5; $contentScore -= 5;
break; break;
} }
$readability->value += $this->getWeight($node); $contentScore += $this->getWeight($node);
$readability = $this->dom->createAttribute('readability');
$readability->value = (string) $contentScore;
$node->setAttributeNode($readability);
} }
/** /**
* Using a variety of metrics (content score, classname, element types), find the content that is * Using a variety of metrics (content score, classname, element types), find the content that is
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div. * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
*
* @return \DOMElement|false
*/ */
protected function grabArticle(?\DOMElement $page = null) protected function grabArticle(?JSLikeHTMLElement $page = null): ?JSLikeHTMLElement
{ {
if (!$page) { if (!$page) {
$page = $this->dom; $page = $this->dom;
@ -908,6 +957,7 @@ class Readability implements LoggerAwareInterface
$xpath = new \DOMXPath($page); $xpath = new \DOMXPath($page);
} }
/** @var \DOMNodeList<JSLikeHTMLElement> */
$allElements = $page->getElementsByTagName('*'); $allElements = $page->getElementsByTagName('*');
for ($nodeIndex = 0; $allElements->item($nodeIndex); ++$nodeIndex) { for ($nodeIndex = 0; $allElements->item($nodeIndex); ++$nodeIndex) {
@ -950,6 +1000,7 @@ class Readability implements LoggerAwareInterface
// (as in, where they contain no other block level elements). // (as in, where they contain no other block level elements).
if ('div' === $tagName) { if ('div' === $tagName) {
if (!preg_match($this->regexps['divToPElements'], $nodeContent)) { if (!preg_match($this->regexps['divToPElements'], $nodeContent)) {
/** @var JSLikeHTMLElement */
$newNode = $this->dom->createElement('p'); $newNode = $this->dom->createElement('p');
try { try {
@ -999,8 +1050,8 @@ class Readability implements LoggerAwareInterface
} }
} }
if ($this->hasSingleTagInsideElement($node, 'p') && $this->getLinkDensity($node) < 0.25) { $newNode = $this->getSingleTagInsideElement($node, 'p');
$newNode = $node->childNodes->item(0); if (null !== $newNode && $this->getLinkDensity($node) < 0.25) {
$node->parentNode->replaceChild($newNode, $node); $node->parentNode->replaceChild($newNode, $node);
$nodesToScore[] = $newNode; $nodesToScore[] = $newNode;
} }
@ -1041,7 +1092,7 @@ class Readability implements LoggerAwareInterface
foreach ($ancestors as $level => $ancestor) { foreach ($ancestors as $level => $ancestor) {
if (!$ancestor->nodeName || !$ancestor->parentNode) { if (!$ancestor->nodeName || !$ancestor->parentNode) {
return; return null;
} }
if (!$ancestor->hasAttribute('readability')) { if (!$ancestor->hasAttribute('readability')) {
@ -1056,7 +1107,8 @@ class Readability implements LoggerAwareInterface
} else { } else {
$scoreDivider = $level * 3; $scoreDivider = $level * 3;
} }
$ancestor->getAttributeNode('readability')->value += $contentScore / $scoreDivider;
self::updateContentScore($ancestor, fn ($prevScore) => $prevScore + $contentScore / $scoreDivider);
} }
} }
@ -1065,13 +1117,14 @@ class Readability implements LoggerAwareInterface
* This is faster to do before scoring but safer after. * This is faster to do before scoring but safer after.
*/ */
if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) && $xpath) { if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) && $xpath) {
/** @var \DOMNodeList<JSLikeHTMLElement> */
$candidates = $xpath->query('.//*[(self::footer and count(//footer)<2) or (self::aside and count(//aside)<2)]', $page->documentElement); $candidates = $xpath->query('.//*[(self::footer and count(//footer)<2) or (self::aside and count(//aside)<2)]', $page->documentElement);
for ($c = $candidates->length - 1; $c >= 0; --$c) { for ($c = $candidates->length - 1; $c >= 0; --$c) {
$node = $candidates->item($c); $node = $candidates->item($c);
// node should be readable but not inside of an article otherwise it's probably non-readable block // node should be readable but not inside of an article otherwise it's probably non-readable block
if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? 0 !== strcasecmp($node->parentNode->tagName, 'article') : true)) { if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode instanceof JSLikeHTMLElement ? 0 !== strcasecmp($node->parentNode->tagName, 'article') : true)) {
$this->logger->debug('Removing unlikely candidate (using note) ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); $this->logger->debug('Removing unlikely candidate (using note) ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . self::getContentScore($node));
$node->parentNode->removeChild($node); $node->parentNode->removeChild($node);
} }
} }
@ -1091,6 +1144,7 @@ class Readability implements LoggerAwareInterface
$topCandidates = array_fill(0, 5, null); $topCandidates = array_fill(0, 5, null);
if ($xpath) { if ($xpath) {
// Using array of DOMElements after deletion is a path to DOOMElement. // Using array of DOMElements after deletion is a path to DOOMElement.
/** @var \DOMNodeList<JSLikeHTMLElement> */
$candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement); $candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement);
$this->logger->debug('Candidates: ' . $candidates->length); $this->logger->debug('Candidates: ' . $candidates->length);
@ -1100,14 +1154,13 @@ class Readability implements LoggerAwareInterface
// Scale the final candidates score based on link density. Good content should have a // Scale the final candidates score based on link density. Good content should have a
// relatively small link density (5% or less) and be mostly unaffected by this operation. // relatively small link density (5% or less) and be mostly unaffected by this operation.
// If not for this we would have used XPath to find maximum @readability. // If not for this we would have used XPath to find maximum @readability.
$readability = $item->getAttributeNode('readability'); self::updateContentScore($item, fn ($prevScore) => round($prevScore * (1 - $this->getLinkDensity($item)), 0, \PHP_ROUND_HALF_UP));
$readability->value = round($readability->value * (1 - $this->getLinkDensity($item)), 0, \PHP_ROUND_HALF_UP);
for ($t = 0; $t < 5; ++$t) { for ($t = 0; $t < 5; ++$t) {
$aTopCandidate = $topCandidates[$t]; $aTopCandidate = $topCandidates[$t];
if (!$aTopCandidate || $readability->value > (int) $aTopCandidate->getAttribute('readability')) { if (!$aTopCandidate || self::getContentScore($item) > self::getContentScore($aTopCandidate)) {
$this->logger->debug('Candidate: ' . $item->getNodePath() . ' (' . $item->getAttribute('class') . ':' . $item->getAttribute('id') . ') with score ' . $readability->value); $this->logger->debug('Candidate: ' . $item->getNodePath() . ' (' . $item->getAttribute('class') . ':' . $item->getAttribute('id') . ') with score ' . self::getContentScore($item));
array_splice($topCandidates, $t, 0, [$item]); array_splice($topCandidates, $t, 0, [$item]);
if (\count($topCandidates) > 5) { if (\count($topCandidates) > 5) {
array_pop($topCandidates); array_pop($topCandidates);
@ -1118,6 +1171,7 @@ class Readability implements LoggerAwareInterface
} }
} }
/** @var non-empty-array<JSLikeHTMLElement|null> */
$topCandidates = array_filter( $topCandidates = array_filter(
$topCandidates, $topCandidates,
fn ($v, $idx) => 0 === $idx || null !== $v, fn ($v, $idx) => 0 === $idx || null !== $v,
@ -1130,18 +1184,21 @@ class Readability implements LoggerAwareInterface
* We also have to copy the body node so it is something we can modify. * We also have to copy the body node so it is something we can modify.
*/ */
if (null === $topCandidate || 0 === strcasecmp($topCandidate->tagName, 'body')) { if (null === $topCandidate || 0 === strcasecmp($topCandidate->tagName, 'body')) {
/** @var JSLikeHTMLElement */
$topCandidate = $this->dom->createElement('div'); $topCandidate = $this->dom->createElement('div');
if ($page instanceof \DOMDocument) { if ($page instanceof \DOMDocument) {
if (!isset($page->documentElement)) { /** @var ?JSLikeHTMLElement */
$documentElement = $page->documentElement;
if (null === $documentElement) {
// we don't have a body either? what a mess! :) // we don't have a body either? what a mess! :)
$this->logger->debug('The page has no body!'); $this->logger->debug('The page has no body!');
} else { } else {
$this->logger->debug('Setting body to a raw HTML of original page!'); $this->logger->debug('Setting body to a raw HTML of original page!');
$topCandidate->setInnerHtml($page->documentElement->getInnerHTML()); $topCandidate->setInnerHtml($documentElement->getInnerHTML());
$page->documentElement->setInnerHtml(''); $documentElement->setInnerHtml('');
$this->reinitBody(); $this->reinitBody();
$page->documentElement->appendChild($topCandidate); $documentElement->appendChild($topCandidate);
} }
} else { } else {
$topCandidate->setInnerHtml($page->getInnerHTML()); $topCandidate->setInnerHtml($page->getInnerHTML());
@ -1150,7 +1207,7 @@ class Readability implements LoggerAwareInterface
} }
$this->initializeNode($topCandidate); $this->initializeNode($topCandidate);
} elseif ($topCandidate) { } elseif (null !== $topCandidate) {
$alternativeCandidateAncestors = []; $alternativeCandidateAncestors = [];
foreach ($topCandidates as $candidate) { foreach ($topCandidates as $candidate) {
if ((int) $candidate->getAttribute('readability') / (int) $topCandidate->getAttribute('readability') >= 0.75) { if ((int) $candidate->getAttribute('readability') / (int) $topCandidate->getAttribute('readability') >= 0.75) {
@ -1161,7 +1218,7 @@ class Readability implements LoggerAwareInterface
} }
if (\count($alternativeCandidateAncestors) >= 3) { if (\count($alternativeCandidateAncestors) >= 3) {
$parentOfTopCandidate = $topCandidate->parentNode; $parentOfTopCandidate = $topCandidate->parentNode;
while ('body' !== $parentOfTopCandidate->nodeName) { while ('body' !== $parentOfTopCandidate->nodeName && $parentOfTopCandidate instanceof JSLikeHTMLElement) {
$listsContainingThisAncestor = 0; $listsContainingThisAncestor = 0;
for ($ancestorIndex = 0; $ancestorIndex < \count($alternativeCandidateAncestors) && $listsContainingThisAncestor < 3; ++$ancestorIndex) { for ($ancestorIndex = 0; $ancestorIndex < \count($alternativeCandidateAncestors) && $listsContainingThisAncestor < 3; ++$ancestorIndex) {
$listsContainingThisAncestor += (int) \in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex], true); $listsContainingThisAncestor += (int) \in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex], true);
@ -1210,7 +1267,7 @@ class Readability implements LoggerAwareInterface
if (0 === strcasecmp($tagName, 'td') || 0 === strcasecmp($tagName, 'tr')) { if (0 === strcasecmp($tagName, 'td') || 0 === strcasecmp($tagName, 'tr')) {
$up = $topCandidate; $up = $topCandidate;
if ($up->parentNode instanceof \DOMElement) { if ($up->parentNode instanceof JSLikeHTMLElement) {
$up = $up->parentNode; $up = $up->parentNode;
if (0 === strcasecmp($up->tagName, 'table')) { if (0 === strcasecmp($up->tagName, 'table')) {
@ -1225,6 +1282,7 @@ class Readability implements LoggerAwareInterface
* Now that we have the top candidate, look through its siblings for content that might also be related. * Now that we have the top candidate, look through its siblings for content that might also be related.
* Things like preambles, content split by ads that we removed, etc. * Things like preambles, content split by ads that we removed, etc.
*/ */
/** @var JSLikeHTMLElement */
$articleContent = $this->dom->createElement('div'); $articleContent = $this->dom->createElement('div');
$articleContent->setAttribute('class', 'readability-content'); $articleContent->setAttribute('class', 'readability-content');
$siblingScoreThreshold = max(10, ((int) $topCandidate->getAttribute('readability')) * 0.2); $siblingScoreThreshold = max(10, ((int) $topCandidate->getAttribute('readability')) * 0.2);
@ -1240,7 +1298,7 @@ class Readability implements LoggerAwareInterface
$siblingNode = $siblingNodes->item($s); $siblingNode = $siblingNodes->item($s);
$siblingNodeName = $siblingNode->nodeName; $siblingNodeName = $siblingNode->nodeName;
$append = false; $append = false;
$this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . ((\XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); $this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . (($siblingNode instanceof JSLikeHTMLElement && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
if ($siblingNode->isSameNode($topCandidate)) { if ($siblingNode->isSameNode($topCandidate)) {
$append = true; $append = true;
@ -1248,11 +1306,11 @@ class Readability implements LoggerAwareInterface
$contentBonus = 0; $contentBonus = 0;
// Give a bonus if sibling nodes and top candidates have the same classname. // Give a bonus if sibling nodes and top candidates have the same classname.
if (\XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && '' !== $topCandidate->getAttribute('class')) { if ($siblingNode instanceof JSLikeHTMLElement && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && '' !== $topCandidate->getAttribute('class')) {
$contentBonus += ((int) $topCandidate->getAttribute('readability')) * 0.2; $contentBonus += ((int) $topCandidate->getAttribute('readability')) * 0.2;
} }
if (\XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) { if ($siblingNode instanceof JSLikeHTMLElement && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) {
$append = true; $append = true;
} elseif (0 === strcasecmp($siblingNodeName, 'p')) { } elseif (0 === strcasecmp($siblingNodeName, 'p')) {
$linkDensity = (int) $this->getLinkDensity($siblingNode); $linkDensity = (int) $this->getLinkDensity($siblingNode);
@ -1272,6 +1330,7 @@ class Readability implements LoggerAwareInterface
if (0 !== strcasecmp($siblingNodeName, 'div') && 0 !== strcasecmp($siblingNodeName, 'p')) { if (0 !== strcasecmp($siblingNodeName, 'div') && 0 !== strcasecmp($siblingNodeName, 'p')) {
// We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. // We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident.
$this->logger->debug('Altering siblingNode "' . $siblingNodeName . '" to "div".'); $this->logger->debug('Altering siblingNode "' . $siblingNodeName . '" to "div".');
/** @var JSLikeHTMLElement */
$nodeToAppend = $this->dom->createElement('div'); $nodeToAppend = $this->dom->createElement('div');
try { try {
@ -1329,7 +1388,7 @@ class Readability implements LoggerAwareInterface
return $this->grabArticle($this->body); return $this->grabArticle($this->body);
} }
return false; return null;
} }
return $articleContent; return $articleContent;
@ -1339,7 +1398,7 @@ class Readability implements LoggerAwareInterface
* Get an element weight by attribute. * Get an element weight by attribute.
* Uses regular expressions to tell if this element looks good or bad. * Uses regular expressions to tell if this element looks good or bad.
*/ */
protected function weightAttribute(\DOMElement $element, string $attribute): int protected function weightAttribute(JSLikeHTMLElement $element, string $attribute): int
{ {
if (!$element->hasAttribute($attribute)) { if (!$element->hasAttribute($attribute)) {
return 0; return 0;
@ -1373,11 +1432,33 @@ class Readability implements LoggerAwareInterface
protected function reinitBody(): void protected function reinitBody(): void
{ {
if (!isset($this->body->childNodes)) { if (!isset($this->body->childNodes)) {
$this->body = $this->dom->createElement('body'); /** @var JSLikeHTMLElement */
$body = $this->dom->createElement('body');
$this->body = $body;
$this->body->setInnerHtml($this->bodyCache); $this->body->setInnerHtml($this->bodyCache);
} }
} }
/**
* Updates the content score for the given element using the provided function.
*
* @param callable(float): float $f
*/
private static function updateContentScore(JSLikeHTMLElement $element, callable $f): void
{
$readabilityAttr = $element->getAttributeNode('readability');
$prevScore = (float) $readabilityAttr->value;
$readabilityAttr->value = (string) $f($prevScore);
}
/**
* Gets the content score for given element.
*/
private static function getContentScore(JSLikeHTMLElement $element): float
{
return $element->hasAttribute('readability') ? (float) $element->getAttribute('readability') : 0;
}
/** /**
* Load HTML in a DOMDocument. * Load HTML in a DOMDocument.
* Apply Pre filters * Apply Pre filters
@ -1445,11 +1526,11 @@ class Readability implements LoggerAwareInterface
$this->dom->registerNodeClass(\DOMElement::class, JSLikeHTMLElement::class); $this->dom->registerNodeClass(\DOMElement::class, JSLikeHTMLElement::class);
} }
private function getAncestors(\DOMElement $node, int $maxDepth = 0): array private function getAncestors(JSLikeHTMLElement $node, int $maxDepth = 0): array
{ {
$ancestors = []; $ancestors = [];
$i = 0; $i = 0;
while ($node->parentNode instanceof \DOMElement) { while ($node->parentNode instanceof JSLikeHTMLElement) {
$ancestors[] = $node->parentNode; $ancestors[] = $node->parentNode;
if (++$i === $maxDepth) { if (++$i === $maxDepth) {
break; break;
@ -1462,7 +1543,7 @@ class Readability implements LoggerAwareInterface
private function isPhrasingContent($node): bool private function isPhrasingContent($node): bool
{ {
return \XML_TEXT_NODE === $node->nodeType return $node instanceof \DOMText
|| \in_array(strtoupper($node->nodeName), $this->phrasingElements, true) || \in_array(strtoupper($node->nodeName), $this->phrasingElements, true)
|| ( || (
\in_array(strtoupper($node->nodeName), ['A', 'DEL', 'INS'], true) \in_array(strtoupper($node->nodeName), ['A', 'DEL', 'INS'], true)
@ -1477,18 +1558,25 @@ class Readability implements LoggerAwareInterface
); );
} }
private function hasSingleTagInsideElement(\DOMElement $node, string $tag): bool /**
* Checks if `$node` has only whitespace and a single element with `$tag` for the tag name.
* Returns the matched element, or `null` if `$node` contains non-empty text nodes
* or if it contains no element with given tag or more than 1 element.
*/
private function getSingleTagInsideElement(JSLikeHTMLElement $node, string $tag): ?JSLikeHTMLElement
{ {
if (1 !== $node->childNodes->length || $node->childNodes->item(0)->nodeName !== $tag) { $childNodes = iterator_to_array($node->childNodes);
return false; $children = array_filter($childNodes, fn ($childNode) => $childNode instanceof JSLikeHTMLElement);
// There should be exactly 1 element child with given tag
if (1 !== \count($children) || $children[0]->nodeName !== $tag) {
return null;
} }
$a = array_filter( // And there should be no text nodes with real content
iterator_to_array($node->childNodes), $a = array_filter($childNodes, fn ($childNode) => $childNode instanceof \DOMText && preg_match($this->regexps['hasContent'], $this->getInnerText($childNode)));
fn ($childNode) => $childNode instanceof \DOMText && preg_match($this->regexps['hasContent'], $this->getInnerText($childNode))
);
return 0 === \count($a); return 0 === \count($a) ? $children[0] : null;
} }
/** /**
@ -1497,7 +1585,7 @@ class Readability implements LoggerAwareInterface
* Tidy must be configured to not clean the input for this function to * Tidy must be configured to not clean the input for this function to
* work as expected, see $this->tidy_config['clean'] * work as expected, see $this->tidy_config['clean']
*/ */
private function isNodeVisible(\DOMElement $node): bool private function isNodeVisible(JSLikeHTMLElement $node): bool
{ {
return !( return !(
$node->hasAttribute('style') $node->hasAttribute('style')

@ -1,19 +1,18 @@
<?php <?php
declare(strict_types=1);
namespace Tests\Readability; namespace Tests\Readability;
use Monolog\Handler\TestHandler; use Monolog\Handler\TestHandler;
use Monolog\Logger; use Monolog\Logger;
use Psr\Log\LoggerInterface; use Psr\Log\LoggerInterface;
use Readability\JSLikeHTMLElement;
use Readability\Readability; use Readability\Readability;
class ReadabilityTest extends \PHPUnit\Framework\TestCase class ReadabilityTest extends \PHPUnit\Framework\TestCase
{ {
/** @var TestHandler */ public TestHandler $logHandler;
public $logHandler; public LoggerInterface $logger;
/** @var LoggerInterface */
public $logger;
/** /**
* @requires extension tidy * @requires extension tidy
@ -80,8 +79,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$res = $readability->init(); $res = $readability->init();
$this->assertFalse($res); $this->assertFalse($res);
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent());
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle());
$this->assertEmpty($readability->getTitle()->getInnerHtml()); $this->assertEmpty($readability->getTitle()->getInnerHtml());
$this->assertStringContainsString('Sorry, Readability was unable to parse this page for content.', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('Sorry, Readability was unable to parse this page for content.', $readability->getContent()->getInnerHtml());
} }
@ -92,8 +89,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent());
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle());
$this->assertStringContainsString('<div readability=', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('<div readability=', $readability->getContent()->getInnerHtml());
$this->assertEmpty($readability->getTitle()->getInnerHtml()); $this->assertEmpty($readability->getTitle()->getInnerHtml());
$this->assertStringContainsString('This is the awesome content :)', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('This is the awesome content :)', $readability->getContent()->getInnerHtml());
@ -105,8 +100,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent());
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle());
$this->assertStringContainsString('<div readability=', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('<div readability=', $readability->getContent()->getInnerHtml());
$this->assertEmpty($readability->getTitle()->getInnerHtml()); $this->assertEmpty($readability->getTitle()->getInnerHtml());
$this->assertStringContainsString('This is the awesome content :)', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('This is the awesome content :)', $readability->getContent()->getInnerHtml());
@ -115,12 +108,9 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testInitDiv(): void public function testInitDiv(): void
{ {
$readability = $this->getReadability('<div>' . str_repeat('This is the awesome content :)', 7) . '</div>', 'http://0.0.0.0'); $readability = $this->getReadability('<div>' . str_repeat('This is the awesome content :)', 7) . '</div>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent());
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle());
$this->assertStringContainsString('<div readability=', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('<div readability=', $readability->getContent()->getInnerHtml());
$this->assertEmpty($readability->getTitle()->getInnerHtml()); $this->assertEmpty($readability->getTitle()->getInnerHtml());
$this->assertStringContainsString('This is the awesome content :)', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('This is the awesome content :)', $readability->getContent()->getInnerHtml());
@ -129,13 +119,10 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithFootnotes(): void public function testWithFootnotes(): void
{ {
$readability = $this->getReadability('<div>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '</div>', 'http://0.0.0.0'); $readability = $this->getReadability('<div>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '</div>', 'http://0.0.0.0');
$readability->debug = true;
$readability->convertLinksToFootnotes = true; $readability->convertLinksToFootnotes = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent());
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle());
$this->assertStringContainsString('<div readability=', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('<div readability=', $readability->getContent()->getInnerHtml());
$this->assertEmpty($readability->getTitle()->getInnerHtml()); $this->assertEmpty($readability->getTitle()->getInnerHtml());
$this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml());
@ -146,13 +133,10 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testStandardClean(): void public function testStandardClean(): void
{ {
$readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<a href="#nofollow" rel="nofollow">will NOT be removed</a></div>', 'http://0.0.0.0'); $readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<a href="#nofollow" rel="nofollow">will NOT be removed</a></div>', 'http://0.0.0.0');
$readability->debug = true;
$readability->lightClean = false; $readability->lightClean = false;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent());
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle());
$this->assertStringContainsString('<div readability=', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('<div readability=', $readability->getContent()->getInnerHtml());
$this->assertEmpty($readability->getTitle()->getInnerHtml()); $this->assertEmpty($readability->getTitle()->getInnerHtml());
$this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml());
@ -163,12 +147,9 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithIframe(): void public function testWithIframe(): void
{ {
$readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe><iframe>http://soundcloud.com/test</iframe></p></div>', 'http://0.0.0.0'); $readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe><iframe>http://soundcloud.com/test</iframe></p></div>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent());
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle());
$this->assertStringContainsString('<div readability=', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('<div readability=', $readability->getContent()->getInnerHtml());
$this->assertEmpty($readability->getTitle()->getInnerHtml()); $this->assertEmpty($readability->getTitle()->getInnerHtml());
$this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml());
@ -178,12 +159,9 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithArticle(): void public function testWithArticle(): void
{ {
$readability = $this->getReadability('<article><p>' . str_repeat('This is an awesome text with some links, here there are: the awesome', 20) . '</p><p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe></p></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<article><p>' . str_repeat('This is an awesome text with some links, here there are: the awesome', 20) . '</p><p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent());
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle());
$this->assertStringContainsString('alt="article"', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('alt="article"', $readability->getContent()->getInnerHtml());
$this->assertEmpty($readability->getTitle()->getInnerHtml()); $this->assertEmpty($readability->getTitle()->getInnerHtml());
$this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml());
@ -193,12 +171,9 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithAside(): void public function testWithAside(): void
{ {
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<footer><aside>' . str_repeat('<p>This is an awesome text with some links, here there are</p>', 8) . '</aside></footer></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<footer><aside>' . str_repeat('<p>This is an awesome text with some links, here there are</p>', 8) . '</aside></footer></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent());
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle());
$this->assertEmpty($readability->getTitle()->getInnerHtml()); $this->assertEmpty($readability->getTitle()->getInnerHtml());
$this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml());
$this->assertStringNotContainsString('<aside>', $readability->getContent()->getInnerHtml()); $this->assertStringNotContainsString('<aside>', $readability->getContent()->getInnerHtml());
@ -208,12 +183,9 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithClasses(): void public function testWithClasses(): void
{ {
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<div style="display:none">' . str_repeat('<p class="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<div style="display:none">' . str_repeat('<p class="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent());
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle());
$this->assertStringContainsString('alt="article"', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('alt="article"', $readability->getContent()->getInnerHtml());
$this->assertEmpty($readability->getTitle()->getInnerHtml()); $this->assertEmpty($readability->getTitle()->getInnerHtml());
$this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml());
@ -223,13 +195,10 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithClassesWithoutLightClean(): void public function testWithClassesWithoutLightClean(): void
{ {
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<div style="display:none">' . str_repeat('<p class="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<div style="display:none">' . str_repeat('<p class="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0');
$readability->debug = true;
$readability->lightClean = false; $readability->lightClean = false;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent());
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle());
$this->assertStringContainsString('alt="article"', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('alt="article"', $readability->getContent()->getInnerHtml());
$this->assertEmpty($readability->getTitle()->getInnerHtml()); $this->assertEmpty($readability->getTitle()->getInnerHtml());
$this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml());
@ -239,12 +208,9 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithTd(): void public function testWithTd(): void
{ {
$readability = $this->getReadability('<table><tr>' . str_repeat('<td><p>This is an awesome text with some links, here there are the awesome</td>', 7) . '</tr></table>', 'http://0.0.0.0'); $readability = $this->getReadability('<table><tr>' . str_repeat('<td><p>This is an awesome text with some links, here there are the awesome</td>', 7) . '</tr></table>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent());
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle());
$this->assertEmpty($readability->getTitle()->getInnerHtml()); $this->assertEmpty($readability->getTitle()->getInnerHtml());
$this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml());
} }
@ -252,12 +218,9 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithSameClasses(): void public function testWithSameClasses(): void
{ {
$readability = $this->getReadability('<article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<div class="awesomecontent">This text is also an awesome text and you should know that !</div></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<div class="awesomecontent">This text is also an awesome text and you should know that !</div></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent());
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle());
$this->assertEmpty($readability->getTitle()->getInnerHtml()); $this->assertEmpty($readability->getTitle()->getInnerHtml());
$this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml());
$this->assertStringContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml());
@ -266,12 +229,9 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithScript(): void public function testWithScript(): void
{ {
$readability = $this->getReadability('<article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p><script>This text is also an awesome text and you should know that !</script></p></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p><script>This text is also an awesome text and you should know that !</script></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent());
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle());
$this->assertEmpty($readability->getTitle()->getInnerHtml()); $this->assertEmpty($readability->getTitle()->getInnerHtml());
$this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml());
$this->assertStringNotContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml()); $this->assertStringNotContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml());
@ -280,12 +240,9 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testTitle(): void public function testTitle(): void
{ {
$readability = $this->getReadability('<title>this is my title</title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<title>this is my title</title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent());
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle());
$this->assertSame('this is my title', $readability->getTitle()->getInnerHtml()); $this->assertSame('this is my title', $readability->getTitle()->getInnerHtml());
$this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml());
$this->assertStringNotContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml()); $this->assertStringNotContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml());
@ -294,12 +251,9 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testTitleWithDash(): void public function testTitleWithDash(): void
{ {
$readability = $this->getReadability('<title> title2 - title3 </title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<title> title2 - title3 </title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent());
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle());
$this->assertSame('title2 - title3', $readability->getTitle()->getInnerHtml()); $this->assertSame('title2 - title3', $readability->getTitle()->getInnerHtml());
$this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml());
$this->assertStringNotContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml()); $this->assertStringNotContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml());
@ -308,12 +262,9 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testTitleWithDoubleDot(): void public function testTitleWithDoubleDot(): void
{ {
$readability = $this->getReadability('<title> title2 : title3 </title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<title> title2 : title3 </title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent());
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle());
$this->assertSame('title2 : title3', $readability->getTitle()->getInnerHtml()); $this->assertSame('title2 : title3', $readability->getTitle()->getInnerHtml());
$this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml());
$this->assertStringNotContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml()); $this->assertStringNotContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml());
@ -322,12 +273,9 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testTitleTooShortUseH1(): void public function testTitleTooShortUseH1(): void
{ {
$readability = $this->getReadability('<title>too short</title><h1>this is my h1 title !</h1><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<title>too short</title><h1>this is my h1 title !</h1><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent());
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle());
$this->assertSame('this is my h1 title !', $readability->getTitle()->getInnerHtml()); $this->assertSame('this is my h1 title !', $readability->getTitle()->getInnerHtml());
$this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml());
$this->assertStringNotContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml()); $this->assertStringNotContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml());
@ -338,7 +286,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$oldErrorReporting = error_reporting(\E_ALL | \E_STRICT); $oldErrorReporting = error_reporting(\E_ALL | \E_STRICT);
$oldDisplayErrors = ini_set('display_errors', '1'); $oldDisplayErrors = ini_set('display_errors', '1');
// dummy function to be used to the next test // dummy function to be used to the next test
set_error_handler(function (int $errno, string $errstr, string $errfile, int $errline, array $errcontext) { set_error_handler(function (int $errno, string $errstr, string $errfile, int $errline): bool {
throw new \Exception($errstr, $errno); throw new \Exception($errstr, $errno);
}, \E_ALL | \E_STRICT); }, \E_ALL | \E_STRICT);
@ -369,13 +317,10 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
</html>'; </html>';
$readability = $this->getReadability($data, 'http://iosgames.ru/?p=22030'); $readability = $this->getReadability($data, 'http://iosgames.ru/?p=22030');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent());
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle());
$this->assertStringContainsString('<iframe src="https://www.youtube.com/embed/PUep6xNeKjA" width="560" height="315" frameborder="0" allowfullscreen="allowfullscreen"> </iframe>', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('<iframe src="https://www.youtube.com/embed/PUep6xNeKjA" width="560" height="315" frameborder="0" allowfullscreen="allowfullscreen"> </iframe>', $readability->getContent()->getInnerHtml());
$this->assertStringContainsString('3D Touch', $readability->getTitle()->getInnerHtml()); $this->assertStringContainsString('3D Touch', $readability->getTitle()->getInnerHtml());
} finally { } finally {
@ -437,13 +382,10 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
</html>'; </html>';
$readability = $this->getReadability($data, 'http://0.0.0.0'); $readability = $this->getReadability($data, 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getContent());
$this->assertInstanceOf(JSLikeHTMLElement::class, $readability->getTitle());
} }
public function testPostFilters(): void public function testPostFilters(): void
@ -474,7 +416,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$html = (string) file_get_contents('tests/fixtures/childNodeGoesNull.html'); $html = (string) file_get_contents('tests/fixtures/childNodeGoesNull.html');
$readability = $this->getReadability($html, 'http://0.0.0.0'); $readability = $this->getReadability($html, 'http://0.0.0.0');
$readability->debug = true;
$readability->convertLinksToFootnotes = true; $readability->convertLinksToFootnotes = true;
$res = $readability->init(); $res = $readability->init();
@ -487,7 +428,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$html = (string) file_get_contents('tests/fixtures/keepFootnotes.html'); $html = (string) file_get_contents('tests/fixtures/keepFootnotes.html');
$readability = $this->getReadability($html, 'http://0.0.0.0'); $readability = $this->getReadability($html, 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
@ -501,7 +441,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$html = (string) file_get_contents('tests/fixtures/wipedBody.html'); $html = (string) file_get_contents('tests/fixtures/wipedBody.html');
$readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', false); $readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', false);
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
@ -540,7 +479,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testVisibleNode(string $content, bool $shouldBeVisible): void public function testVisibleNode(string $content, bool $shouldBeVisible): void
{ {
$readability = $this->getReadability($content, 'http://0.0.0.0'); $readability = $this->getReadability($content, 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
if ($shouldBeVisible) { if ($shouldBeVisible) {

Loading…
Cancel
Save