diff --git a/src/Readability.php b/src/Readability.php
index c554cb9..b9d6241 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -46,14 +46,26 @@ class Readability implements LoggerAwareInterface
* Defined up here so we don't instantiate them repeatedly in loops.
*/
public $regexps = [
- 'unlikelyCandidates' => '/display\s*:\s*none|ignore|\binfos?\b|annoy|clock|date|time|author|intro|hidd?e|about|archive|\bprint|bookmark|tags|tag-list|share|search|social|robot|published|combx|comment|mast(?:head)|subscri|community|category|disqus|extra|head|head(?:er|note)|floor|foot(?:er|note)|menu|tool\b|function|nav|remark|rss|shoutbox|widget|meta|banner|sponsor|adsense|inner-?ad|ad-|sponsor|\badv\b|\bads\b|agr?egate?|pager|sidebar|popup|tweet|twitter/i',
- 'okMaybeItsACandidate' => '/article\b|contain|\bcontent|column|general|detail|shadow|lightbox|blog|body|entry|main|page|footnote/i',
+ 'unlikelyCandidates' => '/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i',
+ 'okMaybeItsACandidate' => '/article\b|contain|\bcontent|column|general|detail|shadow|lightbox|blog|body|entry|main|page|footnote|element/i',
'positive' => '/read|full|article|body|\bcontent|contain|entry|main|markdown|media|page|attach|pagination|post|text|blog|story/i',
'negative' => '/bottom|stat|info|discuss|e[\-]?mail|comment|reply|log.{2}(n|ed)|sign|single|combx|com-|contact|_nav|link|media|promo|\bad-|related|scroll|shoutbox|sidebar|sponsor|shopping|teaser|recommend/i',
'divToPElements' => '/<(?:blockquote|header|section|code|div|article|footer|aside|img|p|pre|dl|ol|ul)/mi',
'killBreaks' => '/(
([ \r\n\s]| ?)*)+/',
'media' => '!//(?:[^\.\?/]+\.)?(?:youtu(?:be)?|giphy|soundcloud|dailymotion|vimeo|pornhub|xvideos|twitvid|rutube|openload\.co|viddler)\.(?:com|be|org|net)/!i',
'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i',
+ 'hasContent' => '/\S$/',
+ ];
+ public $defaultTagsToScore = ['section', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'td', 'pre'];
+ // The commented out elements qualify as phrasing content but tend to be
+ // removed by readability when put into paragraphs, so we ignore them here.
+ public $phrasingElements = [
+ // "CANVAS", "IFRAME", "SVG", "VIDEO",
+ 'ABBR', 'AUDIO', 'B', 'BDO', 'BR', 'BUTTON', 'CITE', 'CODE', 'DATA',
+ 'DATALIST', 'DFN', 'EM', 'EMBED', 'I', 'IMG', 'INPUT', 'KBD', 'LABEL',
+ 'MARK', 'MATH', 'METER', 'NOSCRIPT', 'OBJECT', 'OUTPUT', 'PROGRESS', 'Q',
+ 'RUBY', 'SAMP', 'SCRIPT', 'SELECT', 'SMALL', 'SPAN', 'STRONG', 'SUB',
+ 'SUP', 'TEXTAREA', 'TIME', 'VAR', 'WBR',
];
public $tidy_config = [
'tidy-mark' => false,
@@ -485,7 +497,7 @@ class Readability implements LoggerAwareInterface
*/
public function getCommaCount(string $text): int
{
- return substr_count($text, ',');
+ return \count(explode(',', $text));
}
/**
@@ -609,6 +621,9 @@ class Readability implements LoggerAwareInterface
$contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0;
$this->logger->debug('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : ''));
+ // XXX Incomplete implementation
+ $isList = \in_array($node->tagName, ['ul', 'ol'], true);
+
if ($weight + $contentScore < 0) {
$this->logger->debug('Removing...');
$node->parentNode->removeChild($node);
@@ -643,16 +658,16 @@ class Readability implements LoggerAwareInterface
$toRemove = false;
if ($this->lightClean) {
- if ($li > $p && 'ul' !== $tag && 'ol' !== $tag) {
+ if (!$isList && $li > $p) {
$this->logger->debug(' too many
elements, and parent is not or ');
$toRemove = true;
} elseif ($input > floor($p / 3)) {
$this->logger->debug(' too many elements');
$toRemove = true;
- } elseif ($contentLength < 6 && (0 === $embedCount && (0 === $img || $img > 2))) {
+ } elseif (!$isList && $contentLength < 6 && (0 === $embedCount && (0 === $img || $img > 2))) {
$this->logger->debug(' content length less than 6 chars, 0 embeds and either 0 images or more than 2 images');
$toRemove = true;
- } elseif ($weight < 25 && $linkDensity > 0.25) {
+ } elseif (!$isList && $weight < 25 && $linkDensity > 0.25) {
$this->logger->debug(' weight is ' . $weight . ' < 25 and link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.25');
$toRemove = true;
} elseif ($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
@@ -666,16 +681,16 @@ class Readability implements LoggerAwareInterface
if ($img > $p) {
$this->logger->debug(' more image elements than paragraph elements');
$toRemove = true;
- } elseif ($li > $p && 'ul' !== $tag && 'ol' !== $tag) {
+ } elseif (!$isList && $li > $p) {
$this->logger->debug(' too many - elements, and parent is not
or ');
$toRemove = true;
} elseif ($input > floor($p / 3)) {
$this->logger->debug(' too many elements');
$toRemove = true;
- } elseif ($contentLength < 10 && (0 === $img || $img > 2)) {
+ } elseif (!$isList && $contentLength < 10 && (0 === $img || $img > 2)) {
$this->logger->debug(' content length less than 10 chars and 0 images, or more than 2 images');
$toRemove = true;
- } elseif ($weight < 25 && $linkDensity > 0.2) {
+ } elseif (!$isList && $weight < 25 && $linkDensity > 0.2) {
$this->logger->debug(' weight is ' . $weight . ' lower than 0 and link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.2');
$toRemove = true;
} elseif ($weight >= 25 && $linkDensity > 0.5) {
@@ -846,7 +861,7 @@ class Readability implements LoggerAwareInterface
case 'DD':
case 'DT':
case 'LI':
- $readability->value -= 2 * round($this->getLinkDensity($node), 0, \PHP_ROUND_HALF_UP);
+ $readability->value -= 3;
break;
case 'ASIDE':
case 'FOOTER':
@@ -907,14 +922,27 @@ class Readability implements LoggerAwareInterface
continue;
}
+ // Remove unlikely candidates
+ $unlikelyMatchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id') . ' ' . $node->getAttribute('style');
+
+ if (mb_strlen($unlikelyMatchString) > 3 && // don't process "empty" strings
+ preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
+ !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString)
+ ) {
+ $this->logger->debug('Removing unlikely candidate (using conf) ' . $node->getNodePath() . ' by "' . $unlikelyMatchString . '"');
+ $node->parentNode->removeChild($node);
+ --$nodeIndex;
+ continue;
+ }
+
// Some well known site uses sections as paragraphs.
- if (0 === strcasecmp($tagName, 'p') || 0 === strcasecmp($tagName, 'td') || 0 === strcasecmp($tagName, 'pre') || 0 === strcasecmp($tagName, 'section')) {
+ if (\in_array($tagName, $this->defaultTagsToScore, true)) {
$nodesToScore[] = $node;
}
// Turn divs into P tags where they have been used inappropriately
// (as in, where they contain no other block level elements).
- if (0 === strcasecmp($tagName, 'div') || 0 === strcasecmp($tagName, 'article') || 0 === strcasecmp($tagName, 'section')) {
+ if ('div' === $tagName) {
if (!preg_match($this->regexps['divToPElements'], $nodeContent)) {
$newNode = $this->dom->createElement('p');
@@ -929,28 +957,47 @@ class Readability implements LoggerAwareInterface
}
} else {
// Will change these P elements back to text nodes after processing.
- for ($i = 0, $il = $node->childNodes->length; $i < $il; ++$i) {
- $childNode = $node->childNodes->item($i);
+ $p = null;
+ // foreach does not handle removeChild very well
+ // See https://www.php.net/manual/en/domnode.removechild.php#90292
+ $childs = iterator_to_array($node->childNodes);
+ foreach ($childs as $childNode) {
+ // executable tags (parentNode->removeChild($childNode);
- // it looks like sometimes the loop is going too far and we are retrieving a non-existant child
- if (null === $childNode) {
continue;
}
- // executable tags (getInnerText($childNode, true, true)) {
+ /* $this->logger->debug('Remove empty text node'); */
$childNode->parentNode->removeChild($childNode);
continue;
}
- if (\XML_TEXT_NODE === $childNode->nodeType) {
- $p = $this->dom->createElement('p');
- $p->setInnerHtml($childNode->nodeValue);
- $p->setAttribute('data-readability-styled', 'true');
- $childNode->parentNode->replaceChild($p, $childNode);
+ if ($this->isPhrasingContent($childNode)) {
+ if (null !== $p) {
+ $p->appendChild($childNode);
+ } elseif ('' !== $this->getInnerText($childNode, true, true)) {
+ $p = $this->dom->createElement('p');
+ $p->setInnerHtml($childNode->nodeValue);
+ $p->setAttribute('data-readability-styled', 'true');
+ $childNode->parentNode->replaceChild($p, $childNode);
+ }
+ } elseif (null !== $p) {
+ while ($p->lastChild && '' === $this->getInnerText($p->lastChild, true, true)) {
+ $p->removeChild($p->lastChild);
+ }
+ $p = null;
}
}
+
+ if ($this->hasSingleTagInsideElement($node, 'p') && $this->getLinkDensity($node) < 0.25) {
+ $newNode = $node->childNodes->item(0);
+ $node->parentNode->replaceChild($newNode, $node);
+ $nodesToScore[] = $newNode;
+ }
}
}
}
@@ -963,14 +1010,13 @@ class Readability implements LoggerAwareInterface
* Maybe eventually link density.
*/
for ($pt = 0, $scored = \count($nodesToScore); $pt < $scored; ++$pt) {
- $parentNode = $nodesToScore[$pt]->parentNode;
+ $ancestors = $this->getAncestors($nodesToScore[$pt], 5);
// No parent node? Move on...
- if (!$parentNode) {
+ if (0 === \count($ancestors)) {
continue;
}
- $grandParentNode = $parentNode->parentNode instanceof \DOMElement ? $parentNode->parentNode : null;
$innerText = $this->getInnerText($nodesToScore[$pt]);
// If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it.
@@ -978,17 +1024,6 @@ class Readability implements LoggerAwareInterface
continue;
}
- // Initialize readability data for the parent.
- if (!$parentNode->hasAttribute('readability')) {
- $this->initializeNode($parentNode);
- $parentNode->setAttribute('data-candidate', 'true');
- }
-
- // Initialize readability data for the grandparent.
- if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) {
- $this->initializeNode($grandParentNode);
- $grandParentNode->setAttribute('data-candidate', 'true');
- }
// Add a point for the paragraph itself as a base.
$contentScore = 1;
// Add points for any commas within this paragraph.
@@ -996,25 +1031,26 @@ class Readability implements LoggerAwareInterface
// For every SCORE_CHARS_IN_PARAGRAPH (default:100) characters in this paragraph, add another point. Up to 3 points.
$contentScore += min(floor(mb_strlen($innerText) / self::SCORE_CHARS_IN_PARAGRAPH), 3);
// For every SCORE_WORDS_IN_PARAGRAPH (default:20) words in this paragraph, add another point. Up to 3 points.
- $contentScore += min(floor($this->getWordCount($innerText) / self::SCORE_WORDS_IN_PARAGRAPH), 3);
- /* TEST: For every positive/negative parent tag, add/substract half point. Up to 3 points. *\/
- $up = $nodesToScore[$pt];
- $score = 0;
- while ($up->parentNode instanceof \DOMElement) {
- $up = $up->parentNode;
- if (preg_match($this->regexps['positive'], $up->getAttribute('class') . ' ' . $up->getAttribute('id'))) {
- $score += 0.5;
- } elseif (preg_match($this->regexps['negative'], $up->getAttribute('class') . ' ' . $up->getAttribute('id'))) {
- $score -= 0.5;
+ //$contentScore += min(floor($this->getWordCount($innerText) / self::SCORE_WORDS_IN_PARAGRAPH), 3);
+
+ foreach ($ancestors as $level => $ancestor) {
+ if (!$ancestor->nodeName || !$ancestor->parentNode) {
+ return;
}
- }
- $score = floor($score);
- $contentScore += max(min($score, 3), -3);/**/
- // Add the score to the parent. The grandparent gets half.
- $parentNode->getAttributeNode('readability')->value = ((float) $parentNode->getAttributeNode('readability')->value) + $contentScore;
- if ($grandParentNode) {
- $grandParentNode->getAttributeNode('readability')->value += round($contentScore / self::GRANDPARENT_SCORE_DIVISOR);
+ if (!$ancestor->hasAttribute('readability')) {
+ $this->initializeNode($ancestor);
+ $ancestor->setAttribute('data-candidate', 'true');
+ }
+
+ if (0 === $level) {
+ $scoreDivider = 1;
+ } elseif (1 === $level) {
+ $scoreDivider = 2;
+ } else {
+ $scoreDivider = $level * 3;
+ }
+ $ancestor->getAttributeNode('readability')->value += $contentScore / $scoreDivider;
}
}
@@ -1038,18 +1074,6 @@ class Readability implements LoggerAwareInterface
for ($c = $candidates->length - 1; $c >= 0; --$c) {
$node = $candidates->item($c);
-
- // Remove unlikely candidates
- $unlikelyMatchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id') . ' ' . $node->getAttribute('style');
-
- if (mb_strlen($unlikelyMatchString) > 3 && // don't process "empty" strings
- preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
- !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString)
- ) {
- $this->logger->debug('Removing unlikely candidate (using conf) ' . $node->getNodePath() . ' by "' . $unlikelyMatchString . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0));
- $node->parentNode->removeChild($node);
- --$nodeIndex;
- }
}
unset($candidates);
}
@@ -1058,10 +1082,11 @@ class Readability implements LoggerAwareInterface
* After we've calculated scores, loop through all of the possible candidate nodes we found
* and find the one with the highest score.
*/
- $topCandidate = null;
+ $topCandidates = array_fill(0, 5, null);
if ($xpath) {
// Using array of DOMElements after deletion is a path to DOOMElement.
$candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement);
+ $this->logger->debug('Candidates: ' . $candidates->length);
for ($c = $candidates->length - 1; $c >= 0; --$c) {
$item = $candidates->item($c);
@@ -1072,15 +1097,26 @@ class Readability implements LoggerAwareInterface
$readability = $item->getAttributeNode('readability');
$readability->value = round($readability->value * (1 - $this->getLinkDensity($item)), 0, \PHP_ROUND_HALF_UP);
- if (!$topCandidate || $readability->value > (int) $topCandidate->getAttribute('readability')) {
- $this->logger->debug('Candidate: ' . $item->getNodePath() . ' (' . $item->getAttribute('class') . ':' . $item->getAttribute('id') . ') with score ' . $readability->value);
- $topCandidate = $item;
+ for ($t = 0; $t < 5; ++$t) {
+ $aTopCandidate = $topCandidates[$t];
+
+ if (!$aTopCandidate || $readability->value > (int) $aTopCandidate->getAttribute('readability')) {
+ $this->logger->debug('Candidate: ' . $item->getNodePath() . ' (' . $item->getAttribute('class') . ':' . $item->getAttribute('id') . ') with score ' . $readability->value);
+ array_splice($topCandidates, $t, 0, [$item]);
+ if (\count($topCandidates) > 5) {
+ array_pop($topCandidates);
+ }
+ break;
+ }
}
}
-
- unset($candidates);
}
+ $topCandidates = array_filter($topCandidates, function ($v, $idx) {
+ return 0 === $idx || null !== $v;
+ }, \ARRAY_FILTER_USE_BOTH);
+ $topCandidate = $topCandidates[0];
+
/*
* If we still have no top candidate, just use the body as a last resort.
* We also have to copy the body node so it is something we can modify.
@@ -1106,6 +1142,59 @@ class Readability implements LoggerAwareInterface
}
$this->initializeNode($topCandidate);
+ } elseif ($topCandidate) {
+ $alternativeCandidateAncestors = [];
+ foreach ($topCandidates as $candidate) {
+ if ((int) $candidate->getAttribute('readability') / (int) $topCandidate->getAttribute('readability') >= 0.75) {
+ $ancestors = $this->getAncestors($candidate);
+ $this->logger->debug('Adding ' . \count($ancestors) . ' alternative ancestors for ' . $candidate->getNodePath());
+ $alternativeCandidateAncestors[] = $ancestors;
+ }
+ }
+ if (\count($alternativeCandidateAncestors) >= 3) {
+ $parentOfTopCandidate = $topCandidate->parentNode;
+ while ('body' !== $parentOfTopCandidate->nodeName) {
+ $listsContainingThisAncestor = 0;
+ for ($ancestorIndex = 0; $ancestorIndex < \count($alternativeCandidateAncestors) && $listsContainingThisAncestor < 3; ++$ancestorIndex) {
+ $listsContainingThisAncestor += (int) \in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex], true);
+ }
+ if ($listsContainingThisAncestor >= 3) {
+ $topCandidate = $parentOfTopCandidate;
+ break;
+ }
+ $parentOfTopCandidate = $parentOfTopCandidate->parentNode;
+ }
+ }
+ if (!$topCandidate->hasAttribute('readability')) {
+ $this->initializeNode($topCandidate);
+ }
+ $parentOfTopCandidate = $topCandidate->parentNode;
+ $lastScore = (int) $topCandidate->getAttribute('readability');
+ $scoreThreshold = $lastScore / 3;
+ while ('body' !== $parentOfTopCandidate->nodeName) {
+ if (!$parentOfTopCandidate->hasAttribute('readability')) {
+ $parentOfTopCandidate = $parentOfTopCandidate->parentNode;
+ continue;
+ }
+ $parentScore = (int) $parentOfTopCandidate->getAttribute('readability');
+ if ($parentScore < $scoreThreshold) {
+ break;
+ }
+ if ($parentScore > $lastScore) {
+ $topCandidate = $parentOfTopCandidate;
+ break;
+ }
+ $lastScore = (int) $parentOfTopCandidate->getAttribute('readability');
+ $parentOfTopCandidate = $parentOfTopCandidate->parentNode;
+ }
+ $parentOfTopCandidate = $topCandidate->parentNode;
+ while ('body' !== $parentOfTopCandidate->nodeName && 1 === $parentOfTopCandidate->childNodes->length) {
+ $topCandidate = $parentOfTopCandidate;
+ $parentOfTopCandidate = $topCandidate->parentNode;
+ }
+ if (!$topCandidate->hasAttribute('readability')) {
+ $this->initializeNode($topCandidate);
+ }
}
// Set table as the main node if resulted data is table element.
@@ -1131,7 +1220,8 @@ class Readability implements LoggerAwareInterface
$articleContent = $this->dom->createElement('div');
$articleContent->setAttribute('class', 'readability-content');
$siblingScoreThreshold = max(10, ((int) $topCandidate->getAttribute('readability')) * 0.2);
- $siblingNodes = $topCandidate->parentNode->childNodes;
+ $parentOfTopCandidate = $topCandidate->parentNode;
+ $siblingNodes = $parentOfTopCandidate->childNodes;
if (0 === $siblingNodes->length) {
$siblingNodes = new \stdClass();
@@ -1146,27 +1236,25 @@ class Readability implements LoggerAwareInterface
if ($siblingNode->isSameNode($topCandidate)) {
$append = true;
- }
-
- $contentBonus = 0;
-
- // Give a bonus if sibling nodes and top candidates have the same classname.
- if (\XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && '' !== $topCandidate->getAttribute('class')) {
- $contentBonus += ((int) $topCandidate->getAttribute('readability')) * 0.2;
- }
-
- if (\XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) {
- $append = true;
- }
+ } else {
+ $contentBonus = 0;
- if (0 === strcasecmp($siblingNodeName, 'p')) {
- $linkDensity = $this->getLinkDensity($siblingNode);
- $nodeContent = $this->getInnerText($siblingNode, true, true);
- $nodeLength = mb_strlen($nodeContent);
+ // Give a bonus if sibling nodes and top candidates have the same classname.
+ if (\XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && '' !== $topCandidate->getAttribute('class')) {
+ $contentBonus += ((int) $topCandidate->getAttribute('readability')) * 0.2;
+ }
- if (($nodeLength > self::MIN_NODE_LENGTH && $linkDensity < self::MAX_LINK_DENSITY)
- || ($nodeLength < self::MIN_NODE_LENGTH && 0 === (int) $linkDensity && preg_match('/\.( |$)/', $nodeContent))) {
+ if (\XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) {
$append = true;
+ } elseif (0 === strcasecmp($siblingNodeName, 'p')) {
+ $linkDensity = (int) $this->getLinkDensity($siblingNode);
+ $nodeContent = $this->getInnerText($siblingNode, true, true);
+ $nodeLength = mb_strlen($nodeContent);
+
+ if (($nodeLength > self::MIN_NODE_LENGTH && $linkDensity < self::MAX_LINK_DENSITY)
+ || ($nodeLength < self::MIN_NODE_LENGTH && 0 === $nodeLength && 0 === $linkDensity && preg_match('/\.( |$)/', $nodeContent))) {
+ $append = true;
+ }
}
}
@@ -1348,4 +1436,42 @@ class Readability implements LoggerAwareInterface
$this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement');
}
+
+ private function getAncestors(\DOMElement $node, int $maxDepth = 0): array
+ {
+ $ancestors = [];
+ $i = 0;
+ while ($node->parentNode instanceof \DOMElement) {
+ $ancestors[] = $node->parentNode;
+ if (++$i === $maxDepth) {
+ break;
+ }
+ $node = $node->parentNode;
+ }
+
+ return $ancestors;
+ }
+
+ private function isPhrasingContent($node): bool
+ {
+ return \XML_TEXT_NODE === $node->nodeType
+ || \in_array($node->nodeName, $this->phrasingElements, true)
+ || (\in_array($node->nodeName, ['a', 'del', 'ins'], true) && !\in_array(false, array_map(function ($c) {
+ return $this->isPhrasingContent($c);
+ }, iterator_to_array($node->childNodes)), true));
+ }
+
+ private function hasSingleTagInsideElement(\DOMElement $node, string $tag): bool
+ {
+ if (1 !== $node->childNodes->length || $node->childNodes->item(0)->nodeName !== $tag) {
+ return false;
+ }
+
+ $a = array_filter(iterator_to_array($node->childNodes), function ($childNode) {
+ return $childNode instanceof \DOMText &&
+ preg_match($this->regexps['hasContent'], $this->getInnerText($childNode));
+ });
+
+ return 0 === \count($a);
+ }
}
diff --git a/tests/ReadabilityTest.php b/tests/ReadabilityTest.php
index 5931b43..fb0bed8 100644
--- a/tests/ReadabilityTest.php
+++ b/tests/ReadabilityTest.php
@@ -201,7 +201,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertEmpty($readability->getTitle()->getInnerHtml());
$this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml());
$this->assertStringNotContainsString('