All of the regular expressions in use within readability.
*
* Defined up here so we don't instantiate them repeatedly in loops.
*/
public $regexps = [
'unlikelyCandidates' => '/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i',
'okMaybeItsACandidate' => '/article\b|contain|\bcontent|column|general|detail|shadow|lightbox|blog|body|entry|main|page|footnote|element/i',
'positive' => '/read|full|article|body|\bcontent|contain|entry|main|markdown|media|page|attach|pagination|post|text|blog|story/i',
'negative' => '/bottom|stat|info|discuss|e[\-]?mail|comment|reply|log.{2}(n|ed)|sign|single|combx|com-|contact|_nav|link|media|promo|\bad-|related|scroll|shoutbox|sidebar|sponsor|shopping|teaser|recommend/i',
'divToPElements' => '/<(?:blockquote|header|section|code|div|article|footer|aside|img|p|pre|dl|ol|ul)/mi',
'killBreaks' => '/(
([ \r\n\s]| ?)*)+/',
'media' => '!//(?:[^\.\?/]+\.)?(?:youtu(?:be)?|giphy|soundcloud|dailymotion|vimeo|pornhub|xvideos|twitvid|rutube|openload\.co|viddler)\.(?:com|be|org|net)/!i',
'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i',
'hasContent' => '/\S$/',
'isNotVisible' => '/display\s*:\s*none/',
];
/**
* @var array
]*>[ \r\n\s]*){2,}!i' => '
',
// replace noscripts
// '!?noscript>!is' => '',
// replace fonts to spans
'!<(/?)font[^>]*>!is' => '<\\1span>',
];
/**
* @var array ' ]+/>!is' => '',
// remove all attributes on text tags
// '!<(\s*/?\s*(?:blockquote|br|hr|code|div|article|span|footer|aside|p|pre|dl|li|ul|ol)) [^>]+>!is' => "<\\1>",
// single newlines cleanup
"/\n+/" => "\n",
// modern web...
'! Sorry, Readability was unable to parse this page for content. tags, etc.
*/
public function prepArticle(\DOMNode $articleContent): void
{
if (!$articleContent instanceof JSLikeHTMLElement) {
return;
}
$this->logger->debug($this->lightClean ? 'Light clean enabled.' : 'Standard clean enabled.');
$this->clean($articleContent, 'style');
$this->clean($articleContent, 'script');
$this->cleanStyles($articleContent);
$this->killBreaks($articleContent);
$xpath = new \DOMXPath($articleContent->ownerDocument);
if ($this->revertForcedParagraphElements) {
/*
* Reverts P elements with class 'readability-styled' to text nodes:
* which is what they were before.
*/
$elems = $xpath->query('.//p[@data-readability-styled]', $articleContent);
for ($i = $elems->length - 1; $i >= 0; --$i) {
$e = $elems->item($i);
$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
}
}
// Remove service data-candidate attribute.
/** @var \DOMNodeList
\s*]*>\s*
',
'!<[hb]r>!is' => '<\\1 />',
];
/**
* Create instance of Readability.
*
* @param string $html UTF-8 encoded string
* @param string $url URL associated with HTML (for footnotes)
* @param string $parser Which parser to use for turning raw HTML into a DOMDocument
* @param bool $useTidy Use tidy
*/
public function __construct(string $html, ?string $url = null, string $parser = 'libxml', bool $useTidy = true)
{
$this->url = $url;
$this->html = $html;
$this->parser = $parser;
$this->useTidy = $useTidy && \function_exists('tidy_parse_string');
$this->logger = new NullLogger();
}
public function setLogger(LoggerInterface $logger): void
{
$this->logger = $logger;
}
/**
* Get article title element.
*
* @return JSLikeHTMLElement
*/
public function getTitle()
{
if (null === $this->articleTitle) {
throw new \BadMethodCallException('You need to successfully run Readability::init() before you can get title');
}
return $this->articleTitle;
}
/**
* Get article content element.
*
* @return JSLikeHTMLElement
*/
public function getContent()
{
if (null === $this->articleContent) {
throw new \BadMethodCallException('You need to successfully run Readability::init() before you can get content');
}
return $this->articleContent;
}
/**
* Add pre filter for raw input HTML processing.
*
* @param string $filter RegExp for replace
* @param string $replacer Replacer
*/
public function addPreFilter(string $filter, string $replacer = ''): void
{
$this->pre_filters[$filter] = $replacer;
}
/**
* Add post filter for raw output HTML processing.
*
* @param string $filter RegExp for replace
* @param string $replacer Replacer
*/
public function addPostFilter(string $filter, string $replacer = ''): void
{
$this->post_filters[$filter] = $replacer;
}
/**
* Runs readability.
*
* Workflow:
* 1. Prep the document by removing script tags, css, etc.
* 2. Build readability's DOM tree.
* 3. Grab the article content from the current dom tree.
* 4. Replace the current DOM tree with the new one.
* 5. Read peacefully.
*
* @return bool true if we found content, false otherwise
*/
public function init(): bool
{
$this->loadHtml();
if (!isset($this->dom->documentElement)) {
return false;
}
// Assume successful outcome
$this->success = true;
$bodyElems = $this->dom->getElementsByTagName('body');
// WTF multiple body nodes?
if (null === $this->bodyCache) {
$this->bodyCache = '';
foreach ($bodyElems as $bodyNode) {
$this->bodyCache .= trim($bodyNode->getInnerHTML());
}
}
if ($bodyElems->length > 0 && null === $this->body) {
$this->body = $bodyElems->item(0);
}
$this->prepDocument();
// Build readability's DOM tree.
$overlay = $this->dom->createElement('div');
$innerDiv = $this->dom->createElement('div');
$articleTitle = $this->getArticleTitle();
$articleContent = $this->grabArticle();
if (!$articleContent) {
$this->success = false;
$articleContent = $this->dom->createElement('div');
$articleContent->setAttribute('class', 'readability-content');
$articleContent->setInnerHtml(' '\s*
!is' => 'References
');
$articleFootnotes = $this->dom->createElement('ol');
$articleFootnotes->setAttribute('class', 'readability-footnotes-list');
$footnotesWrapper->appendChild($articleFootnotes);
$articleLinks = $articleContent->getElementsByTagName('a');
$linkCount = 0;
foreach ($articleLinks as $articleLink) {
$footnoteLink = $articleLink->cloneNode(true);
$refLink = $this->dom->createElement('a');
$footnote = $this->dom->createElement('li');
$linkDomain = @parse_url($footnoteLink->getAttribute('href'), \PHP_URL_HOST);
if (!$linkDomain && isset($this->url)) {
$linkDomain = @parse_url($this->url, \PHP_URL_HOST);
}
$linkText = $this->getInnerText($articleLink);
if ((false !== strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote')) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
continue;
}
++$linkCount;
// Add a superscript reference after the article link.
$refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount);
$refLink->setInnerHtml('[' . $linkCount . ']');
$refLink->setAttribute('class', 'readability-DoNotFootnote');
$refLink->setAttribute('style', 'color: inherit;');
if ($articleLink->parentNode->lastChild->isSameNode($articleLink)) {
$articleLink->parentNode->appendChild($refLink);
} else {
$articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling);
}
$articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');
$articleLink->setAttribute('name', 'readabilityLink-' . $linkCount);
$footnote->setInnerHtml('^ ');
$footnoteLink->setInnerHtml('' !== $footnoteLink->getAttribute('title') ? $footnoteLink->getAttribute('title') : $linkText);
$footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);
$footnote->appendChild($footnoteLink);
if ($linkDomain) {
$footnote->setInnerHtml($footnote->getInnerHTML() . ' (' . $linkDomain . ')');
}
$articleFootnotes->appendChild($footnote);
}
if ($linkCount > 0) {
$articleContent->appendChild($footnotesWrapper);
}
}
/**
* Prepare the article node for display. Clean out any inline styles,
* iframes, forms, strip extraneous
', $html);
$node->setInnerHtml($html);
}
/**
* Clean a node of all elements of type "tag".
* (Unless it's a youtube/vimeo video. People love movies.).
*
* Updated 2012-09-18 to preserve youtube/vimeo iframes
*/
public function clean(\DOMElement $e, string $tag): void
{
$targetList = $e->getElementsByTagName($tag);
$isEmbed = ('audio' === $tag || 'video' === $tag || 'iframe' === $tag || 'object' === $tag || 'embed' === $tag);
for ($y = $targetList->length - 1; $y >= 0; --$y) {
// Allow youtube and vimeo videos through as people usually want to see those.
$currentItem = $targetList->item($y);
if ($isEmbed) {
$attributeValues = $currentItem->getAttribute('src') . ' ' . $currentItem->getAttribute('href');
// First, check the elements attributes to see if any of them contain known media hosts
if (preg_match($this->regexps['media'], $attributeValues)) {
continue;
}
// Then check the elements inside this element for the same.
if (preg_match($this->regexps['media'], $currentItem->getInnerHTML())) {
continue;
}
}
$currentItem->parentNode->removeChild($currentItem);
}
}
/**
* Clean an element of all tags of type "tag" if they look fishy.
* "Fishy" is an algorithm based on content length, classnames,
* link density, number of images & embeds, etc.
*/
public function cleanConditionally(\DOMElement $e, string $tag): void
{
if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
return;
}
$tagsList = $e->getElementsByTagName($tag);
$curTagsLength = $tagsList->length;
/*
* Gather counts for other typical elements embedded within.
* Traverse backwards so we can remove nodes at the same time without effecting the traversal.
*
* TODO: Consider taking into account original contentScore here.
*/
for ($i = $curTagsLength - 1; $i >= 0; --$i) {
$node = $tagsList->item($i);
$weight = $this->getWeight($node);
$contentScore = self::getContentScore($node);
$this->logger->debug('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : ''));
// XXX Incomplete implementation
$isList = \in_array($node->tagName, ['ul', 'ol'], true);
if ($weight + $contentScore < 0) {
$this->logger->debug('Removing...');
$node->parentNode->removeChild($node);
} elseif ($this->getCommaCount($this->getInnerText($node)) < self::MIN_COMMAS_IN_PARAGRAPH) {
/*
* If there are not very many commas, and the number of
* non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
*/
$p = $node->getElementsByTagName('p')->length;
$img = $node->getElementsByTagName('img')->length;
$li = $node->getElementsByTagName('li')->length - 100;
$input = $node->getElementsByTagName('input')->length;
$a = $node->getElementsByTagName('a')->length;
$embedCount = 0;
$embeds = $node->getElementsByTagName('embed');
foreach ($embeds as $embed) {
if (preg_match($this->regexps['media'], $embed->getAttribute('src'))) {
++$embedCount;
}
}
$embeds = $node->getElementsByTagName('iframe');
foreach ($embeds as $embed) {
if (preg_match($this->regexps['media'], $embed->getAttribute('src'))) {
++$embedCount;
}
}
$linkDensity = $this->getLinkDensity($node, true);
$contentLength = mb_strlen($this->getInnerText($node));
$toRemove = false;
if ($this->lightClean) {
if (!$isList && $li > $p) {
$this->logger->debug(' too many or
');
$toRemove = true;
} elseif ($input > floor($p / 3)) {
$this->logger->debug(' too many elements');
$toRemove = true;
} elseif (!$isList && $contentLength < 6 && (0 === $embedCount && (0 === $img || $img > 2))) {
$this->logger->debug(' content length less than 6 chars, 0 embeds and either 0 images or more than 2 images');
$toRemove = true;
} elseif (!$isList && $weight < 25 && $linkDensity > 0.25) {
$this->logger->debug(' weight is ' . $weight . ' < 25 and link density is ' . \sprintf('%.2f', $linkDensity) . ' > 0.25');
$toRemove = true;
} elseif ($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
$this->logger->debug(' more than 2 links and weight is ' . $weight . ' > 25 but link density is ' . \sprintf('%.2f', $linkDensity) . ' > 0.5');
$toRemove = true;
} elseif ($embedCount > 3) {
$this->logger->debug(' more than 3 embeds');
$toRemove = true;
}
} else {
if ($img > $p) {
$this->logger->debug(' more image elements than paragraph elements');
$toRemove = true;
} elseif (!$isList && $li > $p) {
$this->logger->debug(' too many
or
');
$toRemove = true;
} elseif ($input > floor($p / 3)) {
$this->logger->debug(' too many elements');
$toRemove = true;
} elseif (!$isList && $contentLength < 10 && (0 === $img || $img > 2)) {
$this->logger->debug(' content length less than 10 chars and 0 images, or more than 2 images');
$toRemove = true;
} elseif (!$isList && $weight < 25 && $linkDensity > 0.2) {
$this->logger->debug(' weight is ' . $weight . ' lower than 0 and link density is ' . \sprintf('%.2f', $linkDensity) . ' > 0.2');
$toRemove = true;
} elseif ($weight >= 25 && $linkDensity > 0.5) {
$this->logger->debug(' weight above 25 but link density is ' . \sprintf('%.2f', $linkDensity) . ' > 0.5');
$toRemove = true;
} elseif ((1 === $embedCount && $contentLength < 75) || $embedCount > 1) {
$this->logger->debug(' 1 embed and content length smaller than 75 chars, or more than one embed');
$toRemove = true;
}
}
if ($toRemove) {
$this->logger->debug('Removing...');
$node->parentNode->removeChild($node);
}
}
}
}
/**
* Clean out spurious headers from an Element. Checks things like classnames and link density.
*/
public function cleanHeaders(\DOMElement $e): void
{
for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) {
$headers = $e->getElementsByTagName('h' . $headerIndex);
for ($i = $headers->length - 1; $i >= 0; --$i) {
$header = $headers->item($i);
if ($this->getWeight($header) < 0 || $this->getLinkDensity($header) > 0.33) {
$header->parentNode->removeChild($header);
}
}
}
}
/**
* Check if the given flag is active.
*/
public function flagIsActive(int $flag): bool
{
return ($this->flags & $flag) > 0;
}
/**
* Add a flag.
*/
public function addFlag(int $flag): void
{
$this->flags |= $flag;
}
/**
* Remove a flag.
*/
public function removeFlag(int $flag): void
{
$this->flags &= ~$flag;
}
/**
* Get the article title as an H1.
*
* @return JSLikeHTMLElement
*/
protected function getArticleTitle()
{
try {
$curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
} catch (\Exception $e) {
$curTitle = '';
$origTitle = '';
}
if (preg_match('/ [\|\-] /', $curTitle)) {
$curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
if (\count(explode(' ', $curTitle)) < 3) {
$curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
}
} elseif (false !== strpos($curTitle, ': ')) {
$curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);
if (\count(explode(' ', $curTitle)) < 3) {
$curTitle = preg_replace('/[^:]*[:](.*)/i', '$1', $origTitle);
}
} elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) {
$hOnes = $this->dom->getElementsByTagName('h1');
if (1 === $hOnes->length) {
$curTitle = $this->getInnerText($hOnes->item(0));
}
}
$curTitle = trim($curTitle);
if (\count(explode(' ', $curTitle)) <= 4) {
$curTitle = $origTitle;
}
$articleTitle = $this->dom->createElement('h1');
$articleTitle->setInnerHtml($curTitle);
return $articleTitle;
}
/**
* Prepare the HTML document for readability to scrape it.
* This includes things like stripping javascript, CSS, and handling terrible markup.
*/
protected function prepDocument(): void
{
/*
* In some cases a body element can't be found (if the HTML is totally hosed for example)
* so we create a new body node and append it to the document.
*/
if (null === $this->body) {
$this->body = $this->dom->createElement('body');
$this->dom->documentElement->appendChild($this->body);
}
$this->body->setAttribute('class', 'readabilityBody');
// Remove all style tags in head.
$styleTags = $this->dom->getElementsByTagName('style');
for ($i = $styleTags->length - 1; $i >= 0; --$i) {
$styleTag = $styleTags->item($i);
$styleTag->parentNode->removeChild($styleTag);
}
$linkTags = $this->dom->getElementsByTagName('link');
for ($i = $linkTags->length - 1; $i >= 0; --$i) {
$linkTag = $linkTags->item($i);
$linkTag->parentNode->removeChild($linkTag);
}
}
/**
* Initialize a node with the readability object. Also checks the
* className/id for special names to add to its score.
*/
protected function initializeNode(\DOMElement $node): void
{
if (!isset($node->tagName)) {
return;
}
$contentScore = 0;
// using strtoupper just in case
switch (strtoupper($node->tagName)) {
case 'ARTICLE':
$contentScore += 15;
// no break
case 'DIV':
$contentScore += 5;
break;
case 'PRE':
case 'CODE':
case 'TD':
case 'BLOCKQUOTE':
case 'FIGURE':
$contentScore += 3;
break;
case 'SECTION':
// often misused
// $contentScore += 2;
break;
case 'OL':
case 'UL':
case 'DL':
case 'DD':
case 'DT':
case 'LI':
$contentScore -= 3;
break;
case 'ASIDE':
case 'FOOTER':
case 'HEADER':
case 'ADDRESS':
case 'FORM':
case 'BUTTON':
case 'TEXTAREA':
case 'INPUT':
case 'NAV':
$contentScore -= 3;
break;
case 'H1':
case 'H2':
case 'H3':
case 'H4':
case 'H5':
case 'H6':
case 'TH':
case 'HGROUP':
$contentScore -= 5;
break;
}
$contentScore += $this->getWeight($node);
$readability = $this->dom->createAttribute('readability');
$readability->value = (string) $contentScore;
$node->setAttributeNode($readability);
}
/**
* Using a variety of metrics (content score, classname, element types), find the content that is
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
*
* @return JSLikeHTMLElement|false
*/
protected function grabArticle(?JSLikeHTMLElement $page = null)
{
if (!$page) {
$page = $this->dom;
}
$xpath = null;
$nodesToScore = [];
if ($page instanceof \DOMDocument && isset($page->documentElement)) {
$xpath = new \DOMXPath($page);
}
$allElements = $page->getElementsByTagName('*');
for ($nodeIndex = 0; $allElements->item($nodeIndex); ++$nodeIndex) {
$node = $allElements->item($nodeIndex);
$tagName = $node->tagName;
$nodeContent = $node->getInnerHTML();
if (empty($nodeContent)) {
$this->logger->debug('Skipping empty node');
continue;
}
// Remove invisible nodes
if (!$this->isNodeVisible($node)) {
$this->logger->debug('Removing invisible node ' . $node->getNodePath());
$node->parentNode->removeChild($node);
--$nodeIndex;
continue;
}
// Remove unlikely candidates
$unlikelyMatchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id') . ' ' . $node->getAttribute('style');
if (mb_strlen($unlikelyMatchString) > 3 // don't process "empty" strings
&& preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString)
&& !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString)
) {
$this->logger->debug('Removing unlikely candidate (using conf) ' . $node->getNodePath() . ' by "' . $unlikelyMatchString . '"');
$node->parentNode->removeChild($node);
--$nodeIndex;
continue;
}
// Some well known site uses sections as paragraphs.
if (\in_array($tagName, $this->defaultTagsToScore, true)) {
$nodesToScore[] = $node;
}
// Turn divs into P tags where they have been used inappropriately
// (as in, where they contain no other block level elements).
if ('div' === $tagName) {
if (!preg_match($this->regexps['divToPElements'], $nodeContent)) {
$newNode = $this->dom->createElement('p');
try {
$newNode->setInnerHtml($nodeContent);
$node->parentNode->replaceChild($newNode, $node);
--$nodeIndex;
$nodesToScore[] = $newNode;
} catch (\Exception $e) {
$this->logger->error('Could not alter div/article to p, reverting back to div: ' . $e->getMessage());
}
} else {
// Will change these P elements back to text nodes after processing.
$p = null;
// foreach does not handle removeChild very well
// See https://www.php.net/manual/en/domnode.removechild.php#90292
$childs = iterator_to_array($node->childNodes);
foreach ($childs as $childNode) {
// executable tags (parentNode->removeChild($childNode);
continue;
}
if ($childNode instanceof \DOMText && '' === $this->getInnerText($childNode, true, true)) {
/* $this->logger->debug('Remove empty text node'); */
$childNode->parentNode->removeChild($childNode);
continue;
}
if ($this->isPhrasingContent($childNode)) {
if (null !== $p) {
$p->appendChild($childNode);
} elseif ('' !== $this->getInnerText($childNode, true, true)) {
$p = $this->dom->createElement('p');
$p->setAttribute('data-readability-styled', 'true');
$node->replaceChild($p, $childNode);
$p->appendChild($childNode);
}
} elseif (null !== $p) {
while ($p->lastChild && '' === $this->getInnerText($p->lastChild, true, true)) {
$p->removeChild($p->lastChild);
}
$p = null;
}
}
if ($this->hasSingleTagInsideElement($node, 'p') && $this->getLinkDensity($node) < 0.25) {
$newNode = $node->childNodes->item(0);
$node->parentNode->replaceChild($newNode, $node);
$nodesToScore[] = $newNode;
}
}
}
}
/*
* Loop through all paragraphs, and assign a score to them based on how content-y they look.
* Then add their score to their parent node.
*
* A score is determined by things like number of commas, class names, etc.
* Maybe eventually link density.
*/
foreach ($nodesToScore as $nodeToScore) {
$ancestors = $this->getAncestors($nodeToScore, 5);
// No parent node? Move on...
if (0 === \count($ancestors)) {
continue;
}
$innerText = $this->getInnerText($nodeToScore);
// If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it.
if (mb_strlen($innerText) < self::MIN_PARAGRAPH_LENGTH) {
continue;
}
// Add a point for the paragraph itself as a base.
$contentScore = 1;
// Add points for any commas within this paragraph.
$contentScore += $this->getCommaCount($innerText);
// For every SCORE_CHARS_IN_PARAGRAPH (default:100) characters in this paragraph, add another point. Up to 3 points.
$contentScore += min(floor(mb_strlen($innerText) / self::SCORE_CHARS_IN_PARAGRAPH), 3);
// For every SCORE_WORDS_IN_PARAGRAPH (default:20) words in this paragraph, add another point. Up to 3 points.
// $contentScore += min(floor($this->getWordCount($innerText) / self::SCORE_WORDS_IN_PARAGRAPH), 3);
foreach ($ancestors as $level => $ancestor) {
if (!$ancestor->nodeName || !$ancestor->parentNode) {
return false;
}
if (!$ancestor->hasAttribute('readability')) {
$this->initializeNode($ancestor);
$ancestor->setAttribute('data-candidate', 'true');
}
if (0 === $level) {
$scoreDivider = 1;
} elseif (1 === $level) {
$scoreDivider = 2;
} else {
$scoreDivider = $level * 3;
}
self::updateContentScore($ancestor, fn ($prevScore) => $prevScore + $contentScore / $scoreDivider);
}
}
/*
* Node prepping: trash nodes that look cruddy (like ones with the class name "comment", etc).
* This is faster to do before scoring but safer after.
*/
if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) && $xpath) {
/** @var \DOMNodeList