php-readability/src/Readability.php

<?php

namespace Readability;

use Masterminds\HTML5;
use Psr\Log\LoggerAwareInterface;
use Psr\Log\LoggerInterface;
use Psr\Log\NullLogger;

class Readability implements LoggerAwareInterface
{
    // flags
    public const FLAG_STRIP_UNLIKELYS = 1;
    public const FLAG_WEIGHT_ATTRIBUTES = 2;
    public const FLAG_CLEAN_CONDITIONALLY = 4;
    public const FLAG_DISABLE_PREFILTER = 8;
    public const FLAG_DISABLE_POSTFILTER = 16;
    // constants
    public const SCORE_CHARS_IN_PARAGRAPH = 100;
    public const SCORE_WORDS_IN_PARAGRAPH = 20;
    public const GRANDPARENT_SCORE_DIVISOR = 2;
    public const MIN_PARAGRAPH_LENGTH = 20;
    public const MIN_COMMAS_IN_PARAGRAPH = 6;
    public const MIN_ARTICLE_LENGTH = 200;
    public const MIN_NODE_LENGTH = 80;
    public const MAX_LINK_DENSITY = 0.25;

    /**
     * @var bool
     */
    public $convertLinksToFootnotes = false;

    /**
     * @var bool
     */
    public $revertForcedParagraphElements = false;

    /**
     * @var ?\DOMElement
     */
    public $articleTitle;

    /**
     * @var ?\DOMElement
     */
    public $articleContent;

    /**
     * @var ?string
     */
    public $original_html;

    /**
     * @var ?\DOMDocument
     */
    public $dom;

    /**
     * @var ?string URL where HTML was retrieved
     */
    public $url = null;

    /**
     * @var bool preserves more content (experimental)
     */
    public $lightClean = true;

    /**
     * @var bool no more used, keept to avoid BC
     */
    public $debug = false;

    /**
     * @var bool
     */
    public $tidied = false;

    /**
     * @var array<string, string> All of the regular expressions in use within readability.
     *
     * Defined up here so we don't instantiate them repeatedly in loops.
     */
    public $regexps = [
        'unlikelyCandidates' => '/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i',
        'okMaybeItsACandidate' => '/article\b|contain|\bcontent|column|general|detail|shadow|lightbox|blog|body|entry|main|page|footnote|element/i',
        'positive' => '/read|full|article|body|\bcontent|contain|entry|main|markdown|media|page|attach|pagination|post|text|blog|story/i',
        'negative' => '/bottom|stat|info|discuss|e[\-]?mail|comment|reply|log.{2}(n|ed)|sign|single|combx|com-|contact|_nav|link|media|promo|\bad-|related|scroll|shoutbox|sidebar|sponsor|shopping|teaser|recommend/i',
        'divToPElements' => '/<(?:blockquote|header|section|code|div|article|footer|aside|img|p|pre|dl|ol|ul)/mi',
        'killBreaks' => '/(<br\s*\/?>([ \r\n\s]|&nbsp;?)*)+/',
        'media' => '!//(?:[^\.\?/]+\.)?(?:youtu(?:be)?|giphy|soundcloud|dailymotion|vimeo|pornhub|xvideos|twitvid|rutube|openload\.co|viddler)\.(?:com|be|org|net)/!i',
        'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i',
        'hasContent' => '/\S$/',
        'isNotVisible' => '/display\s*:\s*none/',
    ];

    /**
     * @var array<string>
     */
    public $defaultTagsToScore = ['section', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'td', 'pre'];

    /**
     * @var array<string>
     */
    public $phrasingElements = [
        // The commented out elements qualify as phrasing content but tend to be
        // removed by readability when put into paragraphs, so we ignore them here.
        // "CANVAS", "IFRAME", "SVG", "VIDEO",
        'ABBR', 'AUDIO', 'B', 'BDO', 'BR', 'BUTTON', 'CITE', 'CODE', 'DATA',
        'DATALIST', 'DFN', 'EM', 'EMBED', 'I', 'IMG', 'INPUT', 'KBD', 'LABEL',
        'MARK', 'MATH', 'METER', 'NOSCRIPT', 'OBJECT', 'OUTPUT', 'PROGRESS', 'Q',
        'RUBY', 'SAMP', 'SCRIPT', 'SELECT', 'SMALL', 'SPAN', 'STRONG', 'SUB',
        'SUP', 'TEXTAREA', 'TIME', 'VAR', 'WBR',
    ];

    /**
     * @var array<string, bool|string>
     */
    public $tidy_config = [
        'tidy-mark' => false,
        'vertical-space' => false,
        'doctype' => 'omit',
        'numeric-entities' => false,
        // 'preserve-entities' => true,
        'break-before-br' => false,
        'clean' => false,
        'output-xhtml' => true,
        'logical-emphasis' => true,
        'show-body-only' => false,
        'new-blocklevel-tags' => 'article aside audio bdi canvas details dialog figcaption figure footer header hgroup main menu menuitem nav section source summary template track video',
        'new-empty-tags' => 'command embed keygen source track wbr',
        'new-inline-tags' => 'audio command datalist embed keygen mark menuitem meter output progress source time video wbr',
        'wrap' => 0,
        'drop-empty-paras' => true,
        'drop-proprietary-attributes' => false,
        'enclose-text' => true,
        'merge-divs' => true,
        // 'merge-spans' => true,
        'input-encoding' => '????',
        'output-encoding' => 'utf8',
        'hide-comments' => true,
    ];

    /**
     * @var ?string article domain regexp for calibration
     */
    protected $domainRegExp = null;

    /**
     * @var ?\DOMElement
     */
    protected $body = null;

    /**
     * @var ?string Cache the body HTML in case we need to re-use it later
     */
    protected $bodyCache = null;

    /**
     * @var int-mask-of<self::FLAG_*> start with all processing flags set
     */
    protected $flags = self::FLAG_STRIP_UNLIKELYS | self::FLAG_WEIGHT_ATTRIBUTES | self::FLAG_CLEAN_CONDITIONALLY;

    /**
     * @var bool indicates whether we were able to extract or not
     */
    protected $success = false;

    /**
     * @var LoggerInterface
     */
    protected $logger;

    /**
     * @var string
     */
    protected $parser;

    /**
     * @var string
     */
    protected $html;

    /**
     * @var bool
     */
    protected $useTidy;

    /**
     * @var array<string, string> raw HTML filters
     */
    protected $pre_filters = [
        // remove spans as we redefine styles and they're probably special-styled
        '!</?span[^>]*>!is' => '',
        // HACK: firewall-filtered content
        '!<font[^>]*>\s*\[AD\]\s*</font>!is' => '',
        // HACK: replace linebreaks plus br's with p's
        '!(<br[^>]*>[ \r\n\s]*){2,}!i' => '</p><p>',
        // replace noscripts
        // '!</?noscript>!is' => '',
        // replace fonts to spans
        '!<(/?)font[^>]*>!is' => '<\\1span>',
    ];

    /**
     * @var array<string, string> output HTML filters
     */
    protected $post_filters = [
        // replace excessive br's
        '/<br\s*\/?>\s*<p/i' => '<p',
        // replace empty tags that break layouts
        '!<(?:a|div|p|figure)[^>]+/>!is' => '',
        // remove all attributes on text tags
        // '!<(\s*/?\s*(?:blockquote|br|hr|code|div|article|span|footer|aside|p|pre|dl|li|ul|ol)) [^>]+>!is' => "<\\1>",
        // single newlines cleanup
        "/\n+/" => "\n",
        // modern web...
        '!<pre[^>]*>\s*<code!is' => '<pre',
        '!</code>\s*</pre>!is' => '</pre>',
        '!<[hb]r>!is' => '<\\1 />',
    ];

    /**
     * Create instance of Readability.
     *
     * @param string $html    UTF-8 encoded string
     * @param string $url     URL associated with HTML (for footnotes)
     * @param string $parser  Which parser to use for turning raw HTML into a DOMDocument
     * @param bool   $useTidy Use tidy
     */
    public function __construct(string $html, ?string $url = null, string $parser = 'libxml', bool $useTidy = true)
    {
        $this->url = $url;
        $this->html = $html;
        $this->parser = $parser;
        $this->useTidy = $useTidy && \function_exists('tidy_parse_string');

        $this->logger = new NullLogger();
    }

    public function setLogger(LoggerInterface $logger): void
    {
        $this->logger = $logger;
    }

    /**
     * Get article title element.
     *
     * @return \DOMElement
     */
    public function getTitle()
    {
        if (null === $this->articleTitle) {
            throw new \BadMethodCallException('You need to successfully run Readability::init() before you can get title');
        }

        return $this->articleTitle;
    }

    /**
     * Get article content element.
     *
     * @return \DOMElement
     */
    public function getContent()
    {
        if (null === $this->articleContent) {
            throw new \BadMethodCallException('You need to successfully run Readability::init() before you can get content');
        }

        return $this->articleContent;
    }

    /**
     * Add pre filter for raw input HTML processing.
     *
     * @param string $filter   RegExp for replace
     * @param string $replacer Replacer
     */
    public function addPreFilter(string $filter, string $replacer = ''): void
    {
        $this->pre_filters[$filter] = $replacer;
    }

    /**
     * Add post filter for raw output HTML processing.
     *
     * @param string $filter   RegExp for replace
     * @param string $replacer Replacer
     */
    public function addPostFilter(string $filter, string $replacer = ''): void
    {
        $this->post_filters[$filter] = $replacer;
    }

    /**
     * Runs readability.
     *
     * Workflow:
     *  1. Prep the document by removing script tags, css, etc.
     *  2. Build readability's DOM tree.
     *  3. Grab the article content from the current dom tree.
     *  4. Replace the current DOM tree with the new one.
     *  5. Read peacefully.
     *
     * @return bool true if we found content, false otherwise
     */
    public function init(): bool
    {
        $this->loadHtml();

        if (!isset($this->dom->documentElement)) {
            return false;
        }

        // Assume successful outcome
        $this->success = true;
        $bodyElems = $this->dom->getElementsByTagName('body');

        // WTF multiple body nodes?
        if (null === $this->bodyCache) {
            $this->bodyCache = '';
            foreach ($bodyElems as $bodyNode) {
                $this->bodyCache .= trim($bodyNode->getInnerHTML());
            }
        }

        if ($bodyElems->length > 0 && null === $this->body) {
            $this->body = $bodyElems->item(0);
        }

        $this->prepDocument();

        // Build readability's DOM tree.
        $overlay = $this->dom->createElement('div');
        $innerDiv = $this->dom->createElement('div');
        $articleTitle = $this->getArticleTitle();
        $articleContent = $this->grabArticle();

        if (!$articleContent) {
            $this->success = false;
            $articleContent = $this->dom->createElement('div');
            $articleContent->setAttribute('class', 'readability-content');
            $articleContent->setInnerHtml('<p>Sorry, Readability was unable to parse this page for content.</p>');
        }

        $overlay->setAttribute('class', 'readOverlay');
        $innerDiv->setAttribute('class', 'readInner');

        // Glue the structure of our document together.
        $innerDiv->appendChild($articleTitle);
        $innerDiv->appendChild($articleContent);
        $overlay->appendChild($innerDiv);

        // without tidy the body can (sometimes) be wiped, so re-create it
        if (false === isset($this->body->childNodes)) {
            $this->body = $this->dom->createElement('body');
        }

        // Clear the old HTML, insert the new content.
        $this->body->setInnerHtml('');
        $this->body->appendChild($overlay);
        $this->body->removeAttribute('style');
        $this->postProcessContent($articleContent);

        // Set title and content instance variables.
        $this->articleTitle = $articleTitle;
        $this->articleContent = $articleContent;

        return $this->success;
    }

    /**
     * Run any post-process modifications to article content as necessary.
     */
    public function postProcessContent(\DOMElement $articleContent): void
    {
        if ($this->convertLinksToFootnotes && !preg_match('/\bwiki/', $this->url)) {
            $this->addFootnotes($articleContent);
        }
    }

    /**
     * For easier reading, convert this document to have footnotes at the bottom rather than inline links.
     *
     * @see http://www.roughtype.com/archives/2010/05/experiments_in.php
     */
    public function addFootnotes(\DOMElement $articleContent): void
    {
        $footnotesWrapper = $this->dom->createElement('footer');
        $footnotesWrapper->setAttribute('class', 'readability-footnotes');
        $footnotesWrapper->setInnerHtml('<h3>References</h3>');
        $articleFootnotes = $this->dom->createElement('ol');
        $articleFootnotes->setAttribute('class', 'readability-footnotes-list');
        $footnotesWrapper->appendChild($articleFootnotes);
        $articleLinks = $articleContent->getElementsByTagName('a');
        $linkCount = 0;

        foreach ($articleLinks as $articleLink) {
            $footnoteLink = $articleLink->cloneNode(true);
            $refLink = $this->dom->createElement('a');
            $footnote = $this->dom->createElement('li');
            $linkDomain = @parse_url($footnoteLink->getAttribute('href'), \PHP_URL_HOST);
            if (!$linkDomain && isset($this->url)) {
                $linkDomain = @parse_url($this->url, \PHP_URL_HOST);
            }

            $linkText = $this->getInnerText($articleLink);
            if ((false !== strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote')) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
                continue;
            }

            ++$linkCount;

            // Add a superscript reference after the article link.
            $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount);
            $refLink->setInnerHtml('<small><sup>[' . $linkCount . ']</sup></small>');
            $refLink->setAttribute('class', 'readability-DoNotFootnote');
            $refLink->setAttribute('style', 'color: inherit;');

            if ($articleLink->parentNode->lastChild->isSameNode($articleLink)) {
                $articleLink->parentNode->appendChild($refLink);
            } else {
                $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling);
            }

            $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');
            $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount);
            $footnote->setInnerHtml('<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> ');
            $footnoteLink->setInnerHtml('' !== $footnoteLink->getAttribute('title') ? $footnoteLink->getAttribute('title') : $linkText);
            $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);
            $footnote->appendChild($footnoteLink);

            if ($linkDomain) {
                $footnote->setInnerHtml($footnote->getInnerHTML() . '<small> (' . $linkDomain . ')</small>');
            }
            $articleFootnotes->appendChild($footnote);
        }

        if ($linkCount > 0) {
            $articleContent->appendChild($footnotesWrapper);
        }
    }

    /**
     * Prepare the article node for display. Clean out any inline styles,
     * iframes, forms, strip extraneous <p> tags, etc.
     */
    public function prepArticle(\DOMNode $articleContent): void
    {
        if (!$articleContent instanceof \DOMElement) {
            return;
        }

        $this->logger->debug($this->lightClean ? 'Light clean enabled.' : 'Standard clean enabled.');

        $this->clean($articleContent, 'style');
        $this->clean($articleContent, 'script');

        $this->cleanStyles($articleContent);
        $this->killBreaks($articleContent);

        $xpath = new \DOMXPath($articleContent->ownerDocument);

        if ($this->revertForcedParagraphElements) {
            /*
             * Reverts P elements with class 'readability-styled' to text nodes:
             * which is what they were before.
             */
            $elems = $xpath->query('.//p[@data-readability-styled]', $articleContent);
            for ($i = $elems->length - 1; $i >= 0; --$i) {
                $e = $elems->item($i);
                $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
            }
        }

        // Remove service data-candidate attribute.
        $elems = $xpath->query('.//*[@data-candidate]', $articleContent);
        foreach ($elems as $elem) {
            $elem->removeAttribute('data-candidate');
        }

        // Clean out junk from the article content.
        $this->clean($articleContent, 'input');
        $this->clean($articleContent, 'button');
        $this->clean($articleContent, 'nav');
        $this->clean($articleContent, 'object');
        $this->clean($articleContent, 'iframe');
        $this->clean($articleContent, 'canvas');
        $this->clean($articleContent, 'h1');

        /*
         * If there is only one h2, they are probably using it as a main header, so remove it since we
         *  already have a header.
         */
        $h2s = $articleContent->getElementsByTagName('h2');
        if (1 === $h2s->length && mb_strlen($this->getInnerText($h2s->item(0), true, true)) < 100) {
            $this->clean($articleContent, 'h2');
        }

        $this->cleanHeaders($articleContent);

        // Do these last as the previous stuff may have removed junk that will affect these.
        $this->cleanConditionally($articleContent, 'form');
        $this->cleanConditionally($articleContent, 'table');
        $this->cleanConditionally($articleContent, 'ul');
        $this->cleanConditionally($articleContent, 'div');

        // Remove extra paragraphs.
        $articleParagraphs = $articleContent->getElementsByTagName('p');

        for ($i = $articleParagraphs->length - 1; $i >= 0; --$i) {
            $item = $articleParagraphs->item($i);

            $imgCount = $item->getElementsByTagName('img')->length;
            $embedCount = $item->getElementsByTagName('embed')->length;
            $objectCount = $item->getElementsByTagName('object')->length;
            $videoCount = $item->getElementsByTagName('video')->length;
            $audioCount = $item->getElementsByTagName('audio')->length;
            $iframeCount = $item->getElementsByTagName('iframe')->length;

            if (0 === $iframeCount && 0 === $imgCount && 0 === $embedCount && 0 === $objectCount && 0 === $videoCount && 0 === $audioCount && 0 === mb_strlen(preg_replace('/\s+/is', '', $this->getInnerText($item, false, false)))) {
                $item->parentNode->removeChild($item);
            }

            // add extra text to iframe tag to avoid an auto-closing iframe and then break the html code
            if ($iframeCount) {
                $iframe = $item->getElementsByTagName('iframe');
                $iframe->item(0)->nodeValue = ' ';

                $item->parentNode->replaceChild($iframe->item(0), $item);
            }
        }

        if (!$this->flagIsActive(self::FLAG_DISABLE_POSTFILTER)) {
            try {
                foreach ($this->post_filters as $search => $replace) {
                    $articleContent->setInnerHtml(preg_replace($search, $replace, $articleContent->getInnerHTML()));
                }
                unset($search, $replace);
            } catch (\Exception $e) {
                $this->logger->error('Cleaning output HTML failed. Ignoring: ' . $e->getMessage());
            }
        }
    }

    /**
     * Get the inner text of a node.
     * This also strips out any excess whitespace to be found.
     *
     * @param \DOMElement $e
     * @param bool        $normalizeSpaces (default: true)
     * @param bool        $flattenLines    (default: false)
     */
    public function getInnerText($e, bool $normalizeSpaces = true, bool $flattenLines = false): string
    {
        if (null === $e || !isset($e->textContent) || '' === $e->textContent) {
            return '';
        }

        $textContent = trim($e->textContent);

        if ($flattenLines) {
            return (string) mb_ereg_replace('(?:[\r\n](?:\s|&nbsp;)*)+', '', $textContent);
        }

        if ($normalizeSpaces) {
            return (string) mb_ereg_replace('\s\s+', ' ', $textContent);
        }

        return $textContent;
    }

    /**
     * Remove the style attribute on every $e and under.
     */
    public function cleanStyles(\DOMElement $e): void
    {
        if (\is_object($e)) {
            $elems = $e->getElementsByTagName('*');

            foreach ($elems as $elem) {
                $elem->removeAttribute('style');
            }
        }
    }

    /**
     * Get comma number for a given text.
     */
    public function getCommaCount(string $text): int
    {
        return \count(explode(',', $text));
    }

    /**
     * Get words number for a given text if words separated by a space.
     * Input string should be normalized.
     */
    public function getWordCount(string $text): int
    {
        return substr_count($text, ' ');
    }

    /**
     * Get the density of links as a percentage of the content
     * This is the amount of text that is inside a link divided by the total text in the node.
     * Can exclude external references to differentiate between simple text and menus/infoblocks.
     */
    public function getLinkDensity(\DOMElement $e, bool $excludeExternal = false): float
    {
        $links = $e->getElementsByTagName('a');
        $textLength = mb_strlen($this->getInnerText($e, true, true));
        $linkLength = 0;

        $dRe = $this->domainRegExp;
        foreach ($links as $link) {
            if ($excludeExternal && $dRe && !preg_match($dRe, $link->getAttribute('href'))) {
                continue;
            }
            $linkLength += mb_strlen($this->getInnerText($link));
        }

        if ($textLength > 0 && $linkLength > 0) {
            return $linkLength / $textLength;
        }

        return 0;
    }

    /**
     * Get an element relative weight.
     */
    public function getWeight(\DOMElement $e): int
    {
        if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
            return 0;
        }

        $weight = 0;
        // Look for a special classname
        $weight += $this->weightAttribute($e, 'class');
        // Look for a special ID
        $weight += $this->weightAttribute($e, 'id');

        return $weight;
    }

    /**
     * Remove extraneous break tags from a node.
     */
    public function killBreaks(\DOMElement $node): void
    {
        $html = $node->getInnerHTML();
        $html = preg_replace($this->regexps['killBreaks'], '<br />', $html);
        $node->setInnerHtml($html);
    }

    /**
     * Clean a node of all elements of type "tag".
     * (Unless it's a youtube/vimeo video. People love movies.).
     *
     * Updated 2012-09-18 to preserve youtube/vimeo iframes
     */
    public function clean(\DOMElement $e, string $tag): void
    {
        $targetList = $e->getElementsByTagName($tag);
        $isEmbed = ('audio' === $tag || 'video' === $tag || 'iframe' === $tag || 'object' === $tag || 'embed' === $tag);

        for ($y = $targetList->length - 1; $y >= 0; --$y) {
            // Allow youtube and vimeo videos through as people usually want to see those.
            $currentItem = $targetList->item($y);

            if ($isEmbed) {
                $attributeValues = $currentItem->getAttribute('src') . ' ' . $currentItem->getAttribute('href');

                // First, check the elements attributes to see if any of them contain known media hosts
                if (preg_match($this->regexps['media'], $attributeValues)) {
                    continue;
                }

                // Then check the elements inside this element for the same.
                if (preg_match($this->regexps['media'], $currentItem->getInnerHTML())) {
                    continue;
                }
            }

            $currentItem->parentNode->removeChild($currentItem);
        }
    }

    /**
     * Clean an element of all tags of type "tag" if they look fishy.
     * "Fishy" is an algorithm based on content length, classnames,
     * link density, number of images & embeds, etc.
     */
    public function cleanConditionally(\DOMElement $e, string $tag): void
    {
        if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
            return;
        }

        $tagsList = $e->getElementsByTagName($tag);
        $curTagsLength = $tagsList->length;

        /*
         * Gather counts for other typical elements embedded within.
         * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
         *
         * TODO: Consider taking into account original contentScore here.
         */
        for ($i = $curTagsLength - 1; $i >= 0; --$i) {
            $node = $tagsList->item($i);
            $weight = $this->getWeight($node);
            $contentScore = self::getContentScore($node);
            $this->logger->debug('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : ''));

            // XXX Incomplete implementation
            $isList = \in_array($node->tagName, ['ul', 'ol'], true);

            if ($weight + $contentScore < 0) {
                $this->logger->debug('Removing...');
                $node->parentNode->removeChild($node);
            } elseif ($this->getCommaCount($this->getInnerText($node)) < self::MIN_COMMAS_IN_PARAGRAPH) {
                /*
                 * If there are not very many commas, and the number of
                 * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
                 */
                $p = $node->getElementsByTagName('p')->length;
                $img = $node->getElementsByTagName('img')->length;
                $li = $node->getElementsByTagName('li')->length - 100;
                $input = $node->getElementsByTagName('input')->length;
                $a = $node->getElementsByTagName('a')->length;
                $embedCount = 0;
                $embeds = $node->getElementsByTagName('embed');

                foreach ($embeds as $embed) {
                    if (preg_match($this->regexps['media'], $embed->getAttribute('src'))) {
                        ++$embedCount;
                    }
                }

                $embeds = $node->getElementsByTagName('iframe');
                foreach ($embeds as $embed) {
                    if (preg_match($this->regexps['media'], $embed->getAttribute('src'))) {
                        ++$embedCount;
                    }
                }

                $linkDensity = $this->getLinkDensity($node, true);
                $contentLength = mb_strlen($this->getInnerText($node));
                $toRemove = false;

                if ($this->lightClean) {
                    if (!$isList && $li > $p) {
                        $this->logger->debug(' too many <li> elements, and parent is not <ul> or <ol>');
                        $toRemove = true;
                    } elseif ($input > floor($p / 3)) {
                        $this->logger->debug(' too many <input> elements');
                        $toRemove = true;
                    } elseif (!$isList && $contentLength < 6 && (0 === $embedCount && (0 === $img || $img > 2))) {
                        $this->logger->debug(' content length less than 6 chars, 0 embeds and either 0 images or more than 2 images');
                        $toRemove = true;
                    } elseif (!$isList && $weight < 25 && $linkDensity > 0.25) {
                        $this->logger->debug(' weight is ' . $weight . ' < 25 and link density is ' . \sprintf('%.2f', $linkDensity) . ' > 0.25');
                        $toRemove = true;
                    } elseif ($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
                        $this->logger->debug('  more than 2 links and weight is ' . $weight . ' > 25 but link density is ' . \sprintf('%.2f', $linkDensity) . ' > 0.5');
                        $toRemove = true;
                    } elseif ($embedCount > 3) {
                        $this->logger->debug(' more than 3 embeds');
                        $toRemove = true;
                    }
                } else {
                    if ($img > $p) {
                        $this->logger->debug(' more image elements than paragraph elements');
                        $toRemove = true;
                    } elseif (!$isList && $li > $p) {
                        $this->logger->debug('  too many <li> elements, and parent is not <ul> or <ol>');
                        $toRemove = true;
                    } elseif ($input > floor($p / 3)) {
                        $this->logger->debug('  too many <input> elements');
                        $toRemove = true;
                    } elseif (!$isList && $contentLength < 10 && (0 === $img || $img > 2)) {
                        $this->logger->debug('  content length less than 10 chars and 0 images, or more than 2 images');
                        $toRemove = true;
                    } elseif (!$isList && $weight < 25 && $linkDensity > 0.2) {
                        $this->logger->debug('  weight is ' . $weight . ' lower than 0 and link density is ' . \sprintf('%.2f', $linkDensity) . ' > 0.2');
                        $toRemove = true;
                    } elseif ($weight >= 25 && $linkDensity > 0.5) {
                        $this->logger->debug('  weight above 25 but link density is ' . \sprintf('%.2f', $linkDensity) . ' > 0.5');
                        $toRemove = true;
                    } elseif ((1 === $embedCount && $contentLength < 75) || $embedCount > 1) {
                        $this->logger->debug('  1 embed and content length smaller than 75 chars, or more than one embed');
                        $toRemove = true;
                    }
                }

                if ($toRemove) {
                    $this->logger->debug('Removing...');
                    $node->parentNode->removeChild($node);
                }
            }
        }
    }

    /**
     * Clean out spurious headers from an Element. Checks things like classnames and link density.
     */
    public function cleanHeaders(\DOMElement $e): void
    {
        for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) {
            $headers = $e->getElementsByTagName('h' . $headerIndex);

            for ($i = $headers->length - 1; $i >= 0; --$i) {
                $header = $headers->item($i);
                if ($this->getWeight($header) < 0 || $this->getLinkDensity($header) > 0.33) {
                    $header->parentNode->removeChild($header);
                }
            }
        }
    }

    /**
     * Check if the given flag is active.
     */
    public function flagIsActive(int $flag): bool
    {
        return ($this->flags & $flag) > 0;
    }

    /**
     * Add a flag.
     */
    public function addFlag(int $flag): void
    {
        $this->flags |= $flag;
    }

    /**
     * Remove a flag.
     */
    public function removeFlag(int $flag): void
    {
        $this->flags &= ~$flag;
    }

    /**
     * Get the article title as an H1.
     *
     * @return \DOMElement
     */
    protected function getArticleTitle()
    {
        try {
            $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
        } catch (\Exception $e) {
            $curTitle = '';
            $origTitle = '';
        }

        if (preg_match('/ [\|\-] /', $curTitle)) {
            $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
            if (\count(explode(' ', $curTitle)) < 3) {
                $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
            }
        } elseif (false !== strpos($curTitle, ': ')) {
            $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);
            if (\count(explode(' ', $curTitle)) < 3) {
                $curTitle = preg_replace('/[^:]*[:](.*)/i', '$1', $origTitle);
            }
        } elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) {
            $hOnes = $this->dom->getElementsByTagName('h1');
            if (1 === $hOnes->length) {
                $curTitle = $this->getInnerText($hOnes->item(0));
            }
        }

        $curTitle = trim($curTitle);
        if (\count(explode(' ', $curTitle)) <= 4) {
            $curTitle = $origTitle;
        }

        $articleTitle = $this->dom->createElement('h1');
        $articleTitle->setInnerHtml($curTitle);

        return $articleTitle;
    }

    /**
     * Prepare the HTML document for readability to scrape it.
     * This includes things like stripping javascript, CSS, and handling terrible markup.
     */
    protected function prepDocument(): void
    {
        /*
         * In some cases a body element can't be found (if the HTML is totally hosed for example)
         * so we create a new body node and append it to the document.
         */
        if (null === $this->body) {
            $this->body = $this->dom->createElement('body');
            $this->dom->documentElement->appendChild($this->body);
        }

        $this->body->setAttribute('class', 'readabilityBody');

        // Remove all style tags in head.
        $styleTags = $this->dom->getElementsByTagName('style');
        for ($i = $styleTags->length - 1; $i >= 0; --$i) {
            $styleTag = $styleTags->item($i);
            $styleTag->parentNode->removeChild($styleTag);
        }

        $linkTags = $this->dom->getElementsByTagName('link');
        for ($i = $linkTags->length - 1; $i >= 0; --$i) {
            $linkTag = $linkTags->item($i);
            $linkTag->parentNode->removeChild($linkTag);
        }
    }

    /**
     * Initialize a node with the readability object. Also checks the
     * className/id for special names to add to its score.
     */
    protected function initializeNode(\DOMElement $node): void
    {
        if (!isset($node->tagName)) {
            return;
        }

        $contentScore = 0;

        // using strtoupper just in case
        switch (strtoupper($node->tagName)) {
            case 'ARTICLE':
                $contentScore += 15;
                // no break
            case 'DIV':
                $contentScore += 5;
                break;
            case 'PRE':
            case 'CODE':
            case 'TD':
            case 'BLOCKQUOTE':
            case 'FIGURE':
                $contentScore += 3;
                break;
            case 'SECTION':
                // often misused
                // $contentScore += 2;
                break;
            case 'OL':
            case 'UL':
            case 'DL':
            case 'DD':
            case 'DT':
            case 'LI':
                $contentScore -= 3;
                break;
            case 'ASIDE':
            case 'FOOTER':
            case 'HEADER':
            case 'ADDRESS':
            case 'FORM':
            case 'BUTTON':
            case 'TEXTAREA':
            case 'INPUT':
            case 'NAV':
                $contentScore -= 3;
                break;
            case 'H1':
            case 'H2':
            case 'H3':
            case 'H4':
            case 'H5':
            case 'H6':
            case 'TH':
            case 'HGROUP':
                $contentScore -= 5;
                break;
        }

        $contentScore += $this->getWeight($node);

        $readability = $this->dom->createAttribute('readability');
        $readability->value = (string) $contentScore;
        $node->setAttributeNode($readability);
    }

    /**
     * Using a variety of metrics (content score, classname, element types), find the content that is
     * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
     *
     * @return \DOMElement|false
     */
    protected function grabArticle(?\DOMElement $page = null)
    {
        if (!$page) {
            $page = $this->dom;
        }

        $xpath = null;
        $nodesToScore = [];

        if ($page instanceof \DOMDocument && isset($page->documentElement)) {
            $xpath = new \DOMXPath($page);
        }

        $allElements = $page->getElementsByTagName('*');

        for ($nodeIndex = 0; $allElements->item($nodeIndex); ++$nodeIndex) {
            $node = $allElements->item($nodeIndex);
            $tagName = $node->tagName;

            $nodeContent = $node->getInnerHTML();
            if (empty($nodeContent)) {
                $this->logger->debug('Skipping empty node');
                continue;
            }

            // Remove invisible nodes
            if (!$this->isNodeVisible($node)) {
                $this->logger->debug('Removing invisible node ' . $node->getNodePath());
                $node->parentNode->removeChild($node);
                --$nodeIndex;
                continue;
            }

            // Remove unlikely candidates
            $unlikelyMatchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id') . ' ' . $node->getAttribute('style');

            if (mb_strlen($unlikelyMatchString) > 3 // don't process "empty" strings
                && preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString)
                && !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString)
            ) {
                $this->logger->debug('Removing unlikely candidate (using conf) ' . $node->getNodePath() . ' by "' . $unlikelyMatchString . '"');
                $node->parentNode->removeChild($node);
                --$nodeIndex;
                continue;
            }

            // Some well known site uses sections as paragraphs.
            if (\in_array($tagName, $this->defaultTagsToScore, true)) {
                $nodesToScore[] = $node;
            }

            // Turn divs into P tags where they have been used inappropriately
            //  (as in, where they contain no other block level elements).
            if ('div' === $tagName) {
                if (!preg_match($this->regexps['divToPElements'], $nodeContent)) {
                    $newNode = $this->dom->createElement('p');

                    try {
                        $newNode->setInnerHtml($nodeContent);

                        $node->parentNode->replaceChild($newNode, $node);
                        --$nodeIndex;
                        $nodesToScore[] = $newNode;
                    } catch (\Exception $e) {
                        $this->logger->error('Could not alter div/article to p, reverting back to div: ' . $e->getMessage());
                    }
                } else {
                    // Will change these P elements back to text nodes after processing.
                    $p = null;
                    // foreach does not handle removeChild very well
                    // See https://www.php.net/manual/en/domnode.removechild.php#90292
                    $childs = iterator_to_array($node->childNodes);
                    foreach ($childs as $childNode) {
                        // executable tags (<?php or <?xml) warning
                        if ($childNode instanceof \DOMProcessingInstruction) {
                            $childNode->parentNode->removeChild($childNode);

                            continue;
                        }

                        if ($childNode instanceof \DOMText && '' === $this->getInnerText($childNode, true, true)) {
                            /* $this->logger->debug('Remove empty text node'); */
                            $childNode->parentNode->removeChild($childNode);

                            continue;
                        }

                        if ($this->isPhrasingContent($childNode)) {
                            if (null !== $p) {
                                $p->appendChild($childNode);
                            } elseif ('' !== $this->getInnerText($childNode, true, true)) {
                                $p = $this->dom->createElement('p');
                                $p->setAttribute('data-readability-styled', 'true');
                                $node->replaceChild($p, $childNode);
                                $p->appendChild($childNode);
                            }
                        } elseif (null !== $p) {
                            while ($p->lastChild && '' === $this->getInnerText($p->lastChild, true, true)) {
                                $p->removeChild($p->lastChild);
                            }
                            $p = null;
                        }
                    }

                    if ($this->hasSingleTagInsideElement($node, 'p') && $this->getLinkDensity($node) < 0.25) {
                        $newNode = $node->childNodes->item(0);
                        $node->parentNode->replaceChild($newNode, $node);
                        $nodesToScore[] = $newNode;
                    }
                }
            }
        }

        /*
         * Loop through all paragraphs, and assign a score to them based on how content-y they look.
         * Then add their score to their parent node.
         *
         * A score is determined by things like number of commas, class names, etc.
         * Maybe eventually link density.
         */
        foreach ($nodesToScore as $nodeToScore) {
            $ancestors = $this->getAncestors($nodeToScore, 5);

            // No parent node? Move on...
            if (0 === \count($ancestors)) {
                continue;
            }

            $innerText = $this->getInnerText($nodeToScore);

            // If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it.
            if (mb_strlen($innerText) < self::MIN_PARAGRAPH_LENGTH) {
                continue;
            }

            // Add a point for the paragraph itself as a base.
            $contentScore = 1;
            // Add points for any commas within this paragraph.
            $contentScore += $this->getCommaCount($innerText);
            // For every SCORE_CHARS_IN_PARAGRAPH (default:100) characters in this paragraph, add another point. Up to 3 points.
            $contentScore += min(floor(mb_strlen($innerText) / self::SCORE_CHARS_IN_PARAGRAPH), 3);
            // For every SCORE_WORDS_IN_PARAGRAPH (default:20) words in this paragraph, add another point. Up to 3 points.
            // $contentScore += min(floor($this->getWordCount($innerText) / self::SCORE_WORDS_IN_PARAGRAPH), 3);

            foreach ($ancestors as $level => $ancestor) {
                if (!$ancestor->nodeName || !$ancestor->parentNode) {
                    return false;
                }

                if (!$ancestor->hasAttribute('readability')) {
                    $this->initializeNode($ancestor);
                    $ancestor->setAttribute('data-candidate', 'true');
                }

                if (0 === $level) {
                    $scoreDivider = 1;
                } elseif (1 === $level) {
                    $scoreDivider = 2;
                } else {
                    $scoreDivider = $level * 3;
                }

                self::updateContentScore($ancestor, fn ($prevScore) => $prevScore + $contentScore / $scoreDivider);
            }
        }

        /*
         * Node prepping: trash nodes that look cruddy (like ones with the class name "comment", etc).
         * This is faster to do before scoring but safer after.
         */
        if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) && $xpath) {
            $candidates = $xpath->query('.//*[(self::footer and count(//footer)<2) or (self::aside and count(//aside)<2)]', $page->documentElement);

            for ($c = $candidates->length - 1; $c >= 0; --$c) {
                $node = $candidates->item($c);
                // node should be readable but not inside of an article otherwise it's probably non-readable block
                if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? 0 !== strcasecmp($node->parentNode->tagName, 'article') : true)) {
                    $this->logger->debug('Removing unlikely candidate (using note) ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . self::getContentScore($node));
                    $node->parentNode->removeChild($node);
                }
            }

            unset($candidates);
        }

        /*
         * After we've calculated scores, loop through all of the possible candidate nodes we found
         * and find the one with the highest score.
         */
        $topCandidates = array_fill(0, 5, null);
        if ($xpath) {
            // Using array of DOMElements after deletion is a path to DOOMElement.
            $candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement);
            $this->logger->debug('Candidates: ' . $candidates->length);

            for ($c = $candidates->length - 1; $c >= 0; --$c) {
                $item = $candidates->item($c);

                // Scale the final candidates score based on link density. Good content should have a
                // relatively small link density (5% or less) and be mostly unaffected by this operation.
                // If not for this we would have used XPath to find maximum @readability.
                self::updateContentScore($item, fn ($prevScore) => round($prevScore * (1 - $this->getLinkDensity($item)), 0, \PHP_ROUND_HALF_UP));

                for ($t = 0; $t < 5; ++$t) {
                    $aTopCandidate = $topCandidates[$t];

                    if (!$aTopCandidate || self::getContentScore($item) > self::getContentScore($aTopCandidate)) {
                        $this->logger->debug('Candidate: ' . $item->getNodePath() . ' (' . $item->getAttribute('class') . ':' . $item->getAttribute('id') . ') with score ' . self::getContentScore($item));
                        array_splice($topCandidates, $t, 0, [$item]);
                        if (\count($topCandidates) > 5) {
                            array_pop($topCandidates);
                        }
                        break;
                    }
                }
            }
        }

        $topCandidates = array_filter(
            $topCandidates,
            fn ($v, $idx) => 0 === $idx || null !== $v,
            \ARRAY_FILTER_USE_BOTH
        );
        $topCandidate = $topCandidates[0];

        /*
         * If we still have no top candidate, just use the body as a last resort.
         * We also have to copy the body node so it is something we can modify.
         */
        if (null === $topCandidate || 0 === strcasecmp($topCandidate->tagName, 'body')) {
            $topCandidate = $this->dom->createElement('div');

            if ($page instanceof \DOMDocument) {
                if (!isset($page->documentElement)) {
                    // we don't have a body either? what a mess! :)
                    $this->logger->debug('The page has no body!');
                } else {
                    $this->logger->debug('Setting body to a raw HTML of original page!');
                    $topCandidate->setInnerHtml($page->documentElement->getInnerHTML());
                    $page->documentElement->setInnerHtml('');
                    $this->reinitBody();
                    $page->documentElement->appendChild($topCandidate);
                }
            } else {
                $topCandidate->setInnerHtml($page->getInnerHTML());
                $page->setInnerHtml('');
                $page->appendChild($topCandidate);
            }

            $this->initializeNode($topCandidate);
        } elseif ($topCandidate) {
            $alternativeCandidateAncestors = [];
            foreach ($topCandidates as $candidate) {
                if ((int) $candidate->getAttribute('readability') / (int) $topCandidate->getAttribute('readability') >= 0.75) {
                    $ancestors = $this->getAncestors($candidate);
                    $this->logger->debug('Adding ' . \count($ancestors) . ' alternative ancestors for ' . $candidate->getNodePath());
                    $alternativeCandidateAncestors[] = $ancestors;
                }
            }
            if (\count($alternativeCandidateAncestors) >= 3) {
                $parentOfTopCandidate = $topCandidate->parentNode;
                while ('body' !== $parentOfTopCandidate->nodeName) {
                    $listsContainingThisAncestor = 0;
                    for ($ancestorIndex = 0; $ancestorIndex < \count($alternativeCandidateAncestors) && $listsContainingThisAncestor < 3; ++$ancestorIndex) {
                        $listsContainingThisAncestor += (int) \in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex], true);
                    }
                    if ($listsContainingThisAncestor >= 3) {
                        $topCandidate = $parentOfTopCandidate;
                        break;
                    }
                    $parentOfTopCandidate = $parentOfTopCandidate->parentNode;
                }
            }
            if (!$topCandidate->hasAttribute('readability')) {
                $this->initializeNode($topCandidate);
            }
            $parentOfTopCandidate = $topCandidate->parentNode;
            $lastScore = (int) $topCandidate->getAttribute('readability');
            $scoreThreshold = $lastScore / 3;
            while ('body' !== $parentOfTopCandidate->nodeName) {
                if (!$parentOfTopCandidate->hasAttribute('readability')) {
                    $parentOfTopCandidate = $parentOfTopCandidate->parentNode;
                    continue;
                }
                $parentScore = (int) $parentOfTopCandidate->getAttribute('readability');
                if ($parentScore < $scoreThreshold) {
                    break;
                }
                if ($parentScore > $lastScore) {
                    $topCandidate = $parentOfTopCandidate;
                    break;
                }
                $lastScore = (int) $parentOfTopCandidate->getAttribute('readability');
                $parentOfTopCandidate = $parentOfTopCandidate->parentNode;
            }
            $parentOfTopCandidate = $topCandidate->parentNode;
            while ('body' !== $parentOfTopCandidate->nodeName && 1 === $parentOfTopCandidate->childNodes->length) {
                $topCandidate = $parentOfTopCandidate;
                $parentOfTopCandidate = $topCandidate->parentNode;
            }
            if (!$topCandidate->hasAttribute('readability')) {
                $this->initializeNode($topCandidate);
            }
        }

        // Set table as the main node if resulted data is table element.
        $tagName = $topCandidate->tagName;
        if (0 === strcasecmp($tagName, 'td') || 0 === strcasecmp($tagName, 'tr')) {
            $up = $topCandidate;

            if ($up->parentNode instanceof \DOMElement) {
                $up = $up->parentNode;

                if (0 === strcasecmp($up->tagName, 'table')) {
                    $topCandidate = $up;
                }
            }
        }

        $this->logger->debug('Top candidate: ' . $topCandidate->getNodePath());

        /*
         * Now that we have the top candidate, look through its siblings for content that might also be related.
         * Things like preambles, content split by ads that we removed, etc.
         */
        $articleContent = $this->dom->createElement('div');
        $articleContent->setAttribute('class', 'readability-content');
        $siblingScoreThreshold = max(10, ((int) $topCandidate->getAttribute('readability')) * 0.2);
        $parentOfTopCandidate = $topCandidate->parentNode;
        $siblingNodes = $parentOfTopCandidate->childNodes;

        for ($s = 0, $sl = $siblingNodes->length; $s < $sl; ++$s) {
            $siblingNode = $siblingNodes->item($s);
            $siblingNodeName = $siblingNode->nodeName;
            $append = false;
            $this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . ((\XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));

            if ($siblingNode->isSameNode($topCandidate)) {
                $append = true;
            } else {
                $contentBonus = 0;

                // Give a bonus if sibling nodes and top candidates have the same classname.
                if (\XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && '' !== $topCandidate->getAttribute('class')) {
                    $contentBonus += ((int) $topCandidate->getAttribute('readability')) * 0.2;
                }

                if (\XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) {
                    $append = true;
                } elseif (0 === strcasecmp($siblingNodeName, 'p')) {
                    $linkDensity = (int) $this->getLinkDensity($siblingNode);
                    $nodeContent = $this->getInnerText($siblingNode, true, true);
                    $nodeLength = mb_strlen($nodeContent);

                    if (($nodeLength > self::MIN_NODE_LENGTH && $linkDensity < self::MAX_LINK_DENSITY)
                        || ($nodeLength < self::MIN_NODE_LENGTH && 0 === $nodeLength && 0 === $linkDensity && preg_match('/\.( |$)/', $nodeContent))) {
                        $append = true;
                    }
                }
            }

            if ($append) {
                $this->logger->debug('Appending node: ' . $siblingNode->getNodePath());

                if (0 !== strcasecmp($siblingNodeName, 'div') && 0 !== strcasecmp($siblingNodeName, 'p')) {
                    // We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident.
                    $this->logger->debug('Altering siblingNode "' . $siblingNodeName . '" to "div".');
                    $nodeToAppend = $this->dom->createElement('div');

                    try {
                        $nodeToAppend->setAttribute('alt', $siblingNodeName);
                        $nodeToAppend->setInnerHtml($siblingNode->getInnerHTML());
                    } catch (\Exception $e) {
                        $this->logger->debug('Could not alter siblingNode "' . $siblingNodeName . '" to "div", reverting to original.');
                        $nodeToAppend = $siblingNode;
                        --$s;
                        --$sl;
                    }
                } else {
                    $nodeToAppend = $siblingNode;
                    --$s;
                    --$sl;
                }

                // To ensure a node does not interfere with readability styles, remove its classnames & ids.
                // Now done via RegExp post_filter.
                // $nodeToAppend->removeAttribute('class');
                // $nodeToAppend->removeAttribute('id');
                // Append sibling and subtract from our list as appending removes a node.
                $articleContent->appendChild($nodeToAppend);
            }
        }

        unset($xpath);

        // So we have all of the content that we need. Now we clean it up for presentation.
        $this->prepArticle($articleContent);

        /*
         * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
         * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
         * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
         * finding the -right- content.
         */
        if (mb_strlen($this->getInnerText($articleContent, false)) < self::MIN_ARTICLE_LENGTH) {
            $this->reinitBody();

            if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
                $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
                $this->logger->debug('...content is shorter than ' . self::MIN_ARTICLE_LENGTH . " letters, trying not to strip unlikely content.\n");

                return $this->grabArticle($this->body);
            } elseif ($this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
                $this->removeFlag(self::FLAG_WEIGHT_ATTRIBUTES);
                $this->logger->debug('...content is shorter than ' . self::MIN_ARTICLE_LENGTH . " letters, trying not to weight attributes.\n");

                return $this->grabArticle($this->body);
            } elseif ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
                $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
                $this->logger->debug('...content is shorter than ' . self::MIN_ARTICLE_LENGTH . " letters, trying not to clean at all.\n");

                return $this->grabArticle($this->body);
            }

            return false;
        }

        return $articleContent;
    }

    /**
     * Get an element weight by attribute.
     * Uses regular expressions to tell if this element looks good or bad.
     */
    protected function weightAttribute(\DOMElement $element, string $attribute): int
    {
        if (!$element->hasAttribute($attribute)) {
            return 0;
        }
        $weight = 0;

        // $attributeValue = trim($element->getAttribute('class')." ".$element->getAttribute('id'));
        $attributeValue = trim($element->getAttribute($attribute));

        if ('' !== $attributeValue) {
            if (preg_match($this->regexps['negative'], $attributeValue)) {
                $weight -= 25;
            }
            if (preg_match($this->regexps['positive'], $attributeValue)) {
                $weight += 25;
            }
            if (preg_match($this->regexps['unlikelyCandidates'], $attributeValue)) {
                $weight -= 5;
            }
            if (preg_match($this->regexps['okMaybeItsACandidate'], $attributeValue)) {
                $weight += 5;
            }
        }

        return $weight;
    }

    /**
     * Will recreate previously deleted body property.
     */
    protected function reinitBody(): void
    {
        if (!isset($this->body->childNodes)) {
            $this->body = $this->dom->createElement('body');
            $this->body->setInnerHtml($this->bodyCache);
        }
    }

    /**
     * Updates the content score for the given element using the provided function.
     *
     * @param callable(float): float $f
     */
    private static function updateContentScore(\DOMElement $element, callable $f): void
    {
        $readabilityAttr = $element->getAttributeNode('readability');
        $prevScore = (float) $readabilityAttr->value;
        $readabilityAttr->value = (string) $f($prevScore);
    }

    /**
     * Gets the content score for given element.
     */
    private static function getContentScore(\DOMElement $element): float
    {
        return $element->hasAttribute('readability') ? (float) $element->getAttribute('readability') : 0;
    }

    /**
     * Load HTML in a DOMDocument.
     * Apply Pre filters
     * Cleanup HTML using Tidy (or not).
     */
    private function loadHtml(): void
    {
        $this->original_html = $this->html;

        $this->logger->debug('Parsing URL: ' . $this->url);

        if ($this->url) {
            $this->domainRegExp = '/' . strtr((string) preg_replace('/www\d*\./', '', (string) parse_url($this->url, \PHP_URL_HOST)), ['.' => '\.']) . '/';
        }

        mb_internal_encoding('UTF-8');
        mb_http_output('UTF-8');
        mb_regex_encoding('UTF-8');

        // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well...
        if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) {
            foreach ($this->pre_filters as $search => $replace) {
                $this->html = preg_replace($search, $replace, $this->html);
            }
            unset($search, $replace);
        }

        if ('' === trim($this->html)) {
            $this->html = '<html></html>';
        }

        /*
         * Use tidy (if it exists).
         * This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing.
         * Although sometimes it makes matters worse, which is why there is an option to disable it.
         */
        if ($this->useTidy) {
            $this->logger->debug('Tidying document');

            $tidy = tidy_repair_string($this->html, $this->tidy_config, 'UTF8');
            if (false !== $tidy && $this->html !== $tidy) {
                $this->tidied = true;
                $this->html = $tidy;
                $this->html = preg_replace('/[\r\n]+/is', "\n", $this->html);
            }
            unset($tidy);
        }

        $this->html = '<meta charset="utf-8">' . (string) $this->html;

        if ('html5lib' === $this->parser || 'html5' === $this->parser) {
            $this->dom = (new HTML5())->loadHTML($this->html);
        }

        if ('libxml' === $this->parser) {
            libxml_use_internal_errors(true);

            $this->dom = new \DOMDocument();
            $this->dom->preserveWhiteSpace = false;
            $this->dom->loadHTML($this->html, \LIBXML_NOBLANKS | \LIBXML_COMPACT | \LIBXML_NOERROR);

            libxml_use_internal_errors(false);
        }

        $this->dom->registerNodeClass(\DOMElement::class, JSLikeHTMLElement::class);
    }

    private function getAncestors(\DOMElement $node, int $maxDepth = 0): array
    {
        $ancestors = [];
        $i = 0;
        while ($node->parentNode instanceof \DOMElement) {
            $ancestors[] = $node->parentNode;
            if (++$i === $maxDepth) {
                break;
            }
            $node = $node->parentNode;
        }

        return $ancestors;
    }

    private function isPhrasingContent($node): bool
    {
        return \XML_TEXT_NODE === $node->nodeType
            || \in_array(strtoupper($node->nodeName), $this->phrasingElements, true)
            || (
                \in_array(strtoupper($node->nodeName), ['A', 'DEL', 'INS'], true)
                && !\in_array(
                    false,
                    array_map(
                        fn ($c) => $this->isPhrasingContent($c),
                        iterator_to_array($node->childNodes)
                    ),
                    true
                )
            );
    }

    /**
     * Checks if `$node` has only whitespace and a single element with `$tag` for the tag name.
     * Returns false if `$node` contains non-empty text nodes
     * or if it contains no element with given tag or more than 1 element.
     */
    private function hasSingleTagInsideElement(\DOMElement $node, string $tag): bool
    {
        $childNodes = iterator_to_array($node->childNodes);
        $children = array_filter($childNodes, fn ($childNode) => $childNode instanceof \DOMElement);

        // There should be exactly 1 element child with given tag
        if (1 !== \count($children) || $children[0]->nodeName !== $tag) {
            return false;
        }

        $a = array_filter(
            $childNodes,
            fn ($childNode) => $childNode instanceof \DOMText && preg_match($this->regexps['hasContent'], $this->getInnerText($childNode))
        );

        return 0 === \count($a);
    }

    /**
     * Return whether a given node is visible or not.
     *
     * Tidy must be configured to not clean the input for this function to
     * work as expected, see $this->tidy_config['clean']
     */
    private function isNodeVisible(\DOMElement $node): bool
    {
        return !(
            $node->hasAttribute('style')
            && preg_match($this->regexps['isNotVisible'], $node->getAttribute('style'))
        )
        && !$node->hasAttribute('hidden');
    }
}