From 2dce2879bfe6f7cb57d5073f58a2c0a34d9b6f27 Mon Sep 17 00:00:00 2001
From: Jeremy Benoist <jeremy.benoist@gmail.com>
Date: Mon, 4 Feb 2019 11:21:31 +0100
Subject: [PATCH] Update fixer rules

Following graby, wallabag, etc.
---
 .php_cs                   |   23 +-
 src/JSLikeHTMLElement.php |    6 +-
 src/Readability.php       | 1114 ++++++++++++++++++-------------------
 tests/ReadabilityTest.php |   24 +-
 4 files changed, 585 insertions(+), 582 deletions(-)

diff --git a/.php_cs b/.php_cs
index 8340221..5dc5396 100644
--- a/.php_cs
+++ b/.php_cs
@@ -4,22 +4,27 @@ return PhpCsFixer\Config::create()
     ->setUsingCache(true)
     ->setRiskyAllowed(true)
     ->setRules([
-        'concat_space' => [
-            'spacing' => 'one',
-        ],
+        '@Symfony' => true,
+        '@Symfony:risky' => true,
+        'array_syntax' => ['syntax' => 'short'],
+        'combine_consecutive_unsets' => true,
+        'heredoc_to_nowdoc' => true,
+        'no_extra_consecutive_blank_lines' => ['break', 'continue', 'extra', 'return', 'throw', 'use', 'parenthesis_brace_block', 'square_brace_block', 'curly_brace_block'],
+        'no_unreachable_default_argument_value' => true,
+        'no_useless_else' => true,
+        'no_useless_return' => true,
+        'ordered_class_elements' => true,
         'ordered_imports' => true,
+        'php_unit_strict' => false,
         'phpdoc_order' => true,
+        // 'psr4' => true,
         'strict_comparison' => true,
         'strict_param' => true,
-        'array_syntax' => [
-            'syntax' => 'long',
-        ],
+        'concat_space' => ['spacing' => 'one'],
     ])
     ->setFinder(
         PhpCsFixer\Finder::create()
-            ->exclude([
-                'vendor',
-            ])
+            ->exclude(['vendor'])
             ->in(__DIR__)
     )
 ;
diff --git a/src/JSLikeHTMLElement.php b/src/JSLikeHTMLElement.php
index 15e7281..b908d06 100644
--- a/src/JSLikeHTMLElement.php
+++ b/src/JSLikeHTMLElement.php
@@ -45,7 +45,7 @@ class JSLikeHTMLElement extends \DOMElement
      */
     public function __set($name, $value)
     {
-        if ($name !== 'innerHTML') {
+        if ('innerHTML' !== $name) {
             $trace = debug_backtrace();
             trigger_error('Undefined property via __set(): ' . $name . ' in ' . $trace[0]['file'] . ' on line ' . $trace[0]['line'], E_USER_NOTICE);
 
@@ -109,7 +109,7 @@ class JSLikeHTMLElement extends \DOMElement
      */
     public function __get($name)
     {
-        if ($name === 'innerHTML') {
+        if ('innerHTML' === $name) {
             $inner = '';
 
             foreach ($this->childNodes as $child) {
@@ -121,8 +121,6 @@ class JSLikeHTMLElement extends \DOMElement
 
         $trace = debug_backtrace();
         trigger_error('Undefined property via __get(): ' . $name . ' in ' . $trace[0]['file'] . ' on line ' . $trace[0]['line'], E_USER_NOTICE);
-
-        return;
     }
 
     public function __toString()
diff --git a/src/Readability.php b/src/Readability.php
index 986cce5..8d0aa33 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -52,6 +52,21 @@ use Psr\Log\NullLogger;
  */
 class Readability implements LoggerAwareInterface
 {
+    // flags
+    const FLAG_STRIP_UNLIKELYS = 1;
+    const FLAG_WEIGHT_ATTRIBUTES = 2;
+    const FLAG_CLEAN_CONDITIONALLY = 4;
+    const FLAG_DISABLE_PREFILTER = 8;
+    const FLAG_DISABLE_POSTFILTER = 16;
+    // constants
+    const SCORE_CHARS_IN_PARAGRAPH = 100;
+    const SCORE_WORDS_IN_PARAGRAPH = 20;
+    const GRANDPARENT_SCORE_DIVISOR = 2.2;
+    const MIN_PARAGRAPH_LENGTH = 20;
+    const MIN_COMMAS_IN_PARAGRAPH = 6;
+    const MIN_ARTICLE_LENGTH = 200;
+    const MIN_NODE_LENGTH = 80;
+    const MAX_LINK_DENSITY = 0.25;
     public $convertLinksToFootnotes = false;
     public $revertForcedParagraphElements = true;
     public $articleTitle;
@@ -65,25 +80,12 @@ class Readability implements LoggerAwareInterface
     // no more used, keept to avoid BC
     public $debug = false;
     public $tidied = false;
-    // article domain regexp for calibration
-    protected $domainRegExp = null;
-    protected $body = null;
-    // Cache the body HTML in case we need to re-use it later
-    protected $bodyCache = null;
-    // 1 | 2 | 4;   // Start with all processing flags set.
-    protected $flags = 7;
-    // indicates whether we were able to extract or not
-    protected $success = false;
-    protected $logger;
-    protected $parser;
-    protected $html;
-    protected $useTidy;
 
     /**
      * All of the regular expressions in use within readability.
      * Defined up here so we don't instantiate them repeatedly in loops.
      */
-    public $regexps = array(
+    public $regexps = [
         'unlikelyCandidates' => '/display\s*:\s*none|ignore|\binfos?\b|annoy|clock|date|time|author|intro|hidd?e|about|archive|\bprint|bookmark|tags|tag-list|share|search|social|robot|published|combx|comment|mast(?:head)|subscri|community|category|disqus|extra|head|head(?:er|note)|floor|foot(?:er|note)|menu|tool\b|function|nav|remark|rss|shoutbox|widget|meta|banner|sponsor|adsense|inner-?ad|ad-|sponsor|\badv\b|\bads\b|agr?egate?|pager|sidebar|popup|tweet|twitter/i',
         'okMaybeItsACandidate' => '/article\b|contain|\bcontent|column|general|detail|shadow|lightbox|blog|body|entry|main|page|footnote/i',
         'positive' => '/read|full|article|body|\bcontent|contain|entry|main|markdown|media|page|attach|pagination|post|text|blog|story/i',
@@ -92,8 +94,8 @@ class Readability implements LoggerAwareInterface
         'killBreaks' => '/(<br\s*\/?>([ \r\n\s]|&nbsp;?)*)+/',
         'media' => '!//(?:[^\.\?/]+\.)?(?:youtu(?:be)?|giphy|soundcloud|dailymotion|vimeo|pornhub|xvideos|twitvid|rutube|viddler)\.(?:com|be|org|net)/!i',
         'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i',
-    );
-    public $tidy_config = array(
+    ];
+    public $tidy_config = [
         'tidy-mark' => false,
         'vertical-space' => false,
         'doctype' => 'omit',
@@ -117,9 +119,22 @@ class Readability implements LoggerAwareInterface
         'input-encoding' => '????',
         'output-encoding' => 'utf8',
         'hide-comments' => true,
-    );
+    ];
+    // article domain regexp for calibration
+    protected $domainRegExp = null;
+    protected $body = null;
+    // Cache the body HTML in case we need to re-use it later
+    protected $bodyCache = null;
+    // 1 | 2 | 4;   // Start with all processing flags set.
+    protected $flags = 7;
+    // indicates whether we were able to extract or not
+    protected $success = false;
+    protected $logger;
+    protected $parser;
+    protected $html;
+    protected $useTidy;
     // raw HTML filters
-    protected $pre_filters = array(
+    protected $pre_filters = [
         // remove obvious scripts
         '!<script[^>]*>(.*?)</script>!is' => '',
         // remove obvious styles
@@ -134,9 +149,9 @@ class Readability implements LoggerAwareInterface
         //'!</?noscript>!is' => '',
         // replace fonts to spans
         '!<(/?)font[^>]*>!is' => '<\\1span>',
-    );
+    ];
     // output HTML filters
-    protected $post_filters = array(
+    protected $post_filters = [
         // replace excessive br's
         '/<br\s*\/?>\s*<p/i' => '<p',
         // replace empty tags that break layouts
@@ -149,23 +164,7 @@ class Readability implements LoggerAwareInterface
         '!<pre[^>]*>\s*<code!is' => '<pre',
         '!</code>\s*</pre>!is' => '</pre>',
         '!<[hb]r>!is' => '<\\1 />',
-    );
-
-    // flags
-    const FLAG_STRIP_UNLIKELYS = 1;
-    const FLAG_WEIGHT_ATTRIBUTES = 2;
-    const FLAG_CLEAN_CONDITIONALLY = 4;
-    const FLAG_DISABLE_PREFILTER = 8;
-    const FLAG_DISABLE_POSTFILTER = 16;
-    // constants
-    const SCORE_CHARS_IN_PARAGRAPH = 100;
-    const SCORE_WORDS_IN_PARAGRAPH = 20;
-    const GRANDPARENT_SCORE_DIVISOR = 2.2;
-    const MIN_PARAGRAPH_LENGTH = 20;
-    const MIN_COMMAS_IN_PARAGRAPH = 6;
-    const MIN_ARTICLE_LENGTH = 200;
-    const MIN_NODE_LENGTH = 80;
-    const MAX_LINK_DENSITY = 0.25;
+    ];
 
     /**
      * Create instance of Readability.
@@ -180,7 +179,7 @@ class Readability implements LoggerAwareInterface
         $this->url = $url;
         $this->html = $html;
         $this->parser = $parser;
-        $this->useTidy = $use_tidy && function_exists('tidy_parse_string');
+        $this->useTidy = $use_tidy && \function_exists('tidy_parse_string');
 
         $this->logger = new NullLogger();
         $this->loadHtml();
@@ -233,76 +232,6 @@ class Readability implements LoggerAwareInterface
         $this->post_filters[$filter] = $replacer;
     }
 
-    /**
-     * Load HTML in a DOMDocument.
-     * Apply Pre filters
-     * Cleanup HTML using Tidy (or not).
-     *
-     * @todo This should be called in init() instead of from __construct
-     */
-    private function loadHtml()
-    {
-        $this->original_html = $this->html;
-
-        $this->logger->debug('Parsing URL: ' . $this->url);
-
-        if ($this->url) {
-            $this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', parse_url($this->url, PHP_URL_HOST)), array('.' => '\.')) . '/';
-        }
-
-        mb_internal_encoding('UTF-8');
-        mb_http_output('UTF-8');
-        mb_regex_encoding('UTF-8');
-
-        // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well...
-        if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) {
-            foreach ($this->pre_filters as $search => $replace) {
-                $this->html = preg_replace($search, $replace, $this->html);
-            }
-            unset($search, $replace);
-        }
-
-        if (trim($this->html) === '') {
-            $this->html = '<html></html>';
-        }
-
-        /*
-         * Use tidy (if it exists).
-         * This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing.
-         * Although sometimes it makes matters worse, which is why there is an option to disable it.
-         */
-        if ($this->useTidy) {
-            $this->logger->debug('Tidying document');
-
-            $tidy = tidy_repair_string($this->html, $this->tidy_config, 'UTF8');
-            if (false !== $tidy && $this->html !== $tidy) {
-                $this->tidied = true;
-                $this->html = $tidy;
-                $this->html = preg_replace('/[\r\n]+/is', "\n", $this->html);
-            }
-            unset($tidy);
-        }
-
-        $this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8');
-
-        if (!($this->parser === 'html5lib' && ($this->dom = Parser::parse($this->html)))) {
-            libxml_use_internal_errors(true);
-
-            $this->dom = new \DOMDocument();
-            $this->dom->preserveWhiteSpace = false;
-
-            if (PHP_VERSION_ID >= 50400) {
-                $this->dom->loadHTML($this->html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR);
-            } else {
-                $this->dom->loadHTML($this->html);
-            }
-
-            libxml_use_internal_errors(false);
-        }
-
-        $this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement');
-    }
-
     /**
      * Runs readability.
      *
@@ -326,14 +255,14 @@ class Readability implements LoggerAwareInterface
         $bodyElems = $this->dom->getElementsByTagName('body');
 
         // WTF multiple body nodes?
-        if ($this->bodyCache === null) {
+        if (null === $this->bodyCache) {
             $this->bodyCache = '';
             foreach ($bodyElems as $bodyNode) {
                 $this->bodyCache .= trim($bodyNode->innerHTML);
             }
         }
 
-        if ($bodyElems->length > 0 && $this->body === null) {
+        if ($bodyElems->length > 0 && null === $this->body) {
             $this->body = $bodyElems->item(0);
         }
 
@@ -373,27 +302,6 @@ class Readability implements LoggerAwareInterface
         return $this->success;
     }
 
-    /**
-     * Debug.
-     *
-     * @deprecated use $this->logger->debug() instead
-     * @codeCoverageIgnore
-     */
-    protected function dbg($msg)
-    {
-        $this->logger->debug($msg);
-    }
-
-    /**
-     * Dump debug info.
-     *
-     * @deprecated since Monolog gather log, we don't need it
-     * @codeCoverageIgnore
-     */
-    protected function dump_dbg()
-    {
-    }
-
     /**
      * Run any post-process modifications to article content as necessary.
      *
@@ -406,77 +314,6 @@ class Readability implements LoggerAwareInterface
         }
     }
 
-    /**
-     * Get the article title as an H1.
-     *
-     * @return \DOMElement
-     */
-    protected function getArticleTitle()
-    {
-        try {
-            $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
-        } catch (\Exception $e) {
-            $curTitle = '';
-            $origTitle = '';
-        }
-
-        if (preg_match('/ [\|\-] /', $curTitle)) {
-            $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
-            if (count(explode(' ', $curTitle)) < 3) {
-                $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
-            }
-        } elseif (strpos($curTitle, ': ') !== false) {
-            $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);
-            if (count(explode(' ', $curTitle)) < 3) {
-                $curTitle = preg_replace('/[^:]*[:](.*)/i', '$1', $origTitle);
-            }
-        } elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) {
-            $hOnes = $this->dom->getElementsByTagName('h1');
-            if ($hOnes->length === 1) {
-                $curTitle = $this->getInnerText($hOnes->item(0));
-            }
-        }
-
-        $curTitle = trim($curTitle);
-        if (count(explode(' ', $curTitle)) <= 4) {
-            $curTitle = $origTitle;
-        }
-
-        $articleTitle = $this->dom->createElement('h1');
-        $articleTitle->innerHTML = $curTitle;
-
-        return $articleTitle;
-    }
-
-    /**
-     * Prepare the HTML document for readability to scrape it.
-     * This includes things like stripping javascript, CSS, and handling terrible markup.
-     */
-    protected function prepDocument()
-    {
-        /*
-         * In some cases a body element can't be found (if the HTML is totally hosed for example)
-         * so we create a new body node and append it to the document.
-         */
-        if ($this->body === null) {
-            $this->body = $this->dom->createElement('body');
-            $this->dom->documentElement->appendChild($this->body);
-        }
-
-        $this->body->setAttribute('class', 'readabilityBody');
-
-        // Remove all style tags in head.
-        $styleTags = $this->dom->getElementsByTagName('style');
-        for ($i = $styleTags->length - 1; $i >= 0; --$i) {
-            $styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
-        }
-
-        $linkTags = $this->dom->getElementsByTagName('link');
-        for ($i = $linkTags->length - 1; $i >= 0; --$i) {
-            $linkTags->item($i)->parentNode->removeChild($linkTags->item($i));
-        }
-    }
-
     /**
      * For easier reading, convert this document to have footnotes at the bottom rather than inline links.
      *
@@ -506,7 +343,7 @@ class Readability implements LoggerAwareInterface
             }
 
             $linkText = $this->getInnerText($articleLink);
-            if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
+            if ((false !== strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote')) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
                 continue;
             }
 
@@ -527,7 +364,7 @@ class Readability implements LoggerAwareInterface
             $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');
             $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount);
             $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> ';
-            $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') !== '' ? $footnoteLink->getAttribute('title') : $linkText);
+            $footnoteLink->innerHTML = ('' !== $footnoteLink->getAttribute('title') ? $footnoteLink->getAttribute('title') : $linkText);
             $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);
             $footnote->appendChild($footnoteLink);
 
@@ -589,7 +426,7 @@ class Readability implements LoggerAwareInterface
          *  already have a header.
          */
         $h2s = $articleContent->getElementsByTagName('h2');
-        if ($h2s->length === 1 && mb_strlen($this->getInnerText($h2s->item(0), true, true)) < 100) {
+        if (1 === $h2s->length && mb_strlen($this->getInnerText($h2s->item(0), true, true)) < 100) {
             $this->clean($articleContent, 'h2');
         }
 
@@ -614,7 +451,7 @@ class Readability implements LoggerAwareInterface
             $audioCount = $item->getElementsByTagName('audio')->length;
             $iframeCount = $item->getElementsByTagName('iframe')->length;
 
-            if ($iframeCount === 0 && $imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $videoCount === 0 && $audioCount === 0 && mb_strlen(preg_replace('/\s+/is', '', $this->getInnerText($item, false, false))) === 0) {
+            if (0 === $iframeCount && 0 === $imgCount && 0 === $embedCount && 0 === $objectCount && 0 === $videoCount && 0 === $audioCount && 0 === mb_strlen(preg_replace('/\s+/is', '', $this->getInnerText($item, false, false)))) {
                 $item->parentNode->removeChild($item);
             }
 
@@ -640,30 +477,460 @@ class Readability implements LoggerAwareInterface
     }
 
     /**
-     * Initialize a node with the readability object. Also checks the
-     * className/id for special names to add to its score.
+     * Get the inner text of a node.
+     * This also strips out any excess whitespace to be found.
      *
-     * @param \DOMElement $node
+     * @param \DOMElement $e
+     * @param bool        $normalizeSpaces (default: true)
+     * @param bool        $flattenLines    (default: false)
+     *
+     * @return string
      */
-    protected function initializeNode($node)
+    public function getInnerText($e, $normalizeSpaces = true, $flattenLines = false)
     {
-        if (!isset($node->tagName)) {
+        if (null === $e || !isset($e->textContent) || '' === $e->textContent) {
+            return '';
+        }
+
+        $textContent = trim($e->textContent);
+
+        if ($flattenLines) {
+            $textContent = mb_ereg_replace('(?:[\r\n](?:\s|&nbsp;)*)+', '', $textContent);
+        } elseif ($normalizeSpaces) {
+            $textContent = mb_ereg_replace('\s\s+', ' ', $textContent);
+        }
+
+        return $textContent;
+    }
+
+    /**
+     * Remove the style attribute on every $e and under.
+     *
+     * @param \DOMElement $e
+     */
+    public function cleanStyles($e)
+    {
+        if (!\is_object($e)) {
             return;
         }
 
-        $readability = $this->dom->createAttribute('readability');
-        // this is our contentScore
-        $readability->value = 0;
-        $node->setAttributeNode($readability);
+        $elems = $e->getElementsByTagName('*');
 
-        // using strtoupper just in case
-        switch (strtoupper($node->tagName)) {
-            case 'ARTICLE':
-                $readability->value += 15;
-            case 'DIV':
-                $readability->value += 5;
-                break;
-            case 'PRE':
+        foreach ($elems as $elem) {
+            $elem->removeAttribute('style');
+        }
+    }
+
+    /**
+     * Get comma number for a given text.
+     *
+     * @param string $text
+     *
+     * @return int
+     */
+    public function getCommaCount($text)
+    {
+        return substr_count($text, ',');
+    }
+
+    /**
+     * Get words number for a given text if words separated by a space.
+     * Input string should be normalized.
+     *
+     * @param string $text
+     *
+     * @return int
+     */
+    public function getWordCount($text)
+    {
+        return substr_count($text, ' ');
+    }
+
+    /**
+     * Get the density of links as a percentage of the content
+     * This is the amount of text that is inside a link divided by the total text in the node.
+     * Can exclude external references to differentiate between simple text and menus/infoblocks.
+     *
+     * @param \DOMElement $e
+     * @param string      $excludeExternal
+     *
+     * @return int
+     */
+    public function getLinkDensity($e, $excludeExternal = false)
+    {
+        $links = $e->getElementsByTagName('a');
+        $textLength = mb_strlen($this->getInnerText($e, true, true));
+        $linkLength = 0;
+
+        for ($dRe = $this->domainRegExp, $i = 0, $il = $links->length; $i < $il; ++$i) {
+            if ($excludeExternal && $dRe && !preg_match($dRe, $links->item($i)->getAttribute('href'))) {
+                continue;
+            }
+            $linkLength += mb_strlen($this->getInnerText($links->item($i)));
+        }
+
+        if ($textLength > 0 && $linkLength > 0) {
+            return $linkLength / $textLength;
+        }
+
+        return 0;
+    }
+
+    /**
+     * Get an element relative weight.
+     *
+     * @param \DOMElement $e
+     *
+     * @return int
+     */
+    public function getWeight($e)
+    {
+        if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
+            return 0;
+        }
+
+        $weight = 0;
+        // Look for a special classname
+        $weight += $this->weightAttribute($e, 'class');
+        // Look for a special ID
+        $weight += $this->weightAttribute($e, 'id');
+
+        return $weight;
+    }
+
+    /**
+     * Remove extraneous break tags from a node.
+     *
+     * @param \DOMElement $node
+     */
+    public function killBreaks($node)
+    {
+        $html = $node->innerHTML;
+        $html = preg_replace($this->regexps['killBreaks'], '<br />', $html);
+        $node->innerHTML = $html;
+    }
+
+    /**
+     * Clean a node of all elements of type "tag".
+     * (Unless it's a youtube/vimeo video. People love movies.).
+     *
+     * Updated 2012-09-18 to preserve youtube/vimeo iframes
+     *
+     * @param \DOMElement $e
+     * @param string      $tag
+     */
+    public function clean($e, $tag)
+    {
+        $currentItem = null;
+        $targetList = $e->getElementsByTagName($tag);
+        $isEmbed = ('audio' === $tag || 'video' === $tag || 'iframe' === $tag || 'object' === $tag || 'embed' === $tag);
+
+        for ($y = $targetList->length - 1; $y >= 0; --$y) {
+            // Allow youtube and vimeo videos through as people usually want to see those.
+            $currentItem = $targetList->item($y);
+
+            if ($isEmbed) {
+                $attributeValues = $currentItem->getAttribute('src') . ' ' . $currentItem->getAttribute('href');
+
+                // First, check the elements attributes to see if any of them contain known media hosts
+                if (preg_match($this->regexps['media'], $attributeValues)) {
+                    continue;
+                }
+
+                // Then check the elements inside this element for the same.
+                if (preg_match($this->regexps['media'], $targetList->item($y)->innerHTML)) {
+                    continue;
+                }
+            }
+
+            $currentItem->parentNode->removeChild($currentItem);
+        }
+    }
+
+    /**
+     * Clean an element of all tags of type "tag" if they look fishy.
+     * "Fishy" is an algorithm based on content length, classnames,
+     * link density, number of images & embeds, etc.
+     *
+     * @param \DOMElement $e
+     * @param string      $tag
+     */
+    public function cleanConditionally($e, $tag)
+    {
+        if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
+            return;
+        }
+
+        $tagsList = $e->getElementsByTagName($tag);
+        $curTagsLength = $tagsList->length;
+        $node = null;
+
+        /*
+         * Gather counts for other typical elements embedded within.
+         * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
+         *
+         * TODO: Consider taking into account original contentScore here.
+         */
+        for ($i = $curTagsLength - 1; $i >= 0; --$i) {
+            $node = $tagsList->item($i);
+            $weight = $this->getWeight($node);
+            $contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0;
+            $this->logger->debug('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : ''));
+
+            if ($weight + $contentScore < 0) {
+                $this->logger->debug('Removing...');
+                $node->parentNode->removeChild($node);
+            } elseif ($this->getCommaCount($this->getInnerText($node)) < self::MIN_COMMAS_IN_PARAGRAPH) {
+                /*
+                 * If there are not very many commas, and the number of
+                 * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
+                 */
+                $p = $node->getElementsByTagName('p')->length;
+                $img = $node->getElementsByTagName('img')->length;
+                $li = $node->getElementsByTagName('li')->length - 100;
+                $input = $node->getElementsByTagName('input')->length;
+                $a = $node->getElementsByTagName('a')->length;
+                $embedCount = 0;
+                $embeds = $node->getElementsByTagName('embed');
+
+                for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) {
+                    if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) {
+                        ++$embedCount;
+                    }
+                }
+
+                $embeds = $node->getElementsByTagName('iframe');
+                for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) {
+                    if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) {
+                        ++$embedCount;
+                    }
+                }
+
+                $linkDensity = $this->getLinkDensity($node, true);
+                $contentLength = mb_strlen($this->getInnerText($node));
+                $toRemove = false;
+
+                if ($this->lightClean) {
+                    if ($li > $p && 'ul' !== $tag && 'ol' !== $tag) {
+                        $this->logger->debug(' too many <li> elements, and parent is not <ul> or <ol>');
+                        $toRemove = true;
+                    } elseif ($input > floor($p / 3)) {
+                        $this->logger->debug(' too many <input> elements');
+                        $toRemove = true;
+                    } elseif ($contentLength < 6 && (0 === $embedCount && (0 === $img || $img > 2))) {
+                        $this->logger->debug(' content length less than 6 chars, 0 embeds and either 0 images or more than 2 images');
+                        $toRemove = true;
+                    } elseif ($weight < 25 && $linkDensity > 0.25) {
+                        $this->logger->debug(' weight is ' . $weight . ' < 25 and link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.25');
+                        $toRemove = true;
+                    } elseif ($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
+                        $this->logger->debug('  more than 2 links and weight is ' . $weight . ' > 25 but link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.5');
+                        $toRemove = true;
+                    } elseif ($embedCount > 3) {
+                        $this->logger->debug(' more than 3 embeds');
+                        $toRemove = true;
+                    }
+                } else {
+                    if ($img > $p) {
+                        $this->logger->debug(' more image elements than paragraph elements');
+                        $toRemove = true;
+                    } elseif ($li > $p && 'ul' !== $tag && 'ol' !== $tag) {
+                        $this->logger->debug('  too many <li> elements, and parent is not <ul> or <ol>');
+                        $toRemove = true;
+                    } elseif ($input > floor($p / 3)) {
+                        $this->logger->debug('  too many <input> elements');
+                        $toRemove = true;
+                    } elseif ($contentLength < 10 && (0 === $img || $img > 2)) {
+                        $this->logger->debug('  content length less than 10 chars and 0 images, or more than 2 images');
+                        $toRemove = true;
+                    } elseif ($weight < 25 && $linkDensity > 0.2) {
+                        $this->logger->debug('  weight is ' . $weight . ' lower than 0 and link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.2');
+                        $toRemove = true;
+                    } elseif ($weight >= 25 && $linkDensity > 0.5) {
+                        $this->logger->debug('  weight above 25 but link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.5');
+                        $toRemove = true;
+                    } elseif ((1 === $embedCount && $contentLength < 75) || $embedCount > 1) {
+                        $this->logger->debug('  1 embed and content length smaller than 75 chars, or more than one embed');
+                        $toRemove = true;
+                    }
+                }
+
+                if ($toRemove) {
+                    $this->logger->debug('Removing...');
+                    $node->parentNode->removeChild($node);
+                }
+            }
+        }
+    }
+
+    /**
+     * Clean out spurious headers from an Element. Checks things like classnames and link density.
+     *
+     * @param \DOMElement $e
+     */
+    public function cleanHeaders($e)
+    {
+        for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) {
+            $headers = $e->getElementsByTagName('h' . $headerIndex);
+
+            for ($i = $headers->length - 1; $i >= 0; --$i) {
+                if ($this->getWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
+                    $headers->item($i)->parentNode->removeChild($headers->item($i));
+                }
+            }
+        }
+    }
+
+    /**
+     * Check if the given flag is active.
+     *
+     * @param int $flag
+     *
+     * @return bool
+     */
+    public function flagIsActive($flag)
+    {
+        return ($this->flags & $flag) > 0;
+    }
+
+    /**
+     * Add a flag.
+     *
+     * @param int $flag
+     */
+    public function addFlag($flag)
+    {
+        $this->flags = $this->flags | $flag;
+    }
+
+    /**
+     * Remove a flag.
+     *
+     * @param int $flag
+     */
+    public function removeFlag($flag)
+    {
+        $this->flags = $this->flags & ~$flag;
+    }
+
+    /**
+     * Debug.
+     *
+     * @deprecated use $this->logger->debug() instead
+     * @codeCoverageIgnore
+     */
+    protected function dbg($msg)
+    {
+        $this->logger->debug($msg);
+    }
+
+    /**
+     * Dump debug info.
+     *
+     * @deprecated since Monolog gather log, we don't need it
+     * @codeCoverageIgnore
+     */
+    protected function dump_dbg()
+    {
+    }
+
+    /**
+     * Get the article title as an H1.
+     *
+     * @return \DOMElement
+     */
+    protected function getArticleTitle()
+    {
+        try {
+            $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
+        } catch (\Exception $e) {
+            $curTitle = '';
+            $origTitle = '';
+        }
+
+        if (preg_match('/ [\|\-] /', $curTitle)) {
+            $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
+            if (\count(explode(' ', $curTitle)) < 3) {
+                $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
+            }
+        } elseif (false !== strpos($curTitle, ': ')) {
+            $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);
+            if (\count(explode(' ', $curTitle)) < 3) {
+                $curTitle = preg_replace('/[^:]*[:](.*)/i', '$1', $origTitle);
+            }
+        } elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) {
+            $hOnes = $this->dom->getElementsByTagName('h1');
+            if (1 === $hOnes->length) {
+                $curTitle = $this->getInnerText($hOnes->item(0));
+            }
+        }
+
+        $curTitle = trim($curTitle);
+        if (\count(explode(' ', $curTitle)) <= 4) {
+            $curTitle = $origTitle;
+        }
+
+        $articleTitle = $this->dom->createElement('h1');
+        $articleTitle->innerHTML = $curTitle;
+
+        return $articleTitle;
+    }
+
+    /**
+     * Prepare the HTML document for readability to scrape it.
+     * This includes things like stripping javascript, CSS, and handling terrible markup.
+     */
+    protected function prepDocument()
+    {
+        /*
+         * In some cases a body element can't be found (if the HTML is totally hosed for example)
+         * so we create a new body node and append it to the document.
+         */
+        if (null === $this->body) {
+            $this->body = $this->dom->createElement('body');
+            $this->dom->documentElement->appendChild($this->body);
+        }
+
+        $this->body->setAttribute('class', 'readabilityBody');
+
+        // Remove all style tags in head.
+        $styleTags = $this->dom->getElementsByTagName('style');
+        for ($i = $styleTags->length - 1; $i >= 0; --$i) {
+            $styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
+        }
+
+        $linkTags = $this->dom->getElementsByTagName('link');
+        for ($i = $linkTags->length - 1; $i >= 0; --$i) {
+            $linkTags->item($i)->parentNode->removeChild($linkTags->item($i));
+        }
+    }
+
+    /**
+     * Initialize a node with the readability object. Also checks the
+     * className/id for special names to add to its score.
+     *
+     * @param \DOMElement $node
+     */
+    protected function initializeNode($node)
+    {
+        if (!isset($node->tagName)) {
+            return;
+        }
+
+        $readability = $this->dom->createAttribute('readability');
+        // this is our contentScore
+        $readability->value = 0;
+        $node->setAttributeNode($readability);
+
+        // using strtoupper just in case
+        switch (strtoupper($node->tagName)) {
+            case 'ARTICLE':
+                $readability->value += 15;
+                // no break
+            case 'DIV':
+                $readability->value += 5;
+                break;
+            case 'PRE':
             case 'CODE':
             case 'TD':
             case 'BLOCKQUOTE':
@@ -723,7 +990,7 @@ class Readability implements LoggerAwareInterface
         }
 
         $xpath = null;
-        $nodesToScore = array();
+        $nodesToScore = [];
 
         if ($page instanceof \DOMDocument && isset($page->documentElement)) {
             $xpath = new \DOMXPath($page);
@@ -735,13 +1002,13 @@ class Readability implements LoggerAwareInterface
             $tagName = $node->tagName;
 
             // Some well known site uses sections as paragraphs.
-            if (strcasecmp($tagName, 'p') === 0 || strcasecmp($tagName, 'td') === 0 || strcasecmp($tagName, 'pre') === 0 || strcasecmp($tagName, 'section') === 0) {
+            if (0 === strcasecmp($tagName, 'p') || 0 === strcasecmp($tagName, 'td') || 0 === strcasecmp($tagName, 'pre') || 0 === strcasecmp($tagName, 'section')) {
                 $nodesToScore[] = $node;
             }
 
             // Turn divs into P tags where they have been used inappropriately
             //  (as in, where they contain no other block level elements).
-            if (strcasecmp($tagName, 'div') === 0 || strcasecmp($tagName, 'article') === 0 || strcasecmp($tagName, 'section') === 0) {
+            if (0 === strcasecmp($tagName, 'div') || 0 === strcasecmp($tagName, 'article') || 0 === strcasecmp($tagName, 'section')) {
                 if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
                     $newNode = $this->dom->createElement('p');
 
@@ -765,13 +1032,13 @@ class Readability implements LoggerAwareInterface
                         }
 
                         // executable tags (<?php or <?xml) warning
-                        if (is_object($childNode) && get_class($childNode) === 'DOMProcessingInstruction') {
+                        if (\is_object($childNode) && 'DOMProcessingInstruction' === \get_class($childNode)) {
                             $childNode->parentNode->removeChild($childNode);
 
                             continue;
                         }
 
-                        if ($childNode->nodeType === XML_TEXT_NODE) {
+                        if (XML_TEXT_NODE === $childNode->nodeType) {
                             $p = $this->dom->createElement('p');
                             $p->innerHTML = $childNode->nodeValue;
                             $p->setAttribute('data-readability-styled', 'true');
@@ -789,7 +1056,7 @@ class Readability implements LoggerAwareInterface
          * A score is determined by things like number of commas, class names, etc.
          * Maybe eventually link density.
          */
-        for ($pt = 0, $scored = count($nodesToScore); $pt < $scored; ++$pt) {
+        for ($pt = 0, $scored = \count($nodesToScore); $pt < $scored; ++$pt) {
             $parentNode = $nodesToScore[$pt]->parentNode;
 
             // No parent node? Move on...
@@ -856,7 +1123,7 @@ class Readability implements LoggerAwareInterface
             for ($c = $candidates->length - 1; $c >= 0; --$c) {
                 $node = $candidates->item($c);
                 // node should be readable but not inside of an article otherwise it's probably non-readable block
-                if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? strcasecmp($node->parentNode->tagName, 'article') !== 0 : true)) {
+                if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? 0 !== strcasecmp($node->parentNode->tagName, 'article') : true)) {
                     $this->logger->debug('Removing unlikely candidate (using note) ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0));
                     $node->parentNode->removeChild($node);
                 }
@@ -914,7 +1181,7 @@ class Readability implements LoggerAwareInterface
          * If we still have no top candidate, just use the body as a last resort.
          * We also have to copy the body node so it is something we can modify.
          */
-        if ($topCandidate === null || strcasecmp($topCandidate->tagName, 'body') === 0) {
+        if (null === $topCandidate || 0 === strcasecmp($topCandidate->tagName, 'body')) {
             $topCandidate = $this->dom->createElement('div');
 
             if ($page instanceof \DOMDocument) {
@@ -939,13 +1206,13 @@ class Readability implements LoggerAwareInterface
 
         // Set table as the main node if resulted data is table element.
         $tagName = $topCandidate->tagName;
-        if (strcasecmp($tagName, 'td') === 0 || strcasecmp($tagName, 'tr') === 0) {
+        if (0 === strcasecmp($tagName, 'td') || 0 === strcasecmp($tagName, 'tr')) {
             $up = $topCandidate;
 
             if ($up->parentNode instanceof \DOMElement) {
                 $up = $up->parentNode;
 
-                if (strcasecmp($up->tagName, 'table') === 0) {
+                if (0 === strcasecmp($up->tagName, 'table')) {
                     $topCandidate = $up;
                 }
             }
@@ -971,7 +1238,7 @@ class Readability implements LoggerAwareInterface
             $siblingNode = $siblingNodes->item($s);
             $siblingNodeName = $siblingNode->nodeName;
             $append = false;
-            $this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
+            $this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . ((XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
 
             if ($siblingNode->isSameNode($topCandidate)) {
                 $append = true;
@@ -980,21 +1247,21 @@ class Readability implements LoggerAwareInterface
             $contentBonus = 0;
 
             // Give a bonus if sibling nodes and top candidates have the same classname.
-            if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') !== '') {
+            if (XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && '' !== $topCandidate->getAttribute('class')) {
                 $contentBonus += ((int) $topCandidate->getAttribute('readability')) * 0.2;
             }
 
-            if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) {
+            if (XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) {
                 $append = true;
             }
 
-            if (strcasecmp($siblingNodeName, 'p') === 0) {
+            if (0 === strcasecmp($siblingNodeName, 'p')) {
                 $linkDensity = $this->getLinkDensity($siblingNode);
                 $nodeContent = $this->getInnerText($siblingNode, true, true);
                 $nodeLength = mb_strlen($nodeContent);
 
                 if (($nodeLength > self::MIN_NODE_LENGTH && $linkDensity < self::MAX_LINK_DENSITY)
-                    || ($nodeLength < self::MIN_NODE_LENGTH && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent))) {
+                    || ($nodeLength < self::MIN_NODE_LENGTH && 0 === $linkDensity && preg_match('/\.( |$)/', $nodeContent))) {
                     $append = true;
                 }
             }
@@ -1002,7 +1269,7 @@ class Readability implements LoggerAwareInterface
             if ($append) {
                 $this->logger->debug('Appending node: ' . $siblingNode->getNodePath());
 
-                if (strcasecmp($siblingNodeName, 'div') !== 0 && strcasecmp($siblingNodeName, 'p') !== 0) {
+                if (0 !== strcasecmp($siblingNodeName, 'div') && 0 !== strcasecmp($siblingNodeName, 'p')) {
                     // We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident.
                     $this->logger->debug('Altering siblingNode "' . $siblingNodeName . '" to "div".');
                     $nodeToAppend = $this->dom->createElement('div');
@@ -1065,107 +1332,7 @@ class Readability implements LoggerAwareInterface
             return false;
         }
 
-        return $articleContent;
-    }
-
-    /**
-     * Get the inner text of a node.
-     * This also strips out any excess whitespace to be found.
-     *
-     * @param \DOMElement $e
-     * @param bool        $normalizeSpaces (default: true)
-     * @param bool        $flattenLines    (default: false)
-     *
-     * @return string
-     */
-    public function getInnerText($e, $normalizeSpaces = true, $flattenLines = false)
-    {
-        if (null === $e || !isset($e->textContent) || $e->textContent === '') {
-            return '';
-        }
-
-        $textContent = trim($e->textContent);
-
-        if ($flattenLines) {
-            $textContent = mb_ereg_replace('(?:[\r\n](?:\s|&nbsp;)*)+', '', $textContent);
-        } elseif ($normalizeSpaces) {
-            $textContent = mb_ereg_replace('\s\s+', ' ', $textContent);
-        }
-
-        return $textContent;
-    }
-
-    /**
-     * Remove the style attribute on every $e and under.
-     *
-     * @param \DOMElement $e
-     */
-    public function cleanStyles($e)
-    {
-        if (!is_object($e)) {
-            return;
-        }
-
-        $elems = $e->getElementsByTagName('*');
-
-        foreach ($elems as $elem) {
-            $elem->removeAttribute('style');
-        }
-    }
-
-    /**
-     * Get comma number for a given text.
-     *
-     * @param string $text
-     *
-     * @return int
-     */
-    public function getCommaCount($text)
-    {
-        return substr_count($text, ',');
-    }
-
-    /**
-     * Get words number for a given text if words separated by a space.
-     * Input string should be normalized.
-     *
-     * @param string $text
-     *
-     * @return int
-     */
-    public function getWordCount($text)
-    {
-        return substr_count($text, ' ');
-    }
-
-    /**
-     * Get the density of links as a percentage of the content
-     * This is the amount of text that is inside a link divided by the total text in the node.
-     * Can exclude external references to differentiate between simple text and menus/infoblocks.
-     *
-     * @param \DOMElement $e
-     * @param string      $excludeExternal
-     *
-     * @return int
-     */
-    public function getLinkDensity($e, $excludeExternal = false)
-    {
-        $links = $e->getElementsByTagName('a');
-        $textLength = mb_strlen($this->getInnerText($e, true, true));
-        $linkLength = 0;
-
-        for ($dRe = $this->domainRegExp, $i = 0, $il = $links->length; $i < $il; ++$i) {
-            if ($excludeExternal && $dRe && !preg_match($dRe, $links->item($i)->getAttribute('href'))) {
-                continue;
-            }
-            $linkLength += mb_strlen($this->getInnerText($links->item($i)));
-        }
-
-        if ($textLength > 0 && $linkLength > 0) {
-            return $linkLength / $textLength;
-        }
-
-        return 0;
+        return $articleContent;
     }
 
     /**
@@ -1187,7 +1354,7 @@ class Readability implements LoggerAwareInterface
         // $attributeValue = trim($element->getAttribute('class')." ".$element->getAttribute('id'));
         $attributeValue = trim($element->getAttribute($attribute));
 
-        if ($attributeValue !== '') {
+        if ('' !== $attributeValue) {
             if (preg_match($this->regexps['negative'], $attributeValue)) {
                 $weight -= 25;
             }
@@ -1206,250 +1373,83 @@ class Readability implements LoggerAwareInterface
     }
 
     /**
-     * Get an element relative weight.
-     *
-     * @param \DOMElement $e
-     *
-     * @return int
+     * Will recreate previously deleted body property.
      */
-    public function getWeight($e)
+    protected function reinitBody()
     {
-        if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
-            return 0;
+        if (!isset($this->body->childNodes)) {
+            $this->body = $this->dom->createElement('body');
+            $this->body->innerHTML = $this->bodyCache;
         }
-
-        $weight = 0;
-        // Look for a special classname
-        $weight += $this->weightAttribute($e, 'class');
-        // Look for a special ID
-        $weight += $this->weightAttribute($e, 'id');
-
-        return $weight;
-    }
-
-    /**
-     * Remove extraneous break tags from a node.
-     *
-     * @param \DOMElement $node
-     */
-    public function killBreaks($node)
-    {
-        $html = $node->innerHTML;
-        $html = preg_replace($this->regexps['killBreaks'], '<br />', $html);
-        $node->innerHTML = $html;
     }
 
     /**
-     * Clean a node of all elements of type "tag".
-     * (Unless it's a youtube/vimeo video. People love movies.).
-     *
-     * Updated 2012-09-18 to preserve youtube/vimeo iframes
+     * Load HTML in a DOMDocument.
+     * Apply Pre filters
+     * Cleanup HTML using Tidy (or not).
      *
-     * @param \DOMElement $e
-     * @param string      $tag
+     * @todo This should be called in init() instead of from __construct
      */
-    public function clean($e, $tag)
+    private function loadHtml()
     {
-        $currentItem = null;
-        $targetList = $e->getElementsByTagName($tag);
-        $isEmbed = ($tag === 'audio' || $tag === 'video' || $tag === 'iframe' || $tag === 'object' || $tag === 'embed');
+        $this->original_html = $this->html;
 
-        for ($y = $targetList->length - 1; $y >= 0; --$y) {
-            // Allow youtube and vimeo videos through as people usually want to see those.
-            $currentItem = $targetList->item($y);
+        $this->logger->debug('Parsing URL: ' . $this->url);
 
-            if ($isEmbed) {
-                $attributeValues = $currentItem->getAttribute('src') . ' ' . $currentItem->getAttribute('href');
+        if ($this->url) {
+            $this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', parse_url($this->url, PHP_URL_HOST)), ['.' => '\.']) . '/';
+        }
 
-                // First, check the elements attributes to see if any of them contain known media hosts
-                if (preg_match($this->regexps['media'], $attributeValues)) {
-                    continue;
-                }
+        mb_internal_encoding('UTF-8');
+        mb_http_output('UTF-8');
+        mb_regex_encoding('UTF-8');
 
-                // Then check the elements inside this element for the same.
-                if (preg_match($this->regexps['media'], $targetList->item($y)->innerHTML)) {
-                    continue;
-                }
+        // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well...
+        if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) {
+            foreach ($this->pre_filters as $search => $replace) {
+                $this->html = preg_replace($search, $replace, $this->html);
             }
-
-            $currentItem->parentNode->removeChild($currentItem);
+            unset($search, $replace);
         }
-    }
 
-    /**
-     * Clean an element of all tags of type "tag" if they look fishy.
-     * "Fishy" is an algorithm based on content length, classnames,
-     * link density, number of images & embeds, etc.
-     *
-     * @param \DOMElement $e
-     * @param string      $tag
-     */
-    public function cleanConditionally($e, $tag)
-    {
-        if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
-            return;
+        if ('' === trim($this->html)) {
+            $this->html = '<html></html>';
         }
 
-        $tagsList = $e->getElementsByTagName($tag);
-        $curTagsLength = $tagsList->length;
-        $node = null;
-
         /*
-         * Gather counts for other typical elements embedded within.
-         * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
-         *
-         * TODO: Consider taking into account original contentScore here.
+         * Use tidy (if it exists).
+         * This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing.
+         * Although sometimes it makes matters worse, which is why there is an option to disable it.
          */
-        for ($i = $curTagsLength - 1; $i >= 0; --$i) {
-            $node = $tagsList->item($i);
-            $weight = $this->getWeight($node);
-            $contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0;
-            $this->logger->debug('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : ''));
-
-            if ($weight + $contentScore < 0) {
-                $this->logger->debug('Removing...');
-                $node->parentNode->removeChild($node);
-            } elseif ($this->getCommaCount($this->getInnerText($node)) < self::MIN_COMMAS_IN_PARAGRAPH) {
-                /*
-                 * If there are not very many commas, and the number of
-                 * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
-                 */
-                $p = $node->getElementsByTagName('p')->length;
-                $img = $node->getElementsByTagName('img')->length;
-                $li = $node->getElementsByTagName('li')->length - 100;
-                $input = $node->getElementsByTagName('input')->length;
-                $a = $node->getElementsByTagName('a')->length;
-                $embedCount = 0;
-                $embeds = $node->getElementsByTagName('embed');
-
-                for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) {
-                    if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) {
-                        ++$embedCount;
-                    }
-                }
-
-                $embeds = $node->getElementsByTagName('iframe');
-                for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) {
-                    if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) {
-                        ++$embedCount;
-                    }
-                }
-
-                $linkDensity = $this->getLinkDensity($node, true);
-                $contentLength = mb_strlen($this->getInnerText($node));
-                $toRemove = false;
-
-                if ($this->lightClean) {
-                    if ($li > $p && $tag !== 'ul' && $tag !== 'ol') {
-                        $this->logger->debug(' too many <li> elements, and parent is not <ul> or <ol>');
-                        $toRemove = true;
-                    } elseif ($input > floor($p / 3)) {
-                        $this->logger->debug(' too many <input> elements');
-                        $toRemove = true;
-                    } elseif ($contentLength < 6 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
-                        $this->logger->debug(' content length less than 6 chars, 0 embeds and either 0 images or more than 2 images');
-                        $toRemove = true;
-                    } elseif ($weight < 25 && $linkDensity > 0.25) {
-                        $this->logger->debug(' weight is ' . $weight . ' < 25 and link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.25');
-                        $toRemove = true;
-                    } elseif ($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
-                        $this->logger->debug('  more than 2 links and weight is ' . $weight . ' > 25 but link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.5');
-                        $toRemove = true;
-                    } elseif ($embedCount > 3) {
-                        $this->logger->debug(' more than 3 embeds');
-                        $toRemove = true;
-                    }
-                } else {
-                    if ($img > $p) {
-                        $this->logger->debug(' more image elements than paragraph elements');
-                        $toRemove = true;
-                    } elseif ($li > $p && $tag !== 'ul' && $tag !== 'ol') {
-                        $this->logger->debug('  too many <li> elements, and parent is not <ul> or <ol>');
-                        $toRemove = true;
-                    } elseif ($input > floor($p / 3)) {
-                        $this->logger->debug('  too many <input> elements');
-                        $toRemove = true;
-                    } elseif ($contentLength < 10 && ($img === 0 || $img > 2)) {
-                        $this->logger->debug('  content length less than 10 chars and 0 images, or more than 2 images');
-                        $toRemove = true;
-                    } elseif ($weight < 25 && $linkDensity > 0.2) {
-                        $this->logger->debug('  weight is ' . $weight . ' lower than 0 and link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.2');
-                        $toRemove = true;
-                    } elseif ($weight >= 25 && $linkDensity > 0.5) {
-                        $this->logger->debug('  weight above 25 but link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.5');
-                        $toRemove = true;
-                    } elseif (($embedCount === 1 && $contentLength < 75) || $embedCount > 1) {
-                        $this->logger->debug('  1 embed and content length smaller than 75 chars, or more than one embed');
-                        $toRemove = true;
-                    }
-                }
+        if ($this->useTidy) {
+            $this->logger->debug('Tidying document');
 
-                if ($toRemove) {
-                    $this->logger->debug('Removing...');
-                    $node->parentNode->removeChild($node);
-                }
+            $tidy = tidy_repair_string($this->html, $this->tidy_config, 'UTF8');
+            if (false !== $tidy && $this->html !== $tidy) {
+                $this->tidied = true;
+                $this->html = $tidy;
+                $this->html = preg_replace('/[\r\n]+/is', "\n", $this->html);
             }
+            unset($tidy);
         }
-    }
-
-    /**
-     * Clean out spurious headers from an Element. Checks things like classnames and link density.
-     *
-     * @param \DOMElement $e
-     */
-    public function cleanHeaders($e)
-    {
-        for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) {
-            $headers = $e->getElementsByTagName('h' . $headerIndex);
 
-            for ($i = $headers->length - 1; $i >= 0; --$i) {
-                if ($this->getWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
-                    $headers->item($i)->parentNode->removeChild($headers->item($i));
-                }
-            }
-        }
-    }
+        $this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8');
 
-    /**
-     * Check if the given flag is active.
-     *
-     * @param int $flag
-     *
-     * @return bool
-     */
-    public function flagIsActive($flag)
-    {
-        return ($this->flags & $flag) > 0;
-    }
+        if (!('html5lib' === $this->parser && ($this->dom = Parser::parse($this->html)))) {
+            libxml_use_internal_errors(true);
 
-    /**
-     * Add a flag.
-     *
-     * @param int $flag
-     */
-    public function addFlag($flag)
-    {
-        $this->flags = $this->flags | $flag;
-    }
+            $this->dom = new \DOMDocument();
+            $this->dom->preserveWhiteSpace = false;
 
-    /**
-     * Remove a flag.
-     *
-     * @param int $flag
-     */
-    public function removeFlag($flag)
-    {
-        $this->flags = $this->flags & ~$flag;
-    }
+            if (\PHP_VERSION_ID >= 50400) {
+                $this->dom->loadHTML($this->html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR);
+            } else {
+                $this->dom->loadHTML($this->html);
+            }
 
-    /**
-     * Will recreate previously deleted body property.
-     */
-    protected function reinitBody()
-    {
-        if (!isset($this->body->childNodes)) {
-            $this->body = $this->dom->createElement('body');
-            $this->body->innerHTML = $this->bodyCache;
+            libxml_use_internal_errors(false);
         }
+
+        $this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement');
     }
 }
diff --git a/tests/ReadabilityTest.php b/tests/ReadabilityTest.php
index be03bd8..5a3efdd 100644
--- a/tests/ReadabilityTest.php
+++ b/tests/ReadabilityTest.php
@@ -11,17 +11,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
     public $logHandler;
     public $logger;
 
-    private function getReadability($html, $url = null, $parser = 'libxml', $useTidy = true)
-    {
-        $readability = new Readability($html, $url, $parser, $useTidy);
-
-        $this->logHandler = new TestHandler();
-        $this->logger = new Logger('test', array($this->logHandler));
-        $readability->setLogger($this->logger);
-
-        return $readability;
-    }
-
     /**
      * @requires extension tidy
      */
@@ -345,7 +334,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
     {
         error_reporting(E_ALL | E_STRICT);
         ini_set('display_errors', true);
-        set_error_handler(array($this, 'error2Exception'), E_ALL | E_STRICT);
+        set_error_handler([$this, 'error2Exception'], E_ALL | E_STRICT);
 
         $data = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
             <html xmlns="http://www.w3.org/1999/xhtml" lang="ru-RU" prefix="og: http://ogp.me/ns#">
@@ -493,4 +482,15 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
         $this->assertContains('<sup id="fnref1:fnfeed_2"><a href="#fn:fnfeed_2" class="footnote-ref">2</a></sup>', $readability->getContent()->innerHTML);
         $this->assertContains('<a href="#fnref1:fnfeed_2" rev="footnote"', $readability->getContent()->innerHTML);
     }
+
+    private function getReadability($html, $url = null, $parser = 'libxml', $useTidy = true)
+    {
+        $readability = new Readability($html, $url, $parser, $useTidy);
+
+        $this->logHandler = new TestHandler();
+        $this->logger = new Logger('test', [$this->logHandler]);
+        $readability->setLogger($this->logger);
+
+        return $readability;
+    }
 }