diff --git a/.php_cs b/.php_cs
index 240846c..8aac3e6 100644
--- a/.php_cs
+++ b/.php_cs
@@ -1,20 +1,28 @@
 <?php
 
-return Symfony\CS\Config\Config::create()
-    ->setUsingCache(true)
-    ->level(Symfony\CS\FixerInterface::SYMFONY_LEVEL)
-    // use default SYMFONY_LEVEL and extra fixers:
-    ->fixers(array(
-        'concat_with_spaces',
-        'ordered_use',
-        'phpdoc_order',
-        'strict',
-        'strict_param',
-        'long_array_syntax',
-    ))
-    ->finder(
-        Symfony\CS\Finder\DefaultFinder::create()
-            ->in(__DIR__)
+return PhpCsFixer\Config::create()
+    ->setRiskyAllowed(true)
+    ->setRules([
+        '@Symfony' => true,
+        '@Symfony:risky' => true,
+        'combine_consecutive_unsets' => true,
+        'heredoc_to_nowdoc' => true,
+        'no_extra_consecutive_blank_lines' => array('break', 'continue', 'extra', 'return', 'throw', 'use', 'parenthesis_brace_block', 'square_brace_block', 'curly_brace_block'),
+        'no_unreachable_default_argument_value' => true,
+        'no_useless_else' => true,
+        'no_useless_return' => true,
+        'ordered_class_elements' => true,
+        'ordered_imports' => true,
+        'php_unit_strict' => false,
+        'phpdoc_order' => true,
+        // 'psr4' => true,
+        'strict_comparison' => true,
+        'strict_param' => true,
+        'concat_space' => array('spacing' => 'one'),
+    ])
+    ->setFinder(
+        PhpCsFixer\Finder::create()
             ->exclude(array('vendor'))
+            ->in(__DIR__)
     )
 ;
diff --git a/composer.json b/composer.json
index 8f39405..4d9bd16 100644
--- a/composer.json
+++ b/composer.json
@@ -31,7 +31,7 @@
     },
     "require-dev": {
         "satooshi/php-coveralls": "~0.6",
-        "friendsofphp/php-cs-fixer": "<2",
+        "friendsofphp/php-cs-fixer": "~2.0",
         "monolog/monolog": "^1.13",
         "symfony/phpunit-bridge": "^3.2"
     },
diff --git a/src/JSLikeHTMLElement.php b/src/JSLikeHTMLElement.php
index 15e7281..b908d06 100644
--- a/src/JSLikeHTMLElement.php
+++ b/src/JSLikeHTMLElement.php
@@ -45,7 +45,7 @@ class JSLikeHTMLElement extends \DOMElement
      */
     public function __set($name, $value)
     {
-        if ($name !== 'innerHTML') {
+        if ('innerHTML' !== $name) {
             $trace = debug_backtrace();
             trigger_error('Undefined property via __set(): ' . $name . ' in ' . $trace[0]['file'] . ' on line ' . $trace[0]['line'], E_USER_NOTICE);
 
@@ -109,7 +109,7 @@ class JSLikeHTMLElement extends \DOMElement
      */
     public function __get($name)
     {
-        if ($name === 'innerHTML') {
+        if ('innerHTML' === $name) {
             $inner = '';
 
             foreach ($this->childNodes as $child) {
@@ -121,8 +121,6 @@ class JSLikeHTMLElement extends \DOMElement
 
         $trace = debug_backtrace();
         trigger_error('Undefined property via __get(): ' . $name . ' in ' . $trace[0]['file'] . ' on line ' . $trace[0]['line'], E_USER_NOTICE);
-
-        return;
     }
 
     public function __toString()
diff --git a/src/Readability.php b/src/Readability.php
index 9a2e9ba..5fdf85d 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -52,6 +52,23 @@ use Psr\Log\NullLogger;
  */
 class Readability implements LoggerAwareInterface
 {
+    // flags
+    const FLAG_STRIP_UNLIKELYS = 1;
+    const FLAG_WEIGHT_ATTRIBUTES = 2;
+    const FLAG_CLEAN_CONDITIONALLY = 4;
+    const FLAG_DISABLE_PREFILTER = 8;
+    const FLAG_DISABLE_POSTFILTER = 16;
+
+    // constants
+    const SCORE_CHARS_IN_PARAGRAPH = 100;
+    const SCORE_WORDS_IN_PARAGRAPH = 20;
+    const GRANDPARENT_SCORE_DIVISOR = 2.2;
+    const MIN_PARAGRAPH_LENGTH = 20;
+    const MIN_COMMAS_IN_PARAGRAPH = 6;
+    const MIN_ARTICLE_LENGTH = 200;
+    const MIN_NODE_LENGTH = 80;
+    const MAX_LINK_DENSITY = 0.25;
+
     public $convertLinksToFootnotes = false;
     public $revertForcedParagraphElements = true;
     public $articleTitle;
@@ -65,19 +82,6 @@ class Readability implements LoggerAwareInterface
     // no more used, keept to avoid BC
     public $debug = false;
     public $tidied = false;
-    // article domain regexp for calibration
-    protected $domainRegExp = null;
-    protected $body = null;
-    // Cache the body HTML in case we need to re-use it later
-    protected $bodyCache = null;
-    // 1 | 2 | 4;   // Start with all processing flags set.
-    protected $flags = 7;
-    // indicates whether we were able to extract or not
-    protected $success = false;
-    protected $logger;
-    protected $parser;
-    protected $html;
-    protected $useTidy;
 
     /**
      * All of the regular expressions in use within readability.
@@ -118,6 +122,19 @@ class Readability implements LoggerAwareInterface
         'output-encoding' => 'utf8',
         'hide-comments' => true,
     );
+    // article domain regexp for calibration
+    protected $domainRegExp = null;
+    protected $body = null;
+    // Cache the body HTML in case we need to re-use it later
+    protected $bodyCache = null;
+    // 1 | 2 | 4;   // Start with all processing flags set.
+    protected $flags = 7;
+    // indicates whether we were able to extract or not
+    protected $success = false;
+    protected $logger;
+    protected $parser;
+    protected $html;
+    protected $useTidy;
     // raw HTML filters
     protected $pre_filters = array(
         // remove obvious scripts
@@ -151,22 +168,6 @@ class Readability implements LoggerAwareInterface
         '!<[hb]r>!is' => '<\\1 />',
     );
 
-    // flags
-    const FLAG_STRIP_UNLIKELYS = 1;
-    const FLAG_WEIGHT_ATTRIBUTES = 2;
-    const FLAG_CLEAN_CONDITIONALLY = 4;
-    const FLAG_DISABLE_PREFILTER = 8;
-    const FLAG_DISABLE_POSTFILTER = 16;
-    // constants
-    const SCORE_CHARS_IN_PARAGRAPH = 100;
-    const SCORE_WORDS_IN_PARAGRAPH = 20;
-    const GRANDPARENT_SCORE_DIVISOR = 2.2;
-    const MIN_PARAGRAPH_LENGTH = 20;
-    const MIN_COMMAS_IN_PARAGRAPH = 6;
-    const MIN_ARTICLE_LENGTH = 200;
-    const MIN_NODE_LENGTH = 80;
-    const MAX_LINK_DENSITY = 0.25;
-
     /**
      * Create instance of Readability.
      *
@@ -233,76 +234,6 @@ class Readability implements LoggerAwareInterface
         $this->post_filters[$filter] = $replacer;
     }
 
-    /**
-     * Load HTML in a DOMDocument.
-     * Apply Pre filters
-     * Cleanup HTML using Tidy (or not).
-     *
-     * @todo This should be called in init() instead of from __construct
-     */
-    private function loadHtml()
-    {
-        $this->original_html = $this->html;
-
-        $this->logger->debug('Parsing URL: ' . $this->url);
-
-        if ($this->url) {
-            $this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', parse_url($this->url, PHP_URL_HOST)), array('.' => '\.')) . '/';
-        }
-
-        mb_internal_encoding('UTF-8');
-        mb_http_output('UTF-8');
-        mb_regex_encoding('UTF-8');
-
-        // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well...
-        if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) {
-            foreach ($this->pre_filters as $search => $replace) {
-                $this->html = preg_replace($search, $replace, $this->html);
-            }
-            unset($search, $replace);
-        }
-
-        if (trim($this->html) === '') {
-            $this->html = '<html></html>';
-        }
-
-        /*
-         * Use tidy (if it exists).
-         * This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing.
-         * Although sometimes it makes matters worse, which is why there is an option to disable it.
-         */
-        if ($this->useTidy) {
-            $this->logger->debug('Tidying document');
-
-            $tidy = tidy_parse_string($this->html, $this->tidy_config, 'UTF8');
-            if (tidy_clean_repair($tidy)) {
-                $this->tidied = true;
-                $this->html = $tidy->value;
-                $this->html = preg_replace('/[\r\n]+/is', "\n", $this->html);
-            }
-            unset($tidy);
-        }
-
-        $this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8');
-
-        if (!($this->parser === 'html5lib' && ($this->dom = Parser::parse($this->html)))) {
-            libxml_use_internal_errors(true);
-
-            $this->dom = new \DOMDocument();
-            $this->dom->preserveWhiteSpace = false;
-
-            if (PHP_VERSION_ID >= 50400) {
-                $this->dom->loadHTML($this->html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR);
-            } else {
-                $this->dom->loadHTML($this->html);
-            }
-
-            libxml_use_internal_errors(false);
-        }
-
-        $this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement');
-    }
-
     /**
      * Runs readability.
      *
@@ -326,14 +257,14 @@ class Readability implements LoggerAwareInterface
         $bodyElems = $this->dom->getElementsByTagName('body');
 
         // WTF multiple body nodes?
-        if ($this->bodyCache === null) {
+        if (null === $this->bodyCache) {
             $this->bodyCache = '';
             foreach ($bodyElems as $bodyNode) {
                 $this->bodyCache .= trim($bodyNode->innerHTML);
             }
         }
 
-        if ($bodyElems->length > 0 && $this->body === null) {
+        if ($bodyElems->length > 0 && null === $this->body) {
             $this->body = $bodyElems->item(0);
         }
 
@@ -373,27 +304,6 @@ class Readability implements LoggerAwareInterface
         return $this->success;
     }
 
-    /**
-     * Debug.
-     *
-     * @deprecated use $this->logger->debug() instead
-     * @codeCoverageIgnore
-     */
-    protected function dbg($msg)
-    {
-        $this->logger->debug($msg);
-    }
-
-    /**
-     * Dump debug info.
-     *
-     * @deprecated since Monolog gather log, we don't need it
-     * @codeCoverageIgnore
-     */
-    protected function dump_dbg()
-    {
-    }
-
     /**
      * Run any post-process modifications to article content as necessary.
      *
@@ -406,77 +316,6 @@ class Readability implements LoggerAwareInterface
         }
     }
 
-    /**
-     * Get the article title as an H1.
-     *
-     * @return \DOMElement
-     */
-    protected function getArticleTitle()
-    {
-        try {
-            $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
-        } catch (\Exception $e) {
-            $curTitle = '';
-            $origTitle = '';
-        }
-
-        if (preg_match('/ [\|\-] /', $curTitle)) {
-            $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
-            if (count(explode(' ', $curTitle)) < 3) {
-                $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
-            }
-        } elseif (strpos($curTitle, ': ') !== false) {
-            $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);
-            if (count(explode(' ', $curTitle)) < 3) {
-                $curTitle = preg_replace('/[^:]*[:](.*)/i', '$1', $origTitle);
-            }
-        } elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) {
-            $hOnes = $this->dom->getElementsByTagName('h1');
-            if ($hOnes->length === 1) {
-                $curTitle = $this->getInnerText($hOnes->item(0));
-            }
-        }
-
-        $curTitle = trim($curTitle);
-        if (count(explode(' ', $curTitle)) <= 4) {
-            $curTitle = $origTitle;
-        }
-
-        $articleTitle = $this->dom->createElement('h1');
-        $articleTitle->innerHTML = $curTitle;
-
-        return $articleTitle;
-    }
-
-    /**
-     * Prepare the HTML document for readability to scrape it.
-     * This includes things like stripping javascript, CSS, and handling terrible markup.
-     */
-    protected function prepDocument()
-    {
-        /*
-         * In some cases a body element can't be found (if the HTML is totally hosed for example)
-         * so we create a new body node and append it to the document.
-         */
-        if ($this->body === null) {
-            $this->body = $this->dom->createElement('body');
-            $this->dom->documentElement->appendChild($this->body);
-        }
-
-        $this->body->setAttribute('class', 'readabilityBody');
-
-        // Remove all style tags in head.
-        $styleTags = $this->dom->getElementsByTagName('style');
-        for ($i = $styleTags->length - 1; $i >= 0; --$i) {
-            $styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
-        }
-
-        $linkTags = $this->dom->getElementsByTagName('link');
-        for ($i = $linkTags->length - 1; $i >= 0; --$i) {
-            $linkTags->item($i)->parentNode->removeChild($linkTags->item($i));
-        }
-    }
-
     /**
      * For easier reading, convert this document to have footnotes at the bottom rather than inline links.
      *
@@ -506,7 +345,7 @@ class Readability implements LoggerAwareInterface
             }
 
             $linkText = $this->getInnerText($articleLink);
-            if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
+            if ((false !== strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote')) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
                 continue;
             }
 
@@ -527,7 +366,7 @@ class Readability implements LoggerAwareInterface
             $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');
             $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount);
             $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> ';
-            $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') !== '' ? $footnoteLink->getAttribute('title') : $linkText);
+            $footnoteLink->innerHTML = ('' !== $footnoteLink->getAttribute('title') ? $footnoteLink->getAttribute('title') : $linkText);
             $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);
             $footnote->appendChild($footnoteLink);
 
@@ -589,7 +428,7 @@ class Readability implements LoggerAwareInterface
          *  already have a header.
          */
         $h2s = $articleContent->getElementsByTagName('h2');
-        if ($h2s->length === 1 && mb_strlen($this->getInnerText($h2s->item(0), true, true)) < 100) {
+        if (1 === $h2s->length && mb_strlen($this->getInnerText($h2s->item(0), true, true)) < 100) {
             $this->clean($articleContent, 'h2');
         }
 
@@ -614,7 +453,7 @@ class Readability implements LoggerAwareInterface
             $audioCount = $item->getElementsByTagName('audio')->length;
             $iframeCount = $item->getElementsByTagName('iframe')->length;
 
-            if ($iframeCount === 0 && $imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $videoCount === 0 && $audioCount === 0 && mb_strlen(preg_replace('/\s+/is', '', $this->getInnerText($item, false, false))) === 0) {
+            if (0 === $iframeCount && 0 === $imgCount && 0 === $embedCount && 0 === $objectCount && 0 === $videoCount && 0 === $audioCount && 0 === mb_strlen(preg_replace('/\s+/is', '', $this->getInnerText($item, false, false)))) {
                 $item->parentNode->removeChild($item);
             }
 
@@ -640,806 +479,899 @@ class Readability implements LoggerAwareInterface
     }
 
     /**
-     * Initialize a node with the readability object. Also checks the
-     * className/id for special names to add to its score.
+     * Get the inner text of a node.
+     * This also strips out any excess whitespace to be found.
      *
-     * @param \DOMElement $node
+     * @param \DOMElement $e
+     * @param bool        $normalizeSpaces (default: true)
+     * @param bool        $flattenLines    (default: false)
+     *
+     * @return string
      */
-    protected function initializeNode($node)
+    public function getInnerText($e, $normalizeSpaces = true, $flattenLines = false)
     {
-        if (!isset($node->tagName)) {
+        if (null === $e || !isset($e->textContent) || '' === $e->textContent) {
+            return '';
+        }
+
+        $textContent = trim($e->textContent);
+
+        if ($flattenLines) {
+            $textContent = mb_ereg_replace('(?:[\r\n](?:\s|&nbsp;)*)+', '', $textContent);
+        } elseif ($normalizeSpaces) {
+            $textContent = mb_ereg_replace('\s\s+', ' ', $textContent);
+        }
+
+        return $textContent;
+    }
+
+    /**
+     * Remove the style attribute on every $e and under.
+     *
+     * @param \DOMElement $e
+     */
+    public function cleanStyles($e)
+    {
+        if (!is_object($e)) {
             return;
         }
 
-        $readability = $this->dom->createAttribute('readability');
-        // this is our contentScore
-        $readability->value = 0;
-        $node->setAttributeNode($readability);
+        $elems = $e->getElementsByTagName('*');
 
-        // using strtoupper just in case
-        switch (strtoupper($node->tagName)) {
-            case 'ARTICLE':
-                $readability->value += 15;
-            case 'DIV':
-                $readability->value += 5;
-                break;
-            case 'PRE':
-            case 'CODE':
-            case 'TD':
-            case 'BLOCKQUOTE':
-            case 'FIGURE':
-                $readability->value += 3;
-                break;
-            case 'SECTION':
-                // often misused
-                // $readability->value += 2;
-                break;
-            case 'OL':
-            case 'UL':
-            case 'DL':
-            case 'DD':
-            case 'DT':
-            case 'LI':
-                $readability->value -= 2 * round($this->getLinkDensity($node), 0, PHP_ROUND_HALF_UP);
-                break;
-            case 'ASIDE':
-            case 'FOOTER':
-            case 'HEADER':
-            case 'ADDRESS':
-            case 'FORM':
-            case 'BUTTON':
-            case 'TEXTAREA':
-            case 'INPUT':
-            case 'NAV':
-                $readability->value -= 3;
-                break;
-            case 'H1':
-            case 'H2':
-            case 'H3':
-            case 'H4':
-            case 'H5':
-            case 'H6':
-            case 'TH':
-            case 'HGROUP':
-                $readability->value -= 5;
-                break;
+        foreach ($elems as $elem) {
+            $elem->removeAttribute('style');
         }
+    }
 
-        $readability->value += $this->getWeight($node);
+    /**
+     * Get comma number for a given text.
+     *
+     * @param string $text
+     *
+     * @return int
+     */
+    public function getCommaCount($text)
+    {
+        return substr_count($text, ',');
     }
 
     /**
-     * Using a variety of metrics (content score, classname, element types), find the content that is
-     * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
+     * Get words number for a given text if words separated by a space.
+     * Input string should be normalized.
      *
-     * @param \DOMElement $page
+     * @param string $text
      *
-     * @return \DOMElement|bool
+     * @return int
      */
-    protected function grabArticle($page = null)
+    public function getWordCount($text)
     {
-        if (!$page) {
-            $page = $this->dom;
-        }
+        return substr_count($text, ' ');
+    }
 
-        $xpath = null;
-        $nodesToScore = array();
+    /**
+     * Get the density of links as a percentage of the content
+     * This is the amount of text that is inside a link divided by the total text in the node.
+     * Can exclude external references to differentiate between simple text and menus/infoblocks.
+     *
+     * @param \DOMElement $e
+     * @param string      $excludeExternal
+     *
+     * @return int
+     */
+    public function getLinkDensity($e, $excludeExternal = false)
+    {
+        $links = $e->getElementsByTagName('a');
+        $textLength = mb_strlen($this->getInnerText($e, true, true));
+        $linkLength = 0;
 
-        if ($page instanceof \DOMDocument && isset($page->documentElement)) {
-            $xpath = new \DOMXPath($page);
+        for ($dRe = $this->domainRegExp, $i = 0, $il = $links->length; $i < $il; ++$i) {
+            if ($excludeExternal && $dRe && !preg_match($dRe, $links->item($i)->getAttribute('href'))) {
+                continue;
+            }
+            $linkLength += mb_strlen($this->getInnerText($links->item($i)));
         }
 
-        $allElements = $page->getElementsByTagName('*');
+        if ($textLength > 0 && $linkLength > 0) {
+            return $linkLength / $textLength;
+        }
 
-        for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); ++$nodeIndex) {
-            $tagName = $node->tagName;
+        return 0;
+    }
 
-            // Some well known site uses sections as paragraphs.
-            if (strcasecmp($tagName, 'p') === 0 || strcasecmp($tagName, 'td') === 0 || strcasecmp($tagName, 'pre') === 0 || strcasecmp($tagName, 'section') === 0) {
-                $nodesToScore[] = $node;
-            }
+    /**
+     * Get an element relative weight.
+     *
+     * @param \DOMElement $e
+     *
+     * @return int
+     */
+    public function getWeight($e)
+    {
+        if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
+            return 0;
+        }
 
-            // Turn divs into P tags where they have been used inappropriately
-            //  (as in, where they contain no other block level elements).
-            if (strcasecmp($tagName, 'div') === 0 || strcasecmp($tagName, 'article') === 0 || strcasecmp($tagName, 'section') === 0) {
-                if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
-                    $newNode = $this->dom->createElement('p');
+        $weight = 0;
+        // Look for a special classname
+        $weight += $this->weightAttribute($e, 'class');
+        // Look for a special ID
+        $weight += $this->weightAttribute($e, 'id');
 
-                    try {
-                        $newNode->innerHTML = $node->innerHTML;
+        return $weight;
+    }
 
-                        $node->parentNode->replaceChild($newNode, $node);
-                        --$nodeIndex;
-                        $nodesToScore[] = $newNode;
-                    } catch (\Exception $e) {
-                        $this->logger->error('Could not alter div/article to p, reverting back to div: ' . $e->getMessage());
-                    }
-                } else {
-                    // Will change these P elements back to text nodes after processing.
-                    for ($i = 0, $il = $node->childNodes->length; $i < $il; ++$i) {
-                        $childNode = $node->childNodes->item($i);
+    /**
+     * Remove extraneous break tags from a node.
+     *
+     * @param \DOMElement $node
+     */
+    public function killBreaks($node)
+    {
+        $html = $node->innerHTML;
+        $html = preg_replace($this->regexps['killBreaks'], '<br />', $html);
+        $node->innerHTML = $html;
+    }
 
-                        // it looks like sometimes the loop is going too far and we are retrieving a non-existant child
-                        if (null === $childNode) {
-                            continue;
-                        }
+    /**
+     * Clean a node of all elements of type "tag".
+     * (Unless it's a youtube/vimeo video. People love movies.).
+     *
+     * Updated 2012-09-18 to preserve youtube/vimeo iframes
+     *
+     * @param \DOMElement $e
+     * @param string      $tag
+     */
+    public function clean($e, $tag)
+    {
+        $currentItem = null;
+        $targetList = $e->getElementsByTagName($tag);
+        $isEmbed = ('audio' === $tag || 'video' === $tag || 'iframe' === $tag || 'object' === $tag || 'embed' === $tag);
 
-                        // executable tags (<?php or <?xml) warning
-                        if (is_object($childNode) && get_class($childNode) === 'DOMProcessingInstruction') {
-                            $childNode->parentNode->removeChild($childNode);
+        for ($y = $targetList->length - 1; $y >= 0; --$y) {
+            // Allow youtube and vimeo videos through as people usually want to see those.
+            $currentItem = $targetList->item($y);
 
-                            continue;
-                        }
+            if ($isEmbed) {
+                $attributeValues = $currentItem->getAttribute('src') . ' ' . $currentItem->getAttribute('href');
 
-                        if ($childNode->nodeType === XML_TEXT_NODE) {
-                            $p = $this->dom->createElement('p');
-                            $p->innerHTML = $childNode->nodeValue;
-                            $p->setAttribute('data-readability-styled', 'true');
-                            $childNode->parentNode->replaceChild($p, $childNode);
-                        }
-                    }
+                // First, check the elements attributes to see if any of them contain known media hosts
+                if (preg_match($this->regexps['media'], $attributeValues)) {
+                    continue;
+                }
+
+                // Then check the elements inside this element for the same.
+                if (preg_match($this->regexps['media'], $targetList->item($y)->innerHTML)) {
+                    continue;
                 }
             }
+
+            $currentItem->parentNode->removeChild($currentItem);
+        }
+    }
+
+    /**
+     * Clean an element of all tags of type "tag" if they look fishy.
+     * "Fishy" is an algorithm based on content length, classnames,
+     * link density, number of images & embeds, etc.
+     *
+     * @param \DOMElement $e
+     * @param string      $tag
+     */
+    public function cleanConditionally($e, $tag)
+    {
+        if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
+            return;
         }
 
+        $tagsList = $e->getElementsByTagName($tag);
+        $curTagsLength = $tagsList->length;
+        $node = null;
+
         /*
-         * Loop through all paragraphs, and assign a score to them based on how content-y they look.
-         * Then add their score to their parent node.
+         * Gather counts for other typical elements embedded within.
+         * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
          *
-         * A score is determined by things like number of commas, class names, etc.
-         * Maybe eventually link density.
+         * TODO: Consider taking into account original contentScore here.
          */
-        for ($pt = 0, $scored = count($nodesToScore); $pt < $scored; ++$pt) {
-            $parentNode = $nodesToScore[$pt]->parentNode;
-
-            // No parent node? Move on...
-            if (!$parentNode) {
-                continue;
-            }
+        for ($i = $curTagsLength - 1; $i >= 0; --$i) {
+            $node = $tagsList->item($i);
+            $weight = $this->getWeight($node);
+            $contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0;
+            $this->logger->debug('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : ''));
 
-            $grandParentNode = $parentNode->parentNode instanceof \DOMElement ? $parentNode->parentNode : null;
-            $innerText = $this->getInnerText($nodesToScore[$pt]);
+            if ($weight + $contentScore < 0) {
+                $this->logger->debug('Removing...');
+                $node->parentNode->removeChild($node);
+            } elseif ($this->getCommaCount($this->getInnerText($node)) < self::MIN_COMMAS_IN_PARAGRAPH) {
+                /*
+                 * If there are not very many commas, and the number of
+                 * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
+                 */
+                $p = $node->getElementsByTagName('p')->length;
+                $img = $node->getElementsByTagName('img')->length;
+                $li = $node->getElementsByTagName('li')->length - 100;
+                $input = $node->getElementsByTagName('input')->length;
+                $a = $node->getElementsByTagName('a')->length;
+                $embedCount = 0;
+                $embeds = $node->getElementsByTagName('embed');
 
-            // If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it.
-            if (mb_strlen($innerText) < self::MIN_PARAGRAPH_LENGTH) {
-                continue;
-            }
-
-            // Initialize readability data for the parent.
-            if (!$parentNode->hasAttribute('readability')) {
-                $this->initializeNode($parentNode);
-                $parentNode->setAttribute('data-candidate', 'true');
-            }
-
-            // Initialize readability data for the grandparent.
-            if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) {
-                $this->initializeNode($grandParentNode);
-                $grandParentNode->setAttribute('data-candidate', 'true');
-            }
-            // Add a point for the paragraph itself as a base.
-            $contentScore = 1;
-            // Add points for any commas within this paragraph.
-            $contentScore += $this->getCommaCount($innerText);
-            // For every SCORE_CHARS_IN_PARAGRAPH (default:100) characters in this paragraph, add another point. Up to 3 points.
-            $contentScore += min(floor(mb_strlen($innerText) / self::SCORE_CHARS_IN_PARAGRAPH), 3);
-            // For every SCORE_WORDS_IN_PARAGRAPH (default:20) words in this paragraph, add another point. Up to 3 points.
-            $contentScore += min(floor($this->getWordCount($innerText) / self::SCORE_WORDS_IN_PARAGRAPH), 3);
-            /* TEST: For every positive/negative parent tag, add/substract half point. Up to 3 points. *\/
-            $up = $nodesToScore[$pt];
-            $score = 0;
-            while ($up->parentNode instanceof \DOMElement) {
-                $up = $up->parentNode;
-                if (preg_match($this->regexps['positive'], $up->getAttribute('class') . ' ' . $up->getAttribute('id'))) {
-                    $score += 0.5;
-                } elseif (preg_match($this->regexps['negative'], $up->getAttribute('class') . ' ' . $up->getAttribute('id'))) {
-                    $score -= 0.5;
-                }
-            }
-            $score = floor($score);
-            $contentScore += max(min($score, 3), -3);/**/
-
-            // Add the score to the parent. The grandparent gets half.
-            $parentNode->getAttributeNode('readability')->value += $contentScore;
-            if ($grandParentNode) {
-                $grandParentNode->getAttributeNode('readability')->value += $contentScore / self::GRANDPARENT_SCORE_DIVISOR;
-            }
-        }
-
-        /*
-         * Node prepping: trash nodes that look cruddy (like ones with the class name "comment", etc).
-         * This is faster to do before scoring but safer after.
-         */
-        if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) && $xpath) {
-            $candidates = $xpath->query('.//*[(self::footer and count(//footer)<2) or (self::aside and count(//aside)<2)]', $page->documentElement);
-            $node = null;
-
-            for ($c = $candidates->length - 1; $c >= 0; --$c) {
-                $node = $candidates->item($c);
-                // node should be readable but not inside of an article otherwise it's probably non-readable block
-                if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? strcasecmp($node->parentNode->tagName, 'article') !== 0 : true)) {
-                    $this->logger->debug('Removing unlikely candidate (using note) ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0));
-                    $node->parentNode->removeChild($node);
-                }
-            }
-
-            $candidates = $xpath->query('.//*[not(self::body) and (@class or @id or @style) and ((number(@readability) < 40) or not(@readability))]', $page->documentElement);
-            $node = null;
-
-            for ($c = $candidates->length - 1; $c >= 0; --$c) {
-                $node = $candidates->item($c);
-
-                // Remove unlikely candidates
-                $unlikelyMatchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id') . ' ' . $node->getAttribute('style');
-
-                if (mb_strlen($unlikelyMatchString) > 3 && // don't process "empty" strings
-                    preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
-                    !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString)
-                ) {
-                    $this->logger->debug('Removing unlikely candidate (using conf) ' . $node->getNodePath() . ' by "' . $unlikelyMatchString . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0));
-                    $node->parentNode->removeChild($node);
-                    --$nodeIndex;
+                for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) {
+                    if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) {
+                        ++$embedCount;
+                    }
                 }
-            }
-            unset($candidates);
-        }
 
-        /*
-         * After we've calculated scores, loop through all of the possible candidate nodes we found
-         * and find the one with the highest score.
-         */
-        $topCandidate = null;
-        if ($xpath) {
-            // Using array of DOMElements after deletion is a path to DOOMElement.
-            $candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement);
-
-            for ($c = $candidates->length - 1; $c >= 0; --$c) {
-                $item = $candidates->item($c);
-
-                // Scale the final candidates score based on link density. Good content should have a
-                // relatively small link density (5% or less) and be mostly unaffected by this operation.
-                // If not for this we would have used XPath to find maximum @readability.
-                $readability = $item->getAttributeNode('readability');
-                $readability->value = round($readability->value * (1 - $this->getLinkDensity($item)), 0, PHP_ROUND_HALF_UP);
-
-                if (!$topCandidate || $readability->value > (int) $topCandidate->getAttribute('readability')) {
-                    $this->logger->debug('Candidate: ' . $item->getNodePath() . ' (' . $item->getAttribute('class') . ':' . $item->getAttribute('id') . ') with score ' . $readability->value);
-                    $topCandidate = $item;
+                $embeds = $node->getElementsByTagName('iframe');
+                for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) {
+                    if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) {
+                        ++$embedCount;
+                    }
                 }
-            }
-
-            unset($candidates);
-        }
 
-        /*
-         * If we still have no top candidate, just use the body as a last resort.
-         * We also have to copy the body node so it is something we can modify.
-         */
-        if ($topCandidate === null || strcasecmp($topCandidate->tagName, 'body') === 0) {
-            $topCandidate = $this->dom->createElement('div');
+                $linkDensity = $this->getLinkDensity($node, true);
+                $contentLength = mb_strlen($this->getInnerText($node));
+                $toRemove = false;
 
-            if ($page instanceof \DOMDocument) {
-                if (!isset($page->documentElement)) {
-                    // we don't have a body either? what a mess! :)
-                    $this->logger->debug('The page has no body!');
+                if ($this->lightClean) {
+                    if ($li > $p && 'ul' !== $tag && 'ol' !== $tag) {
+                        $this->logger->debug(' too many <li> elements, and parent is not <ul> or <ol>');
+                        $toRemove = true;
+                    } elseif ($input > floor($p / 3)) {
+                        $this->logger->debug(' too many <input> elements');
+                        $toRemove = true;
+                    } elseif ($contentLength < 6 && (0 === $embedCount && (0 === $img || $img > 2))) {
+                        $this->logger->debug(' content length less than 6 chars, 0 embeds and either 0 images or more than 2 images');
+                        $toRemove = true;
+                    } elseif ($weight < 25 && $linkDensity > 0.25) {
+                        $this->logger->debug(' weight is ' . $weight . ' < 25 and link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.25');
+                        $toRemove = true;
+                    } elseif ($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
+                        $this->logger->debug('  more than 2 links and weight is ' . $weight . ' > 25 but link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.5');
+                        $toRemove = true;
+                    } elseif ($embedCount > 3) {
+                        $this->logger->debug(' more than 3 embeds');
+                        $toRemove = true;
+                    }
                 } else {
-                    $this->logger->debug('Setting body to a raw HTML of original page!');
-                    $topCandidate->innerHTML = $page->documentElement->innerHTML;
-                    $page->documentElement->innerHTML = '';
-                    $this->reinitBody();
-                    $page->documentElement->appendChild($topCandidate);
+                    if ($img > $p) {
+                        $this->logger->debug(' more image elements than paragraph elements');
+                        $toRemove = true;
+                    } elseif ($li > $p && 'ul' !== $tag && 'ol' !== $tag) {
+                        $this->logger->debug('  too many <li> elements, and parent is not <ul> or <ol>');
+                        $toRemove = true;
+                    } elseif ($input > floor($p / 3)) {
+                        $this->logger->debug('  too many <input> elements');
+                        $toRemove = true;
+                    } elseif ($contentLength < 10 && (0 === $img || $img > 2)) {
+                        $this->logger->debug('  content length less than 10 chars and 0 images, or more than 2 images');
+                        $toRemove = true;
+                    } elseif ($weight < 25 && $linkDensity > 0.2) {
+                        $this->logger->debug('  weight is ' . $weight . ' lower than 0 and link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.2');
+                        $toRemove = true;
+                    } elseif ($weight >= 25 && $linkDensity > 0.5) {
+                        $this->logger->debug('  weight above 25 but link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.5');
+                        $toRemove = true;
+                    } elseif ((1 === $embedCount && $contentLength < 75) || $embedCount > 1) {
+                        $this->logger->debug('  1 embed and content length smaller than 75 chars, or more than one embed');
+                        $toRemove = true;
+                    }
                 }
-            } else {
-                $topCandidate->innerHTML = $page->innerHTML;
-                $page->innerHTML = '';
-                $page->appendChild($topCandidate);
-            }
 
-            $this->initializeNode($topCandidate);
-        }
-
-        // Set table as the main node if resulted data is table element.
-        $tagName = $topCandidate->tagName;
-        if (strcasecmp($tagName, 'td') === 0 || strcasecmp($tagName, 'tr') === 0) {
-            $up = $topCandidate;
-
-            if ($up->parentNode instanceof \DOMElement) {
-                $up = $up->parentNode;
-
-                if (strcasecmp($up->tagName, 'table') === 0) {
-                    $topCandidate = $up;
+                if ($toRemove) {
+                    $this->logger->debug('Removing...');
+                    $node->parentNode->removeChild($node);
                 }
             }
         }
+    }
 
-        $this->logger->debug('Top candidate: ' . $topCandidate->getNodePath());
-
-        /*
-         * Now that we have the top candidate, look through its siblings for content that might also be related.
-         * Things like preambles, content split by ads that we removed, etc.
-         */
-        $articleContent = $this->dom->createElement('div');
-        $articleContent->setAttribute('class', 'readability-content');
-        $siblingScoreThreshold = max(10, ((int) $topCandidate->getAttribute('readability')) * 0.2);
-        $siblingNodes = $topCandidate->parentNode->childNodes;
-
-        if (!isset($siblingNodes)) {
-            $siblingNodes = new stdClass();
-            $siblingNodes->length = 0;
-        }
-
-        for ($s = 0, $sl = $siblingNodes->length; $s < $sl; ++$s) {
-            $siblingNode = $siblingNodes->item($s);
-            $siblingNodeName = $siblingNode->nodeName;
-            $append = false;
-            $this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
-
-            if ($siblingNode->isSameNode($topCandidate)) {
-                $append = true;
-            }
-
-            $contentBonus = 0;
-
-            // Give a bonus if sibling nodes and top candidates have the same classname.
-            if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') !== '') {
-                $contentBonus += ((int) $topCandidate->getAttribute('readability')) * 0.2;
-            }
-
-            if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) {
-                $append = true;
-            }
-
-            if (strcasecmp($siblingNodeName, 'p') === 0) {
-                $linkDensity = $this->getLinkDensity($siblingNode);
-                $nodeContent = $this->getInnerText($siblingNode, true, true);
-                $nodeLength = mb_strlen($nodeContent);
-
-                if (($nodeLength > self::MIN_NODE_LENGTH && $linkDensity < self::MAX_LINK_DENSITY)
-                    || ($nodeLength < self::MIN_NODE_LENGTH && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent))) {
-                    $append = true;
-                }
-            }
-
-            if ($append) {
-                $this->logger->debug('Appending node: ' . $siblingNode->getNodePath());
-
-                if (strcasecmp($siblingNodeName, 'div') !== 0 && strcasecmp($siblingNodeName, 'p') !== 0) {
-                    // We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident.
-                    $this->logger->debug('Altering siblingNode "' . $siblingNodeName . '" to "div".');
-                    $nodeToAppend = $this->dom->createElement('div');
+    /**
+     * Clean out spurious headers from an Element. Checks things like classnames and link density.
+     *
+     * @param \DOMElement $e
+     */
+    public function cleanHeaders($e)
+    {
+        for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) {
+            $headers = $e->getElementsByTagName('h' . $headerIndex);
 
-                    try {
-                        $nodeToAppend->setAttribute('alt', $siblingNodeName);
-                        $nodeToAppend->innerHTML = $siblingNode->innerHTML;
-                    } catch (\Exception $e) {
-                        $this->logger->debug('Could not alter siblingNode "' . $siblingNodeName . '" to "div", reverting to original.');
-                        $nodeToAppend = $siblingNode;
-                        --$s;
-                        --$sl;
-                    }
-                } else {
-                    $nodeToAppend = $siblingNode;
-                    --$s;
-                    --$sl;
+            for ($i = $headers->length - 1; $i >= 0; --$i) {
+                if ($this->getWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
+                    $headers->item($i)->parentNode->removeChild($headers->item($i));
                 }
-
-                // To ensure a node does not interfere with readability styles, remove its classnames & ids.
-                // Now done via RegExp post_filter.
-                //$nodeToAppend->removeAttribute('class');
-                //$nodeToAppend->removeAttribute('id');
-                // Append sibling and subtract from our list as appending removes a node.
-                $articleContent->appendChild($nodeToAppend);
             }
         }
+    }
 
-        unset($xpath);
+    /**
+     * Check if the given flag is active.
+     *
+     * @param int $flag
+     *
+     * @return bool
+     */
+    public function flagIsActive($flag)
+    {
+        return ($this->flags & $flag) > 0;
+    }
 
-        // So we have all of the content that we need. Now we clean it up for presentation.
-        $this->prepArticle($articleContent);
+    /**
+     * Add a flag.
+     *
+     * @param int $flag
+     */
+    public function addFlag($flag)
+    {
+        $this->flags = $this->flags | $flag;
+    }
 
-        /*
-         * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
-         * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
-         * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
-         * finding the -right- content.
-         */
-        if (mb_strlen($this->getInnerText($articleContent, false)) < self::MIN_ARTICLE_LENGTH) {
-            $this->reinitBody();
+    /**
+     * Remove a flag.
+     *
+     * @param int $flag
+     */
+    public function removeFlag($flag)
+    {
+        $this->flags = $this->flags & ~$flag;
+    }
 
-            if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
-                $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
-                $this->logger->debug('...content is shorter than ' . self::MIN_ARTICLE_LENGTH . " letters, trying not to strip unlikely content.\n");
+    /**
+     * Debug.
+     *
+     * @deprecated use $this->logger->debug() instead
+     * @codeCoverageIgnore
+     */
+    protected function dbg($msg)
+    {
+        $this->logger->debug($msg);
+    }
 
-                return $this->grabArticle($this->body);
-            } elseif ($this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
-                $this->removeFlag(self::FLAG_WEIGHT_ATTRIBUTES);
-                $this->logger->debug('...content is shorter than ' . self::MIN_ARTICLE_LENGTH . " letters, trying not to weight attributes.\n");
+    /**
+     * Dump debug info.
+     *
+     * @deprecated since Monolog gather log, we don't need it
+     * @codeCoverageIgnore
+     */
+    protected function dump_dbg()
+    {
+    }
 
-                return $this->grabArticle($this->body);
-            } elseif ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
-                $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
-                $this->logger->debug('...content is shorter than ' . self::MIN_ARTICLE_LENGTH . " letters, trying not to clean at all.\n");
+    /**
+     * Get the article title as an H1.
+     *
+     * @return \DOMElement
+     */
+    protected function getArticleTitle()
+    {
+        try {
+            $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
+        } catch (\Exception $e) {
+            $curTitle = '';
+            $origTitle = '';
+        }
 
-                return $this->grabArticle($this->body);
+        if (preg_match('/ [\|\-] /', $curTitle)) {
+            $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
+            if (count(explode(' ', $curTitle)) < 3) {
+                $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
+            }
+        } elseif (false !== strpos($curTitle, ': ')) {
+            $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);
+            if (count(explode(' ', $curTitle)) < 3) {
+                $curTitle = preg_replace('/[^:]*[:](.*)/i', '$1', $origTitle);
+            }
+        } elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) {
+            $hOnes = $this->dom->getElementsByTagName('h1');
+            if (1 === $hOnes->length) {
+                $curTitle = $this->getInnerText($hOnes->item(0));
             }
+        }
 
-            return false;
+        $curTitle = trim($curTitle);
+        if (count(explode(' ', $curTitle)) <= 4) {
+            $curTitle = $origTitle;
         }
 
-        return $articleContent;
+        $articleTitle = $this->dom->createElement('h1');
+        $articleTitle->innerHTML = $curTitle;
+
+        return $articleTitle;
     }
 
     /**
-     * Get the inner text of a node.
-     * This also strips out any excess whitespace to be found.
-     *
-     * @param \DOMElement $e
-     * @param bool        $normalizeSpaces (default: true)
-     * @param bool        $flattenLines    (default: false)
-     *
-     * @return string
+     * Prepare the HTML document for readability to scrape it.
+     * This includes things like stripping javascript, CSS, and handling terrible markup.
      */
-    public function getInnerText($e, $normalizeSpaces = true, $flattenLines = false)
+    protected function prepDocument()
     {
-        if (null === $e || !isset($e->textContent) || $e->textContent === '') {
-            return '';
+        /*
+         * In some cases a body element can't be found (if the HTML is totally hosed for example)
+         * so we create a new body node and append it to the document.
+         */
+        if (null === $this->body) {
+            $this->body = $this->dom->createElement('body');
+            $this->dom->documentElement->appendChild($this->body);
         }
 
-        $textContent = trim($e->textContent);
+        $this->body->setAttribute('class', 'readabilityBody');
 
-        if ($flattenLines) {
-            $textContent = mb_ereg_replace('(?:[\r\n](?:\s|&nbsp;)*)+', '', $textContent);
-        } elseif ($normalizeSpaces) {
-            $textContent = mb_ereg_replace('\s\s+', ' ', $textContent);
+        // Remove all style tags in head.
+        $styleTags = $this->dom->getElementsByTagName('style');
+        for ($i = $styleTags->length - 1; $i >= 0; --$i) {
+            $styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
         }
 
-        return $textContent;
+        $linkTags = $this->dom->getElementsByTagName('link');
+        for ($i = $linkTags->length - 1; $i >= 0; --$i) {
+            $linkTags->item($i)->parentNode->removeChild($linkTags->item($i));
+        }
     }
 
     /**
-     * Remove the style attribute on every $e and under.
+     * Initialize a node with the readability object. Also checks the
+     * className/id for special names to add to its score.
      *
-     * @param \DOMElement $e
+     * @param \DOMElement $node
      */
-    public function cleanStyles($e)
+    protected function initializeNode($node)
     {
-        if (!is_object($e)) {
+        if (!isset($node->tagName)) {
             return;
         }
 
-        $elems = $e->getElementsByTagName('*');
+        $readability = $this->dom->createAttribute('readability');
+        // this is our contentScore
+        $readability->value = 0;
+        $node->setAttributeNode($readability);
 
-        foreach ($elems as $elem) {
-            $elem->removeAttribute('style');
+        // using strtoupper just in case
+        switch (strtoupper($node->tagName)) {
+            case 'ARTICLE':
+                $readability->value += 15;
+                // no break
+            case 'DIV':
+                $readability->value += 5;
+                break;
+            case 'PRE':
+            case 'CODE':
+            case 'TD':
+            case 'BLOCKQUOTE':
+            case 'FIGURE':
+                $readability->value += 3;
+                break;
+            case 'SECTION':
+                // often misused
+                // $readability->value += 2;
+                break;
+            case 'OL':
+            case 'UL':
+            case 'DL':
+            case 'DD':
+            case 'DT':
+            case 'LI':
+                $readability->value -= 2 * round($this->getLinkDensity($node), 0, PHP_ROUND_HALF_UP);
+                break;
+            case 'ASIDE':
+            case 'FOOTER':
+            case 'HEADER':
+            case 'ADDRESS':
+            case 'FORM':
+            case 'BUTTON':
+            case 'TEXTAREA':
+            case 'INPUT':
+            case 'NAV':
+                $readability->value -= 3;
+                break;
+            case 'H1':
+            case 'H2':
+            case 'H3':
+            case 'H4':
+            case 'H5':
+            case 'H6':
+            case 'TH':
+            case 'HGROUP':
+                $readability->value -= 5;
+                break;
         }
-    }
 
-    /**
-     * Get comma number for a given text.
-     *
-     * @param string $text
-     *
-     * @return int
-     */
-    public function getCommaCount($text)
-    {
-        return substr_count($text, ',');
+        $readability->value += $this->getWeight($node);
     }
 
     /**
-     * Get words number for a given text if words separated by a space.
-     * Input string should be normalized.
+     * Using a variety of metrics (content score, classname, element types), find the content that is
+     * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
      *
-     * @param string $text
+     * @param \DOMElement $page
      *
-     * @return int
+     * @return \DOMElement|bool
      */
-    public function getWordCount($text)
+    protected function grabArticle($page = null)
     {
-        return substr_count($text, ' ');
-    }
+        if (!$page) {
+            $page = $this->dom;
+        }
 
-    /**
-     * Get the density of links as a percentage of the content
-     * This is the amount of text that is inside a link divided by the total text in the node.
-     * Can exclude external references to differentiate between simple text and menus/infoblocks.
-     *
-     * @param \DOMElement $e
-     * @param string      $excludeExternal
-     *
-     * @return int
-     */
-    public function getLinkDensity($e, $excludeExternal = false)
-    {
-        $links = $e->getElementsByTagName('a');
-        $textLength = mb_strlen($this->getInnerText($e, true, true));
-        $linkLength = 0;
+        $xpath = null;
+        $nodesToScore = array();
 
-        for ($dRe = $this->domainRegExp, $i = 0, $il = $links->length; $i < $il; ++$i) {
-            if ($excludeExternal && $dRe && !preg_match($dRe, $links->item($i)->getAttribute('href'))) {
-                continue;
-            }
-            $linkLength += mb_strlen($this->getInnerText($links->item($i)));
+        if ($page instanceof \DOMDocument && isset($page->documentElement)) {
+            $xpath = new \DOMXPath($page);
         }
 
-        if ($textLength > 0 && $linkLength > 0) {
-            return $linkLength / $textLength;
-        }
+        $allElements = $page->getElementsByTagName('*');
 
-        return 0;
-    }
+        for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); ++$nodeIndex) {
+            $tagName = $node->tagName;
 
-    /**
-     * Get an element weight by attribute.
-     * Uses regular expressions to tell if this element looks good or bad.
-     *
-     * @param \DOMElement $element
-     * @param string      $attribute
-     *
-     * @return int
-     */
-    protected function weightAttribute($element, $attribute)
-    {
-        if (!$element->hasAttribute($attribute)) {
-            return 0;
+            // Some well known site uses sections as paragraphs.
+            if (0 === strcasecmp($tagName, 'p') || 0 === strcasecmp($tagName, 'td') || 0 === strcasecmp($tagName, 'pre') || 0 === strcasecmp($tagName, 'section')) {
+                $nodesToScore[] = $node;
+            }
+
+            // Turn divs into P tags where they have been used inappropriately
+            //  (as in, where they contain no other block level elements).
+            if (0 === strcasecmp($tagName, 'div') || 0 === strcasecmp($tagName, 'article') || 0 === strcasecmp($tagName, 'section')) {
+                if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
+                    $newNode = $this->dom->createElement('p');
+
+                    try {
+                        $newNode->innerHTML = $node->innerHTML;
+
+                        $node->parentNode->replaceChild($newNode, $node);
+                        --$nodeIndex;
+                        $nodesToScore[] = $newNode;
+                    } catch (\Exception $e) {
+                        $this->logger->error('Could not alter div/article to p, reverting back to div: ' . $e->getMessage());
+                    }
+                } else {
+                    // Will change these P elements back to text nodes after processing.
+                    for ($i = 0, $il = $node->childNodes->length; $i < $il; ++$i) {
+                        $childNode = $node->childNodes->item($i);
+
+                        // it looks like sometimes the loop is going too far and we are retrieving a non-existant child
+                        if (null === $childNode) {
+                            continue;
+                        }
+
+                        // executable tags (<?php or <?xml) warning
+                        if (is_object($childNode) && 'DOMProcessingInstruction' === get_class($childNode)) {
+                            $childNode->parentNode->removeChild($childNode);
+
+                            continue;
+                        }
+
+                        if (XML_TEXT_NODE === $childNode->nodeType) {
+                            $p = $this->dom->createElement('p');
+                            $p->innerHTML = $childNode->nodeValue;
+                            $p->setAttribute('data-readability-styled', 'true');
+                            $childNode->parentNode->replaceChild($p, $childNode);
+                        }
+                    }
+                }
+            }
         }
-        $weight = 0;
 
-        // $attributeValue = trim($element->getAttribute('class')." ".$element->getAttribute('id'));
-        $attributeValue = trim($element->getAttribute($attribute));
+        /*
+         * Loop through all paragraphs, and assign a score to them based on how content-y they look.
+         * Then add their score to their parent node.
+         *
+         * A score is determined by things like number of commas, class names, etc.
+         * Maybe eventually link density.
+         */
+        for ($pt = 0, $scored = count($nodesToScore); $pt < $scored; ++$pt) {
+            $parentNode = $nodesToScore[$pt]->parentNode;
+
+            // No parent node? Move on...
+            if (!$parentNode) {
+                continue;
+            }
+
+            $grandParentNode = $parentNode->parentNode instanceof \DOMElement ? $parentNode->parentNode : null;
+            $innerText = $this->getInnerText($nodesToScore[$pt]);
 
-        if ($attributeValue !== '') {
-            if (preg_match($this->regexps['negative'], $attributeValue)) {
-                $weight -= 25;
+            // If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it.
+            if (mb_strlen($innerText) < self::MIN_PARAGRAPH_LENGTH) {
+                continue;
             }
-            if (preg_match($this->regexps['positive'], $attributeValue)) {
-                $weight += 25;
+
+            // Initialize readability data for the parent.
+            if (!$parentNode->hasAttribute('readability')) {
+                $this->initializeNode($parentNode);
+                $parentNode->setAttribute('data-candidate', 'true');
             }
-            if (preg_match($this->regexps['unlikelyCandidates'], $attributeValue)) {
-                $weight -= 5;
+
+            // Initialize readability data for the grandparent.
+            if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) {
+                $this->initializeNode($grandParentNode);
+                $grandParentNode->setAttribute('data-candidate', 'true');
             }
-            if (preg_match($this->regexps['okMaybeItsACandidate'], $attributeValue)) {
-                $weight += 5;
+            // Add a point for the paragraph itself as a base.
+            $contentScore = 1;
+            // Add points for any commas within this paragraph.
+            $contentScore += $this->getCommaCount($innerText);
+            // For every SCORE_CHARS_IN_PARAGRAPH (default:100) characters in this paragraph, add another point. Up to 3 points.
+            $contentScore += min(floor(mb_strlen($innerText) / self::SCORE_CHARS_IN_PARAGRAPH), 3);
+            // For every SCORE_WORDS_IN_PARAGRAPH (default:20) words in this paragraph, add another point. Up to 3 points.
+            $contentScore += min(floor($this->getWordCount($innerText) / self::SCORE_WORDS_IN_PARAGRAPH), 3);
+            /* TEST: For every positive/negative parent tag, add/substract half point. Up to 3 points. *\/
+            $up = $nodesToScore[$pt];
+            $score = 0;
+            while ($up->parentNode instanceof \DOMElement) {
+                $up = $up->parentNode;
+                if (preg_match($this->regexps['positive'], $up->getAttribute('class') . ' ' . $up->getAttribute('id'))) {
+                    $score += 0.5;
+                } elseif (preg_match($this->regexps['negative'], $up->getAttribute('class') . ' ' . $up->getAttribute('id'))) {
+                    $score -= 0.5;
+                }
+            }
+            $score = floor($score);
+            $contentScore += max(min($score, 3), -3);/**/
+
+            // Add the score to the parent. The grandparent gets half.
+            $parentNode->getAttributeNode('readability')->value += $contentScore;
+            if ($grandParentNode) {
+                $grandParentNode->getAttributeNode('readability')->value += $contentScore / self::GRANDPARENT_SCORE_DIVISOR;
             }
         }
 
-        return $weight;
-    }
+        /*
+         * Node prepping: trash nodes that look cruddy (like ones with the class name "comment", etc).
+         * This is faster to do before scoring but safer after.
+         */
+        if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) && $xpath) {
+            $candidates = $xpath->query('.//*[(self::footer and count(//footer)<2) or (self::aside and count(//aside)<2)]', $page->documentElement);
+            $node = null;
 
-    /**
-     * Get an element relative weight.
-     *
-     * @param \DOMElement $e
-     *
-     * @return int
-     */
-    public function getWeight($e)
-    {
-        if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
-            return 0;
-        }
+            for ($c = $candidates->length - 1; $c >= 0; --$c) {
+                $node = $candidates->item($c);
+                // node should be readable but not inside of an article otherwise it's probably non-readable block
+                if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? 0 !== strcasecmp($node->parentNode->tagName, 'article') : true)) {
+                    $this->logger->debug('Removing unlikely candidate (using note) ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0));
+                    $node->parentNode->removeChild($node);
+                }
+            }
 
-        $weight = 0;
-        // Look for a special classname
-        $weight += $this->weightAttribute($e, 'class');
-        // Look for a special ID
-        $weight += $this->weightAttribute($e, 'id');
+            $candidates = $xpath->query('.//*[not(self::body) and (@class or @id or @style) and ((number(@readability) < 40) or not(@readability))]', $page->documentElement);
+            $node = null;
 
-        return $weight;
-    }
+            for ($c = $candidates->length - 1; $c >= 0; --$c) {
+                $node = $candidates->item($c);
 
-    /**
-     * Remove extraneous break tags from a node.
-     *
-     * @param \DOMElement $node
-     */
-    public function killBreaks($node)
-    {
-        $html = $node->innerHTML;
-        $html = preg_replace($this->regexps['killBreaks'], '<br />', $html);
-        $node->innerHTML = $html;
-    }
+                // Remove unlikely candidates
+                $unlikelyMatchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id') . ' ' . $node->getAttribute('style');
 
-    /**
-     * Clean a node of all elements of type "tag".
-     * (Unless it's a youtube/vimeo video. People love movies.).
-     *
-     * Updated 2012-09-18 to preserve youtube/vimeo iframes
-     *
-     * @param \DOMElement $e
-     * @param string      $tag
-     */
-    public function clean($e, $tag)
-    {
-        $currentItem = null;
-        $targetList = $e->getElementsByTagName($tag);
-        $isEmbed = ($tag === 'audio' || $tag === 'video' || $tag === 'iframe' || $tag === 'object' || $tag === 'embed');
+                if (mb_strlen($unlikelyMatchString) > 3 && // don't process "empty" strings
+                    preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
+                    !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString)
+                ) {
+                    $this->logger->debug('Removing unlikely candidate (using conf) ' . $node->getNodePath() . ' by "' . $unlikelyMatchString . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0));
+                    $node->parentNode->removeChild($node);
+                    --$nodeIndex;
+                }
+            }
+            unset($candidates);
+        }
 
-        for ($y = $targetList->length - 1; $y >= 0; --$y) {
-            // Allow youtube and vimeo videos through as people usually want to see those.
-            $currentItem = $targetList->item($y);
+        /*
+         * After we've calculated scores, loop through all of the possible candidate nodes we found
+         * and find the one with the highest score.
+         */
+        $topCandidate = null;
+        if ($xpath) {
+            // Using array of DOMElements after deletion is a path to DOOMElement.
+            $candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement);
 
-            if ($isEmbed) {
-                $attributeValues = $currentItem->getAttribute('src') . ' ' . $currentItem->getAttribute('href');
+            for ($c = $candidates->length - 1; $c >= 0; --$c) {
+                $item = $candidates->item($c);
 
-                // First, check the elements attributes to see if any of them contain known media hosts
-                if (preg_match($this->regexps['media'], $attributeValues)) {
-                    continue;
+                // Scale the final candidates score based on link density. Good content should have a
+                // relatively small link density (5% or less) and be mostly unaffected by this operation.
+                // If not for this we would have used XPath to find maximum @readability.
+                $readability = $item->getAttributeNode('readability');
+                $readability->value = round($readability->value * (1 - $this->getLinkDensity($item)), 0, PHP_ROUND_HALF_UP);
+
+                if (!$topCandidate || $readability->value > (int) $topCandidate->getAttribute('readability')) {
+                    $this->logger->debug('Candidate: ' . $item->getNodePath() . ' (' . $item->getAttribute('class') . ':' . $item->getAttribute('id') . ') with score ' . $readability->value);
+                    $topCandidate = $item;
                 }
+            }
 
-                // Then check the elements inside this element for the same.
-                if (preg_match($this->regexps['media'], $targetList->item($y)->innerHTML)) {
-                    continue;
+            unset($candidates);
+        }
+
+        /*
+         * If we still have no top candidate, just use the body as a last resort.
+         * We also have to copy the body node so it is something we can modify.
+         */
+        if (null === $topCandidate || 0 === strcasecmp($topCandidate->tagName, 'body')) {
+            $topCandidate = $this->dom->createElement('div');
+
+            if ($page instanceof \DOMDocument) {
+                if (!isset($page->documentElement)) {
+                    // we don't have a body either? what a mess! :)
+                    $this->logger->debug('The page has no body!');
+                } else {
+                    $this->logger->debug('Setting body to a raw HTML of original page!');
+                    $topCandidate->innerHTML = $page->documentElement->innerHTML;
+                    $page->documentElement->innerHTML = '';
+                    $this->reinitBody();
+                    $page->documentElement->appendChild($topCandidate);
                 }
+            } else {
+                $topCandidate->innerHTML = $page->innerHTML;
+                $page->innerHTML = '';
+                $page->appendChild($topCandidate);
             }
 
-            $currentItem->parentNode->removeChild($currentItem);
+            $this->initializeNode($topCandidate);
         }
-    }
 
-    /**
-     * Clean an element of all tags of type "tag" if they look fishy.
-     * "Fishy" is an algorithm based on content length, classnames,
-     * link density, number of images & embeds, etc.
-     *
-     * @param \DOMElement $e
-     * @param string      $tag
-     */
-    public function cleanConditionally($e, $tag)
-    {
-        if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
-            return;
+        // Set table as the main node if resulted data is table element.
+        $tagName = $topCandidate->tagName;
+        if (0 === strcasecmp($tagName, 'td') || 0 === strcasecmp($tagName, 'tr')) {
+            $up = $topCandidate;
+
+            if ($up->parentNode instanceof \DOMElement) {
+                $up = $up->parentNode;
+
+                if (0 === strcasecmp($up->tagName, 'table')) {
+                    $topCandidate = $up;
+                }
+            }
         }
 
-        $tagsList = $e->getElementsByTagName($tag);
-        $curTagsLength = $tagsList->length;
-        $node = null;
+        $this->logger->debug('Top candidate: ' . $topCandidate->getNodePath());
 
         /*
-         * Gather counts for other typical elements embedded within.
-         * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
-         *
-         * TODO: Consider taking into account original contentScore here.
+         * Now that we have the top candidate, look through its siblings for content that might also be related.
+         * Things like preambles, content split by ads that we removed, etc.
          */
-        for ($i = $curTagsLength - 1; $i >= 0; --$i) {
-            $node = $tagsList->item($i);
-            $weight = $this->getWeight($node);
-            $contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0;
-            $this->logger->debug('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : ''));
+        $articleContent = $this->dom->createElement('div');
+        $articleContent->setAttribute('class', 'readability-content');
+        $siblingScoreThreshold = max(10, ((int) $topCandidate->getAttribute('readability')) * 0.2);
+        $siblingNodes = $topCandidate->parentNode->childNodes;
 
-            if ($weight + $contentScore < 0) {
-                $this->logger->debug('Removing...');
-                $node->parentNode->removeChild($node);
-            } elseif ($this->getCommaCount($this->getInnerText($node)) < self::MIN_COMMAS_IN_PARAGRAPH) {
-                /*
-                 * If there are not very many commas, and the number of
-                 * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
-                 */
-                $p = $node->getElementsByTagName('p')->length;
-                $img = $node->getElementsByTagName('img')->length;
-                $li = $node->getElementsByTagName('li')->length - 100;
-                $input = $node->getElementsByTagName('input')->length;
-                $a = $node->getElementsByTagName('a')->length;
-                $embedCount = 0;
-                $embeds = $node->getElementsByTagName('embed');
+        if (!isset($siblingNodes)) {
+            $siblingNodes = new stdClass();
+            $siblingNodes->length = 0;
+        }
+
+        for ($s = 0, $sl = $siblingNodes->length; $s < $sl; ++$s) {
+            $siblingNode = $siblingNodes->item($s);
+            $siblingNodeName = $siblingNode->nodeName;
+            $append = false;
+            $this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . ((XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
+
+            if ($siblingNode->isSameNode($topCandidate)) {
+                $append = true;
+            }
 
-                for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) {
-                    if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) {
-                        ++$embedCount;
-                    }
-                }
+            $contentBonus = 0;
 
-                $embeds = $node->getElementsByTagName('iframe');
-                for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) {
-                    if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) {
-                        ++$embedCount;
-                    }
+            // Give a bonus if sibling nodes and top candidates have the same classname.
+            if (XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && '' !== $topCandidate->getAttribute('class')) {
+                $contentBonus += ((int) $topCandidate->getAttribute('readability')) * 0.2;
+            }
+
+            if (XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) {
+                $append = true;
+            }
+
+            if (0 === strcasecmp($siblingNodeName, 'p')) {
+                $linkDensity = $this->getLinkDensity($siblingNode);
+                $nodeContent = $this->getInnerText($siblingNode, true, true);
+                $nodeLength = mb_strlen($nodeContent);
+
+                if (($nodeLength > self::MIN_NODE_LENGTH && $linkDensity < self::MAX_LINK_DENSITY)
+                    || ($nodeLength < self::MIN_NODE_LENGTH && 0 === $linkDensity && preg_match('/\.( |$)/', $nodeContent))) {
+                    $append = true;
                 }
+            }
 
-                $linkDensity = $this->getLinkDensity($node, true);
-                $contentLength = mb_strlen($this->getInnerText($node));
-                $toRemove = false;
+            if ($append) {
+                $this->logger->debug('Appending node: ' . $siblingNode->getNodePath());
 
-                if ($this->lightClean) {
-                    if ($li > $p && $tag !== 'ul' && $tag !== 'ol') {
-                        $this->logger->debug(' too many <li> elements, and parent is not <ul> or <ol>');
-                        $toRemove = true;
-                    } elseif ($input > floor($p / 3)) {
-                        $this->logger->debug(' too many <input> elements');
-                        $toRemove = true;
-                    } elseif ($contentLength < 6 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
-                        $this->logger->debug(' content length less than 6 chars, 0 embeds and either 0 images or more than 2 images');
-                        $toRemove = true;
-                    } elseif ($weight < 25 && $linkDensity > 0.25) {
-                        $this->logger->debug(' weight is ' . $weight . ' < 25 and link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.25');
-                        $toRemove = true;
-                    } elseif ($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
-                        $this->logger->debug('  more than 2 links and weight is ' . $weight . ' > 25 but link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.5');
-                        $toRemove = true;
-                    } elseif ($embedCount > 3) {
-                        $this->logger->debug(' more than 3 embeds');
-                        $toRemove = true;
+                if (0 !== strcasecmp($siblingNodeName, 'div') && 0 !== strcasecmp($siblingNodeName, 'p')) {
+                    // We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident.
+                    $this->logger->debug('Altering siblingNode "' . $siblingNodeName . '" to "div".');
+                    $nodeToAppend = $this->dom->createElement('div');
+
+                    try {
+                        $nodeToAppend->setAttribute('alt', $siblingNodeName);
+                        $nodeToAppend->innerHTML = $siblingNode->innerHTML;
+                    } catch (\Exception $e) {
+                        $this->logger->debug('Could not alter siblingNode "' . $siblingNodeName . '" to "div", reverting to original.');
+                        $nodeToAppend = $siblingNode;
+                        --$s;
+                        --$sl;
                     }
                 } else {
-                    if ($img > $p) {
-                        $this->logger->debug(' more image elements than paragraph elements');
-                        $toRemove = true;
-                    } elseif ($li > $p && $tag !== 'ul' && $tag !== 'ol') {
-                        $this->logger->debug('  too many <li> elements, and parent is not <ul> or <ol>');
-                        $toRemove = true;
-                    } elseif ($input > floor($p / 3)) {
-                        $this->logger->debug('  too many <input> elements');
-                        $toRemove = true;
-                    } elseif ($contentLength < 10 && ($img === 0 || $img > 2)) {
-                        $this->logger->debug('  content length less than 10 chars and 0 images, or more than 2 images');
-                        $toRemove = true;
-                    } elseif ($weight < 25 && $linkDensity > 0.2) {
-                        $this->logger->debug('  weight is ' . $weight . ' lower than 0 and link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.2');
-                        $toRemove = true;
-                    } elseif ($weight >= 25 && $linkDensity > 0.5) {
-                        $this->logger->debug('  weight above 25 but link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.5');
-                        $toRemove = true;
-                    } elseif (($embedCount === 1 && $contentLength < 75) || $embedCount > 1) {
-                        $this->logger->debug('  1 embed and content length smaller than 75 chars, or more than one embed');
-                        $toRemove = true;
-                    }
+                    $nodeToAppend = $siblingNode;
+                    --$s;
+                    --$sl;
                 }
 
-                if ($toRemove) {
-                    $this->logger->debug('Removing...');
-                    $node->parentNode->removeChild($node);
-                }
+                // To ensure a node does not interfere with readability styles, remove its classnames & ids.
+                // Now done via RegExp post_filter.
+                //$nodeToAppend->removeAttribute('class');
+                //$nodeToAppend->removeAttribute('id');
+                // Append sibling and subtract from our list as appending removes a node.
+                $articleContent->appendChild($nodeToAppend);
             }
         }
-    }
 
-    /**
-     * Clean out spurious headers from an Element. Checks things like classnames and link density.
-     *
-     * @param \DOMElement $e
-     */
-    public function cleanHeaders($e)
-    {
-        for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) {
-            $headers = $e->getElementsByTagName('h' . $headerIndex);
+        unset($xpath);
 
-            for ($i = $headers->length - 1; $i >= 0; --$i) {
-                if ($this->getWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
-                    $headers->item($i)->parentNode->removeChild($headers->item($i));
-                }
+        // So we have all of the content that we need. Now we clean it up for presentation.
+        $this->prepArticle($articleContent);
+
+        /*
+         * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
+         * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
+         * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
+         * finding the -right- content.
+         */
+        if (mb_strlen($this->getInnerText($articleContent, false)) < self::MIN_ARTICLE_LENGTH) {
+            $this->reinitBody();
+
+            if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
+                $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
+                $this->logger->debug('...content is shorter than ' . self::MIN_ARTICLE_LENGTH . " letters, trying not to strip unlikely content.\n");
+
+                return $this->grabArticle($this->body);
+            } elseif ($this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
+                $this->removeFlag(self::FLAG_WEIGHT_ATTRIBUTES);
+                $this->logger->debug('...content is shorter than ' . self::MIN_ARTICLE_LENGTH . " letters, trying not to weight attributes.\n");
+
+                return $this->grabArticle($this->body);
+            } elseif ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
+                $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
+                $this->logger->debug('...content is shorter than ' . self::MIN_ARTICLE_LENGTH . " letters, trying not to clean at all.\n");
+
+                return $this->grabArticle($this->body);
             }
+
+            return false;
         }
+
+        return $articleContent;
     }
 
     /**
-     * Check if the given flag is active.
+     * Get an element weight by attribute.
+     * Uses regular expressions to tell if this element looks good or bad.
      *
-     * @param int $flag
+     * @param \DOMElement $element
+     * @param string      $attribute
      *
-     * @return bool
+     * @return int
      */
-    public function flagIsActive($flag)
+    protected function weightAttribute($element, $attribute)
     {
-        return ($this->flags & $flag) > 0;
-    }
+        if (!$element->hasAttribute($attribute)) {
+            return 0;
+        }
+        $weight = 0;
 
-    /**
-     * Add a flag.
-     *
-     * @param int $flag
-     */
-    public function addFlag($flag)
-    {
-        $this->flags = $this->flags | $flag;
-    }
+        // $attributeValue = trim($element->getAttribute('class')." ".$element->getAttribute('id'));
+        $attributeValue = trim($element->getAttribute($attribute));
 
-    /**
-     * Remove a flag.
-     *
-     * @param int $flag
-     */
-    public function removeFlag($flag)
-    {
-        $this->flags = $this->flags & ~$flag;
+        if ('' !== $attributeValue) {
+            if (preg_match($this->regexps['negative'], $attributeValue)) {
+                $weight -= 25;
+            }
+            if (preg_match($this->regexps['positive'], $attributeValue)) {
+                $weight += 25;
+            }
+            if (preg_match($this->regexps['unlikelyCandidates'], $attributeValue)) {
+                $weight -= 5;
+            }
+            if (preg_match($this->regexps['okMaybeItsACandidate'], $attributeValue)) {
+                $weight += 5;
+            }
+        }
+
+        return $weight;
     }
 
     /**
@@ -1452,4 +1384,74 @@ class Readability implements LoggerAwareInterface
             $this->body->innerHTML = $this->bodyCache;
         }
     }
+
+    /**
+     * Load HTML in a DOMDocument.
+     * Apply Pre filters
+     * Cleanup HTML using Tidy (or not).
+     *
+     * @todo This should be called in init() instead of from __construct
+     */
+    private function loadHtml()
+    {
+        $this->original_html = $this->html;
+
+        $this->logger->debug('Parsing URL: ' . $this->url);
+
+        if ($this->url) {
+            $this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', parse_url($this->url, PHP_URL_HOST)), array('.' => '\.')) . '/';
+        }
+
+        mb_internal_encoding('UTF-8');
+        mb_http_output('UTF-8');
+        mb_regex_encoding('UTF-8');
+
+        // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well...
+        if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) {
+            foreach ($this->pre_filters as $search => $replace) {
+                $this->html = preg_replace($search, $replace, $this->html);
+            }
+            unset($search, $replace);
+        }
+
+        if ('' === trim($this->html)) {
+            $this->html = '<html></html>';
+        }
+
+        /*
+         * Use tidy (if it exists).
+         * This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing.
+         * Although sometimes it makes matters worse, which is why there is an option to disable it.
+         */
+        if ($this->useTidy) {
+            $this->logger->debug('Tidying document');
+
+            $tidy = tidy_parse_string($this->html, $this->tidy_config, 'UTF8');
+            if (tidy_clean_repair($tidy)) {
+                $this->tidied = true;
+                $this->html = $tidy->value;
+                $this->html = preg_replace('/[\r\n]+/is', "\n", $this->html);
+            }
+            unset($tidy);
+        }
+
+        $this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8');
+
+        if (!('html5lib' === $this->parser && ($this->dom = Parser::parse($this->html)))) {
+            libxml_use_internal_errors(true);
+
+            $this->dom = new \DOMDocument();
+            $this->dom->preserveWhiteSpace = false;
+
+            if (PHP_VERSION_ID >= 50400) {
+                $this->dom->loadHTML($this->html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR);
+            } else {
+                $this->dom->loadHTML($this->html);
+            }
+
+            libxml_use_internal_errors(false);
+        }
+
+        $this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement');
+    }
 }
diff --git a/tests/ReadabilityTest.php b/tests/ReadabilityTest.php
index e7d7bcd..a910e6e 100644
--- a/tests/ReadabilityTest.php
+++ b/tests/ReadabilityTest.php
@@ -11,17 +11,6 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
     public $logHandler;
     public $logger;
 
-    private function getReadability($html, $url = null, $parser = 'libxml', $useTidy = true)
-    {
-        $readability = new Readability($html, $url, $parser, $useTidy);
-
-        $this->logHandler = new TestHandler();
-        $this->logger = new Logger('test', array($this->logHandler));
-        $readability->setLogger($this->logger);
-
-        return $readability;
-    }
-
     /**
      * @requires extension tidy
      */
@@ -479,4 +468,15 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
 
         $this->assertTrue($res);
     }
+
+    private function getReadability($html, $url = null, $parser = 'libxml', $useTidy = true)
+    {
+        $readability = new Readability($html, $url, $parser, $useTidy);
+
+        $this->logHandler = new TestHandler();
+        $this->logger = new Logger('test', array($this->logHandler));
+        $readability->setLogger($this->logger);
+
+        return $readability;
+    }
 }